Skip to content

Commit 58bfd43

Browse files
authored
Add unicode handling to keyword match (#52)
* Add unicode handling to keyword match * Handle non-word letter endings or startings * updated tests * Ruff formatting * Address copilot nits Thank you to @yehorkardash for identifying this in our JS version
1 parent eb724bf commit 58bfd43

File tree

2 files changed

+144
-2
lines changed

2 files changed

+144
-2
lines changed

‎src/guardrails/checks/text/keywords.py‎

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,21 @@ def _compile_pattern(keywords: tuple[str, ...]) -> re.Pattern[str]:
7373
Returns:
7474
re.Pattern[str]: Compiled regex pattern to match any given keyword.
7575
"""
76-
escaped= (re.escape(k) forkinkeywords)
77-
pattern_text=r"\b(?:"+"|".join(escaped) +r")\b"
76+
# Build individual patterns with conditional boundary assertions
77+
# Only apply (?<!\w) if keyword starts with word char, (?!\w) if it ends with word char
78+
patterns= []
79+
forkeywordinkeywords:
80+
escaped=re.escape(keyword)
81+
# Check first and last character of the original keyword for word character status
82+
starts_with_word_char=keywordand (keyword[0].isalnum() orkeyword[0] =="_")
83+
ends_with_word_char=keywordand (keyword[-1].isalnum() orkeyword[-1] =="_")
84+
85+
prefix=r"(?<!\w)"ifstarts_with_word_charelse""
86+
suffix=r"(?!\w)"ifends_with_word_charelse""
87+
patterns.append(f"{prefix}{escaped}{suffix}")
88+
89+
# (?<!\w) and (?!\w) use Unicode-aware lookbehind/lookahead to enforce word boundaries.
90+
pattern_text="(?:"+"|".join(patterns) +")"
7891

7992
returnre.compile(pattern_text, re.IGNORECASE)
8093

‎tests/unit/checks/test_keywords.py‎

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,132 @@ async def test_keywords_does_not_trigger_on_benign_text() -> None:
6565
result=awaitkeywords(ctx=None, data="Safe content", config=config)
6666

6767
assertresult.tripwire_triggeredisFalse# noqa: S101
68+
69+
70+
deftest_match_keywords_does_not_match_partial_words() ->None:
71+
"""Ensure substrings embedded in larger words are ignored."""
72+
config=KeywordCfg(keywords=["orld"])
73+
result=match_keywords("Hello, world!", config, guardrail_name="Test Guardrail")
74+
75+
assertresult.tripwire_triggeredisFalse# noqa: S101
76+
77+
78+
deftest_match_keywords_handles_numeric_tokens() ->None:
79+
"""Keywords containing digits should match exact tokens."""
80+
config=KeywordCfg(keywords=["world123"])
81+
result=match_keywords("Hello, world123", config, guardrail_name="Test Guardrail")
82+
83+
assertresult.tripwire_triggeredisTrue# noqa: S101
84+
assertresult.info["matched"] == ["world123"] # noqa: S101
85+
86+
87+
deftest_match_keywords_rejects_partial_numeric_tokens() ->None:
88+
"""Numeric keywords should not match when extra digits follow."""
89+
config=KeywordCfg(keywords=["world123"])
90+
result=match_keywords("Hello, world12345", config, guardrail_name="Test Guardrail")
91+
92+
assertresult.tripwire_triggeredisFalse# noqa: S101
93+
94+
95+
deftest_match_keywords_handles_underscored_tokens() ->None:
96+
"""Underscored keywords should be detected exactly once."""
97+
config=KeywordCfg(keywords=["w_o_r_l_d"])
98+
result=match_keywords("Hello, w_o_r_l_d", config, guardrail_name="Test Guardrail")
99+
100+
assertresult.tripwire_triggeredisTrue# noqa: S101
101+
assertresult.info["matched"] == ["w_o_r_l_d"] # noqa: S101
102+
103+
104+
deftest_match_keywords_rejects_words_embedded_in_underscores() ->None:
105+
"""Words surrounded by underscores should not trigger partial matches."""
106+
config=KeywordCfg(keywords=["world"])
107+
result=match_keywords("Hello, test_world_test", config, guardrail_name="Test Guardrail")
108+
109+
assertresult.tripwire_triggeredisFalse# noqa: S101
110+
111+
112+
deftest_match_keywords_handles_chinese_characters() ->None:
113+
"""Unicode keywords such as Chinese characters should match."""
114+
config=KeywordCfg(keywords=["你好"])
115+
result=match_keywords("你好", config, guardrail_name="Test Guardrail")
116+
117+
assertresult.tripwire_triggeredisTrue# noqa: S101
118+
assertresult.info["matched"] == ["你好"] # noqa: S101
119+
120+
121+
deftest_match_keywords_handles_chinese_tokens_with_digits() ->None:
122+
"""Unicode keywords that include digits should match whole tokens."""
123+
config=KeywordCfg(keywords=["你好123"])
124+
result=match_keywords("你好123", config, guardrail_name="Test Guardrail")
125+
126+
assertresult.tripwire_triggeredisTrue# noqa: S101
127+
assertresult.info["matched"] == ["你好123"] # noqa: S101
128+
129+
130+
deftest_match_keywords_rejects_partial_chinese_tokens_with_digits() ->None:
131+
"""Unicode keywords with trailing digits should not match supersets."""
132+
config=KeywordCfg(keywords=["你好123"])
133+
result=match_keywords("你好12345", config, guardrail_name="Test Guardrail")
134+
135+
assertresult.tripwire_triggeredisFalse# noqa: S101
136+
137+
138+
deftest_match_keywords_applies_boundaries_to_all_keywords() ->None:
139+
"""Every keyword in a multi-token pattern should respect Unicode boundaries."""
140+
config=KeywordCfg(keywords=["test", "hello", "world"])
141+
result=match_keywords("testing hello world", config, guardrail_name="Test Guardrail")
142+
143+
assertresult.tripwire_triggeredisTrue# noqa: S101
144+
assertresult.info["matched"] == ["hello", "world"] # noqa: S101
145+
146+
147+
deftest_match_keywords_detects_email_like_patterns() ->None:
148+
"""Email-like keywords starting with punctuation should match after word chars."""
149+
config=KeywordCfg(keywords=["@corp.com"])
150+
result=match_keywords("[email protected]", config, guardrail_name="Test Guardrail")
151+
152+
assertresult.tripwire_triggeredisTrue# noqa: S101
153+
assertresult.info["matched"] == ["@corp.com"] # noqa: S101
154+
155+
156+
deftest_match_keywords_detects_hashtag_patterns() ->None:
157+
"""Hashtag keywords starting with punctuation should match after word chars."""
158+
config=KeywordCfg(keywords=["#leak"])
159+
result=match_keywords("abc#leak", config, guardrail_name="Test Guardrail")
160+
161+
assertresult.tripwire_triggeredisTrue# noqa: S101
162+
assertresult.info["matched"] == ["#leak"] # noqa: S101
163+
164+
165+
deftest_match_keywords_respects_end_boundary_for_punctuation_prefixed() ->None:
166+
"""Punctuation-prefixed keywords ending with word chars need end boundary."""
167+
config=KeywordCfg(keywords=["@leak"])
168+
# Should not match when word chars continue after
169+
result=match_keywords("foo@leakmore", config, guardrail_name="Test Guardrail")
170+
assertresult.tripwire_triggeredisFalse# noqa: S101
171+
172+
# Should match when followed by non-word char
173+
result=match_keywords("foo@leak bar", config, guardrail_name="Test Guardrail")
174+
assertresult.tripwire_triggeredisTrue# noqa: S101
175+
assertresult.info["matched"] == ["@leak"] # noqa: S101
176+
177+
178+
deftest_match_keywords_handles_full_punctuation_keywords() ->None:
179+
"""Keywords consisting only of punctuation should match anywhere."""
180+
config=KeywordCfg(keywords=["@#$"])
181+
result=match_keywords("test@#$test", config, guardrail_name="Test Guardrail")
182+
183+
assertresult.tripwire_triggeredisTrue# noqa: S101
184+
assertresult.info["matched"] == ["@#$"] # noqa: S101
185+
186+
187+
deftest_match_keywords_mixed_punctuation_and_word_chars() ->None:
188+
"""Keywords with both punctuation prefix and suffix should work correctly."""
189+
config=KeywordCfg(keywords=["@user@"])
190+
# Should match when embedded
191+
result=match_keywords("test@user@test", config, guardrail_name="Test Guardrail")
192+
assertresult.tripwire_triggeredisTrue# noqa: S101
193+
194+
# Should match even when followed by more text (no boundaries applied to punctuation edges)
195+
result=match_keywords("test@user@more", config, guardrail_name="Test Guardrail")
196+
assertresult.tripwire_triggeredisTrue# noqa: S101

0 commit comments

Comments
(0)