Add unicode handling to keyword match (#52)

steven10a · web-flow · commit 58bfd4340ca7 · 2025-11-19T10:06:02.000-08:00
* Add unicode handling to keyword match * Handle non-word letter endings or startings * updated tests * Ruff formatting * Address copilot nits Thank you to @yehorkardash for identifying this in our JS version
diff --git a/src/guardrails/checks/text/keywords.py b/src/guardrails/checks/text/keywords.py
@@ -73,8 +73,21 @@ def _compile_pattern(keywords: tuple[str, ...]) -> re.Pattern[str]:
 Returns:
 re.Pattern[str]: Compiled regex pattern to match any given keyword.
 """
- escaped = (re.escape(k) for k in keywords)
- pattern_text = r"\b(?:" + "|".join(escaped) + r")\b"
+ # Build individual patterns with conditional boundary assertions
+ # Only apply (?<!\w) if keyword starts with word char, (?!\w) if it ends with word char
+ patterns = []
+ for keyword in keywords:
+ escaped = re.escape(keyword)
+ # Check first and last character of the original keyword for word character status
+ starts_with_word_char = keyword and (keyword[0].isalnum() or keyword[0] == "_")
+ ends_with_word_char = keyword and (keyword[-1].isalnum() or keyword[-1] == "_")
+
+ prefix = r"(?<!\w)" if starts_with_word_char else ""
+ suffix = r"(?!\w)" if ends_with_word_char else ""
+ patterns.append(f"{prefix}{escaped}{suffix}")
+
+ # (?<!\w) and (?!\w) use Unicode-aware lookbehind/lookahead to enforce word boundaries.
+ pattern_text = "(?:" + "|".join(patterns) + ")"
 
 return re.compile(pattern_text, re.IGNORECASE)
 
diff --git a/tests/unit/checks/test_keywords.py b/tests/unit/checks/test_keywords.py
@@ -65,3 +65,132 @@ async def test_keywords_does_not_trigger_on_benign_text() -> None:
 result = await keywords(ctx=None, data="Safe content", config=config)
 
 assert result.tripwire_triggered is False # noqa: S101
+
+
+def test_match_keywords_does_not_match_partial_words() -> None:
+ """Ensure substrings embedded in larger words are ignored."""
+ config = KeywordCfg(keywords=["orld"])
+ result = match_keywords("Hello, world!", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is False # noqa: S101
+
+
+def test_match_keywords_handles_numeric_tokens() -> None:
+ """Keywords containing digits should match exact tokens."""
+ config = KeywordCfg(keywords=["world123"])
+ result = match_keywords("Hello, world123", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is True # noqa: S101
+ assert result.info["matched"] == ["world123"] # noqa: S101
+
+
+def test_match_keywords_rejects_partial_numeric_tokens() -> None:
+ """Numeric keywords should not match when extra digits follow."""
+ config = KeywordCfg(keywords=["world123"])
+ result = match_keywords("Hello, world12345", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is False # noqa: S101
+
+
+def test_match_keywords_handles_underscored_tokens() -> None:
+ """Underscored keywords should be detected exactly once."""
+ config = KeywordCfg(keywords=["w_o_r_l_d"])
+ result = match_keywords("Hello, w_o_r_l_d", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is True # noqa: S101
+ assert result.info["matched"] == ["w_o_r_l_d"] # noqa: S101
+
+
+def test_match_keywords_rejects_words_embedded_in_underscores() -> None:
+ """Words surrounded by underscores should not trigger partial matches."""
+ config = KeywordCfg(keywords=["world"])
+ result = match_keywords("Hello, test_world_test", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is False # noqa: S101
+
+
+def test_match_keywords_handles_chinese_characters() -> None:
+ """Unicode keywords such as Chinese characters should match."""
+ config = KeywordCfg(keywords=["你好"])
+ result = match_keywords("你好", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is True # noqa: S101
+ assert result.info["matched"] == ["你好"] # noqa: S101
+
+
+def test_match_keywords_handles_chinese_tokens_with_digits() -> None:
+ """Unicode keywords that include digits should match whole tokens."""
+ config = KeywordCfg(keywords=["你好123"])
+ result = match_keywords("你好123", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is True # noqa: S101
+ assert result.info["matched"] == ["你好123"] # noqa: S101
+
+
+def test_match_keywords_rejects_partial_chinese_tokens_with_digits() -> None:
+ """Unicode keywords with trailing digits should not match supersets."""
+ config = KeywordCfg(keywords=["你好123"])
+ result = match_keywords("你好12345", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is False # noqa: S101
+
+
+def test_match_keywords_applies_boundaries_to_all_keywords() -> None:
+ """Every keyword in a multi-token pattern should respect Unicode boundaries."""
+ config = KeywordCfg(keywords=["test", "hello", "world"])
+ result = match_keywords("testing hello world", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is True # noqa: S101
+ assert result.info["matched"] == ["hello", "world"] # noqa: S101
+
+
+def test_match_keywords_detects_email_like_patterns() -> None:
+ """Email-like keywords starting with punctuation should match after word chars."""
+ config = KeywordCfg(keywords=["@corp.com"])
+ result = match_keywords("foo@corp.com", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is True # noqa: S101
+ assert result.info["matched"] == ["@corp.com"] # noqa: S101
+
+
+def test_match_keywords_detects_hashtag_patterns() -> None:
+ """Hashtag keywords starting with punctuation should match after word chars."""
+ config = KeywordCfg(keywords=["#leak"])
+ result = match_keywords("abc#leak", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is True # noqa: S101
+ assert result.info["matched"] == ["#leak"] # noqa: S101
+
+
+def test_match_keywords_respects_end_boundary_for_punctuation_prefixed() -> None:
+ """Punctuation-prefixed keywords ending with word chars need end boundary."""
+ config = KeywordCfg(keywords=["@leak"])
+ # Should not match when word chars continue after
+ result = match_keywords("foo@leakmore", config, guardrail_name="Test Guardrail")
+ assert result.tripwire_triggered is False # noqa: S101
+
+ # Should match when followed by non-word char
+ result = match_keywords("foo@leak bar", config, guardrail_name="Test Guardrail")
+ assert result.tripwire_triggered is True # noqa: S101
+ assert result.info["matched"] == ["@leak"] # noqa: S101
+
+
+def test_match_keywords_handles_full_punctuation_keywords() -> None:
+ """Keywords consisting only of punctuation should match anywhere."""
+ config = KeywordCfg(keywords=["@#$"])
+ result = match_keywords("test@#$test", config, guardrail_name="Test Guardrail")
+
+ assert result.tripwire_triggered is True # noqa: S101
+ assert result.info["matched"] == ["@#$"] # noqa: S101
+
+
+def test_match_keywords_mixed_punctuation_and_word_chars() -> None:
+ """Keywords with both punctuation prefix and suffix should work correctly."""
+ config = KeywordCfg(keywords=["@user@"])
+ # Should match when embedded
+ result = match_keywords("test@user@test", config, guardrail_name="Test Guardrail")
+ assert result.tripwire_triggered is True # noqa: S101
+
+ # Should match even when followed by more text (no boundaries applied to punctuation edges)
+ result = match_keywords("test@user@more", config, guardrail_name="Test Guardrail")
+ assert result.tripwire_triggered is True # noqa: S101

-Original file line number
+Diff line change
  Returns:
  re.Pattern[str]: Compiled regex pattern to match any given keyword.
  """
 -escaped= (re.escape(k) forkinkeywords)
 -pattern_text=r"\b(?:"+"|".join(escaped) +r")\b"
 +# Build individual patterns with conditional boundary assertions
 +# Only apply (?<!\w) if keyword starts with word char, (?!\w) if it ends with word char
 +patterns= []
 +forkeywordinkeywords:
 +escaped=re.escape(keyword)
 +# Check first and last character of the original keyword for word character status
 +starts_with_word_char=keywordand (keyword[0].isalnum() orkeyword[0] =="_")
 +ends_with_word_char=keywordand (keyword[-1].isalnum() orkeyword[-1] =="_")
++
 +prefix=r"(?<!\w)"ifstarts_with_word_charelse""
 +suffix=r"(?!\w)"ifends_with_word_charelse""
 +patterns.append(f"{prefix}{escaped}{suffix}")
++
 +# (?<!\w) and (?!\w) use Unicode-aware lookbehind/lookahead to enforce word boundaries.
 +pattern_text="(?:"+"|".join(patterns) +")"
 returnre.compile(pattern_text, re.IGNORECASE)
-Original file line number
+Diff line change
 result=awaitkeywords(ctx=None, data="Safe content", config=config)
 assertresult.tripwire_triggeredisFalse# noqa: S101
++
++
 +deftest_match_keywords_does_not_match_partial_words() ->None:
 +"""Ensure substrings embedded in larger words are ignored."""
 +config=KeywordCfg(keywords=["orld"])
 +result=match_keywords("Hello, world!", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisFalse# noqa: S101
++
++
 +deftest_match_keywords_handles_numeric_tokens() ->None:
 +"""Keywords containing digits should match exact tokens."""
 +config=KeywordCfg(keywords=["world123"])
 +result=match_keywords("Hello, world123", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisTrue# noqa: S101
 +assertresult.info["matched"] == ["world123"] # noqa: S101
++
++
 +deftest_match_keywords_rejects_partial_numeric_tokens() ->None:
 +"""Numeric keywords should not match when extra digits follow."""
 +config=KeywordCfg(keywords=["world123"])
 +result=match_keywords("Hello, world12345", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisFalse# noqa: S101
++
++
 +deftest_match_keywords_handles_underscored_tokens() ->None:
 +"""Underscored keywords should be detected exactly once."""
 +config=KeywordCfg(keywords=["w_o_r_l_d"])
 +result=match_keywords("Hello, w_o_r_l_d", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisTrue# noqa: S101
 +assertresult.info["matched"] == ["w_o_r_l_d"] # noqa: S101
++
++
 +deftest_match_keywords_rejects_words_embedded_in_underscores() ->None:
 +"""Words surrounded by underscores should not trigger partial matches."""
 +config=KeywordCfg(keywords=["world"])
 +result=match_keywords("Hello, test_world_test", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisFalse# noqa: S101
++
++
 +deftest_match_keywords_handles_chinese_characters() ->None:
 +"""Unicode keywords such as Chinese characters should match."""
 +config=KeywordCfg(keywords=["你好"])
 +result=match_keywords("你好", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisTrue# noqa: S101
 +assertresult.info["matched"] == ["你好"] # noqa: S101
++
++
 +deftest_match_keywords_handles_chinese_tokens_with_digits() ->None:
 +"""Unicode keywords that include digits should match whole tokens."""
 +config=KeywordCfg(keywords=["你好123"])
 +result=match_keywords("你好123", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisTrue# noqa: S101
 +assertresult.info["matched"] == ["你好123"] # noqa: S101
++
++
 +deftest_match_keywords_rejects_partial_chinese_tokens_with_digits() ->None:
 +"""Unicode keywords with trailing digits should not match supersets."""
 +config=KeywordCfg(keywords=["你好123"])
 +result=match_keywords("你好12345", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisFalse# noqa: S101
++
++
 +deftest_match_keywords_applies_boundaries_to_all_keywords() ->None:
 +"""Every keyword in a multi-token pattern should respect Unicode boundaries."""
 +config=KeywordCfg(keywords=["test", "hello", "world"])
 +result=match_keywords("testing hello world", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisTrue# noqa: S101
 +assertresult.info["matched"] == ["hello", "world"] # noqa: S101
++
++
 +deftest_match_keywords_detects_email_like_patterns() ->None:
 +"""Email-like keywords starting with punctuation should match after word chars."""
 +config=KeywordCfg(keywords=["@corp.com"])
 +result=match_keywords("[email protected]", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisTrue# noqa: S101
 +assertresult.info["matched"] == ["@corp.com"] # noqa: S101
++
++
 +deftest_match_keywords_detects_hashtag_patterns() ->None:
 +"""Hashtag keywords starting with punctuation should match after word chars."""
 +config=KeywordCfg(keywords=["#leak"])
 +result=match_keywords("abc#leak", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisTrue# noqa: S101
 +assertresult.info["matched"] == ["#leak"] # noqa: S101
++
++
 +deftest_match_keywords_respects_end_boundary_for_punctuation_prefixed() ->None:
 +"""Punctuation-prefixed keywords ending with word chars need end boundary."""
 +config=KeywordCfg(keywords=["@leak"])
 +# Should not match when word chars continue after
 +result=match_keywords("foo@leakmore", config, guardrail_name="Test Guardrail")
 +assertresult.tripwire_triggeredisFalse# noqa: S101
++
 +# Should match when followed by non-word char
 +result=match_keywords("foo@leak bar", config, guardrail_name="Test Guardrail")
 +assertresult.tripwire_triggeredisTrue# noqa: S101
 +assertresult.info["matched"] == ["@leak"] # noqa: S101
++
++
 +deftest_match_keywords_handles_full_punctuation_keywords() ->None:
 +"""Keywords consisting only of punctuation should match anywhere."""
 +config=KeywordCfg(keywords=["@#$"])
 +result=match_keywords("test@#$test", config, guardrail_name="Test Guardrail")
++
 +assertresult.tripwire_triggeredisTrue# noqa: S101
 +assertresult.info["matched"] == ["@#$"] # noqa: S101
++
++
 +deftest_match_keywords_mixed_punctuation_and_word_chars() ->None:
 +"""Keywords with both punctuation prefix and suffix should work correctly."""
 +config=KeywordCfg(keywords=["@user@"])
 +# Should match when embedded
 +result=match_keywords("test@user@test", config, guardrail_name="Test Guardrail")
 +assertresult.tripwire_triggeredisTrue# noqa: S101
++
 +# Should match even when followed by more text (no boundaries applied to punctuation edges)
 +result=match_keywords("test@user@more", config, guardrail_name="Test Guardrail")
 +assertresult.tripwire_triggeredisTrue# noqa: S101