Skip to content

Commit bf067db

Browse files
authored
Enhancing URL detection (#57)
* Enhancing URL detection: If allow list has scheme, match exactly * Validating ports * Removing redundant variable assignment * Handle trailing slash in allow lists * Allow scheme-less user input * Fix port matching * improve port matching
1 parent 06c1018 commit bf067db

File tree

2 files changed

+604
-43
lines changed

2 files changed

+604
-43
lines changed

‎src/guardrails/checks/text/urls.py‎

Lines changed: 222 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,21 @@
2727
fromtypingimportAny
2828
fromurllib.parseimportParseResult, urlparse
2929

30-
frompydanticimportBaseModel, Field
30+
frompydanticimportBaseModel, Field, field_validator
3131

3232
fromguardrails.registryimportdefault_spec_registry
3333
fromguardrails.specimportGuardrailSpecMetadata
3434
fromguardrails.typesimportGuardrailResult
3535

3636
__all__= ["urls"]
3737

38+
DEFAULT_PORTS={
39+
"http": 80,
40+
"https": 443,
41+
}
42+
43+
SCHEME_PREFIX_RE=re.compile(r"^[a-z][a-z0-9+.-]*://")
44+
3845

3946
@dataclass(frozen=True, slots=True)
4047
classUrlDetectionResult:
@@ -66,9 +73,53 @@ class URLConfig(BaseModel):
6673
description="Allow subdomains of allowed domains (e.g. api.example.com if example.com is allowed)",
6774
)
6875

76+
@field_validator("allowed_schemes", mode="before")
77+
@classmethod
78+
defnormalize_allowed_schemes(cls, value: Any) ->set[str]:
79+
"""Normalize allowed schemes to bare identifiers without delimiters."""
80+
ifvalueisNone:
81+
return{"https"}
82+
83+
ifisinstance(value, str):
84+
raw_values= [value]
85+
else:
86+
raw_values=list(value)
87+
88+
normalized: set[str] =set()
89+
forentryinraw_values:
90+
ifnotisinstance(entry, str):
91+
raiseTypeError("allowed_schemes entries must be strings")
92+
cleaned=entry.strip().lower()
93+
ifnotcleaned:
94+
continue
95+
# Support inputs like "https://", "HTTPS:", or " https "
96+
ifcleaned.endswith("://"):
97+
cleaned=cleaned[:-3]
98+
cleaned=cleaned.removesuffix(":")
99+
ifcleaned:
100+
normalized.add(cleaned)
101+
102+
ifnotnormalized:
103+
raiseValueError("allowed_schemes must include at least one scheme")
104+
105+
returnnormalized
106+
69107

70108
def_detect_urls(text: str) ->list[str]:
71-
"""Detect URLs using regex."""
109+
"""Detect URLs using regex patterns with deduplication.
110+
111+
Detects URLs with explicit schemes (http, https, ftp, data, javascript,
112+
vbscript), domain-like patterns without schemes, and IP addresses.
113+
Deduplicates to avoid returning both scheme-ful and scheme-less versions
114+
of the same URL.
115+
116+
Args:
117+
text: The text to scan for URLs.
118+
119+
Returns:
120+
List of unique URL strings found in the text, with trailing
121+
punctuation removed.
122+
"""
72123
# Pattern for cleaning trailing punctuation (] must be escaped)
73124
PUNCTUATION_CLEANUP=r"[.,;:!?)\]]+$"
74125

@@ -155,55 +206,110 @@ def _detect_urls(text: str) -> list[str]:
155206
returnlist(dict.fromkeys([urlforurlinfinal_urlsifurl]))
156207

157208

158-
def_validate_url_security(url_string: str, config: URLConfig) ->tuple[ParseResult|None, str]:
159-
"""Validate URL using stdlib urllib.parse."""
209+
def_validate_url_security(url_string: str, config: URLConfig) ->tuple[ParseResult|None, str, bool]:
210+
"""Validate URL security properties using urllib.parse.
211+
212+
Checks URL structure, validates the scheme is allowed, and ensures no
213+
credentials are embedded in userinfo if block_userinfo is enabled.
214+
215+
Args:
216+
url_string: The URL string to validate.
217+
config: Configuration specifying allowed schemes and userinfo policy.
218+
219+
Returns:
220+
A tuple of (parsed_url, error_reason, had_explicit_scheme). If validation
221+
succeeds, parsed_url is a ParseResult, error_reason is empty, and
222+
had_explicit_scheme indicates if the original URL included a scheme.
223+
If validation fails, parsed_url is None and error_reason describes the failure.
224+
"""
160225
try:
161-
# Parse URL - preserve original scheme for validation
226+
# Parse URL - track whether scheme was explicit
227+
has_explicit_scheme=False
162228
if"://"inurl_string:
163229
# Standard URL with double-slash scheme (http://, https://, ftp://, etc.)
164230
parsed_url=urlparse(url_string)
165231
original_scheme=parsed_url.scheme
232+
has_explicit_scheme=True
166233
elif":"inurl_stringandurl_string.split(":", 1)[0] in{"data", "javascript", "vbscript", "mailto"}:
167234
# Special single-colon schemes
168235
parsed_url=urlparse(url_string)
169236
original_scheme=parsed_url.scheme
237+
has_explicit_scheme=True
170238
else:
171-
# Add http scheme for parsing, but remember this is a default
239+
# Add http scheme for parsing only (user didn't specify a scheme)
172240
parsed_url=urlparse(f"http://{url_string}")
173-
original_scheme="http"# Default scheme for scheme-less URLs
241+
original_scheme=None# No explicit scheme
242+
has_explicit_scheme=False
174243

175244
# Basic validation: must have scheme and netloc (except for special schemes)
176245
ifnotparsed_url.scheme:
177-
returnNone, "Invalid URL format"
246+
returnNone, "Invalid URL format", False
178247

179248
# Special schemes like data: and javascript: don't need netloc
180249
special_schemes={"data", "javascript", "vbscript", "mailto"}
181-
iforiginal_schemenotinspecial_schemesandnotparsed_url.netloc:
182-
returnNone, "Invalid URL format"
250+
ifparsed_url.schemenotinspecial_schemesandnotparsed_url.netloc:
251+
returnNone, "Invalid URL format", False
183252

184-
# Security validations - use original scheme
185-
iforiginal_schemenotinconfig.allowed_schemes:
186-
returnNone, f"Blocked scheme: {original_scheme}"
253+
# Security validations - only validate scheme if it was explicitly provided
254+
ifhas_explicit_schemeandoriginal_schemenotinconfig.allowed_schemes:
255+
returnNone, f"Blocked scheme: {original_scheme}", has_explicit_scheme
187256

188-
ifconfig.block_userinfoandparsed_url.username:
189-
returnNone, "Contains userinfo (potential credential injection)"
257+
ifconfig.block_userinfoand(parsed_url.usernameorparsed_url.password):
258+
returnNone, "Contains userinfo (potential credential injection)", has_explicit_scheme
190259

191260
# Everything else (IPs, localhost, private IPs) goes through allow list logic
192-
returnparsed_url, ""
261+
returnparsed_url, "", has_explicit_scheme
193262

194263
except (ValueError, UnicodeError, AttributeError) ase:
195264
# Common URL parsing errors:
196265
# - ValueError: Invalid URL structure, invalid port, etc.
197266
# - UnicodeError: Invalid encoding in URL
198267
# - AttributeError: Unexpected URL structure
199-
returnNone, f"Invalid URL format: {str(e)}"
268+
returnNone, f"Invalid URL format: {str(e)}", False
200269
exceptExceptionase:
201270
# Catch any unexpected errors but provide debugging info
202-
returnNone, f"URL parsing error: {type(e).__name__}: {str(e)}"
271+
returnNone, f"URL parsing error: {type(e).__name__}: {str(e)}", False
272+
273+
274+
def_safe_get_port(parsed: ParseResult, scheme: str) ->int|None:
275+
"""Safely extract port from ParseResult, handling malformed ports.
276+
277+
Args:
278+
parsed: The parsed URL.
279+
scheme: The URL scheme (for default port lookup).
280+
281+
Returns:
282+
The port number, the default port for the scheme, or None if invalid.
283+
"""
284+
try:
285+
returnparsed.portorDEFAULT_PORTS.get(scheme.lower())
286+
exceptValueError:
287+
# Port is out of range (0-65535) or malformed
288+
returnNone
289+
290+
291+
def_is_url_allowed(
292+
parsed_url: ParseResult,
293+
allow_list: list[str],
294+
allow_subdomains: bool,
295+
url_had_explicit_scheme: bool,
296+
) ->bool:
297+
"""Check if parsed URL matches any entry in the allow list.
203298
299+
Supports domain names, IP addresses, CIDR blocks, and full URLs with
300+
paths/ports/query strings. Allow list entries without explicit schemes
301+
match any scheme. Entries with schemes must match exactly against URLs
302+
with explicit schemes, but match any scheme-less URL.
204303
205-
def_is_url_allowed(parsed_url: ParseResult, allow_list: list[str], allow_subdomains: bool) ->bool:
206-
"""Check if URL is allowed."""
304+
Args:
305+
parsed_url: The parsed URL to check.
306+
allow_list: List of allowed URL patterns (domains, IPs, CIDR, full URLs).
307+
allow_subdomains: If True, subdomains of allowed domains are permitted.
308+
url_had_explicit_scheme: Whether the original URL included an explicit scheme.
309+
310+
Returns:
311+
True if the URL matches any allow list entry, False otherwise.
312+
"""
207313
ifnotallow_list:
208314
returnFalse
209315

@@ -212,30 +318,109 @@ def _is_url_allowed(parsed_url: ParseResult, allow_list: list[str], allow_subdom
212318
returnFalse
213319

214320
url_host=url_host.lower()
321+
url_domain=url_host.replace("www.", "")
322+
scheme_lower=parsed_url.scheme.lower() ifparsed_url.schemeelse""
323+
# Safely get port (rejects malformed ports)
324+
url_port=_safe_get_port(parsed_url, scheme_lower)
325+
# Early rejection of malformed ports
326+
try:
327+
_=parsed_url.port# This will raise ValueError for malformed ports
328+
exceptValueError:
329+
returnFalse
330+
url_path=parsed_url.pathor"/"
331+
url_query=parsed_url.query
332+
url_fragment=parsed_url.fragment
333+
334+
try:
335+
url_ip=ip_address(url_host)
336+
except (AddressValueError, ValueError):
337+
url_ip=None
215338

216339
forallowed_entryinallow_list:
217340
allowed_entry=allowed_entry.lower().strip()
218341

219-
# Handle IP addresses and CIDR blocks
342+
has_explicit_scheme=bool(SCHEME_PREFIX_RE.match(allowed_entry))
343+
ifhas_explicit_scheme:
344+
parsed_allowed=urlparse(allowed_entry)
345+
else:
346+
parsed_allowed=urlparse(f"//{allowed_entry}")
347+
allowed_host= (parsed_allowed.hostnameor"").lower()
348+
allowed_scheme=parsed_allowed.scheme.lower() ifparsed_allowed.schemeelse""
349+
# Check if port was explicitly specified (safely)
350+
try:
351+
allowed_port_explicit=parsed_allowed.port
352+
exceptValueError:
353+
allowed_port_explicit=None
354+
allowed_port=_safe_get_port(parsed_allowed, allowed_scheme)
355+
allowed_path=parsed_allowed.path
356+
allowed_query=parsed_allowed.query
357+
allowed_fragment=parsed_allowed.fragment
358+
359+
# Handle IP addresses and CIDR blocks (including schemes)
220360
try:
221-
ip_address(allowed_entry.split("/")[0])
222-
ifallowed_entry==url_hostor ("/"inallowed_entryandip_address(url_host) inip_network(allowed_entry, strict=False)):
361+
allowed_ip=ip_address(allowed_host)
362+
except (AddressValueError, ValueError):
363+
allowed_ip=None
364+
365+
ifallowed_ipisnotNone:
366+
ifurl_ipisNone:
367+
continue
368+
# Scheme matching for IPs: if both allow list and URL have explicit schemes, they must match
369+
ifhas_explicit_schemeandurl_had_explicit_schemeandallowed_schemeandallowed_scheme!=scheme_lower:
370+
continue
371+
# Port matching: enforce if allow list has explicit port
372+
ifallowed_port_explicitisnotNoneandallowed_port!=url_port:
373+
continue
374+
ifallowed_ip==url_ip:
223375
returnTrue
376+
377+
network_spec=allowed_host
378+
ifparsed_allowed.pathnotin ("", "/"):
379+
network_spec=f"{network_spec}{parsed_allowed.path}"
380+
try:
381+
ifnetwork_specand"/"innetwork_specandurl_ipinip_network(network_spec, strict=False):
382+
returnTrue
383+
except (AddressValueError, ValueError):
384+
# Path segment might not represent a CIDR mask; ignore.
385+
pass
386+
continue
387+
388+
ifnotallowed_host:
224389
continue
225-
except (AddressValueError, ValueError):
226-
pass
227390

228-
# Handle domain matching
229-
allowed_domain=allowed_entry.replace("www.", "")
230-
url_domain=url_host.replace("www.", "")
391+
allowed_domain=allowed_host.replace("www.", "")
231392

232-
# Exact match always allowed
233-
ifurl_domain==allowed_domain:
234-
returnTrue
393+
# Port matching: enforce if allow list has explicit port
394+
ifallowed_port_explicitisnotNoneandallowed_port!=url_port:
395+
continue
396+
397+
host_matches=url_domain==allowed_domainor (
398+
allow_subdomainsandurl_domain.endswith(f".{allowed_domain}")
399+
)
400+
ifnothost_matches:
401+
continue
402+
403+
# Scheme matching: if both allow list and URL have explicit schemes, they must match
404+
ifhas_explicit_schemeandurl_had_explicit_schemeandallowed_schemeandallowed_scheme!=scheme_lower:
405+
continue
406+
407+
# Path matching with segment boundary respect
408+
ifallowed_pathnotin ("", "/"):
409+
# Normalize trailing slashes to prevent issues with entries like "/api/"
410+
# which should match "/api/users" but would fail with double-slash check
411+
normalized_allowed_path=allowed_path.rstrip("/")
412+
# Ensure path matching respects segment boundaries to prevent
413+
# "/api" from matching "/api2" or "/api-v2"
414+
ifurl_path!=allowed_pathandurl_path!=normalized_allowed_pathandnoturl_path.startswith(f"{normalized_allowed_path}/"):
415+
continue
416+
417+
ifallowed_queryandallowed_query!=url_query:
418+
continue
419+
420+
ifallowed_fragmentandallowed_fragment!=url_fragment:
421+
continue
235422

236-
# Subdomain matching if enabled
237-
ifallow_subdomainsandurl_domain.endswith(f".{allowed_domain}"):
238-
returnTrue
423+
returnTrue
239424

240425
returnFalse
241426

@@ -258,7 +443,7 @@ async def urls(ctx: Any, data: str, config: URLConfig) -> GuardrailResult:
258443

259444
forurl_stringindetected_urls:
260445
# Validate URL with security checks
261-
parsed_url, error_reason=_validate_url_security(url_string, config)
446+
parsed_url, error_reason, url_had_explicit_scheme=_validate_url_security(url_string, config)
262447

263448
ifparsed_urlisNone:
264449
blocked.append(url_string)
@@ -273,7 +458,7 @@ async def urls(ctx: Any, data: str, config: URLConfig) -> GuardrailResult:
273458
# For hostless schemes, only scheme permission matters (no allow list needed)
274459
# They were already validated for scheme permission in _validate_url_security
275460
allowed.append(url_string)
276-
elif_is_url_allowed(parsed_url, config.url_allow_list, config.allow_subdomains):
461+
elif_is_url_allowed(parsed_url, config.url_allow_list, config.allow_subdomains, url_had_explicit_scheme):
277462
allowed.append(url_string)
278463
else:
279464
blocked.append(url_string)
@@ -282,7 +467,7 @@ async def urls(ctx: Any, data: str, config: URLConfig) -> GuardrailResult:
282467
returnGuardrailResult(
283468
tripwire_triggered=bool(blocked),
284469
info={
285-
"guardrail_name": "URL Filter (Direct Config)",
470+
"guardrail_name": "URL Filter",
286471
"config":{
287472
"allowed_schemes": list(config.allowed_schemes),
288473
"block_userinfo": config.block_userinfo,

0 commit comments

Comments
(0)