openai
diff --git a/‎src/guardrails/checks/text/urls.py‎
Lines changed: 222 additions & 37 deletions b/‎src/guardrails/checks/text/urls.py‎
Lines changed: 222 additions & 37 deletions
@@ -27,14 +27,21 @@
 fromtypingimportAny
 fromurllib.parseimportParseResult, urlparse
 
-frompydanticimportBaseModel, Field
+frompydanticimportBaseModel, Field, field_validator
 
 fromguardrails.registryimportdefault_spec_registry
 fromguardrails.specimportGuardrailSpecMetadata
 fromguardrails.typesimportGuardrailResult
 
 __all__= ["urls"]
 
+DEFAULT_PORTS={
+"http": 80,
+"https": 443,
+}
+
+SCHEME_PREFIX_RE=re.compile(r"^[a-z][a-z0-9+.-]*://")
+
 
 @dataclass(frozen=True, slots=True)
 classUrlDetectionResult:
@@ -66,9 +73,53 @@ class URLConfig(BaseModel):
 description="Allow subdomains of allowed domains (e.g. api.example.com if example.com is allowed)",
  )
 
+@field_validator("allowed_schemes", mode="before")
+@classmethod
+defnormalize_allowed_schemes(cls, value: Any) ->set[str]:
+"""Normalize allowed schemes to bare identifiers without delimiters."""
+ifvalueisNone:
+return{"https"}
+
+ifisinstance(value, str):
+raw_values= [value]
+else:
+raw_values=list(value)
+
+normalized: set[str] =set()
+forentryinraw_values:
+ifnotisinstance(entry, str):
+raiseTypeError("allowed_schemes entries must be strings")
+cleaned=entry.strip().lower()
+ifnotcleaned:
+continue
+# Support inputs like "https://", "HTTPS:", or " https "
+ifcleaned.endswith("://"):
+cleaned=cleaned[:-3]
+cleaned=cleaned.removesuffix(":")
+ifcleaned:
+normalized.add(cleaned)
+
+ifnotnormalized:
+raiseValueError("allowed_schemes must include at least one scheme")
+
+returnnormalized
+
 
 def_detect_urls(text: str) ->list[str]:
-"""Detect URLs using regex."""
+"""Detect URLs using regex patterns with deduplication.
+
+ Detects URLs with explicit schemes (http, https, ftp, data, javascript,
+ vbscript), domain-like patterns without schemes, and IP addresses.
+ Deduplicates to avoid returning both scheme-ful and scheme-less versions
+ of the same URL.
+
+ Args:
+ text: The text to scan for URLs.
+
+ Returns:
+ List of unique URL strings found in the text, with trailing
+ punctuation removed.
+ """
 # Pattern for cleaning trailing punctuation (] must be escaped)
 PUNCTUATION_CLEANUP=r"[.,;:!?)\]]+$"
 
@@ -155,55 +206,110 @@ def _detect_urls(text: str) -> list[str]:
 returnlist(dict.fromkeys([urlforurlinfinal_urlsifurl]))
 
 
-def_validate_url_security(url_string: str, config: URLConfig) ->tuple[ParseResult|None, str]:
-"""Validate URL using stdlib urllib.parse."""
+def_validate_url_security(url_string: str, config: URLConfig) ->tuple[ParseResult|None, str, bool]:
+"""Validate URL security properties using urllib.parse.
+
+ Checks URL structure, validates the scheme is allowed, and ensures no
+ credentials are embedded in userinfo if block_userinfo is enabled.
+
+ Args:
+ url_string: The URL string to validate.
+ config: Configuration specifying allowed schemes and userinfo policy.
+
+ Returns:
+ A tuple of (parsed_url, error_reason, had_explicit_scheme). If validation
+ succeeds, parsed_url is a ParseResult, error_reason is empty, and
+ had_explicit_scheme indicates if the original URL included a scheme.
+ If validation fails, parsed_url is None and error_reason describes the failure.
+ """
 try:
-# Parse URL - preserve original scheme for validation
+# Parse URL - track whether scheme was explicit
+has_explicit_scheme=False
 if"://"inurl_string:
 # Standard URL with double-slash scheme (http://, https://, ftp://, etc.)
 parsed_url=urlparse(url_string)
 original_scheme=parsed_url.scheme
+has_explicit_scheme=True
 elif":"inurl_stringandurl_string.split(":", 1)[0] in{"data", "javascript", "vbscript", "mailto"}:
 # Special single-colon schemes
 parsed_url=urlparse(url_string)
 original_scheme=parsed_url.scheme
+has_explicit_scheme=True
 else:
-# Add http scheme for parsing, but remember this is a default
+# Add http scheme for parsing only (user didn't specify a scheme)
 parsed_url=urlparse(f"http://{url_string}")
-original_scheme="http"# Default scheme for scheme-less URLs
+original_scheme=None# No explicit scheme
+has_explicit_scheme=False
 
 # Basic validation: must have scheme and netloc (except for special schemes)
 ifnotparsed_url.scheme:
-returnNone, "Invalid URL format"
+returnNone, "Invalid URL format", False
 
 # Special schemes like data: and javascript: don't need netloc
 special_schemes={"data", "javascript", "vbscript", "mailto"}
-iforiginal_schemenotinspecial_schemesandnotparsed_url.netloc:
-returnNone, "Invalid URL format"
+ifparsed_url.schemenotinspecial_schemesandnotparsed_url.netloc:
+returnNone, "Invalid URL format", False
 
-# Security validations - use original scheme
-iforiginal_schemenotinconfig.allowed_schemes:
-returnNone, f"Blocked scheme: {original_scheme}"
+# Security validations - only validate scheme if it was explicitly provided
+ifhas_explicit_schemeandoriginal_schemenotinconfig.allowed_schemes:
+returnNone, f"Blocked scheme: {original_scheme}", has_explicit_scheme
 
-ifconfig.block_userinfoandparsed_url.username:
-returnNone, "Contains userinfo (potential credential injection)"
+ifconfig.block_userinfoand(parsed_url.usernameorparsed_url.password):
+returnNone, "Contains userinfo (potential credential injection)", has_explicit_scheme
 
 # Everything else (IPs, localhost, private IPs) goes through allow list logic
-returnparsed_url, ""
+returnparsed_url, "", has_explicit_scheme
 
 except (ValueError, UnicodeError, AttributeError) ase:
 # Common URL parsing errors:
 # - ValueError: Invalid URL structure, invalid port, etc.
 # - UnicodeError: Invalid encoding in URL
 # - AttributeError: Unexpected URL structure
-returnNone, f"Invalid URL format: {str(e)}"
+returnNone, f"Invalid URL format: {str(e)}", False
 exceptExceptionase:
 # Catch any unexpected errors but provide debugging info
-returnNone, f"URL parsing error: {type(e).__name__}: {str(e)}"
+returnNone, f"URL parsing error: {type(e).__name__}: {str(e)}", False
+
+
+def_safe_get_port(parsed: ParseResult, scheme: str) ->int|None:
+"""Safely extract port from ParseResult, handling malformed ports.
+
+ Args:
+ parsed: The parsed URL.
+ scheme: The URL scheme (for default port lookup).
+
+ Returns:
+ The port number, the default port for the scheme, or None if invalid.
+ """
+try:
+returnparsed.portorDEFAULT_PORTS.get(scheme.lower())
+exceptValueError:
+# Port is out of range (0-65535) or malformed
+returnNone
+
+
+def_is_url_allowed(
+parsed_url: ParseResult,
+allow_list: list[str],
+allow_subdomains: bool,
+url_had_explicit_scheme: bool,
+) ->bool:
+"""Check if parsed URL matches any entry in the allow list.
 
+ Supports domain names, IP addresses, CIDR blocks, and full URLs with
+ paths/ports/query strings. Allow list entries without explicit schemes
+ match any scheme. Entries with schemes must match exactly against URLs
+ with explicit schemes, but match any scheme-less URL.
 
-def_is_url_allowed(parsed_url: ParseResult, allow_list: list[str], allow_subdomains: bool) ->bool:
-"""Check if URL is allowed."""
+ Args:
+ parsed_url: The parsed URL to check.
+ allow_list: List of allowed URL patterns (domains, IPs, CIDR, full URLs).
+ allow_subdomains: If True, subdomains of allowed domains are permitted.
+ url_had_explicit_scheme: Whether the original URL included an explicit scheme.
+
+ Returns:
+ True if the URL matches any allow list entry, False otherwise.
+ """
 ifnotallow_list:
 returnFalse
 
@@ -212,30 +318,109 @@ def _is_url_allowed(parsed_url: ParseResult, allow_list: list[str], allow_subdom
 returnFalse
 
 url_host=url_host.lower()
+url_domain=url_host.replace("www.", "")
+scheme_lower=parsed_url.scheme.lower() ifparsed_url.schemeelse""
+# Safely get port (rejects malformed ports)
+url_port=_safe_get_port(parsed_url, scheme_lower)
+# Early rejection of malformed ports
+try:
+_=parsed_url.port# This will raise ValueError for malformed ports
+exceptValueError:
+returnFalse
+url_path=parsed_url.pathor"/"
+url_query=parsed_url.query
+url_fragment=parsed_url.fragment
+
+try:
+url_ip=ip_address(url_host)
+except (AddressValueError, ValueError):
+url_ip=None
 
 forallowed_entryinallow_list:
 allowed_entry=allowed_entry.lower().strip()
 
-# Handle IP addresses and CIDR blocks
+has_explicit_scheme=bool(SCHEME_PREFIX_RE.match(allowed_entry))
+ifhas_explicit_scheme:
+parsed_allowed=urlparse(allowed_entry)
+else:
+parsed_allowed=urlparse(f"//{allowed_entry}")
+allowed_host= (parsed_allowed.hostnameor"").lower()
+allowed_scheme=parsed_allowed.scheme.lower() ifparsed_allowed.schemeelse""
+# Check if port was explicitly specified (safely)
+try:
+allowed_port_explicit=parsed_allowed.port
+exceptValueError:
+allowed_port_explicit=None
+allowed_port=_safe_get_port(parsed_allowed, allowed_scheme)
+allowed_path=parsed_allowed.path
+allowed_query=parsed_allowed.query
+allowed_fragment=parsed_allowed.fragment
+
+# Handle IP addresses and CIDR blocks (including schemes)
 try:
-ip_address(allowed_entry.split("/")[0])
-ifallowed_entry==url_hostor ("/"inallowed_entryandip_address(url_host) inip_network(allowed_entry, strict=False)):
+allowed_ip=ip_address(allowed_host)
+except (AddressValueError, ValueError):
+allowed_ip=None
+
+ifallowed_ipisnotNone:
+ifurl_ipisNone:
+continue
+# Scheme matching for IPs: if both allow list and URL have explicit schemes, they must match
+ifhas_explicit_schemeandurl_had_explicit_schemeandallowed_schemeandallowed_scheme!=scheme_lower:
+continue
+# Port matching: enforce if allow list has explicit port
+ifallowed_port_explicitisnotNoneandallowed_port!=url_port:
+continue
+ifallowed_ip==url_ip:
 returnTrue
+
+network_spec=allowed_host
+ifparsed_allowed.pathnotin ("", "/"):
+network_spec=f"{network_spec}{parsed_allowed.path}"
+try:
+ifnetwork_specand"/"innetwork_specandurl_ipinip_network(network_spec, strict=False):
+returnTrue
+except (AddressValueError, ValueError):
+# Path segment might not represent a CIDR mask; ignore.
+pass
+continue
+
+ifnotallowed_host:
 continue
-except (AddressValueError, ValueError):
-pass
 
-# Handle domain matching
-allowed_domain=allowed_entry.replace("www.", "")
-url_domain=url_host.replace("www.", "")
+allowed_domain=allowed_host.replace("www.", "")
 
-# Exact match always allowed
-ifurl_domain==allowed_domain:
-returnTrue
+# Port matching: enforce if allow list has explicit port
+ifallowed_port_explicitisnotNoneandallowed_port!=url_port:
+continue
+
+host_matches=url_domain==allowed_domainor (
+allow_subdomainsandurl_domain.endswith(f".{allowed_domain}")
+ )
+ifnothost_matches:
+continue
+
+# Scheme matching: if both allow list and URL have explicit schemes, they must match
+ifhas_explicit_schemeandurl_had_explicit_schemeandallowed_schemeandallowed_scheme!=scheme_lower:
+continue
+
+# Path matching with segment boundary respect
+ifallowed_pathnotin ("", "/"):
+# Normalize trailing slashes to prevent issues with entries like "/api/"
+# which should match "/api/users" but would fail with double-slash check
+normalized_allowed_path=allowed_path.rstrip("/")
+# Ensure path matching respects segment boundaries to prevent
+# "/api" from matching "/api2" or "/api-v2"
+ifurl_path!=allowed_pathandurl_path!=normalized_allowed_pathandnoturl_path.startswith(f"{normalized_allowed_path}/"):
+continue
+
+ifallowed_queryandallowed_query!=url_query:
+continue
+
+ifallowed_fragmentandallowed_fragment!=url_fragment:
+continue
 
-# Subdomain matching if enabled
-ifallow_subdomainsandurl_domain.endswith(f".{allowed_domain}"):
-returnTrue
+returnTrue
 
 returnFalse
 
@@ -258,7 +443,7 @@ async def urls(ctx: Any, data: str, config: URLConfig) -> GuardrailResult:
 
 forurl_stringindetected_urls:
 # Validate URL with security checks
-parsed_url, error_reason=_validate_url_security(url_string, config)
+parsed_url, error_reason, url_had_explicit_scheme=_validate_url_security(url_string, config)
 
 ifparsed_urlisNone:
 blocked.append(url_string)
@@ -273,7 +458,7 @@ async def urls(ctx: Any, data: str, config: URLConfig) -> GuardrailResult:
 # For hostless schemes, only scheme permission matters (no allow list needed)
 # They were already validated for scheme permission in _validate_url_security
 allowed.append(url_string)
-elif_is_url_allowed(parsed_url, config.url_allow_list, config.allow_subdomains):
+elif_is_url_allowed(parsed_url, config.url_allow_list, config.allow_subdomains, url_had_explicit_scheme):
 allowed.append(url_string)
 else:
 blocked.append(url_string)
@@ -282,7 +467,7 @@ async def urls(ctx: Any, data: str, config: URLConfig) -> GuardrailResult:
 returnGuardrailResult(
 tripwire_triggered=bool(blocked),
 info={
-"guardrail_name": "URL Filter (Direct Config)",
+"guardrail_name": "URL Filter",
 "config":{
 "allowed_schemes": list(config.allowed_schemes),
 "block_userinfo": config.block_userinfo,