python · methane · Mar 17, 2024 · Jan 3, 2024 · Jan 3, 2024 · Jan 3, 2024
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
@@ -11,7 +11,7 @@
 sace_prefix = "xn--"
 
 # This assumes query strings, so AllowUnassigned is true
-def nameprep(label):
+def nameprep(label): # type: (str) -> str
  # Map
  newlabel = []
  for c in label:
@@ -25,7 +25,7 @@ def nameprep(label):
  label = unicodedata.normalize("NFKC", label)
 
  # Prohibit
- for c in label:
+ for i, c in enumerate(label):
  if stringprep.in_table_c12(c) or \
  stringprep.in_table_c22(c) or \
  stringprep.in_table_c3(c) or \
@@ -35,7 +35,7 @@ def nameprep(label):
  stringprep.in_table_c7(c) or \
  stringprep.in_table_c8(c) or \
  stringprep.in_table_c9(c):
- raise UnicodeError("Invalid character %r" % c)
+ raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character{c!r}")
 
  # Check bidi
  RandAL = [stringprep.in_table_d1(x) for x in label]
@@ -46,59 +46,73 @@ def nameprep(label):
  # This is table C.8, which was already checked
  # 2) If a string contains any RandALCat character, the string
  # MUST NOT contain any LCat character.
- if any(stringprep.in_table_d2(x) for x in label):
- raise UnicodeError("Violation of BIDI requirement 2")
+ for i, x in enumerate(label):
+ if stringprep.in_table_d2(x):
+ raise UnicodeEncodeError("idna", label, i, i+1,
+ "Violation of BIDI requirement 2")
  # 3) If a string contains any RandALCat character, a
  # RandALCat character MUST be the first character of the
  # string, and a RandALCat character MUST be the last
  # character of the string.
- if not RandAL[0] or not RandAL[-1]:
- raise UnicodeError("Violation of BIDI requirement 3")
+ if not RandAL[0]:
+ raise UnicodeEncodeError("idna", label, 0, 1,
+ "Violation of BIDI requirement 3")
+ if not RandAL[-1]:
+ raise UnicodeEncodeError("idna", label, len(label)-1, len(label),
+ "Violation of BIDI requirement 3")
 
  return label
 
-def ToASCII(label):
+def ToASCII(label): # type: (str) -> bytes
  try:
  # Step 1: try ASCII
-label = label.encode("ascii")
- except UnicodeError:
+label_ascii = label.encode("ascii")
+ except UnicodeEncodeError:
  pass
  else:
  # Skip to step 3: UseSTD3ASCIIRules is false, so
  # Skip to step 8.
- if 0 < len(label) < 64:
- return label
- raise UnicodeError("label empty or too long")
+ if 0 < len(label_ascii) < 64:
+ return label_ascii
+ if len(label) == 0:
+ raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
+ else:
+ raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
 
  # Step 2: nameprep
  label = nameprep(label)
 
  # Step 3: UseSTD3ASCIIRules is false
  # Step 4: try ASCII
  try:
-label = label.encode("ascii")
- except UnicodeError:
+label_ascii = label.encode("ascii")
+ except UnicodeEncodeError:
  pass
  else:
  # Skip to step 8.
  if 0 < len(label) < 64:
- return label
- raise UnicodeError("label empty or too long")
+ return label_ascii
+ if len(label) == 0:
+ raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
+ else:
+ raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
 
  # Step 5: Check ACE prefix
- if label[:4].lower() == sace_prefix:
- raise UnicodeError("Label starts with ACE prefix")
+ if label.lower().startswith(sace_prefix):
+ raise UnicodeEncodeError(
+ "idna", label, 0, len(sace_prefix), "Label starts with ACE prefix")
 
  # Step 6: Encode with PUNYCODE
-label = label.encode("punycode")
+label_ascii = label.encode("punycode")
 
  # Step 7: Prepend ACE prefix
-label = ace_prefix + label
+label_ascii = ace_prefix + label_ascii
 
  # Step 8: Check size
- if 0 < len(label) < 64:
- return label
- raise UnicodeError("label empty or too long")
+ # do not check for empty as we prepend ace_prefix.
+ if len(label_ascii) < 64:
+ return label_ascii
+ raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
 
 def ToUnicode(label):
  if len(label) > 1024:
@@ -110,41 +124,51 @@ def ToUnicode(label):
  # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
  # preventing us from wasting time decoding a big thing that'll just
  # hit the actual <= 63 length limit in Step 6.
- raise UnicodeError("label way too long")
+ if isinstance(label, str):
+ label = label.encode("utf-8", errors="backslashreplace")
+ raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long")
  # Step 1: Check for ASCII
  if isinstance(label, bytes):
  pure_ascii = True
  else:
  try:
  label = label.encode("ascii")
  pure_ascii = True
- except UnicodeError:
+ except UnicodeEncodeError:
  pure_ascii = False
  if not pure_ascii:
+ assert isinstance(label, str)
  # Step 2: Perform nameprep
  label = nameprep(label)
  # It doesn't say this, but apparently, it should be ASCII now
  try:
  label = label.encode("ascii")
- except UnicodeError:
- raise UnicodeError("Invalid character in IDN label")
+ except UnicodeEncodeError as exc:
+ raise UnicodeEncodeError("idna", label, exc.start, exc.end,
+ "Invalid character in IDN label")
  # Step 3: Check for ACE prefix
- if not label[:4].lower() == ace_prefix:
+ assert isinstance(label, bytes)
+ if not label.lower().startswith(ace_prefix):
  return str(label, "ascii")
 
  # Step 4: Remove ACE prefix
  label1 = label[len(ace_prefix):]
 
  # Step 5: Decode using PUNYCODE
- result = label1.decode("punycode")
+ try:
+ result = label1.decode("punycode")
+ except UnicodeDecodeError as exc:
+ offset = len(ace_prefix)
+ raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason)
 
  # Step 6: Apply ToASCII
  label2 = ToASCII(result)
 
  # Step 7: Compare the result of step 6 with the one of step 3
  # label2 will already be in lower case.
  if str(label, "ascii").lower() != str(label2, "ascii"):
- raise UnicodeError("IDNA does not round-trip", label, label2)
+ raise UnicodeDecodeError("idna", label, 0, len(label),
+ f"IDNA does not round-trip, '{label!r}' != '{label2!r}'")
 
  # Step 8: return the result of step 5
  return result
@@ -156,7 +180,7 @@ def encode(self, input, errors='strict'):
 
  if errors != 'strict':
  # IDNA is quite clear that implementations must be strict
- raise UnicodeError("unsupported error handling "+errors)
+ raise UnicodeError(f"Unsupported error handling:{errors}")
 
  if not input:
  return b'', 0
@@ -168,11 +192,16 @@ def encode(self, input, errors='strict'):
  else:
  # ASCII name: fast path
  labels = result.split(b'.')
- for label in labels[:-1]:
- if not (0 < len(label) < 64):
- raise UnicodeError("label empty or too long")
- if len(labels[-1]) >= 64:
- raise UnicodeError("label too long")
+ for i, label in enumerate(labels[:-1]):
+ if len(label) == 0:
+ offset = sum(len(l) for l in labels[:i]) + i
+ raise UnicodeEncodeError("idna", input, offset, offset+1,
+ "label empty")
+ for i, label in enumerate(labels):
+ if len(label) >= 64:
+ offset = sum(len(l) for l in labels[:i]) + i
+ raise UnicodeEncodeError("idna", input, offset, offset+len(label),
+ "label too long")
  return result, len(input)
 
  result = bytearray()
@@ -182,17 +211,27 @@ def encode(self, input, errors='strict'):
  del labels[-1]
  else:
  trailing_dot = b''
- for label in labels:
+ for i, label in enumerate(labels):
  if result:
  # Join with U+002E
  result.extend(b'.')
- result.extend(ToASCII(label))
+ try:
+ result.extend(ToASCII(label))
+ except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+ offset = sum(len(l) for l in labels[:i]) + i
+ raise UnicodeEncodeError(
+ "idna",
+ input,
+ offset + exc.start,
+ offset + exc.end,
+ exc.reason,
+ )
  return bytes(result+trailing_dot), len(input)
 
  def decode(self, input, errors='strict'):
 
  if errors != 'strict':
- raise UnicodeError("Unsupported error handling "+errors)
+ raise UnicodeError(f"Unsupported error handling:{errors}")
 
  if not input:
  return "", 0
@@ -218,16 +257,23 @@ def decode(self, input, errors='strict'):
  trailing_dot = ''
 
  result = []
- for label in labels:
- result.append(ToUnicode(label))
+ for i, label in enumerate(labels):
+ try:
+ u_label = ToUnicode(label)
+ except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+ offset = sum(len(x) for x in labels[:i]) + len(labels[:i])
+ raise UnicodeDecodeError(
+ "idna", input, offset+exc.start, offset+exc.end, exc.reason)
+ else:
+ result.append(u_label)
 
  return ".".join(result)+trailing_dot, len(input)
 
 class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
  def _buffer_encode(self, input, errors, final):
  if errors != 'strict':
  # IDNA is quite clear that implementations must be strict
- raise UnicodeError("unsupported error handling "+errors)
+ raise UnicodeError(f"Unsupported error handling:{errors}")
 
  if not input:
  return (b'', 0)
@@ -251,7 +297,16 @@ def _buffer_encode(self, input, errors, final):
  # Join with U+002E
  result.extend(b'.')
  size += 1
- result.extend(ToASCII(label))
+ try:
+ result.extend(ToASCII(label))
+ except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+ raise UnicodeEncodeError(
+ "idna",
+ input,
+ size + exc.start,
+ size + exc.end,
+ exc.reason,
+ )
  size += len(label)
 
  result += trailing_dot
@@ -261,7 +316,7 @@ def _buffer_encode(self, input, errors, final):
 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
  def _buffer_decode(self, input, errors, final):
  if errors != 'strict':
- raise UnicodeError("Unsupported error handling "+errors)
+ raise UnicodeError("Unsupported error handling:{errors}")
 
  if not input:
  return ("", 0)
@@ -271,7 +326,11 @@ def _buffer_decode(self, input, errors, final):
  labels = dots.split(input)
  else:
  # Must be ASCII string
- input = str(input, "ascii")
+ try:
+ input = str(input, "ascii")
+ except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+ raise UnicodeDecodeError("idna", input,
+ exc.start, exc.end, exc.reason)
  labels = input.split(".")
 
  trailing_dot = ''
@@ -288,7 +347,18 @@ def _buffer_decode(self, input, errors, final):
  result = []
  size = 0
  for label in labels:
- result.append(ToUnicode(label))
+ try:
+ u_label = ToUnicode(label)
+ except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+ raise UnicodeDecodeError(
+ "idna",
+ input.encode("ascii", errors="backslashreplace"),
+ size + exc.start,
+ size + exc.end,
+ exc.reason,
+ )
+ else:
+ result.append(u_label)
  if size:
  size += 1
  size += len(label)