mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
gh-85287: Change codecs to raise precise UnicodeEncodeError and UnicodeDecodeError (#113674)
Co-authored-by: Inada Naoki <songofacandy@gmail.com>
This commit is contained in:
parent
c514a975ab
commit
649857a157
9 changed files with 306 additions and 81 deletions
|
@ -1,4 +1,4 @@
|
|||
""" Codec for the Punicode encoding, as specified in RFC 3492
|
||||
""" Codec for the Punycode encoding, as specified in RFC 3492
|
||||
|
||||
Written by Martin v. Löwis.
|
||||
"""
|
||||
|
@ -131,10 +131,11 @@ def decode_generalized_number(extended, extpos, bias, errors):
|
|||
j = 0
|
||||
while 1:
|
||||
try:
|
||||
char = ord(extended[extpos])
|
||||
char = extended[extpos]
|
||||
except IndexError:
|
||||
if errors == "strict":
|
||||
raise UnicodeError("incomplete punicode string")
|
||||
raise UnicodeDecodeError("punycode", extended, extpos, extpos+1,
|
||||
"incomplete punycode string")
|
||||
return extpos + 1, None
|
||||
extpos += 1
|
||||
if 0x41 <= char <= 0x5A: # A-Z
|
||||
|
@ -142,8 +143,8 @@ def decode_generalized_number(extended, extpos, bias, errors):
|
|||
elif 0x30 <= char <= 0x39:
|
||||
digit = char - 22 # 0x30-26
|
||||
elif errors == "strict":
|
||||
raise UnicodeError("Invalid extended code point '%s'"
|
||||
% extended[extpos-1])
|
||||
raise UnicodeDecodeError("punycode", extended, extpos-1, extpos,
|
||||
f"Invalid extended code point '{extended[extpos-1]}'")
|
||||
else:
|
||||
return extpos, None
|
||||
t = T(j, bias)
|
||||
|
@ -155,11 +156,14 @@ def decode_generalized_number(extended, extpos, bias, errors):
|
|||
|
||||
|
||||
def insertion_sort(base, extended, errors):
|
||||
"""3.2 Insertion unsort coding"""
|
||||
"""3.2 Insertion sort coding"""
|
||||
# This function raises UnicodeDecodeError with position in the extended.
|
||||
# Caller should add the offset.
|
||||
char = 0x80
|
||||
pos = -1
|
||||
bias = 72
|
||||
extpos = 0
|
||||
|
||||
while extpos < len(extended):
|
||||
newpos, delta = decode_generalized_number(extended, extpos,
|
||||
bias, errors)
|
||||
|
@ -171,7 +175,9 @@ def insertion_sort(base, extended, errors):
|
|||
char += pos // (len(base) + 1)
|
||||
if char > 0x10FFFF:
|
||||
if errors == "strict":
|
||||
raise UnicodeError("Invalid character U+%x" % char)
|
||||
raise UnicodeDecodeError(
|
||||
"punycode", extended, pos-1, pos,
|
||||
f"Invalid character U+{char:x}")
|
||||
char = ord('?')
|
||||
pos = pos % (len(base) + 1)
|
||||
base = base[:pos] + chr(char) + base[pos:]
|
||||
|
@ -187,11 +193,21 @@ def punycode_decode(text, errors):
|
|||
pos = text.rfind(b"-")
|
||||
if pos == -1:
|
||||
base = ""
|
||||
extended = str(text, "ascii").upper()
|
||||
extended = text.upper()
|
||||
else:
|
||||
base = str(text[:pos], "ascii", errors)
|
||||
extended = str(text[pos+1:], "ascii").upper()
|
||||
return insertion_sort(base, extended, errors)
|
||||
try:
|
||||
base = str(text[:pos], "ascii", errors)
|
||||
except UnicodeDecodeError as exc:
|
||||
raise UnicodeDecodeError("ascii", text, exc.start, exc.end,
|
||||
exc.reason) from None
|
||||
extended = text[pos+1:].upper()
|
||||
try:
|
||||
return insertion_sort(base, extended, errors)
|
||||
except UnicodeDecodeError as exc:
|
||||
offset = pos + 1
|
||||
raise UnicodeDecodeError("punycode", text,
|
||||
offset+exc.start, offset+exc.end,
|
||||
exc.reason) from None
|
||||
|
||||
### Codec APIs
|
||||
|
||||
|
@ -203,7 +219,7 @@ class Codec(codecs.Codec):
|
|||
|
||||
def decode(self, input, errors='strict'):
|
||||
if errors not in ('strict', 'replace', 'ignore'):
|
||||
raise UnicodeError("Unsupported error handling "+errors)
|
||||
raise UnicodeError(f"Unsupported error handling: {errors}")
|
||||
res = punycode_decode(input, errors)
|
||||
return res, len(input)
|
||||
|
||||
|
@ -214,7 +230,7 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
|
|||
class IncrementalDecoder(codecs.IncrementalDecoder):
|
||||
def decode(self, input, final=False):
|
||||
if self.errors not in ('strict', 'replace', 'ignore'):
|
||||
raise UnicodeError("Unsupported error handling "+self.errors)
|
||||
raise UnicodeError(f"Unsupported error handling: {self.errors}")
|
||||
return punycode_decode(input, self.errors)
|
||||
|
||||
class StreamWriter(Codec,codecs.StreamWriter):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue