mirror of
https://github.com/python/cpython.git
synced 2025-07-27 13:14:41 +00:00

of tuple) that provides incremental decoders and encoders (a way to use stateful codecs without the stream API). Functions codecs.getincrementaldecoder() and codecs.getincrementalencoder() have been added.
222 lines
6.4 KiB
Python
222 lines
6.4 KiB
Python
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
|
|
|
|
import stringprep, re, codecs
|
|
from unicodedata import ucd_3_2_0 as unicodedata
|
|
|
|
# IDNA section 3.1
|
|
dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
|
|
|
|
# IDNA section 5
|
|
ace_prefix = "xn--"
|
|
uace_prefix = unicode(ace_prefix, "ascii")
|
|
|
|
# This assumes query strings, so AllowUnassigned is true
|
|
def nameprep(label):
|
|
# Map
|
|
newlabel = []
|
|
for c in label:
|
|
if stringprep.in_table_b1(c):
|
|
# Map to nothing
|
|
continue
|
|
newlabel.append(stringprep.map_table_b2(c))
|
|
label = u"".join(newlabel)
|
|
|
|
# Normalize
|
|
label = unicodedata.normalize("NFKC", label)
|
|
|
|
# Prohibit
|
|
for c in label:
|
|
if stringprep.in_table_c12(c) or \
|
|
stringprep.in_table_c22(c) or \
|
|
stringprep.in_table_c3(c) or \
|
|
stringprep.in_table_c4(c) or \
|
|
stringprep.in_table_c5(c) or \
|
|
stringprep.in_table_c6(c) or \
|
|
stringprep.in_table_c7(c) or \
|
|
stringprep.in_table_c8(c) or \
|
|
stringprep.in_table_c9(c):
|
|
raise UnicodeError, "Invalid character %s" % repr(c)
|
|
|
|
# Check bidi
|
|
RandAL = map(stringprep.in_table_d1, label)
|
|
for c in RandAL:
|
|
if c:
|
|
# There is a RandAL char in the string. Must perform further
|
|
# tests:
|
|
# 1) The characters in section 5.8 MUST be prohibited.
|
|
# This is table C.8, which was already checked
|
|
# 2) If a string contains any RandALCat character, the string
|
|
# MUST NOT contain any LCat character.
|
|
if filter(stringprep.in_table_d2, label):
|
|
raise UnicodeError, "Violation of BIDI requirement 2"
|
|
|
|
# 3) If a string contains any RandALCat character, a
|
|
# RandALCat character MUST be the first character of the
|
|
# string, and a RandALCat character MUST be the last
|
|
# character of the string.
|
|
if not RandAL[0] or not RandAL[-1]:
|
|
raise UnicodeError, "Violation of BIDI requirement 3"
|
|
|
|
return label
|
|
|
|
def ToASCII(label):
|
|
try:
|
|
# Step 1: try ASCII
|
|
label = label.encode("ascii")
|
|
except UnicodeError:
|
|
pass
|
|
else:
|
|
# Skip to step 3: UseSTD3ASCIIRules is false, so
|
|
# Skip to step 8.
|
|
if 0 < len(label) < 64:
|
|
return label
|
|
raise UnicodeError, "label too long"
|
|
|
|
# Step 2: nameprep
|
|
label = nameprep(label)
|
|
|
|
# Step 3: UseSTD3ASCIIRules is false
|
|
# Step 4: try ASCII
|
|
try:
|
|
label = label.encode("ascii")
|
|
except UnicodeError:
|
|
pass
|
|
else:
|
|
# Skip to step 8.
|
|
if 0 < len(label) < 64:
|
|
return label
|
|
raise UnicodeError, "label too long"
|
|
|
|
# Step 5: Check ACE prefix
|
|
if label.startswith(uace_prefix):
|
|
raise UnicodeError, "Label starts with ACE prefix"
|
|
|
|
# Step 6: Encode with PUNYCODE
|
|
label = label.encode("punycode")
|
|
|
|
# Step 7: Prepend ACE prefix
|
|
label = ace_prefix + label
|
|
|
|
# Step 8: Check size
|
|
if 0 < len(label) < 64:
|
|
return label
|
|
raise UnicodeError, "label too long"
|
|
|
|
def ToUnicode(label):
|
|
# Step 1: Check for ASCII
|
|
if isinstance(label, str):
|
|
pure_ascii = True
|
|
else:
|
|
try:
|
|
label = label.encode("ascii")
|
|
pure_ascii = True
|
|
except UnicodeError:
|
|
pure_ascii = False
|
|
if not pure_ascii:
|
|
# Step 2: Perform nameprep
|
|
label = nameprep(label)
|
|
# It doesn't say this, but apparently, it should be ASCII now
|
|
try:
|
|
label = label.encode("ascii")
|
|
except UnicodeError:
|
|
raise UnicodeError, "Invalid character in IDN label"
|
|
# Step 3: Check for ACE prefix
|
|
if not label.startswith(ace_prefix):
|
|
return unicode(label, "ascii")
|
|
|
|
# Step 4: Remove ACE prefix
|
|
label1 = label[len(ace_prefix):]
|
|
|
|
# Step 5: Decode using PUNYCODE
|
|
result = label1.decode("punycode")
|
|
|
|
# Step 6: Apply ToASCII
|
|
label2 = ToASCII(result)
|
|
|
|
# Step 7: Compare the result of step 6 with the one of step 3
|
|
# label2 will already be in lower case.
|
|
if label.lower() != label2:
|
|
raise UnicodeError, ("IDNA does not round-trip", label, label2)
|
|
|
|
# Step 8: return the result of step 5
|
|
return result
|
|
|
|
### Codec APIs
|
|
|
|
class Codec(codecs.Codec):
|
|
def encode(self,input,errors='strict'):
|
|
|
|
if errors != 'strict':
|
|
# IDNA is quite clear that implementations must be strict
|
|
raise UnicodeError, "unsupported error handling "+errors
|
|
|
|
if not input:
|
|
return "", 0
|
|
|
|
result = []
|
|
labels = dots.split(input)
|
|
if labels and len(labels[-1])==0:
|
|
trailing_dot = '.'
|
|
del labels[-1]
|
|
else:
|
|
trailing_dot = ''
|
|
for label in labels:
|
|
result.append(ToASCII(label))
|
|
# Join with U+002E
|
|
return ".".join(result)+trailing_dot, len(input)
|
|
|
|
def decode(self,input,errors='strict'):
|
|
|
|
if errors != 'strict':
|
|
raise UnicodeError, "Unsupported error handling "+errors
|
|
|
|
if not input:
|
|
return u"", 0
|
|
|
|
# IDNA allows decoding to operate on Unicode strings, too.
|
|
if isinstance(input, unicode):
|
|
labels = dots.split(input)
|
|
else:
|
|
# Must be ASCII string
|
|
input = str(input)
|
|
unicode(input, "ascii")
|
|
labels = input.split(".")
|
|
|
|
if labels and len(labels[-1]) == 0:
|
|
trailing_dot = u'.'
|
|
del labels[-1]
|
|
else:
|
|
trailing_dot = u''
|
|
|
|
result = []
|
|
for label in labels:
|
|
result.append(ToUnicode(label))
|
|
|
|
return u".".join(result)+trailing_dot, len(input)
|
|
|
|
class IncrementalEncoder(codecs.IncrementalEncoder):
|
|
def encode(self, input, final=False):
|
|
return Codec().encode(input, self.errors)[0]
|
|
|
|
class IncrementalDecoder(codecs.IncrementalDecoder):
|
|
def decode(self, input, final=False):
|
|
return Codec().decode(input, self.errors)[0]
|
|
|
|
class StreamWriter(Codec,codecs.StreamWriter):
|
|
pass
|
|
|
|
class StreamReader(Codec,codecs.StreamReader):
|
|
pass
|
|
|
|
### encodings module API
|
|
|
|
def getregentry():
|
|
return codecs.CodecInfo(
|
|
name='idna',
|
|
encode=Codec().encode,
|
|
decode=Codec().decode,
|
|
incrementalencoder=IncrementalEncoder,
|
|
incrementaldecoder=IncrementalDecoder,
|
|
streamwriter=StreamWriter,
|
|
streamreader=StreamReader,
|
|
)
|