cpython/Lib/encodings/idna.py
Walter Dörwald 78a0be6ab3 Add a BufferedIncrementalEncoder class that can be used for implementing
an incremental encoder that must retain part of the data between calls
to the encode() method.

Fix the incremental encoder and decoder for the IDNA encoding.

This closes SF patch #1453235.
2006-04-14 18:25:39 +00:00

288 lines
8.3 KiB
Python

# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
import stringprep, re, codecs
from unicodedata import ucd_3_2_0 as unicodedata
# IDNA section 3.1
dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
# IDNA section 5
ace_prefix = "xn--"
uace_prefix = unicode(ace_prefix, "ascii")
# This assumes query strings, so AllowUnassigned is true
def nameprep(label):
# Map
newlabel = []
for c in label:
if stringprep.in_table_b1(c):
# Map to nothing
continue
newlabel.append(stringprep.map_table_b2(c))
label = u"".join(newlabel)
# Normalize
label = unicodedata.normalize("NFKC", label)
# Prohibit
for c in label:
if stringprep.in_table_c12(c) or \
stringprep.in_table_c22(c) or \
stringprep.in_table_c3(c) or \
stringprep.in_table_c4(c) or \
stringprep.in_table_c5(c) or \
stringprep.in_table_c6(c) or \
stringprep.in_table_c7(c) or \
stringprep.in_table_c8(c) or \
stringprep.in_table_c9(c):
raise UnicodeError("Invalid character %r" % c)
# Check bidi
RandAL = map(stringprep.in_table_d1, label)
for c in RandAL:
if c:
# There is a RandAL char in the string. Must perform further
# tests:
# 1) The characters in section 5.8 MUST be prohibited.
# This is table C.8, which was already checked
# 2) If a string contains any RandALCat character, the string
# MUST NOT contain any LCat character.
if filter(stringprep.in_table_d2, label):
raise UnicodeError("Violation of BIDI requirement 2")
# 3) If a string contains any RandALCat character, a
# RandALCat character MUST be the first character of the
# string, and a RandALCat character MUST be the last
# character of the string.
if not RandAL[0] or not RandAL[-1]:
raise UnicodeError("Violation of BIDI requirement 3")
return label
def ToASCII(label):
try:
# Step 1: try ASCII
label = label.encode("ascii")
except UnicodeError:
pass
else:
# Skip to step 3: UseSTD3ASCIIRules is false, so
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
# Step 2: nameprep
label = nameprep(label)
# Step 3: UseSTD3ASCIIRules is false
# Step 4: try ASCII
try:
label = label.encode("ascii")
except UnicodeError:
pass
else:
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
# Step 5: Check ACE prefix
if label.startswith(uace_prefix):
raise UnicodeError("Label starts with ACE prefix")
# Step 6: Encode with PUNYCODE
label = label.encode("punycode")
# Step 7: Prepend ACE prefix
label = ace_prefix + label
# Step 8: Check size
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
def ToUnicode(label):
# Step 1: Check for ASCII
if isinstance(label, str):
pure_ascii = True
else:
try:
label = label.encode("ascii")
pure_ascii = True
except UnicodeError:
pure_ascii = False
if not pure_ascii:
# Step 2: Perform nameprep
label = nameprep(label)
# It doesn't say this, but apparently, it should be ASCII now
try:
label = label.encode("ascii")
except UnicodeError:
raise UnicodeError("Invalid character in IDN label")
# Step 3: Check for ACE prefix
if not label.startswith(ace_prefix):
return unicode(label, "ascii")
# Step 4: Remove ACE prefix
label1 = label[len(ace_prefix):]
# Step 5: Decode using PUNYCODE
result = label1.decode("punycode")
# Step 6: Apply ToASCII
label2 = ToASCII(result)
# Step 7: Compare the result of step 6 with the one of step 3
# label2 will already be in lower case.
if label.lower() != label2:
raise UnicodeError("IDNA does not round-trip", label, label2)
# Step 8: return the result of step 5
return result
### Codec APIs
class Codec(codecs.Codec):
def encode(self,input,errors='strict'):
if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError("unsupported error handling "+errors)
if not input:
return "", 0
result = []
labels = dots.split(input)
if labels and len(labels[-1])==0:
trailing_dot = '.'
del labels[-1]
else:
trailing_dot = ''
for label in labels:
result.append(ToASCII(label))
# Join with U+002E
return ".".join(result)+trailing_dot, len(input)
def decode(self,input,errors='strict'):
if errors != 'strict':
raise UnicodeError("Unsupported error handling "+errors)
if not input:
return u"", 0
# IDNA allows decoding to operate on Unicode strings, too.
if isinstance(input, unicode):
labels = dots.split(input)
else:
# Must be ASCII string
input = str(input)
unicode(input, "ascii")
labels = input.split(".")
if labels and len(labels[-1]) == 0:
trailing_dot = u'.'
del labels[-1]
else:
trailing_dot = u''
result = []
for label in labels:
result.append(ToUnicode(label))
return u".".join(result)+trailing_dot, len(input)
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
def _buffer_encode(self, input, errors, final):
if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError("unsupported error handling "+errors)
if not input:
return ("", 0)
labels = dots.split(input)
trailing_dot = u''
if labels:
if not labels[-1]:
trailing_dot = '.'
del labels[-1]
elif not final:
# Keep potentially unfinished label until the next call
del labels[-1]
if labels:
trailing_dot = '.'
result = []
size = 0
for label in labels:
result.append(ToASCII(label))
if size:
size += 1
size += len(label)
# Join with U+002E
result = ".".join(result) + trailing_dot
size += len(trailing_dot)
return (result, size)
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
if errors != 'strict':
raise UnicodeError("Unsupported error handling "+errors)
if not input:
return (u"", 0)
# IDNA allows decoding to operate on Unicode strings, too.
if isinstance(input, unicode):
labels = dots.split(input)
else:
# Must be ASCII string
input = str(input)
unicode(input, "ascii")
labels = input.split(".")
trailing_dot = u''
if labels:
if not labels[-1]:
trailing_dot = u'.'
del labels[-1]
elif not final:
# Keep potentially unfinished label until the next call
del labels[-1]
if labels:
trailing_dot = u'.'
result = []
size = 0
for label in labels:
result.append(ToUnicode(label))
if size:
size += 1
size += len(label)
result = u".".join(result) + trailing_dot
size += len(trailing_dot)
return (result, size)
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='idna',
encode=Codec().encode,
decode=Codec().decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
)