mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 03:44:55 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			309 lines
		
	
	
	
		
			9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			309 lines
		
	
	
	
		
			9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
 | 
						|
 | 
						|
import stringprep, re, codecs
 | 
						|
from unicodedata import ucd_3_2_0 as unicodedata
 | 
						|
 | 
						|
# IDNA section 3.1
 | 
						|
dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
 | 
						|
 | 
						|
# IDNA section 5
 | 
						|
ace_prefix = b"xn--"
 | 
						|
sace_prefix = "xn--"
 | 
						|
 | 
						|
# This assumes query strings, so AllowUnassigned is true
 | 
						|
def nameprep(label):
 | 
						|
    # Map
 | 
						|
    newlabel = []
 | 
						|
    for c in label:
 | 
						|
        if stringprep.in_table_b1(c):
 | 
						|
            # Map to nothing
 | 
						|
            continue
 | 
						|
        newlabel.append(stringprep.map_table_b2(c))
 | 
						|
    label = "".join(newlabel)
 | 
						|
 | 
						|
    # Normalize
 | 
						|
    label = unicodedata.normalize("NFKC", label)
 | 
						|
 | 
						|
    # Prohibit
 | 
						|
    for c in label:
 | 
						|
        if stringprep.in_table_c12(c) or \
 | 
						|
           stringprep.in_table_c22(c) or \
 | 
						|
           stringprep.in_table_c3(c) or \
 | 
						|
           stringprep.in_table_c4(c) or \
 | 
						|
           stringprep.in_table_c5(c) or \
 | 
						|
           stringprep.in_table_c6(c) or \
 | 
						|
           stringprep.in_table_c7(c) or \
 | 
						|
           stringprep.in_table_c8(c) or \
 | 
						|
           stringprep.in_table_c9(c):
 | 
						|
            raise UnicodeError("Invalid character %r" % c)
 | 
						|
 | 
						|
    # Check bidi
 | 
						|
    RandAL = [stringprep.in_table_d1(x) for x in label]
 | 
						|
    for c in RandAL:
 | 
						|
        if c:
 | 
						|
            # There is a RandAL char in the string. Must perform further
 | 
						|
            # tests:
 | 
						|
            # 1) The characters in section 5.8 MUST be prohibited.
 | 
						|
            # This is table C.8, which was already checked
 | 
						|
            # 2) If a string contains any RandALCat character, the string
 | 
						|
            # MUST NOT contain any LCat character.
 | 
						|
            if any(stringprep.in_table_d2(x) for x in label):
 | 
						|
                raise UnicodeError("Violation of BIDI requirement 2")
 | 
						|
 | 
						|
            # 3) If a string contains any RandALCat character, a
 | 
						|
            # RandALCat character MUST be the first character of the
 | 
						|
            # string, and a RandALCat character MUST be the last
 | 
						|
            # character of the string.
 | 
						|
            if not RandAL[0] or not RandAL[-1]:
 | 
						|
                raise UnicodeError("Violation of BIDI requirement 3")
 | 
						|
 | 
						|
    return label
 | 
						|
 | 
						|
def ToASCII(label):
 | 
						|
    try:
 | 
						|
        # Step 1: try ASCII
 | 
						|
        label = label.encode("ascii")
 | 
						|
    except UnicodeError:
 | 
						|
        pass
 | 
						|
    else:
 | 
						|
        # Skip to step 3: UseSTD3ASCIIRules is false, so
 | 
						|
        # Skip to step 8.
 | 
						|
        if 0 < len(label) < 64:
 | 
						|
            return label
 | 
						|
        raise UnicodeError("label empty or too long")
 | 
						|
 | 
						|
    # Step 2: nameprep
 | 
						|
    label = nameprep(label)
 | 
						|
 | 
						|
    # Step 3: UseSTD3ASCIIRules is false
 | 
						|
    # Step 4: try ASCII
 | 
						|
    try:
 | 
						|
        label = label.encode("ascii")
 | 
						|
    except UnicodeError:
 | 
						|
        pass
 | 
						|
    else:
 | 
						|
        # Skip to step 8.
 | 
						|
        if 0 < len(label) < 64:
 | 
						|
            return label
 | 
						|
        raise UnicodeError("label empty or too long")
 | 
						|
 | 
						|
    # Step 5: Check ACE prefix
 | 
						|
    if label.startswith(sace_prefix):
 | 
						|
        raise UnicodeError("Label starts with ACE prefix")
 | 
						|
 | 
						|
    # Step 6: Encode with PUNYCODE
 | 
						|
    label = label.encode("punycode")
 | 
						|
 | 
						|
    # Step 7: Prepend ACE prefix
 | 
						|
    label = ace_prefix + label
 | 
						|
 | 
						|
    # Step 8: Check size
 | 
						|
    if 0 < len(label) < 64:
 | 
						|
        return label
 | 
						|
    raise UnicodeError("label empty or too long")
 | 
						|
 | 
						|
def ToUnicode(label):
 | 
						|
    # Step 1: Check for ASCII
 | 
						|
    if isinstance(label, bytes):
 | 
						|
        pure_ascii = True
 | 
						|
    else:
 | 
						|
        try:
 | 
						|
            label = label.encode("ascii")
 | 
						|
            pure_ascii = True
 | 
						|
        except UnicodeError:
 | 
						|
            pure_ascii = False
 | 
						|
    if not pure_ascii:
 | 
						|
        # Step 2: Perform nameprep
 | 
						|
        label = nameprep(label)
 | 
						|
        # It doesn't say this, but apparently, it should be ASCII now
 | 
						|
        try:
 | 
						|
            label = label.encode("ascii")
 | 
						|
        except UnicodeError:
 | 
						|
            raise UnicodeError("Invalid character in IDN label")
 | 
						|
    # Step 3: Check for ACE prefix
 | 
						|
    if not label.startswith(ace_prefix):
 | 
						|
        return str(label, "ascii")
 | 
						|
 | 
						|
    # Step 4: Remove ACE prefix
 | 
						|
    label1 = label[len(ace_prefix):]
 | 
						|
 | 
						|
    # Step 5: Decode using PUNYCODE
 | 
						|
    result = label1.decode("punycode")
 | 
						|
 | 
						|
    # Step 6: Apply ToASCII
 | 
						|
    label2 = ToASCII(result)
 | 
						|
 | 
						|
    # Step 7: Compare the result of step 6 with the one of step 3
 | 
						|
    # label2 will already be in lower case.
 | 
						|
    if str(label, "ascii").lower() != str(label2, "ascii"):
 | 
						|
        raise UnicodeError("IDNA does not round-trip", label, label2)
 | 
						|
 | 
						|
    # Step 8: return the result of step 5
 | 
						|
    return result
 | 
						|
 | 
						|
### Codec APIs
 | 
						|
 | 
						|
class Codec(codecs.Codec):
 | 
						|
    def encode(self, input, errors='strict'):
 | 
						|
 | 
						|
        if errors != 'strict':
 | 
						|
            # IDNA is quite clear that implementations must be strict
 | 
						|
            raise UnicodeError("unsupported error handling "+errors)
 | 
						|
 | 
						|
        if not input:
 | 
						|
            return b'', 0
 | 
						|
 | 
						|
        try:
 | 
						|
            result = input.encode('ascii')
 | 
						|
        except UnicodeEncodeError:
 | 
						|
            pass
 | 
						|
        else:
 | 
						|
            # ASCII name: fast path
 | 
						|
            labels = result.split(b'.')
 | 
						|
            for label in labels[:-1]:
 | 
						|
                if not (0 < len(label) < 64):
 | 
						|
                    raise UnicodeError("label empty or too long")
 | 
						|
            if len(labels[-1]) >= 64:
 | 
						|
                raise UnicodeError("label too long")
 | 
						|
            return result, len(input)
 | 
						|
 | 
						|
        result = bytearray()
 | 
						|
        labels = dots.split(input)
 | 
						|
        if labels and not labels[-1]:
 | 
						|
            trailing_dot = b'.'
 | 
						|
            del labels[-1]
 | 
						|
        else:
 | 
						|
            trailing_dot = b''
 | 
						|
        for label in labels:
 | 
						|
            if result:
 | 
						|
                # Join with U+002E
 | 
						|
                result.extend(b'.')
 | 
						|
            result.extend(ToASCII(label))
 | 
						|
        return bytes(result+trailing_dot), len(input)
 | 
						|
 | 
						|
    def decode(self, input, errors='strict'):
 | 
						|
 | 
						|
        if errors != 'strict':
 | 
						|
            raise UnicodeError("Unsupported error handling "+errors)
 | 
						|
 | 
						|
        if not input:
 | 
						|
            return "", 0
 | 
						|
 | 
						|
        # IDNA allows decoding to operate on Unicode strings, too.
 | 
						|
        if not isinstance(input, bytes):
 | 
						|
            # XXX obviously wrong, see #3232
 | 
						|
            input = bytes(input)
 | 
						|
 | 
						|
        if ace_prefix not in input:
 | 
						|
            # Fast path
 | 
						|
            try:
 | 
						|
                return input.decode('ascii'), len(input)
 | 
						|
            except UnicodeDecodeError:
 | 
						|
                pass
 | 
						|
 | 
						|
        labels = input.split(b".")
 | 
						|
 | 
						|
        if labels and len(labels[-1]) == 0:
 | 
						|
            trailing_dot = '.'
 | 
						|
            del labels[-1]
 | 
						|
        else:
 | 
						|
            trailing_dot = ''
 | 
						|
 | 
						|
        result = []
 | 
						|
        for label in labels:
 | 
						|
            result.append(ToUnicode(label))
 | 
						|
 | 
						|
        return ".".join(result)+trailing_dot, len(input)
 | 
						|
 | 
						|
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
 | 
						|
    def _buffer_encode(self, input, errors, final):
 | 
						|
        if errors != 'strict':
 | 
						|
            # IDNA is quite clear that implementations must be strict
 | 
						|
            raise UnicodeError("unsupported error handling "+errors)
 | 
						|
 | 
						|
        if not input:
 | 
						|
            return (b'', 0)
 | 
						|
 | 
						|
        labels = dots.split(input)
 | 
						|
        trailing_dot = b''
 | 
						|
        if labels:
 | 
						|
            if not labels[-1]:
 | 
						|
                trailing_dot = b'.'
 | 
						|
                del labels[-1]
 | 
						|
            elif not final:
 | 
						|
                # Keep potentially unfinished label until the next call
 | 
						|
                del labels[-1]
 | 
						|
                if labels:
 | 
						|
                    trailing_dot = b'.'
 | 
						|
 | 
						|
        result = bytearray()
 | 
						|
        size = 0
 | 
						|
        for label in labels:
 | 
						|
            if size:
 | 
						|
                # Join with U+002E
 | 
						|
                result.extend(b'.')
 | 
						|
                size += 1
 | 
						|
            result.extend(ToASCII(label))
 | 
						|
            size += len(label)
 | 
						|
 | 
						|
        result += trailing_dot
 | 
						|
        size += len(trailing_dot)
 | 
						|
        return (bytes(result), size)
 | 
						|
 | 
						|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
 | 
						|
    def _buffer_decode(self, input, errors, final):
 | 
						|
        if errors != 'strict':
 | 
						|
            raise UnicodeError("Unsupported error handling "+errors)
 | 
						|
 | 
						|
        if not input:
 | 
						|
            return ("", 0)
 | 
						|
 | 
						|
        # IDNA allows decoding to operate on Unicode strings, too.
 | 
						|
        if isinstance(input, str):
 | 
						|
            labels = dots.split(input)
 | 
						|
        else:
 | 
						|
            # Must be ASCII string
 | 
						|
            input = str(input, "ascii")
 | 
						|
            labels = input.split(".")
 | 
						|
 | 
						|
        trailing_dot = ''
 | 
						|
        if labels:
 | 
						|
            if not labels[-1]:
 | 
						|
                trailing_dot = '.'
 | 
						|
                del labels[-1]
 | 
						|
            elif not final:
 | 
						|
                # Keep potentially unfinished label until the next call
 | 
						|
                del labels[-1]
 | 
						|
                if labels:
 | 
						|
                    trailing_dot = '.'
 | 
						|
 | 
						|
        result = []
 | 
						|
        size = 0
 | 
						|
        for label in labels:
 | 
						|
            result.append(ToUnicode(label))
 | 
						|
            if size:
 | 
						|
                size += 1
 | 
						|
            size += len(label)
 | 
						|
 | 
						|
        result = ".".join(result) + trailing_dot
 | 
						|
        size += len(trailing_dot)
 | 
						|
        return (result, size)
 | 
						|
 | 
						|
class StreamWriter(Codec,codecs.StreamWriter):
 | 
						|
    pass
 | 
						|
 | 
						|
class StreamReader(Codec,codecs.StreamReader):
 | 
						|
    pass
 | 
						|
 | 
						|
### encodings module API
 | 
						|
 | 
						|
def getregentry():
 | 
						|
    return codecs.CodecInfo(
 | 
						|
        name='idna',
 | 
						|
        encode=Codec().encode,
 | 
						|
        decode=Codec().decode,
 | 
						|
        incrementalencoder=IncrementalEncoder,
 | 
						|
        incrementaldecoder=IncrementalDecoder,
 | 
						|
        streamwriter=StreamWriter,
 | 
						|
        streamreader=StreamReader,
 | 
						|
    )
 |