mirror of
https://github.com/python/cpython.git
synced 2025-10-15 19:29:46 +00:00
Fix punycode codec and tests.
This commit is contained in:
parent
5c4501af57
commit
a4c612845a
2 changed files with 43 additions and 43 deletions
|
@ -10,15 +10,15 @@ import codecs
|
||||||
|
|
||||||
def segregate(str):
|
def segregate(str):
|
||||||
"""3.1 Basic code point segregation"""
|
"""3.1 Basic code point segregation"""
|
||||||
base = []
|
base = b""
|
||||||
extended = {}
|
extended = set()
|
||||||
for c in str:
|
for c in str:
|
||||||
if ord(c) < 128:
|
if ord(c) < 128:
|
||||||
base.append(c)
|
base.append(ord(c))
|
||||||
else:
|
else:
|
||||||
extended[c] = 1
|
extended.add(c)
|
||||||
extended = sorted(extended.keys())
|
extended = sorted(extended)
|
||||||
return "".join(base).encode("ascii"),extended
|
return (base, extended)
|
||||||
|
|
||||||
def selective_len(str, max):
|
def selective_len(str, max):
|
||||||
"""Return the length of str, considering only characters below max."""
|
"""Return the length of str, considering only characters below max."""
|
||||||
|
@ -75,10 +75,10 @@ def T(j, bias):
|
||||||
if res > 26: return 26
|
if res > 26: return 26
|
||||||
return res
|
return res
|
||||||
|
|
||||||
digits = "abcdefghijklmnopqrstuvwxyz0123456789"
|
digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
|
||||||
def generate_generalized_integer(N, bias):
|
def generate_generalized_integer(N, bias):
|
||||||
"""3.3 Generalized variable-length integers"""
|
"""3.3 Generalized variable-length integers"""
|
||||||
result = []
|
result = b""
|
||||||
j = 0
|
j = 0
|
||||||
while 1:
|
while 1:
|
||||||
t = T(j, bias)
|
t = T(j, bias)
|
||||||
|
@ -107,21 +107,20 @@ def adapt(delta, first, numchars):
|
||||||
def generate_integers(baselen, deltas):
|
def generate_integers(baselen, deltas):
|
||||||
"""3.4 Bias adaptation"""
|
"""3.4 Bias adaptation"""
|
||||||
# Punycode parameters: initial bias = 72, damp = 700, skew = 38
|
# Punycode parameters: initial bias = 72, damp = 700, skew = 38
|
||||||
result = []
|
result = b""
|
||||||
bias = 72
|
bias = 72
|
||||||
for points, delta in enumerate(deltas):
|
for points, delta in enumerate(deltas):
|
||||||
s = generate_generalized_integer(delta, bias)
|
s = generate_generalized_integer(delta, bias)
|
||||||
result.extend(s)
|
result.extend(s)
|
||||||
bias = adapt(delta, points==0, baselen+points+1)
|
bias = adapt(delta, points==0, baselen+points+1)
|
||||||
return "".join(result)
|
return result
|
||||||
|
|
||||||
def punycode_encode(text):
|
def punycode_encode(text):
|
||||||
base, extended = segregate(text)
|
base, extended = segregate(text)
|
||||||
base = base.encode("ascii")
|
|
||||||
deltas = insertion_unsort(text, extended)
|
deltas = insertion_unsort(text, extended)
|
||||||
extended = generate_integers(len(base), deltas)
|
extended = generate_integers(len(base), deltas)
|
||||||
if base:
|
if base:
|
||||||
return base + "-" + extended
|
return base + b"-" + extended
|
||||||
return extended
|
return extended
|
||||||
|
|
||||||
##################### Decoding #####################################
|
##################### Decoding #####################################
|
||||||
|
@ -182,15 +181,13 @@ def insertion_sort(base, extended, errors):
|
||||||
return base
|
return base
|
||||||
|
|
||||||
def punycode_decode(text, errors):
|
def punycode_decode(text, errors):
|
||||||
pos = text.rfind("-")
|
pos = text.rfind(b"-")
|
||||||
if pos == -1:
|
if pos == -1:
|
||||||
base = ""
|
base = ""
|
||||||
extended = text
|
extended = str(text, "ascii").upper()
|
||||||
else:
|
else:
|
||||||
base = text[:pos]
|
base = str(text[:pos], "ascii", errors)
|
||||||
extended = text[pos+1:]
|
extended = str(text[pos+1:], "ascii").upper()
|
||||||
base = str(base, "ascii", errors)
|
|
||||||
extended = extended.upper()
|
|
||||||
return insertion_sort(base, extended, errors)
|
return insertion_sort(base, extended, errors)
|
||||||
|
|
||||||
### Codec APIs
|
### Codec APIs
|
||||||
|
|
|
@ -505,48 +505,48 @@ punycode_testcases = [
|
||||||
# A Arabic (Egyptian):
|
# A Arabic (Egyptian):
|
||||||
("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
|
("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
|
||||||
"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
|
"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
|
||||||
"egbpdaj6bu4bxfgehfvwxn"),
|
b"egbpdaj6bu4bxfgehfvwxn"),
|
||||||
# B Chinese (simplified):
|
# B Chinese (simplified):
|
||||||
("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
|
("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
|
||||||
"ihqwcrb4cv8a8dqg056pqjye"),
|
b"ihqwcrb4cv8a8dqg056pqjye"),
|
||||||
# C Chinese (traditional):
|
# C Chinese (traditional):
|
||||||
("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
|
("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
|
||||||
"ihqwctvzc91f659drss3x8bo0yb"),
|
b"ihqwctvzc91f659drss3x8bo0yb"),
|
||||||
# D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
|
# D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
|
||||||
("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
|
("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
|
||||||
"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
|
"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
|
||||||
"\u0065\u0073\u006B\u0079",
|
"\u0065\u0073\u006B\u0079",
|
||||||
"Proprostnemluvesky-uyb24dma41a"),
|
b"Proprostnemluvesky-uyb24dma41a"),
|
||||||
# E Hebrew:
|
# E Hebrew:
|
||||||
("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
|
("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
|
||||||
"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
|
"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
|
||||||
"\u05D1\u05E8\u05D9\u05EA",
|
"\u05D1\u05E8\u05D9\u05EA",
|
||||||
"4dbcagdahymbxekheh6e0a7fei0b"),
|
b"4dbcagdahymbxekheh6e0a7fei0b"),
|
||||||
# F Hindi (Devanagari):
|
# F Hindi (Devanagari):
|
||||||
("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
|
("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
|
||||||
"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
|
"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
|
||||||
"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
|
"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
|
||||||
"\u0939\u0948\u0902",
|
"\u0939\u0948\u0902",
|
||||||
"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
|
b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
|
||||||
|
|
||||||
#(G) Japanese (kanji and hiragana):
|
#(G) Japanese (kanji and hiragana):
|
||||||
("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
|
("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
|
||||||
"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
|
"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
|
||||||
"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
|
b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
|
||||||
|
|
||||||
# (H) Korean (Hangul syllables):
|
# (H) Korean (Hangul syllables):
|
||||||
("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
|
("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
|
||||||
"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
|
"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
|
||||||
"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
|
"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
|
||||||
"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
|
b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
|
||||||
"psd879ccm6fea98c"),
|
b"psd879ccm6fea98c"),
|
||||||
|
|
||||||
# (I) Russian (Cyrillic):
|
# (I) Russian (Cyrillic):
|
||||||
("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
|
("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
|
||||||
"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
|
"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
|
||||||
"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
|
"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
|
||||||
"\u0438",
|
"\u0438",
|
||||||
"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
|
b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
|
||||||
|
|
||||||
# (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
|
# (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
|
||||||
("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
|
("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
|
||||||
|
@ -554,7 +554,7 @@ punycode_testcases = [
|
||||||
"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
|
"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
|
||||||
"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
|
"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
|
||||||
"\u0061\u00F1\u006F\u006C",
|
"\u0061\u00F1\u006F\u006C",
|
||||||
"PorqunopuedensimplementehablarenEspaol-fmd56a"),
|
b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
|
||||||
|
|
||||||
# (K) Vietnamese:
|
# (K) Vietnamese:
|
||||||
# T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
|
# T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
|
||||||
|
@ -563,45 +563,45 @@ punycode_testcases = [
|
||||||
"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
|
"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
|
||||||
"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
|
"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
|
||||||
"\u0056\u0069\u1EC7\u0074",
|
"\u0056\u0069\u1EC7\u0074",
|
||||||
"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
|
b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
|
||||||
|
|
||||||
#(L) 3<nen>B<gumi><kinpachi><sensei>
|
#(L) 3<nen>B<gumi><kinpachi><sensei>
|
||||||
("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
|
("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
|
||||||
"3B-ww4c5e180e575a65lsy2b"),
|
b"3B-ww4c5e180e575a65lsy2b"),
|
||||||
|
|
||||||
# (M) <amuro><namie>-with-SUPER-MONKEYS
|
# (M) <amuro><namie>-with-SUPER-MONKEYS
|
||||||
("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
|
("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
|
||||||
"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
|
"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
|
||||||
"\u004F\u004E\u004B\u0045\u0059\u0053",
|
"\u004F\u004E\u004B\u0045\u0059\u0053",
|
||||||
"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
|
b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
|
||||||
|
|
||||||
# (N) Hello-Another-Way-<sorezore><no><basho>
|
# (N) Hello-Another-Way-<sorezore><no><basho>
|
||||||
("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
|
("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
|
||||||
"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
|
"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
|
||||||
"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
|
"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
|
||||||
"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
|
b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
|
||||||
|
|
||||||
# (O) <hitotsu><yane><no><shita>2
|
# (O) <hitotsu><yane><no><shita>2
|
||||||
("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
|
("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
|
||||||
"2-u9tlzr9756bt3uc0v"),
|
b"2-u9tlzr9756bt3uc0v"),
|
||||||
|
|
||||||
# (P) Maji<de>Koi<suru>5<byou><mae>
|
# (P) Maji<de>Koi<suru>5<byou><mae>
|
||||||
("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
|
("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
|
||||||
"\u308B\u0035\u79D2\u524D",
|
"\u308B\u0035\u79D2\u524D",
|
||||||
"MajiKoi5-783gue6qz075azm5e"),
|
b"MajiKoi5-783gue6qz075azm5e"),
|
||||||
|
|
||||||
# (Q) <pafii>de<runba>
|
# (Q) <pafii>de<runba>
|
||||||
("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
|
("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
|
||||||
"de-jg4avhby1noc0d"),
|
b"de-jg4avhby1noc0d"),
|
||||||
|
|
||||||
# (R) <sono><supiido><de>
|
# (R) <sono><supiido><de>
|
||||||
("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
|
("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
|
||||||
"d9juau41awczczp"),
|
b"d9juau41awczczp"),
|
||||||
|
|
||||||
# (S) -> $1.00 <-
|
# (S) -> $1.00 <-
|
||||||
("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
|
("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
|
||||||
"\u003C\u002D",
|
"\u003C\u002D",
|
||||||
"-> $1.00 <--")
|
b"-> $1.00 <--")
|
||||||
]
|
]
|
||||||
|
|
||||||
for i in punycode_testcases:
|
for i in punycode_testcases:
|
||||||
|
@ -616,7 +616,10 @@ class PunycodeTest(unittest.TestCase):
|
||||||
# code produces only lower case. Converting just puny to
|
# code produces only lower case. Converting just puny to
|
||||||
# lower is also insufficient, since some of the input characters
|
# lower is also insufficient, since some of the input characters
|
||||||
# are upper case.
|
# are upper case.
|
||||||
self.assertEquals(uni.encode("punycode").lower(), puny.lower())
|
self.assertEquals(
|
||||||
|
str(uni.encode("punycode"), "ascii").lower(),
|
||||||
|
str(puny, "ascii").lower()
|
||||||
|
)
|
||||||
|
|
||||||
def test_decode(self):
|
def test_decode(self):
|
||||||
for uni, puny in punycode_testcases:
|
for uni, puny in punycode_testcases:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue