mirror of
https://github.com/python/cpython.git
synced 2025-08-25 11:15:02 +00:00
Issue #10459: Update CJK character names to Unicode 6.0.
This commit is contained in:
parent
249d7e3c2e
commit
5cbc71e50a
4 changed files with 33 additions and 7 deletions
|
@ -88,9 +88,13 @@ class UnicodeNamesTest(unittest.TestCase):
|
||||||
self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
|
self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
|
||||||
self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
|
self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
|
||||||
self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
|
self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
|
||||||
self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", "\u9fa5")
|
self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
|
||||||
self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
|
self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
|
||||||
self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
|
self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
|
||||||
|
self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
|
||||||
|
self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
|
||||||
|
self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
|
||||||
|
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
|
||||||
|
|
||||||
def test_bmp_characters(self):
|
def test_bmp_characters(self):
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
|
@ -32,6 +32,8 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #10459: Update CJK character names to Unicode 6.0.
|
||||||
|
|
||||||
- Issue #4493: urllib.request adds '/' in front of path components which does not
|
- Issue #4493: urllib.request adds '/' in front of path components which does not
|
||||||
start with '/. Common behavior exhibited by browsers and other clients.
|
start with '/. Common behavior exhibited by browsers and other clients.
|
||||||
|
|
||||||
|
|
|
@ -866,13 +866,16 @@ static char *hangul_syllables[][3] = {
|
||||||
{ 0, 0, "H" }
|
{ 0, 0, "H" }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
|
||||||
static int
|
static int
|
||||||
is_unified_ideograph(Py_UCS4 code)
|
is_unified_ideograph(Py_UCS4 code)
|
||||||
{
|
{
|
||||||
return (
|
return
|
||||||
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
|
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
|
||||||
(0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
|
(0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
|
||||||
(0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
|
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
|
||||||
|
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
|
||||||
|
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
|
|
@ -70,6 +70,15 @@ PRINTABLE_MASK = 0x400
|
||||||
NODELTA_MASK = 0x800
|
NODELTA_MASK = 0x800
|
||||||
NUMERIC_MASK = 0x1000
|
NUMERIC_MASK = 0x1000
|
||||||
|
|
||||||
|
# these ranges need to match unicodedata.c:is_unified_ideograph
|
||||||
|
cjk_ranges = [
|
||||||
|
('3400', '4DB5'),
|
||||||
|
('4E00', '9FCB'),
|
||||||
|
('20000', '2A6D6'),
|
||||||
|
('2A700', '2B734'),
|
||||||
|
('2B740', '2B81D')
|
||||||
|
]
|
||||||
|
|
||||||
def maketables(trace=0):
|
def maketables(trace=0):
|
||||||
|
|
||||||
print("--- Reading", UNICODE_DATA % "", "...")
|
print("--- Reading", UNICODE_DATA % "", "...")
|
||||||
|
@ -81,7 +90,7 @@ def maketables(trace=0):
|
||||||
|
|
||||||
for version in old_versions:
|
for version in old_versions:
|
||||||
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
||||||
old_unicode = UnicodeData(version)
|
old_unicode = UnicodeData(version, cjk_check=False)
|
||||||
print(len(list(filter(None, old_unicode.table))), "characters")
|
print(len(list(filter(None, old_unicode.table))), "characters")
|
||||||
merge_old_version(version, unicode, old_unicode)
|
merge_old_version(version, unicode, old_unicode)
|
||||||
|
|
||||||
|
@ -804,7 +813,8 @@ class UnicodeData:
|
||||||
|
|
||||||
def __init__(self, version,
|
def __init__(self, version,
|
||||||
linebreakprops=False,
|
linebreakprops=False,
|
||||||
expand=1):
|
expand=1,
|
||||||
|
cjk_check=True):
|
||||||
self.changed = []
|
self.changed = []
|
||||||
file = open_data(UNICODE_DATA, version)
|
file = open_data(UNICODE_DATA, version)
|
||||||
table = [None] * 0x110000
|
table = [None] * 0x110000
|
||||||
|
@ -816,6 +826,8 @@ class UnicodeData:
|
||||||
char = int(s[0], 16)
|
char = int(s[0], 16)
|
||||||
table[char] = s
|
table[char] = s
|
||||||
|
|
||||||
|
cjk_ranges_found = []
|
||||||
|
|
||||||
# expand first-last ranges
|
# expand first-last ranges
|
||||||
if expand:
|
if expand:
|
||||||
field = None
|
field = None
|
||||||
|
@ -826,12 +838,17 @@ class UnicodeData:
|
||||||
s[1] = ""
|
s[1] = ""
|
||||||
field = s
|
field = s
|
||||||
elif s[1][-5:] == "Last>":
|
elif s[1][-5:] == "Last>":
|
||||||
|
if s[1].startswith("<CJK Ideograph"):
|
||||||
|
cjk_ranges_found.append((field[0],
|
||||||
|
s[0]))
|
||||||
s[1] = ""
|
s[1] = ""
|
||||||
field = None
|
field = None
|
||||||
elif field:
|
elif field:
|
||||||
f2 = field[:]
|
f2 = field[:]
|
||||||
f2[0] = "%X" % i
|
f2[0] = "%X" % i
|
||||||
table[i] = f2
|
table[i] = f2
|
||||||
|
if cjk_check and cjk_ranges != cjk_ranges_found:
|
||||||
|
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
|
||||||
|
|
||||||
# public attributes
|
# public attributes
|
||||||
self.filename = UNICODE_DATA % ''
|
self.filename = UNICODE_DATA % ''
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue