mirror of
https://github.com/python/cpython.git
synced 2025-11-03 03:22:27 +00:00
Implement names for CJK unified ideographs. Add name to KeyError output.
Verify that the lookup for an existing name succeeds.
This commit is contained in:
parent
8579efc86c
commit
ef7fe2e813
4 changed files with 59 additions and 8 deletions
|
|
@ -2,7 +2,8 @@ test_ucn
|
|||
Testing General Unicode Character Name, and case insensitivity... done.
|
||||
Testing name to code mapping.... done.
|
||||
Testing hangul syllable names.... done.
|
||||
Testing code to name mapping for all characters.... done.
|
||||
Found 22728 characters in the unicode name database
|
||||
Testing names of CJK unified ideographs.... done.
|
||||
Testing code to name mapping for all BMP characters.... done.
|
||||
Found 50212 characters in the unicode name database
|
||||
Testing misc. symbols for unicode character name expansion.... done.
|
||||
Testing unicode character name expansion strict error handling.... done.
|
||||
|
|
|
|||
|
|
@ -80,16 +80,28 @@ else:
|
|||
raise AssertionError, "Found name for U+D7A4"
|
||||
print "done."
|
||||
|
||||
print "Testing code to name mapping for all characters....",
|
||||
print "Testing names of CJK unified ideographs....",
|
||||
exec r"""
|
||||
verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400")
|
||||
verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5")
|
||||
verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00")
|
||||
verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5")
|
||||
verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000")
|
||||
verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6")
|
||||
"""
|
||||
print "done."
|
||||
|
||||
print "Testing code to name mapping for all BMP characters....",
|
||||
count = 0
|
||||
for code in range(65536):
|
||||
for code in range(0x10000):
|
||||
try:
|
||||
char = unichr(code)
|
||||
name = unicodedata.name(char)
|
||||
verify(unicodedata.lookup(name) == char)
|
||||
count += 1
|
||||
except (KeyError, ValueError):
|
||||
pass
|
||||
else:
|
||||
verify(unicodedata.lookup(name) == char)
|
||||
count += 1
|
||||
print "done."
|
||||
|
||||
print "Found", count, "characters in the unicode name database"
|
||||
|
|
|
|||
|
|
@ -318,7 +318,7 @@ Extension modules
|
|||
is now named bsddb185.
|
||||
|
||||
- unicodedata was updated to Unicode 3.2. In now also supports names
|
||||
for Hangul syllables.
|
||||
for Hangul syllables and CJK unified ideographs.
|
||||
|
||||
- resource.getrlimit() now returns longs instead of ints.
|
||||
|
||||
|
|
|
|||
|
|
@ -348,6 +348,16 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
|
|||
return 1;
|
||||
}
|
||||
|
||||
if ((0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
|
||||
(0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
|
||||
(0x20000 <= code && code <= 0x2A6D6)) {/* CJK Ideograph Extension B */
|
||||
if (buflen < 28)
|
||||
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
|
||||
return 0;
|
||||
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (code >= 0x110000)
|
||||
return 0;
|
||||
|
||||
|
|
@ -449,6 +459,30 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
|
|||
*code = SBase + (L*VCount+V)*TCount + T;
|
||||
return 1;
|
||||
}
|
||||
/* Otherwise, it's an illegal syllable name. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Check for unified ideographs. */
|
||||
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
|
||||
/* Four or five hexdigits must follow. */
|
||||
v = 0;
|
||||
name += 22;
|
||||
namelen -= 22;
|
||||
if (namelen != 4 && namelen != 5)
|
||||
return 0;
|
||||
while (namelen--) {
|
||||
v *= 16;
|
||||
if (*name >= '0' && *name <= '9')
|
||||
v += *name - '0';
|
||||
else if (*name >= 'A' && *name <= 'F')
|
||||
v += *name - 'A' + 10;
|
||||
else
|
||||
return 0;
|
||||
name++;
|
||||
}
|
||||
*code = v;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* the following is the same as python's dictionary lookup, with
|
||||
|
|
@ -535,7 +569,11 @@ unicodedata_lookup(PyObject* self, PyObject* args)
|
|||
return NULL;
|
||||
|
||||
if (!_getcode(name, namelen, &code)) {
|
||||
PyErr_SetString(PyExc_KeyError, "undefined character name");
|
||||
char fmt[] = "undefined character name '%s'";
|
||||
char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
|
||||
sprintf(buf, fmt, name);
|
||||
PyErr_SetString(PyExc_KeyError, buf);
|
||||
PyMem_FREE(buf);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue