mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
#12753: Add support for Unicode name aliases and named sequences.
This commit is contained in:
parent
3764a964ca
commit
931b8aac80
10 changed files with 18125 additions and 17194 deletions
|
@ -21,11 +21,17 @@
|
|||
# 2004-05-29 perky add east asian width information
|
||||
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
|
||||
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
|
||||
# 2011-10-21 ezio add support for name aliases and named sequences
|
||||
#
|
||||
# written by Fredrik Lundh (fredrik@pythonware.com)
|
||||
#
|
||||
|
||||
import sys, os, zipfile
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
from textwrap import dedent
|
||||
from operator import itemgetter
|
||||
|
||||
SCRIPT = sys.argv[0]
|
||||
VERSION = "3.2"
|
||||
|
@ -39,6 +45,17 @@ UNIHAN = "Unihan%s.zip"
|
|||
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
||||
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
||||
LINE_BREAK = "LineBreak%s.txt"
|
||||
NAME_ALIASES = "NameAliases%s.txt"
|
||||
NAMED_SEQUENCES = "NamedSequences%s.txt"
|
||||
|
||||
# Private Use Areas -- in planes 1, 15, 16
|
||||
PUA_1 = range(0xE000, 0xF900)
|
||||
PUA_15 = range(0xF0000, 0xFFFFE)
|
||||
PUA_16 = range(0x100000, 0x10FFFE)
|
||||
|
||||
# we use this ranges of PUA_15 to store name aliases and named sequences
|
||||
NAME_ALIASES_START = 0xF0000
|
||||
NAMED_SEQUENCES_START = 0xF0100
|
||||
|
||||
old_versions = ["3.2.0"]
|
||||
|
||||
|
@ -692,6 +709,39 @@ def makeunicodename(unicode, trace):
|
|||
print("/* name->code dictionary */", file=fp)
|
||||
codehash.dump(fp, trace)
|
||||
|
||||
print(file=fp)
|
||||
print('static const unsigned int aliases_start = %#x;' %
|
||||
NAME_ALIASES_START, file=fp)
|
||||
print('static const unsigned int aliases_end = %#x;' %
|
||||
(NAME_ALIASES_START + len(unicode.aliases)), file=fp)
|
||||
|
||||
print('static const unsigned int name_aliases[] = {', file=fp)
|
||||
for name, codepoint in unicode.aliases:
|
||||
print(' 0x%04X,' % codepoint, file=fp)
|
||||
print('};', file=fp)
|
||||
|
||||
# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
|
||||
# so we are using Py_UCS2 seq[4]. This needs to be updated if longer
|
||||
# sequences or sequences with non-BMP chars are added.
|
||||
# unicodedata_lookup should be adapted too.
|
||||
print(dedent("""
|
||||
typedef struct NamedSequence {
|
||||
int seqlen;
|
||||
Py_UCS2 seq[4];
|
||||
} named_sequence;
|
||||
"""), file=fp)
|
||||
|
||||
print('static const unsigned int named_sequences_start = %#x;' %
|
||||
NAMED_SEQUENCES_START, file=fp)
|
||||
print('static const unsigned int named_sequences_end = %#x;' %
|
||||
(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
|
||||
|
||||
print('static const named_sequence named_sequences[] = {', file=fp)
|
||||
for name, sequence in unicode.named_sequences:
|
||||
seq_str = ', '.join('0x%04X' % cp for cp in sequence)
|
||||
print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
|
||||
print('};', file=fp)
|
||||
|
||||
fp.close()
|
||||
|
||||
|
||||
|
@ -726,7 +776,11 @@ def merge_old_version(version, new, old):
|
|||
for k in range(len(old.table[i])):
|
||||
if old.table[i][k] != new.table[i][k]:
|
||||
value = old.table[i][k]
|
||||
if k == 2:
|
||||
if k == 1 and i in PUA_15:
|
||||
# the name is not set in the old.table, but in the
|
||||
# new.table we are using it for aliases and named seq
|
||||
assert value == ''
|
||||
elif k == 2:
|
||||
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
|
||||
category_changes[i] = CATEGORY_NAMES.index(value)
|
||||
elif k == 4:
|
||||
|
@ -855,6 +909,50 @@ class UnicodeData:
|
|||
self.table = table
|
||||
self.chars = list(range(0x110000)) # unicode 3.2
|
||||
|
||||
# check for name aliases and named sequences, see #12753
|
||||
# aliases and named sequences are not in 3.2.0
|
||||
if version != '3.2.0':
|
||||
self.aliases = []
|
||||
# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
|
||||
# in order to take advantage of the compression and lookup
|
||||
# algorithms used for the other characters
|
||||
pua_index = NAME_ALIASES_START
|
||||
with open_data(NAME_ALIASES, version) as file:
|
||||
for s in file:
|
||||
s = s.strip()
|
||||
if not s or s.startswith('#'):
|
||||
continue
|
||||
char, name = s.split(';')
|
||||
char = int(char, 16)
|
||||
self.aliases.append((name, char))
|
||||
# also store the name in the PUA 1
|
||||
self.table[pua_index][1] = name
|
||||
pua_index += 1
|
||||
assert pua_index - NAME_ALIASES_START == len(self.aliases)
|
||||
|
||||
self.named_sequences = []
|
||||
# store named seqences in the PUA 1, in range U+F0100..,
|
||||
# in order to take advantage of the compression and lookup
|
||||
# algorithms used for the other characters.
|
||||
|
||||
pua_index = NAMED_SEQUENCES_START
|
||||
with open_data(NAMED_SEQUENCES, version) as file:
|
||||
for s in file:
|
||||
s = s.strip()
|
||||
if not s or s.startswith('#'):
|
||||
continue
|
||||
name, chars = s.split(';')
|
||||
chars = tuple(int(char, 16) for char in chars.split())
|
||||
# check that the structure defined in makeunicodename is OK
|
||||
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
|
||||
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
|
||||
"the NamedSequence struct and in unicodedata_lookup")
|
||||
self.named_sequences.append((name, chars))
|
||||
# also store these in the PUA 1
|
||||
self.table[pua_index][1] = name
|
||||
pua_index += 1
|
||||
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
|
||||
|
||||
self.exclusions = {}
|
||||
with open_data(COMPOSITION_EXCLUSIONS, version) as file:
|
||||
for s in file:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue