mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
Merged revisions 72054 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r72054 | antoine.pitrou | 2009-04-27 23:53:26 +0200 (lun., 27 avril 2009) | 5 lines Issue #1734234: Massively speedup `unicodedata.normalize()` when the string is already in normalized form, by performing a quick check beforehand. Original patch by Rauli Ruohonen. ........
This commit is contained in:
parent
57f3d93552
commit
7a0fedfd1d
5 changed files with 2041 additions and 1731 deletions
|
@ -36,6 +36,7 @@ UNICODE_DATA = "UnicodeData%s.txt"
|
|||
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
||||
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
||||
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
||||
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
||||
|
||||
old_versions = ["3.2.0"]
|
||||
|
||||
|
@ -72,7 +73,8 @@ def maketables(trace=0):
|
|||
unicode = UnicodeData(UNICODE_DATA % version,
|
||||
COMPOSITION_EXCLUSIONS % version,
|
||||
EASTASIAN_WIDTH % version,
|
||||
DERIVED_CORE_PROPERTIES % version)
|
||||
DERIVED_CORE_PROPERTIES % version,
|
||||
DERIVEDNORMALIZATION_PROPS % version)
|
||||
|
||||
print(len(list(filter(None, unicode.table))), "characters")
|
||||
|
||||
|
@ -94,7 +96,7 @@ def maketables(trace=0):
|
|||
|
||||
def makeunicodedata(unicode, trace):
|
||||
|
||||
dummy = (0, 0, 0, 0, 0)
|
||||
dummy = (0, 0, 0, 0, 0, 0)
|
||||
table = [dummy]
|
||||
cache = {0: dummy}
|
||||
index = [0] * len(unicode.chars)
|
||||
|
@ -114,8 +116,10 @@ def makeunicodedata(unicode, trace):
|
|||
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
|
||||
mirrored = record[9] == "Y"
|
||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
|
||||
normalizationquickcheck = record[17]
|
||||
item = (
|
||||
category, combining, bidirectional, mirrored, eastasianwidth
|
||||
category, combining, bidirectional, mirrored, eastasianwidth,
|
||||
normalizationquickcheck
|
||||
)
|
||||
# add entry to index and item tables
|
||||
i = cache.get(item)
|
||||
|
@ -227,7 +231,7 @@ def makeunicodedata(unicode, trace):
|
|||
print("/* a list of unique database records */", file=fp)
|
||||
print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
|
||||
for item in table:
|
||||
print(" {%d, %d, %d, %d, %d}," % item, file=fp)
|
||||
print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
|
||||
print("};", file=fp)
|
||||
print(file=fp)
|
||||
|
||||
|
@ -717,7 +721,7 @@ class UnicodeData:
|
|||
# derived-props] (17)
|
||||
|
||||
def __init__(self, filename, exclusions, eastasianwidth,
|
||||
derivedprops, expand=1):
|
||||
derivedprops, derivednormalizationprops=None, expand=1):
|
||||
self.changed = []
|
||||
file = open(filename)
|
||||
table = [None] * 0x110000
|
||||
|
@ -803,6 +807,29 @@ class UnicodeData:
|
|||
# apply to unassigned code points; ignore them
|
||||
table[char][-1].add(p)
|
||||
|
||||
if derivednormalizationprops:
|
||||
quickchecks = [0] * 0x110000 # default is Yes
|
||||
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
|
||||
for s in open(derivednormalizationprops):
|
||||
if '#' in s:
|
||||
s = s[:s.index('#')]
|
||||
s = [i.strip() for i in s.split(';')]
|
||||
if len(s) < 2 or s[1] not in qc_order:
|
||||
continue
|
||||
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
|
||||
quickcheck_shift = qc_order.index(s[1])*2
|
||||
quickcheck <<= quickcheck_shift
|
||||
if '..' not in s[0]:
|
||||
first = last = int(s[0], 16)
|
||||
else:
|
||||
first, last = [int(c, 16) for c in s[0].split('..')]
|
||||
for char in range(first, last+1):
|
||||
assert not (quickchecks[char]>>quickcheck_shift)&3
|
||||
quickchecks[char] |= quickcheck
|
||||
for i in range(0, 0x110000):
|
||||
if table[i] is not None:
|
||||
table[i].append(quickchecks[i])
|
||||
|
||||
def uselatin1(self):
|
||||
# restrict character range to ISO Latin 1
|
||||
self.chars = list(range(256))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue