Merged revisions 72054 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk ........ r72054 | antoine.pitrou | 2009-04-27 23:53:26 +0200 (lun., 27 avril 2009) | 5 lines Issue #1734234: Massively speedup `unicodedata.normalize()` when the string is already in normalized form, by performing a quick check beforehand. Original patch by Rauli Ruohonen. ........
2025-09-27 02:39:58 +00:00 · 2009-04-27 22:31:40 +00:00 · 2009-04-27 22:31:40 +00:00 · 7a0fedfd1d
commit 7a0fedfd1d
parent 57f3d93552
5 changed files with 2041 additions and 1731 deletions
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -36,6 +36,7 @@ UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
+DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

 old_versions = ["3.2.0"]

@ -72,7 +73,8 @@ def maketables(trace=0):
    unicode = UnicodeData(UNICODE_DATA % version,
                          COMPOSITION_EXCLUSIONS % version,
                          EASTASIAN_WIDTH % version,
-                          DERIVED_CORE_PROPERTIES % version)
+                          DERIVED_CORE_PROPERTIES % version,
+                          DERIVEDNORMALIZATION_PROPS % version)

    print(len(list(filter(None, unicode.table))), "characters")

@ -94,7 +96,7 @@ def maketables(trace=0):

 def makeunicodedata(unicode, trace):

-    dummy = (0, 0, 0, 0, 0)
+    dummy = (0, 0, 0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)
@ -114,8 +116,10 @@ def makeunicodedata(unicode, trace):
            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
            mirrored = record[9] == "Y"
            eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
+            normalizationquickcheck = record[17]
            item = (
-                category, combining, bidirectional, mirrored, eastasianwidth
+                category, combining, bidirectional, mirrored, eastasianwidth,
+                normalizationquickcheck
                )
            # add entry to index and item tables
            i = cache.get(item)
@ -227,7 +231,7 @@ def makeunicodedata(unicode, trace):
    print("/* a list of unique database records */", file=fp)
    print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
    for item in table:
-        print("    {%d, %d, %d, %d, %d}," % item, file=fp)
+        print("    {%d, %d, %d, %d, %d, %d}," % item, file=fp)
    print("};", file=fp)
    print(file=fp)

@ -717,7 +721,7 @@ class UnicodeData:
    #  derived-props] (17)

    def __init__(self, filename, exclusions, eastasianwidth,
-                 derivedprops, expand=1):
+                 derivedprops, derivednormalizationprops=None, expand=1):
        self.changed = []
        file = open(filename)
        table = [None] * 0x110000
@ -803,6 +807,29 @@ class UnicodeData:
                    # apply to unassigned code points; ignore them
                    table[char][-1].add(p)

+        if derivednormalizationprops:
+            quickchecks = [0] * 0x110000 # default is Yes
+            qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
+            for s in open(derivednormalizationprops):
+                if '#' in s:
+                    s = s[:s.index('#')]
+                s = [i.strip() for i in s.split(';')]
+                if len(s) < 2 or s[1] not in qc_order:
+                    continue
+                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+                quickcheck_shift = qc_order.index(s[1])*2
+                quickcheck <<= quickcheck_shift
+                if '..' not in s[0]:
+                    first = last = int(s[0], 16)
+                else:
+                    first, last = [int(c, 16) for c in s[0].split('..')]
+                for char in range(first, last+1):
+                    assert not (quickchecks[char]>>quickcheck_shift)&3
+                    quickchecks[char] |= quickcheck
+            for i in range(0, 0x110000):
+                if table[i] is not None:
+                    table[i].append(quickchecks[i])
+
    def uselatin1(self):
        # restrict character range to ISO Latin 1
        self.chars = list(range(256))