Update Unicode database to Unicode 4.1.

2025-07-18 00:35:17 +00:00 · 2006-03-09 23:38:20 +00:00 · 2006-03-09 23:38:20 +00:00 · 480f1bb67b
commit 480f1bb67b
parent e2b4677253
12 changed files with 17302 additions and 13365 deletions
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -26,13 +26,15 @@
 import sys

 SCRIPT = sys.argv[0]
-VERSION = "2.3"
+VERSION = "2.5"

 # The Unicode Database
-UNIDATA_VERSION = "3.2.0"
-UNICODE_DATA = "UnicodeData.txt"
-COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
-EASTASIAN_WIDTH = "EastAsianWidth.txt"
+UNIDATA_VERSION = "4.1.0"
+UNICODE_DATA = "UnicodeData%s.txt"
+COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
+EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
+
+old_versions = ["3.2.0"]

 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@ -57,13 +59,23 @@ UPPER_MASK = 0x80

 def maketables(trace=0):

-    print "--- Reading", UNICODE_DATA, "..."
+    print "--- Reading", UNICODE_DATA % "", "..."

-    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
-                          EASTASIAN_WIDTH)
+    version = ""
+    unicode = UnicodeData(UNICODE_DATA % version,
+                          COMPOSITION_EXCLUSIONS % version,
+                          EASTASIAN_WIDTH % version)

    print len(filter(None, unicode.table)), "characters"

+    for version in old_versions:
+        print "--- Reading", UNICODE_DATA % ("-"+version), "..."
+        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
+                                  COMPOSITION_EXCLUSIONS % ("-"+version),
+                                  EASTASIAN_WIDTH % ("-"+version))
+        print len(filter(None, old_unicode.table)), "characters"
+        merge_old_version(version, unicode, old_unicode)
+
    makeunicodename(unicode, trace)
    makeunicodedata(unicode, trace)
    makeunicodetype(unicode, trace)
@ -119,6 +131,8 @@ def makeunicodedata(unicode, trace):
        if record:
            if record[5]:
                decomp = record[5].split()
+                if len(decomp) > 19:
+                    raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
                # prefix
                if decomp[0][0] == "<":
                    prefix = decomp.pop(0)
@ -278,6 +292,44 @@ def makeunicodedata(unicode, trace):
    Array("comp_index", index).dump(fp, trace)
    Array("comp_data", index2).dump(fp, trace)

+    # Generate delta tables for old versions
+    for version, table, normalization in unicode.changed:
+        cversion = version.replace(".","_")
+        records = [table[0]]
+        cache = {table[0]:0}
+        index = [0] * len(table)
+        for i, record in enumerate(table):
+            try:
+                index[i] = cache[record]
+            except KeyError:
+                index[i] = cache[record] = len(records)
+                records.append(record)
+        index1, index2, shift = splitbins(index, trace)
+        print >>fp, "static const change_record change_records_%s[] = {" % cversion
+        for record in records:
+            print >>fp, "\t{ %s }," % ", ".join(map(str,record))
+        print >>fp, "};"
+        Array("changes_%s_index" % cversion, index1).dump(fp, trace)
+        Array("changes_%s_data" % cversion, index2).dump(fp, trace)
+        print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
+        print >>fp, "{"
+        print >>fp, "\tint index;"
+        print >>fp, "\tif (n >= 0x110000) index = 0;"
+        print >>fp, "\telse {"
+        print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
+        print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
+              (cversion, shift, ((1<<shift)-1))
+        print >>fp, "\t}"
+        print >>fp, "\treturn change_records_%s+index;" % cversion
+        print >>fp, "}\n"
+        print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
+        print >>fp, "{"
+        print >>fp, "\tswitch(n) {"
+        for k, v in normalization:
+            print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
+        print >>fp, "\tdefault: return 0;"
+        print >>fp, "\t}\n}\n"
+
    fp.close()

 # --------------------------------------------------------------------
@ -540,6 +592,82 @@ def makeunicodename(unicode, trace):

    fp.close()

+
+def merge_old_version(version, new, old):
+    # Changes to exclusion file not implemented yet
+    if old.exclusions != new.exclusions:
+        raise NotImplementedError, "exclusions differ"
+
+    # In these change records, 0xFF means "no change"
+    bidir_changes = [0xFF]*0x110000
+    category_changes = [0xFF]*0x110000
+    decimal_changes = [0xFF]*0x110000
+    # In numeric data, 0 means "no change",
+    # -1 means "did not have a numeric value
+    numeric_changes = [0] * 0x110000
+    # normalization_changes is a list of key-value pairs
+    normalization_changes = []
+    for i in range(0x110000):
+        if new.table[i] is None:
+            # Characters unassigned in the new version ought to
+            # be unassigned in the old one
+            assert old.table[i] is None
+            continue
+        # check characters unassigned in the old version
+        if old.table[i] is None:
+            # category 0 is "unassigned"
+            category_changes[i] = 0
+            continue
+        # check characters that differ
+        if old.table[i] != new.table[i]:
+            for k in range(len(old.table[i])):
+                if old.table[i][k] != new.table[i][k]:
+                    value = old.table[i][k]
+                    if k == 2:
+                        #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
+                        category_changes[i] = CATEGORY_NAMES.index(value)
+                    elif k == 4:
+                        #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
+                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
+                    elif k == 5:
+                        #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
+                        # We assume that all normalization changes are in 1:1 mappings
+                        assert " " not in value
+                        normalization_changes.append((i, value))
+                    elif k == 6:
+                        #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
+                        # we only support changes where the old value is a single digit
+                        assert value in "0123456789"
+                        decimal_changes[i] = int(value)
+                    elif k == 8:
+                        # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
+                        # Since 0 encodes "no change", the old value is better not 0
+                        assert value != "0" and value != "-1"
+                        if not value:
+                            numeric_changes[i] = -1
+                        else:
+                            assert re.match("^[0-9]+$", value)
+                            numeric_changes[i] = int(value)
+                    elif k == 11:
+                        # change to ISO comment, ignore
+                        pass
+                    elif k == 12:
+                        # change to simple uppercase mapping; ignore
+                        pass
+                    elif k == 13:
+                        # change to simple lowercase mapping; ignore
+                        pass
+                    elif k == 14:
+                        # change to simple titlecase mapping; ignore
+                        pass
+                    else:
+                        class Difference(Exception):pass
+                        raise Difference, (hex(i), k, old.table[i], new.table[i])
+    new.changed.append((version, zip(bidir_changes, category_changes,
+                                     decimal_changes, numeric_changes),
+                        normalization_changes))
+    
+
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
 # Copyright (c) 1999-2000 by Secret Labs AB
@ -551,6 +679,7 @@ import sys
 class UnicodeData:

    def __init__(self, filename, exclusions, eastasianwidth, expand=1):
+        self.changed = []
        file = open(filename)
        table = [None] * 0x110000
        while 1:
@ -569,13 +698,14 @@ class UnicodeData:
                if s:
                    if s[1][-6:] == "First>":
                        s[1] = ""
-                        field = s[:]
+                        field = s
                    elif s[1][-5:] == "Last>":
                        s[1] = ""
                        field = None
                elif field:
-                    field[0] = hex(i)
-                    table[i] = field
+                    f2 = field[:]
+                    f2[0] = "%X" % i
+                    table[i] = f2

        # public attributes
        self.filename = filename