unicode database compression, step 3:

- use unidb compression for the unicodectype module. smaller, faster, and slightly more portable... - also mention the unicode directory in Tools/README
2025-11-20 10:57:44 +00:00 · 2000-09-25 17:59:57 +00:00 · 2000-09-25 17:59:57 +00:00 · e9133f7e2e
commit e9133f7e2e
parent e53793bf4c
2 changed files with 100 additions and 9 deletions
--- a/Tools/README
+++ b/Tools/README
@ -21,6 +21,9 @@ scripts		A number of useful single-file programs, e.g. tabnanny.py
 		(by Tim Peters), which checks for inconsistent mixing
 		of tabs and spaces.
 unicode		Tools used to generate unicode database files for
 		Python 2.0 (by Fredrik Lundh).
 versioncheck	A tool to automate checking whether you have the latest
 		version of a package (by Jack Jansen).
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -1,9 +1,13 @@
 #
-# generate a compact version of the unicode property database
+# (re)generate unicode property and type databases
 #
 # this script converts a unicode 3.0 database file to
 # Modules/unicodedata_db.h and Objects/unicodetype_db.h
 #
 # history:
 # 2000-09-24 fl   created (based on bits and pieces from unidb)
 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
 # 2000-09-25 fl   added character type table
 #
 # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
 #
@ -13,7 +17,7 @@ import sys
 SCRIPT = sys.argv[0]
 VERSION = "1.1"
-UNICODE_DATA = "../UnicodeData-Latest.txt"
+UNICODE_DATA = "UnicodeData-Latest.txt"
 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@ -24,7 +28,16 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
    "ON" ]
-def maketable():
+ALPHA_MASK = 0x01
 DECIMAL_MASK = 0x02
 DIGIT_MASK = 0x04
 LOWER_MASK = 0x08
 NUMERIC_MASK = 0x10
 SPACE_MASK = 0x20
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
 def maketables():
    unicode = UnicodeData(UNICODE_DATA)
@ -74,7 +87,7 @@ def maketable():
                i = 0
            decomp_index[char] = i
-    FILE = "unicodedata_db.h"
+    FILE = "Modules/unicodedata_db.h"
    sys.stdout = open(FILE, "w")
@ -87,6 +100,9 @@ def maketable():
    print "};"
    print
    # FIXME: the following tables should be made static, and
    # the support code moved into unicodedatabase.c
    print "/* string literals */"
    print "const char *_PyUnicode_CategoryNames[] = {"
    for name in CATEGORY_NAMES:
@ -106,24 +122,96 @@ def maketable():
    print "    NULL"
    print "};"
-    # split index table
+    # split record index table
    index1, index2, shift = splitbins(index)
-    print "/* index tables used to find the right database record */"
+    print "/* index tables for the database records */"
    print "#define SHIFT", shift
    Array("index1", index1).dump(sys.stdout)
    Array("index2", index2).dump(sys.stdout)
-    # split index table
+    # split decomposition index table
    index1, index2, shift = splitbins(decomp_index)
-    print "/* same, for the decomposition data */"
+    print "/* index tables for the decomposition data */"
    print "#define DECOMP_SHIFT", shift
    Array("decomp_index1", index1).dump(sys.stdout)
    Array("decomp_index2", index2).dump(sys.stdout)
    sys.stdout = sys.__stdout__
    #
    # 3) unicode type data
    # extract unicode types
    dummy = (0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)
    for char in unicode.chars:
        record = unicode.table[char]
        if record:
            # extract database properties
            category = record[2]
            bidirectional = record[4]
            flags = 0
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                flags |= ALPHA_MASK
            if category == "Ll":
                flags |= LOWER_MASK
            if category == "Zs" or bidirectional in ("WS", "B", "S"):
                flags |= SPACE_MASK
            if category in ["Lt", "Lu"]:
                flags |= TITLE_MASK
            if category == "Lu":
                flags |= UPPER_MASK
            # use delta predictor for upper/lower/title
            if record[12]:
                upper = (int(record[12], 16) - char) & 0xffff
            else:
                upper = 0
            if record[13]:
                lower = (int(record[13], 16) - char) & 0xffff
            else:
                lower = 0
            if record[14]:
                title = (int(record[14], 16) - char) & 0xffff
            else:
                title = 0
            item = (
                flags, upper, lower, title
                )
            # add entry to index and item tables
            i = cache.get(item)
            if i is None:
                cache[item] = i = len(table)
                table.append(item)
            index[char] = i
    FILE = "Objects/unicodetype_db.h"
    sys.stdout = open(FILE, "w")
    print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
    print
    print "/* a list of unique character type descriptors */"
    print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
    for item in table:
        print "    {%d, %d, %d, %d}," % item
    print "};"
    print
    # split decomposition index table
    index1, index2, shift = splitbins(index)
    print "/* type indexes */"
    print "#define SHIFT", shift
    Array("index1", index1).dump(sys.stdout)
    Array("index2", index2).dump(sys.stdout)
    sys.stdout = sys.__stdout__
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
 # Copyright (c) 1999-2000 by Secret Labs AB
@ -259,4 +347,4 @@ def splitbins(t, trace=0):
    return best
 if __name__ == "__main__":
-    maketable()
+    maketables()