mirror of
https://github.com/python/cpython.git
synced 2025-08-01 23:53:15 +00:00
unicode database compression, step 3:
- use unidb compression for the unicodectype module. smaller, faster, and slightly more portable... - also mention the unicode directory in Tools/README
This commit is contained in:
parent
e53793bf4c
commit
e9133f7e2e
2 changed files with 100 additions and 9 deletions
|
@ -21,6 +21,9 @@ scripts A number of useful single-file programs, e.g. tabnanny.py
|
||||||
(by Tim Peters), which checks for inconsistent mixing
|
(by Tim Peters), which checks for inconsistent mixing
|
||||||
of tabs and spaces.
|
of tabs and spaces.
|
||||||
|
|
||||||
|
unicode Tools used to generate unicode database files for
|
||||||
|
Python 2.0 (by Fredrik Lundh).
|
||||||
|
|
||||||
versioncheck A tool to automate checking whether you have the latest
|
versioncheck A tool to automate checking whether you have the latest
|
||||||
version of a package (by Jack Jansen).
|
version of a package (by Jack Jansen).
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
#
|
#
|
||||||
# generate a compact version of the unicode property database
|
# (re)generate unicode property and type databases
|
||||||
|
#
|
||||||
|
# this script converts a unicode 3.0 database file to
|
||||||
|
# Modules/unicodedata_db.h and Objects/unicodetype_db.h
|
||||||
#
|
#
|
||||||
# history:
|
# history:
|
||||||
# 2000-09-24 fl created (based on bits and pieces from unidb)
|
# 2000-09-24 fl created (based on bits and pieces from unidb)
|
||||||
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
|
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
|
||||||
|
# 2000-09-25 fl added character type table
|
||||||
#
|
#
|
||||||
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
|
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
|
||||||
#
|
#
|
||||||
|
@ -13,7 +17,7 @@ import sys
|
||||||
SCRIPT = sys.argv[0]
|
SCRIPT = sys.argv[0]
|
||||||
VERSION = "1.1"
|
VERSION = "1.1"
|
||||||
|
|
||||||
UNICODE_DATA = "../UnicodeData-Latest.txt"
|
UNICODE_DATA = "UnicodeData-Latest.txt"
|
||||||
|
|
||||||
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
|
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
|
||||||
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
|
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
|
||||||
|
@ -24,7 +28,16 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
|
||||||
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
|
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
|
||||||
"ON" ]
|
"ON" ]
|
||||||
|
|
||||||
def maketable():
|
ALPHA_MASK = 0x01
|
||||||
|
DECIMAL_MASK = 0x02
|
||||||
|
DIGIT_MASK = 0x04
|
||||||
|
LOWER_MASK = 0x08
|
||||||
|
NUMERIC_MASK = 0x10
|
||||||
|
SPACE_MASK = 0x20
|
||||||
|
TITLE_MASK = 0x40
|
||||||
|
UPPER_MASK = 0x80
|
||||||
|
|
||||||
|
def maketables():
|
||||||
|
|
||||||
unicode = UnicodeData(UNICODE_DATA)
|
unicode = UnicodeData(UNICODE_DATA)
|
||||||
|
|
||||||
|
@ -74,7 +87,7 @@ def maketable():
|
||||||
i = 0
|
i = 0
|
||||||
decomp_index[char] = i
|
decomp_index[char] = i
|
||||||
|
|
||||||
FILE = "unicodedata_db.h"
|
FILE = "Modules/unicodedata_db.h"
|
||||||
|
|
||||||
sys.stdout = open(FILE, "w")
|
sys.stdout = open(FILE, "w")
|
||||||
|
|
||||||
|
@ -87,6 +100,9 @@ def maketable():
|
||||||
print "};"
|
print "};"
|
||||||
print
|
print
|
||||||
|
|
||||||
|
# FIXME: the following tables should be made static, and
|
||||||
|
# the support code moved into unicodedatabase.c
|
||||||
|
|
||||||
print "/* string literals */"
|
print "/* string literals */"
|
||||||
print "const char *_PyUnicode_CategoryNames[] = {"
|
print "const char *_PyUnicode_CategoryNames[] = {"
|
||||||
for name in CATEGORY_NAMES:
|
for name in CATEGORY_NAMES:
|
||||||
|
@ -106,24 +122,96 @@ def maketable():
|
||||||
print " NULL"
|
print " NULL"
|
||||||
print "};"
|
print "};"
|
||||||
|
|
||||||
# split index table
|
# split record index table
|
||||||
index1, index2, shift = splitbins(index)
|
index1, index2, shift = splitbins(index)
|
||||||
|
|
||||||
print "/* index tables used to find the right database record */"
|
print "/* index tables for the database records */"
|
||||||
print "#define SHIFT", shift
|
print "#define SHIFT", shift
|
||||||
Array("index1", index1).dump(sys.stdout)
|
Array("index1", index1).dump(sys.stdout)
|
||||||
Array("index2", index2).dump(sys.stdout)
|
Array("index2", index2).dump(sys.stdout)
|
||||||
|
|
||||||
# split index table
|
# split decomposition index table
|
||||||
index1, index2, shift = splitbins(decomp_index)
|
index1, index2, shift = splitbins(decomp_index)
|
||||||
|
|
||||||
print "/* same, for the decomposition data */"
|
print "/* index tables for the decomposition data */"
|
||||||
print "#define DECOMP_SHIFT", shift
|
print "#define DECOMP_SHIFT", shift
|
||||||
Array("decomp_index1", index1).dump(sys.stdout)
|
Array("decomp_index1", index1).dump(sys.stdout)
|
||||||
Array("decomp_index2", index2).dump(sys.stdout)
|
Array("decomp_index2", index2).dump(sys.stdout)
|
||||||
|
|
||||||
sys.stdout = sys.__stdout__
|
sys.stdout = sys.__stdout__
|
||||||
|
|
||||||
|
#
|
||||||
|
# 3) unicode type data
|
||||||
|
|
||||||
|
# extract unicode types
|
||||||
|
dummy = (0, 0, 0, 0)
|
||||||
|
table = [dummy]
|
||||||
|
cache = {0: dummy}
|
||||||
|
index = [0] * len(unicode.chars)
|
||||||
|
|
||||||
|
for char in unicode.chars:
|
||||||
|
record = unicode.table[char]
|
||||||
|
if record:
|
||||||
|
# extract database properties
|
||||||
|
category = record[2]
|
||||||
|
bidirectional = record[4]
|
||||||
|
flags = 0
|
||||||
|
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
||||||
|
flags |= ALPHA_MASK
|
||||||
|
if category == "Ll":
|
||||||
|
flags |= LOWER_MASK
|
||||||
|
if category == "Zs" or bidirectional in ("WS", "B", "S"):
|
||||||
|
flags |= SPACE_MASK
|
||||||
|
if category in ["Lt", "Lu"]:
|
||||||
|
flags |= TITLE_MASK
|
||||||
|
if category == "Lu":
|
||||||
|
flags |= UPPER_MASK
|
||||||
|
# use delta predictor for upper/lower/title
|
||||||
|
if record[12]:
|
||||||
|
upper = (int(record[12], 16) - char) & 0xffff
|
||||||
|
else:
|
||||||
|
upper = 0
|
||||||
|
if record[13]:
|
||||||
|
lower = (int(record[13], 16) - char) & 0xffff
|
||||||
|
else:
|
||||||
|
lower = 0
|
||||||
|
if record[14]:
|
||||||
|
title = (int(record[14], 16) - char) & 0xffff
|
||||||
|
else:
|
||||||
|
title = 0
|
||||||
|
item = (
|
||||||
|
flags, upper, lower, title
|
||||||
|
)
|
||||||
|
# add entry to index and item tables
|
||||||
|
i = cache.get(item)
|
||||||
|
if i is None:
|
||||||
|
cache[item] = i = len(table)
|
||||||
|
table.append(item)
|
||||||
|
index[char] = i
|
||||||
|
|
||||||
|
FILE = "Objects/unicodetype_db.h"
|
||||||
|
|
||||||
|
sys.stdout = open(FILE, "w")
|
||||||
|
|
||||||
|
print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
|
||||||
|
print
|
||||||
|
print "/* a list of unique character type descriptors */"
|
||||||
|
print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
|
||||||
|
for item in table:
|
||||||
|
print " {%d, %d, %d, %d}," % item
|
||||||
|
print "};"
|
||||||
|
print
|
||||||
|
|
||||||
|
# split decomposition index table
|
||||||
|
index1, index2, shift = splitbins(index)
|
||||||
|
|
||||||
|
print "/* type indexes */"
|
||||||
|
print "#define SHIFT", shift
|
||||||
|
Array("index1", index1).dump(sys.stdout)
|
||||||
|
Array("index2", index2).dump(sys.stdout)
|
||||||
|
|
||||||
|
sys.stdout = sys.__stdout__
|
||||||
|
|
||||||
# --------------------------------------------------------------------
|
# --------------------------------------------------------------------
|
||||||
# the following support code is taken from the unidb utilities
|
# the following support code is taken from the unidb utilities
|
||||||
# Copyright (c) 1999-2000 by Secret Labs AB
|
# Copyright (c) 1999-2000 by Secret Labs AB
|
||||||
|
@ -259,4 +347,4 @@ def splitbins(t, trace=0):
|
||||||
return best
|
return best
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
maketable()
|
maketables()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue