mirror of
https://github.com/python/cpython.git
synced 2025-08-02 08:02:56 +00:00
Added 38,642 missing characters to the Unicode database (first-last
ranges) -- but thanks to the 2.0 compression scheme, this doesn't add a single byte to the resulting binaries (!) Closes bug #117524
This commit is contained in:
parent
063ee7bbe6
commit
fad27aee11
4 changed files with 244 additions and 199 deletions
|
@ -9,6 +9,7 @@
|
|||
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
|
||||
# 2000-09-25 fl added character type table
|
||||
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
|
||||
# 2000-11-03 fl expand first/last ranges
|
||||
#
|
||||
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
|
||||
#
|
||||
|
@ -39,10 +40,13 @@ SPACE_MASK = 0x20
|
|||
TITLE_MASK = 0x40
|
||||
UPPER_MASK = 0x80
|
||||
|
||||
def maketables():
|
||||
def maketables(trace=0):
|
||||
|
||||
unicode = UnicodeData(UNICODE_DATA)
|
||||
|
||||
print "--- Processing", UNICODE_DATA, "..."
|
||||
print len(filter(None, unicode.table)), "characters"
|
||||
|
||||
# extract unicode properties
|
||||
dummy = (0, 0, 0, 0)
|
||||
table = [dummy]
|
||||
|
@ -91,6 +95,11 @@ def maketables():
|
|||
|
||||
FILE = "Modules/unicodedata_db.h"
|
||||
|
||||
print "--- Writing", FILE, "..."
|
||||
|
||||
print len(table), "unique properties"
|
||||
print len(decomp_data), "unique decomposition entries"
|
||||
|
||||
fp = open(FILE, "w")
|
||||
print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
|
||||
print >>fp
|
||||
|
@ -125,7 +134,7 @@ def maketables():
|
|||
print >>fp, "};"
|
||||
|
||||
# split record index table
|
||||
index1, index2, shift = splitbins(index)
|
||||
index1, index2, shift = splitbins(index, trace)
|
||||
|
||||
print >>fp, "/* index tables for the database records */"
|
||||
print >>fp, "#define SHIFT", shift
|
||||
|
@ -133,7 +142,7 @@ def maketables():
|
|||
Array("index2", index2).dump(fp)
|
||||
|
||||
# split decomposition index table
|
||||
index1, index2, shift = splitbins(decomp_index)
|
||||
index1, index2, shift = splitbins(decomp_index, trace)
|
||||
|
||||
print >>fp, "/* index tables for the decomposition data */"
|
||||
print >>fp, "#define DECOMP_SHIFT", shift
|
||||
|
@ -200,12 +209,14 @@ def maketables():
|
|||
table.append(item)
|
||||
index[char] = i
|
||||
|
||||
print len(table), "ctype entries"
|
||||
|
||||
FILE = "Objects/unicodetype_db.h"
|
||||
|
||||
fp = open(FILE, "w")
|
||||
|
||||
print "--- Writing", FILE, "..."
|
||||
|
||||
print len(table), "unique character type entries"
|
||||
|
||||
print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
|
||||
print >>fp
|
||||
print >>fp, "/* a list of unique character type descriptors */"
|
||||
|
@ -216,7 +227,7 @@ def maketables():
|
|||
print >>fp
|
||||
|
||||
# split decomposition index table
|
||||
index1, index2, shift = splitbins(index)
|
||||
index1, index2, shift = splitbins(index, trace)
|
||||
|
||||
print >>fp, "/* type indexes */"
|
||||
print >>fp, "#define SHIFT", shift
|
||||
|
@ -233,7 +244,7 @@ import string, sys
|
|||
|
||||
class UnicodeData:
|
||||
|
||||
def __init__(self, filename):
|
||||
def __init__(self, filename, expand=1):
|
||||
file = open(filename)
|
||||
table = [None] * 65536
|
||||
while 1:
|
||||
|
@ -244,6 +255,22 @@ class UnicodeData:
|
|||
char = string.atoi(s[0], 16)
|
||||
table[char] = s
|
||||
|
||||
# expand first-last ranges (ignore surrogates and private use)
|
||||
if expand:
|
||||
field = None
|
||||
for i in range(0, 0xD800):
|
||||
s = table[i]
|
||||
if s:
|
||||
if s[1][-6:] == "First>":
|
||||
s[1] = ""
|
||||
field = s[:]
|
||||
elif s[1][-5:] == "Last>":
|
||||
s[1] = ""
|
||||
field = None
|
||||
elif field:
|
||||
field[0] = hex(i)
|
||||
table[i] = field
|
||||
|
||||
# public attributes
|
||||
self.filename = filename
|
||||
self.table = table
|
||||
|
@ -306,8 +333,9 @@ def splitbins(t, trace=0):
|
|||
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
||||
where mask is a bitmask isolating the last "shift" bits.
|
||||
|
||||
If optional arg trace is true (default false), progress info is
|
||||
printed to sys.stderr.
|
||||
If optional arg trace is non-zero (default zero), progress info
|
||||
is printed to sys.stderr. The higher the value, the more info
|
||||
you'll get.
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
@ -341,7 +369,7 @@ def splitbins(t, trace=0):
|
|||
t1.append(index >> shift)
|
||||
# determine memory size
|
||||
b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
|
||||
if trace:
|
||||
if trace > 1:
|
||||
dump(t1, t2, shift, b)
|
||||
if b < bytes:
|
||||
best = t1, t2, shift
|
||||
|
@ -358,4 +386,4 @@ def splitbins(t, trace=0):
|
|||
return best
|
||||
|
||||
if __name__ == "__main__":
|
||||
maketables()
|
||||
maketables(1)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue