mirror of
https://github.com/python/cpython.git
synced 2025-08-01 23:53:15 +00:00
unicode database compression, step 2:
- fixed attributions - moved decomposition data to a separate table, in preparation for step 3 (which won't happen before 2.0 final, promise!) - use relative paths in the generator script I have a lot more stuff in the works for 2.1, but let's leave that for another day...
This commit is contained in:
parent
2101348830
commit
cfcea49218
5 changed files with 4613 additions and 4330 deletions
|
@ -5,6 +5,7 @@
|
||||||
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
|
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
|
||||||
|
|
||||||
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
||||||
|
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
||||||
|
|
||||||
Copyright (c) Corporation for National Research Initiatives.
|
Copyright (c) Corporation for National Research Initiatives.
|
||||||
|
|
||||||
|
@ -13,8 +14,6 @@ Copyright (c) Corporation for National Research Initiatives.
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
#include "unicodedatabase.h"
|
#include "unicodedatabase.h"
|
||||||
|
|
||||||
#define unicode_db _PyUnicode_Database_GetRecord
|
|
||||||
|
|
||||||
/* --- Module API --------------------------------------------------------- */
|
/* --- Module API --------------------------------------------------------- */
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
|
@ -134,15 +133,9 @@ unicodedata_category(PyObject *self,
|
||||||
"need a single Unicode character as parameter");
|
"need a single Unicode character as parameter");
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->category;
|
index = (int) _PyUnicode_Database_GetRecord(
|
||||||
if (index < 0 ||
|
(int) *PyUnicode_AS_UNICODE(v)
|
||||||
index > sizeof(_PyUnicode_CategoryNames) /
|
)->category;
|
||||||
sizeof(_PyUnicode_CategoryNames[0])) {
|
|
||||||
PyErr_Format(PyExc_SystemError,
|
|
||||||
"category index out of range: %i",
|
|
||||||
index);
|
|
||||||
goto onError;
|
|
||||||
}
|
|
||||||
return PyString_FromString(_PyUnicode_CategoryNames[index]);
|
return PyString_FromString(_PyUnicode_CategoryNames[index]);
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
@ -164,15 +157,9 @@ unicodedata_bidirectional(PyObject *self,
|
||||||
"need a single Unicode character as parameter");
|
"need a single Unicode character as parameter");
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->bidirectional;
|
index = (int) _PyUnicode_Database_GetRecord(
|
||||||
if (index < 0 ||
|
(int) *PyUnicode_AS_UNICODE(v)
|
||||||
index > sizeof(_PyUnicode_CategoryNames) /
|
)->bidirectional;
|
||||||
sizeof(_PyUnicode_CategoryNames[0])) {
|
|
||||||
PyErr_Format(PyExc_SystemError,
|
|
||||||
"bidirectional index out of range: %i",
|
|
||||||
index);
|
|
||||||
goto onError;
|
|
||||||
}
|
|
||||||
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
|
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
@ -194,7 +181,9 @@ unicodedata_combining(PyObject *self,
|
||||||
"need a single Unicode character as parameter");
|
"need a single Unicode character as parameter");
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->combining;
|
value = (int) _PyUnicode_Database_GetRecord(
|
||||||
|
(int) *PyUnicode_AS_UNICODE(v)
|
||||||
|
)->combining;
|
||||||
return PyInt_FromLong(value);
|
return PyInt_FromLong(value);
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
@ -216,7 +205,9 @@ unicodedata_mirrored(PyObject *self,
|
||||||
"need a single Unicode character as parameter");
|
"need a single Unicode character as parameter");
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->mirrored;
|
value = (int) _PyUnicode_Database_GetRecord(
|
||||||
|
(int) *PyUnicode_AS_UNICODE(v)
|
||||||
|
)->mirrored;
|
||||||
return PyInt_FromLong(value);
|
return PyInt_FromLong(value);
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
@ -238,10 +229,9 @@ unicodedata_decomposition(PyObject *self,
|
||||||
"need a single Unicode character as parameter");
|
"need a single Unicode character as parameter");
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
value = unicode_db((int)*PyUnicode_AS_UNICODE(v))->decomposition;
|
value = _PyUnicode_Database_GetDecomposition(
|
||||||
if (value == NULL)
|
(int) *PyUnicode_AS_UNICODE(v)
|
||||||
return PyString_FromString("");
|
);
|
||||||
else
|
|
||||||
return PyString_FromString(value);
|
return PyString_FromString(value);
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -5,6 +5,7 @@
|
||||||
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
|
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
|
||||||
|
|
||||||
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
||||||
|
Rewritten for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
||||||
|
|
||||||
Copyright (c) Corporation for National Research Initiatives.
|
Copyright (c) Corporation for National Research Initiatives.
|
||||||
|
|
||||||
|
@ -29,3 +30,18 @@ _PyUnicode_Database_GetRecord(int code)
|
||||||
}
|
}
|
||||||
return &_PyUnicode_Database_Records[index];
|
return &_PyUnicode_Database_Records[index];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char *
|
||||||
|
_PyUnicode_Database_GetDecomposition(int code)
|
||||||
|
{
|
||||||
|
int index;
|
||||||
|
|
||||||
|
if (code < 0 || code >= 65536)
|
||||||
|
index = 0;
|
||||||
|
else {
|
||||||
|
index = decomp_index1[(code>>DECOMP_SHIFT)];
|
||||||
|
index = decomp_index2[(index<<DECOMP_SHIFT)+
|
||||||
|
(code&((1<<DECOMP_SHIFT)-1))];
|
||||||
|
}
|
||||||
|
return decomp_data[index];
|
||||||
|
}
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
|
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
|
||||||
|
|
||||||
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
||||||
|
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
||||||
|
|
||||||
Copyright (c) Corporation for National Research Initiatives.
|
Copyright (c) Corporation for National Research Initiatives.
|
||||||
|
|
||||||
|
@ -19,15 +20,14 @@ typedef struct {
|
||||||
const unsigned char bidirectional; /* index into
|
const unsigned char bidirectional; /* index into
|
||||||
_PyUnicode_BidirectionalNames */
|
_PyUnicode_BidirectionalNames */
|
||||||
const unsigned char mirrored; /* true if mirrored in bidir mode */
|
const unsigned char mirrored; /* true if mirrored in bidir mode */
|
||||||
const char *decomposition; /* pointer to the decomposition
|
|
||||||
string or NULL */
|
|
||||||
} _PyUnicode_DatabaseRecord;
|
} _PyUnicode_DatabaseRecord;
|
||||||
|
|
||||||
/* --- Unicode category names --------------------------------------------- */
|
/* --- Unicode category names --------------------------------------------- */
|
||||||
|
|
||||||
extern const char *_PyUnicode_CategoryNames[32];
|
extern const char *_PyUnicode_CategoryNames[];
|
||||||
extern const char *_PyUnicode_BidirectionalNames[21];
|
extern const char *_PyUnicode_BidirectionalNames[];
|
||||||
|
|
||||||
/* --- Unicode Database --------------------------------------------------- */
|
/* --- Unicode Database --------------------------------------------------- */
|
||||||
|
|
||||||
extern const _PyUnicode_DatabaseRecord *_PyUnicode_Database_GetRecord(int ch);
|
extern const _PyUnicode_DatabaseRecord *_PyUnicode_Database_GetRecord(int ch);
|
||||||
|
extern const char *_PyUnicode_Database_GetDecomposition(int ch);
|
||||||
|
|
|
@ -1,14 +1,19 @@
|
||||||
#
|
#
|
||||||
# makeunidb.py -- generate a compact version of the unicode property
|
# generate a compact version of the unicode property database
|
||||||
# database (unicodedatabase.h)
|
#
|
||||||
|
# history:
|
||||||
|
# 2000-09-24 fl created (based on bits and pieces from unidb)
|
||||||
|
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
|
||||||
|
#
|
||||||
|
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
|
||||||
#
|
#
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
SCRIPT = sys.argv[0]
|
SCRIPT = sys.argv[0]
|
||||||
VERSION = "1.0"
|
VERSION = "1.1"
|
||||||
|
|
||||||
UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
|
UNICODE_DATA = "../UnicodeData-Latest.txt"
|
||||||
|
|
||||||
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
|
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
|
||||||
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
|
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
|
||||||
|
@ -24,13 +29,12 @@ def maketable():
|
||||||
unicode = UnicodeData(UNICODE_DATA)
|
unicode = UnicodeData(UNICODE_DATA)
|
||||||
|
|
||||||
# extract unicode properties
|
# extract unicode properties
|
||||||
dummy = (0, 0, 0, 0, "NULL")
|
dummy = (0, 0, 0, 0)
|
||||||
table = [dummy]
|
table = [dummy]
|
||||||
cache = {0: dummy}
|
cache = {0: dummy}
|
||||||
index = [0] * len(unicode.chars)
|
index = [0] * len(unicode.chars)
|
||||||
|
|
||||||
DECOMPOSITION = [""]
|
# 1) database properties
|
||||||
|
|
||||||
for char in unicode.chars:
|
for char in unicode.chars:
|
||||||
record = unicode.table[char]
|
record = unicode.table[char]
|
||||||
if record:
|
if record:
|
||||||
|
@ -39,12 +43,8 @@ def maketable():
|
||||||
combining = int(record[3])
|
combining = int(record[3])
|
||||||
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
|
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
|
||||||
mirrored = record[9] == "Y"
|
mirrored = record[9] == "Y"
|
||||||
if record[5]:
|
|
||||||
decomposition = '"%s"' % record[5]
|
|
||||||
else:
|
|
||||||
decomposition = "NULL"
|
|
||||||
item = (
|
item = (
|
||||||
category, combining, bidirectional, mirrored, decomposition
|
category, combining, bidirectional, mirrored
|
||||||
)
|
)
|
||||||
# add entry to index and item tables
|
# add entry to index and item tables
|
||||||
i = cache.get(item)
|
i = cache.get(item)
|
||||||
|
@ -53,8 +53,26 @@ def maketable():
|
||||||
table.append(item)
|
table.append(item)
|
||||||
index[char] = i
|
index[char] = i
|
||||||
|
|
||||||
# FIXME: we really should compress the decomposition stuff
|
# 2) decomposition data
|
||||||
# (see the unidb utilities for one way to do this)
|
|
||||||
|
# FIXME: <fl> using the encoding stuff from unidb would save
|
||||||
|
# another 50k or so, but I'll leave that for 2.1...
|
||||||
|
|
||||||
|
decomp_data = [""]
|
||||||
|
decomp_index = [0] * len(unicode.chars)
|
||||||
|
|
||||||
|
for char in unicode.chars:
|
||||||
|
record = unicode.table[char]
|
||||||
|
if record:
|
||||||
|
if record[5]:
|
||||||
|
try:
|
||||||
|
i = decomp_data.index(record[5])
|
||||||
|
except ValueError:
|
||||||
|
i = len(decomp_data)
|
||||||
|
decomp_data.append(record[5])
|
||||||
|
else:
|
||||||
|
i = 0
|
||||||
|
decomp_index[char] = i
|
||||||
|
|
||||||
FILE = "unicodedata_db.h"
|
FILE = "unicodedata_db.h"
|
||||||
|
|
||||||
|
@ -65,7 +83,7 @@ def maketable():
|
||||||
print "/* a list of unique database records */"
|
print "/* a list of unique database records */"
|
||||||
print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
|
print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
|
||||||
for item in table:
|
for item in table:
|
||||||
print " {%d, %d, %d, %d, %s}," % item
|
print " {%d, %d, %d, %d}," % item
|
||||||
print "};"
|
print "};"
|
||||||
print
|
print
|
||||||
|
|
||||||
|
@ -82,6 +100,12 @@ def maketable():
|
||||||
print " NULL"
|
print " NULL"
|
||||||
print "};"
|
print "};"
|
||||||
|
|
||||||
|
print "static const char *decomp_data[] = {"
|
||||||
|
for name in decomp_data:
|
||||||
|
print " \"%s\"," % name
|
||||||
|
print " NULL"
|
||||||
|
print "};"
|
||||||
|
|
||||||
# split index table
|
# split index table
|
||||||
index1, index2, shift = splitbins(index)
|
index1, index2, shift = splitbins(index)
|
||||||
|
|
||||||
|
@ -90,6 +114,14 @@ def maketable():
|
||||||
Array("index1", index1).dump(sys.stdout)
|
Array("index1", index1).dump(sys.stdout)
|
||||||
Array("index2", index2).dump(sys.stdout)
|
Array("index2", index2).dump(sys.stdout)
|
||||||
|
|
||||||
|
# split index table
|
||||||
|
index1, index2, shift = splitbins(decomp_index)
|
||||||
|
|
||||||
|
print "/* same, for the decomposition data */"
|
||||||
|
print "#define DECOMP_SHIFT", shift
|
||||||
|
Array("decomp_index1", index1).dump(sys.stdout)
|
||||||
|
Array("decomp_index2", index2).dump(sys.stdout)
|
||||||
|
|
||||||
sys.stdout = sys.__stdout__
|
sys.stdout = sys.__stdout__
|
||||||
|
|
||||||
# --------------------------------------------------------------------
|
# --------------------------------------------------------------------
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue