Issue #1734234: Massively speedup unicodedata.normalize() when the

string is already in normalized form, by performing a quick check beforehand.
Original patch by Rauli Ruohonen.
This commit is contained in:
Antoine Pitrou 2009-04-27 21:53:26 +00:00
parent 8b8f8cc1b0
commit e988e286b2
5 changed files with 2041 additions and 1731 deletions

View file

@ -612,6 +612,7 @@ Craig Rowland
Paul Rubin Paul Rubin
Sam Ruby Sam Ruby
Audun S. Runde Audun S. Runde
Rauli Ruohonen
Jeff Rush Jeff Rush
Sam Rushing Sam Rushing
Mark Russell Mark Russell

View file

@ -255,6 +255,10 @@ Core and Builtins
Library Library
------- -------
- Issue #1734234: Massively speedup ``unicodedata.normalize()`` when the
string is already in normalized form, by performing a quick check beforehand.
Original patch by Rauli Ruohonen.
- Issue #5853: calling a function of the mimetypes module from several threads - Issue #5853: calling a function of the mimetypes module from several threads
at once could hit the recursion limit if the mimetypes database hadn't been at once could hit the recursion limit if the mimetypes database hadn't been
initialized before. initialized before.

View file

@ -27,6 +27,7 @@ typedef struct {
const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char mirrored; /* true if mirrored in bidir mode */
const unsigned char east_asian_width; /* index into const unsigned char east_asian_width; /* index into
_PyUnicode_EastAsianWidth */ _PyUnicode_EastAsianWidth */
const unsigned char normalization_quick_check; /* see is_normalized() */
} _PyUnicode_DatabaseRecord; } _PyUnicode_DatabaseRecord;
typedef struct change_record { typedef struct change_record {
@ -721,6 +722,38 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
return result; return result;
} }
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
static int
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
{
Py_UNICODE *i, *end;
unsigned char prev_combining = 0, quickcheck_mask;
/* An older version of the database is requested, quickchecks must be
disabled. */
if (self != NULL)
return 0;
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
as described in http://unicode.org/reports/tr15/#Annex8. */
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
i = PyUnicode_AS_UNICODE(input);
end = i + PyUnicode_GET_SIZE(input);
while (i < end) {
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
unsigned char combining = record->combining;
unsigned char quickcheck = record->normalization_quick_check;
if (quickcheck & quickcheck_mask)
return 0; /* this string might need normalization */
if (combining && prev_combining > combining)
return 0; /* non-canonical sort order, not normalized */
prev_combining = combining;
}
return 1; /* certainly normalized */
}
PyDoc_STRVAR(unicodedata_normalize__doc__, PyDoc_STRVAR(unicodedata_normalize__doc__,
"normalize(form, unistr)\n\ "normalize(form, unistr)\n\
\n\ \n\
@ -744,14 +777,34 @@ unicodedata_normalize(PyObject *self, PyObject *args)
return input; return input;
} }
if (strcmp(form, "NFC") == 0) if (strcmp(form, "NFC") == 0) {
if (is_normalized(self, input, 1, 0)) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 0); return nfc_nfkc(self, input, 0);
if (strcmp(form, "NFKC") == 0) }
if (strcmp(form, "NFKC") == 0) {
if (is_normalized(self, input, 1, 1)) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 1); return nfc_nfkc(self, input, 1);
if (strcmp(form, "NFD") == 0) }
if (strcmp(form, "NFD") == 0) {
if (is_normalized(self, input, 0, 0)) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 0); return nfd_nfkd(self, input, 0);
if (strcmp(form, "NFKD") == 0) }
if (strcmp(form, "NFKD") == 0) {
if (is_normalized(self, input, 0, 1)) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 1); return nfd_nfkd(self, input, 1);
}
PyErr_SetString(PyExc_ValueError, "invalid normalization form"); PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL; return NULL;
} }

File diff suppressed because it is too large Load diff

View file

@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0"
UNICODE_DATA = "UnicodeData%s.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
old_versions = ["3.2.0"] old_versions = ["3.2.0"]
@ -66,7 +67,8 @@ def maketables(trace=0):
version = "" version = ""
unicode = UnicodeData(UNICODE_DATA % version, unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version, COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version) EASTASIAN_WIDTH % version,
DERIVEDNORMALIZATION_PROPS % version)
print len(filter(None, unicode.table)), "characters" print len(filter(None, unicode.table)), "characters"
@ -87,7 +89,7 @@ def maketables(trace=0):
def makeunicodedata(unicode, trace): def makeunicodedata(unicode, trace):
dummy = (0, 0, 0, 0, 0) dummy = (0, 0, 0, 0, 0, 0)
table = [dummy] table = [dummy]
cache = {0: dummy} cache = {0: dummy}
index = [0] * len(unicode.chars) index = [0] * len(unicode.chars)
@ -107,8 +109,10 @@ def makeunicodedata(unicode, trace):
bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y" mirrored = record[9] == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
normalizationquickcheck = record[16]
item = ( item = (
category, combining, bidirectional, mirrored, eastasianwidth category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck
) )
# add entry to index and item tables # add entry to index and item tables
i = cache.get(item) i = cache.get(item)
@ -222,7 +226,7 @@ def makeunicodedata(unicode, trace):
print >>fp, \ print >>fp, \
"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
for item in table: for item in table:
print >>fp, " {%d, %d, %d, %d, %d}," % item print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
print >>fp, "};" print >>fp, "};"
print >>fp print >>fp
@ -698,7 +702,8 @@ import sys
class UnicodeData: class UnicodeData:
def __init__(self, filename, exclusions, eastasianwidth, expand=1): def __init__(self, filename, exclusions, eastasianwidth,
derivednormalizationprops=None, expand=1):
self.changed = [] self.changed = []
file = open(filename) file = open(filename)
table = [None] * 0x110000 table = [None] * 0x110000
@ -761,6 +766,28 @@ class UnicodeData:
for i in range(0, 0x110000): for i in range(0, 0x110000):
if table[i] is not None: if table[i] is not None:
table[i].append(widths[i]) table[i].append(widths[i])
if derivednormalizationprops:
quickchecks = [0] * 0x110000 # default is Yes
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
for s in open(derivednormalizationprops):
if '#' in s:
s = s[:s.index('#')]
s = [i.strip() for i in s.split(';')]
if len(s) < 2 or s[1] not in qc_order:
continue
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
quickcheck_shift = qc_order.index(s[1])*2
quickcheck <<= quickcheck_shift
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
def uselatin1(self): def uselatin1(self):
# restrict character range to ISO Latin 1 # restrict character range to ISO Latin 1