mirror of
https://github.com/python/cpython.git
synced 2025-08-08 10:58:51 +00:00
Issue #1734234: Massively speedup unicodedata.normalize()
when the
string is already in normalized form, by performing a quick check beforehand. Original patch by Rauli Ruohonen.
This commit is contained in:
parent
8b8f8cc1b0
commit
e988e286b2
5 changed files with 2041 additions and 1731 deletions
|
@ -612,6 +612,7 @@ Craig Rowland
|
||||||
Paul Rubin
|
Paul Rubin
|
||||||
Sam Ruby
|
Sam Ruby
|
||||||
Audun S. Runde
|
Audun S. Runde
|
||||||
|
Rauli Ruohonen
|
||||||
Jeff Rush
|
Jeff Rush
|
||||||
Sam Rushing
|
Sam Rushing
|
||||||
Mark Russell
|
Mark Russell
|
||||||
|
|
|
@ -255,6 +255,10 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #1734234: Massively speedup ``unicodedata.normalize()`` when the
|
||||||
|
string is already in normalized form, by performing a quick check beforehand.
|
||||||
|
Original patch by Rauli Ruohonen.
|
||||||
|
|
||||||
- Issue #5853: calling a function of the mimetypes module from several threads
|
- Issue #5853: calling a function of the mimetypes module from several threads
|
||||||
at once could hit the recursion limit if the mimetypes database hadn't been
|
at once could hit the recursion limit if the mimetypes database hadn't been
|
||||||
initialized before.
|
initialized before.
|
||||||
|
|
|
@ -27,6 +27,7 @@ typedef struct {
|
||||||
const unsigned char mirrored; /* true if mirrored in bidir mode */
|
const unsigned char mirrored; /* true if mirrored in bidir mode */
|
||||||
const unsigned char east_asian_width; /* index into
|
const unsigned char east_asian_width; /* index into
|
||||||
_PyUnicode_EastAsianWidth */
|
_PyUnicode_EastAsianWidth */
|
||||||
|
const unsigned char normalization_quick_check; /* see is_normalized() */
|
||||||
} _PyUnicode_DatabaseRecord;
|
} _PyUnicode_DatabaseRecord;
|
||||||
|
|
||||||
typedef struct change_record {
|
typedef struct change_record {
|
||||||
|
@ -721,6 +722,38 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
|
||||||
|
static int
|
||||||
|
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
|
||||||
|
{
|
||||||
|
Py_UNICODE *i, *end;
|
||||||
|
unsigned char prev_combining = 0, quickcheck_mask;
|
||||||
|
|
||||||
|
/* An older version of the database is requested, quickchecks must be
|
||||||
|
disabled. */
|
||||||
|
if (self != NULL)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
|
||||||
|
as described in http://unicode.org/reports/tr15/#Annex8. */
|
||||||
|
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
|
||||||
|
|
||||||
|
i = PyUnicode_AS_UNICODE(input);
|
||||||
|
end = i + PyUnicode_GET_SIZE(input);
|
||||||
|
while (i < end) {
|
||||||
|
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
|
||||||
|
unsigned char combining = record->combining;
|
||||||
|
unsigned char quickcheck = record->normalization_quick_check;
|
||||||
|
|
||||||
|
if (quickcheck & quickcheck_mask)
|
||||||
|
return 0; /* this string might need normalization */
|
||||||
|
if (combining && prev_combining > combining)
|
||||||
|
return 0; /* non-canonical sort order, not normalized */
|
||||||
|
prev_combining = combining;
|
||||||
|
}
|
||||||
|
return 1; /* certainly normalized */
|
||||||
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(unicodedata_normalize__doc__,
|
PyDoc_STRVAR(unicodedata_normalize__doc__,
|
||||||
"normalize(form, unistr)\n\
|
"normalize(form, unistr)\n\
|
||||||
\n\
|
\n\
|
||||||
|
@ -744,14 +777,34 @@ unicodedata_normalize(PyObject *self, PyObject *args)
|
||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strcmp(form, "NFC") == 0)
|
if (strcmp(form, "NFC") == 0) {
|
||||||
|
if (is_normalized(self, input, 1, 0)) {
|
||||||
|
Py_INCREF(input);
|
||||||
|
return input;
|
||||||
|
}
|
||||||
return nfc_nfkc(self, input, 0);
|
return nfc_nfkc(self, input, 0);
|
||||||
if (strcmp(form, "NFKC") == 0)
|
}
|
||||||
|
if (strcmp(form, "NFKC") == 0) {
|
||||||
|
if (is_normalized(self, input, 1, 1)) {
|
||||||
|
Py_INCREF(input);
|
||||||
|
return input;
|
||||||
|
}
|
||||||
return nfc_nfkc(self, input, 1);
|
return nfc_nfkc(self, input, 1);
|
||||||
if (strcmp(form, "NFD") == 0)
|
}
|
||||||
|
if (strcmp(form, "NFD") == 0) {
|
||||||
|
if (is_normalized(self, input, 0, 0)) {
|
||||||
|
Py_INCREF(input);
|
||||||
|
return input;
|
||||||
|
}
|
||||||
return nfd_nfkd(self, input, 0);
|
return nfd_nfkd(self, input, 0);
|
||||||
if (strcmp(form, "NFKD") == 0)
|
}
|
||||||
|
if (strcmp(form, "NFKD") == 0) {
|
||||||
|
if (is_normalized(self, input, 0, 1)) {
|
||||||
|
Py_INCREF(input);
|
||||||
|
return input;
|
||||||
|
}
|
||||||
return nfd_nfkd(self, input, 1);
|
return nfd_nfkd(self, input, 1);
|
||||||
|
}
|
||||||
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
|
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0"
|
||||||
UNICODE_DATA = "UnicodeData%s.txt"
|
UNICODE_DATA = "UnicodeData%s.txt"
|
||||||
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
||||||
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
||||||
|
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
||||||
|
|
||||||
old_versions = ["3.2.0"]
|
old_versions = ["3.2.0"]
|
||||||
|
|
||||||
|
@ -66,7 +67,8 @@ def maketables(trace=0):
|
||||||
version = ""
|
version = ""
|
||||||
unicode = UnicodeData(UNICODE_DATA % version,
|
unicode = UnicodeData(UNICODE_DATA % version,
|
||||||
COMPOSITION_EXCLUSIONS % version,
|
COMPOSITION_EXCLUSIONS % version,
|
||||||
EASTASIAN_WIDTH % version)
|
EASTASIAN_WIDTH % version,
|
||||||
|
DERIVEDNORMALIZATION_PROPS % version)
|
||||||
|
|
||||||
print len(filter(None, unicode.table)), "characters"
|
print len(filter(None, unicode.table)), "characters"
|
||||||
|
|
||||||
|
@ -87,7 +89,7 @@ def maketables(trace=0):
|
||||||
|
|
||||||
def makeunicodedata(unicode, trace):
|
def makeunicodedata(unicode, trace):
|
||||||
|
|
||||||
dummy = (0, 0, 0, 0, 0)
|
dummy = (0, 0, 0, 0, 0, 0)
|
||||||
table = [dummy]
|
table = [dummy]
|
||||||
cache = {0: dummy}
|
cache = {0: dummy}
|
||||||
index = [0] * len(unicode.chars)
|
index = [0] * len(unicode.chars)
|
||||||
|
@ -107,8 +109,10 @@ def makeunicodedata(unicode, trace):
|
||||||
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
|
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
|
||||||
mirrored = record[9] == "Y"
|
mirrored = record[9] == "Y"
|
||||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
|
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
|
||||||
|
normalizationquickcheck = record[16]
|
||||||
item = (
|
item = (
|
||||||
category, combining, bidirectional, mirrored, eastasianwidth
|
category, combining, bidirectional, mirrored, eastasianwidth,
|
||||||
|
normalizationquickcheck
|
||||||
)
|
)
|
||||||
# add entry to index and item tables
|
# add entry to index and item tables
|
||||||
i = cache.get(item)
|
i = cache.get(item)
|
||||||
|
@ -222,7 +226,7 @@ def makeunicodedata(unicode, trace):
|
||||||
print >>fp, \
|
print >>fp, \
|
||||||
"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
|
"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
|
||||||
for item in table:
|
for item in table:
|
||||||
print >>fp, " {%d, %d, %d, %d, %d}," % item
|
print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
|
||||||
print >>fp, "};"
|
print >>fp, "};"
|
||||||
print >>fp
|
print >>fp
|
||||||
|
|
||||||
|
@ -698,7 +702,8 @@ import sys
|
||||||
|
|
||||||
class UnicodeData:
|
class UnicodeData:
|
||||||
|
|
||||||
def __init__(self, filename, exclusions, eastasianwidth, expand=1):
|
def __init__(self, filename, exclusions, eastasianwidth,
|
||||||
|
derivednormalizationprops=None, expand=1):
|
||||||
self.changed = []
|
self.changed = []
|
||||||
file = open(filename)
|
file = open(filename)
|
||||||
table = [None] * 0x110000
|
table = [None] * 0x110000
|
||||||
|
@ -761,6 +766,28 @@ class UnicodeData:
|
||||||
for i in range(0, 0x110000):
|
for i in range(0, 0x110000):
|
||||||
if table[i] is not None:
|
if table[i] is not None:
|
||||||
table[i].append(widths[i])
|
table[i].append(widths[i])
|
||||||
|
if derivednormalizationprops:
|
||||||
|
quickchecks = [0] * 0x110000 # default is Yes
|
||||||
|
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
|
||||||
|
for s in open(derivednormalizationprops):
|
||||||
|
if '#' in s:
|
||||||
|
s = s[:s.index('#')]
|
||||||
|
s = [i.strip() for i in s.split(';')]
|
||||||
|
if len(s) < 2 or s[1] not in qc_order:
|
||||||
|
continue
|
||||||
|
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
|
||||||
|
quickcheck_shift = qc_order.index(s[1])*2
|
||||||
|
quickcheck <<= quickcheck_shift
|
||||||
|
if '..' not in s[0]:
|
||||||
|
first = last = int(s[0], 16)
|
||||||
|
else:
|
||||||
|
first, last = [int(c, 16) for c in s[0].split('..')]
|
||||||
|
for char in range(first, last+1):
|
||||||
|
assert not (quickchecks[char]>>quickcheck_shift)&3
|
||||||
|
quickchecks[char] |= quickcheck
|
||||||
|
for i in range(0, 0x110000):
|
||||||
|
if table[i] is not None:
|
||||||
|
table[i].append(quickchecks[i])
|
||||||
|
|
||||||
def uselatin1(self):
|
def uselatin1(self):
|
||||||
# restrict character range to ISO Latin 1
|
# restrict character range to ISO Latin 1
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue