bpo-31979: Simplify transforming decimals to ASCII (#4336)

in int(), float() and complex() parsers.

This also speeds up parsing non-ASCII numbers by around 20%.
This commit is contained in:
Serhiy Storchaka 2017-11-13 21:23:48 +02:00 committed by GitHub
parent ce12629c84
commit 9b6c60cbce
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 63 additions and 139 deletions

View file

@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)
/* --- Unicode Object ----------------------------------------------------- */
static PyObject *
fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
static inline Py_ssize_t
findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch,
@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
return _PyUnicode_TranslateCharmap(str, mapping, errors);
}
static Py_UCS4
fix_decimal_and_space_to_ascii(PyObject *self)
{
/* No need to call PyUnicode_READY(self) because this function is only
called as a callback from fixup() which does it already. */
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
const int kind = PyUnicode_KIND(self);
void *data = PyUnicode_DATA(self);
Py_UCS4 maxchar = 127, ch, fixed;
int modified = 0;
Py_ssize_t i;
for (i = 0; i < len; ++i) {
ch = PyUnicode_READ(kind, data, i);
fixed = 0;
if (ch > 127) {
if (Py_UNICODE_ISSPACE(ch))
fixed = ' ';
else {
const int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0)
fixed = '0' + decimal;
}
if (fixed != 0) {
modified = 1;
maxchar = Py_MAX(maxchar, fixed);
PyUnicode_WRITE(kind, data, i, fixed);
}
else
maxchar = Py_MAX(maxchar, ch);
}
}
return (modified) ? maxchar : 0;
}
PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
{
@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
if (PyUnicode_IS_ASCII(unicode)) {
/* If the string is already ASCII, just return the same string */
Py_INCREF(unicode);
return unicode;
}
return fixup(unicode, fix_decimal_and_space_to_ascii);
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
PyObject *result = PyUnicode_New(len, 127);
if (result == NULL) {
return NULL;
}
Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
int kind = PyUnicode_KIND(unicode);
const void *data = PyUnicode_DATA(unicode);
Py_ssize_t i;
for (i = 0; i < len; ++i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (ch < 127) {
out[i] = ch;
}
else if (Py_UNICODE_ISSPACE(ch)) {
out[i] = ' ';
}
else {
int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal < 0) {
out[i] = '?';
_PyUnicode_LENGTH(result) = i + 1;
break;
}
out[i] = '0' + decimal;
}
}
return result;
}
PyObject *
@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
return tailmatch(str, substr, start, end, direction);
}
/* Apply fixfct filter to the Unicode object self and return a
reference to the modified object */
static PyObject *
fixup(PyObject *self,
Py_UCS4 (*fixfct)(PyObject *s))
{
PyObject *u;
Py_UCS4 maxchar_old, maxchar_new = 0;
PyObject *v;
u = _PyUnicode_Copy(self);
if (u == NULL)
return NULL;
maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
/* fix functions return the new maximum character in a string,
if the kind of the resulting unicode object does not change,
everything is fine. Otherwise we need to change the string kind
and re-run the fix function. */
maxchar_new = fixfct(u);
if (maxchar_new == 0) {
/* no changes */;
if (PyUnicode_CheckExact(self)) {
Py_DECREF(u);
Py_INCREF(self);
return self;
}
else
return u;
}
maxchar_new = align_maxchar(maxchar_new);
if (maxchar_new == maxchar_old)
return u;
/* In case the maximum character changed, we need to
convert the string to the new category. */
v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
if (v == NULL) {
Py_DECREF(u);
return NULL;
}
if (maxchar_new > maxchar_old) {
/* If the maxchar increased so that the kind changed, not all
characters are representable anymore and we need to fix the
string again. This only happens in very few cases. */
_PyUnicode_FastCopyCharacters(v, 0,
self, 0, PyUnicode_GET_LENGTH(self));
maxchar_old = fixfct(v);
assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
}
else {
_PyUnicode_FastCopyCharacters(v, 0,
u, 0, PyUnicode_GET_LENGTH(self));
}
Py_DECREF(u);
assert(_PyUnicode_CheckConsistency(v, 1));
return v;
}
static PyObject *
ascii_upper_or_lower(PyObject *self, int lower)
{