Move the slowest UTF-8 decoder to its own subfunction

* Create decode_utf8_errors()
 * Reuse unicode_fromascii()
 * decode_utf8_errors() doesn't refit at the beginning
 * Remove refit_partial_string(), use unicode_adjust_maxchar() instead
This commit is contained in:
Victor Stinner 2011-12-11 20:09:03 +01:00
parent 84def3774d
commit 785938eebd

View file

@ -1784,7 +1784,7 @@ _PyUnicode_ClearStaticStrings()
static PyObject* static PyObject*
unicode_fromascii(const unsigned char* s, Py_ssize_t size) unicode_fromascii(const unsigned char* s, Py_ssize_t size)
{ {
PyObject *res; PyObject *unicode;
#ifdef Py_DEBUG #ifdef Py_DEBUG
const unsigned char *p; const unsigned char *p;
const unsigned char *end = s + size; const unsigned char *end = s + size;
@ -1794,11 +1794,12 @@ unicode_fromascii(const unsigned char* s, Py_ssize_t size)
#endif #endif
if (size == 1) if (size == 1)
return get_latin1_char(s[0]); return get_latin1_char(s[0]);
res = PyUnicode_New(size, 127); unicode = PyUnicode_New(size, 127);
if (!res) if (!unicode)
return NULL; return NULL;
memcpy(PyUnicode_1BYTE_DATA(res), s, size); memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
return res; assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
} }
static Py_UCS4 static Py_UCS4
@ -4320,126 +4321,38 @@ _ucs4loop:
return 65537; return 65537;
} }
/* Called when we encountered some error that wasn't detected in the original
scan, e.g. an encoded surrogate character. The original maxchar computation
may have been incorrect, so redo it. */
static int
refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
{
PyObject *tmp;
Py_ssize_t k;
Py_UCS4 maxchar;
for (k = 0, maxchar = 0; k < n; k++)
maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
if (tmp == NULL)
return -1;
PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
Py_DECREF(*unicode);
*unicode = tmp;
return 0;
}
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
in case of errors. Implicit parameters: unicode, kind, data, has_errors, in case of errors. Implicit parameters: unicode, kind, data, onError.
onError. Potential resizing overallocates, so the result needs to shrink Potential resizing overallocates, so the result needs to shrink at the end.
at the end.
*/ */
#define WRITE_MAYBE_FAIL(index, value) \ #define WRITE_MAYBE_FAIL(index, value) \
do { \ do { \
if (has_errors) { \ Py_ssize_t pos = index; \
Py_ssize_t pos = index; \ if (pos > PyUnicode_GET_LENGTH(unicode) && \
if (pos > PyUnicode_GET_LENGTH(unicode) && \ unicode_resize(&unicode, pos + pos/8) < 0) \
unicode_resize(&unicode, pos + pos/8) < 0) \ goto onError; \
goto onError; \ if (unicode_putchar(&unicode, &pos, value) < 0) \
if (unicode_putchar(&unicode, &pos, value) < 0) \ goto onError; \
goto onError; \
} \
else \
PyUnicode_WRITE(kind, data, index, value); \
} while (0) } while (0)
PyObject * PyObject *
PyUnicode_DecodeUTF8Stateful(const char *s, decode_utf8_errors(const char *starts,
Py_ssize_t size, Py_ssize_t size,
const char *errors, const char *errors,
Py_ssize_t *consumed) Py_ssize_t *consumed,
const char *s,
PyObject *unicode,
Py_ssize_t i)
{ {
const char *starts = s;
int n; int n;
int k; int k;
Py_ssize_t startinpos; Py_ssize_t startinpos;
Py_ssize_t endinpos; Py_ssize_t endinpos;
const char *e, *aligned_end; const char *e = starts + size;
PyObject *unicode; const char *aligned_end;
const char *errmsg = ""; const char *errmsg = "";
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
Py_UCS4 maxchar = 0;
Py_ssize_t unicode_size;
Py_ssize_t i;
int kind;
void *data;
int has_errors = 0;
if (size == 0) {
if (consumed)
*consumed = 0;
return (PyObject *)PyUnicode_New(0, 0);
}
maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
if (maxchar < 128 && size == unicode_size) {
if (consumed)
*consumed = size;
if (size == 1)
return get_latin1_char((unsigned char)s[0]);
unicode = PyUnicode_New(unicode_size, maxchar);
if (!unicode)
return NULL;
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
}
/* In case of errors, maxchar and size computation might be incorrect;
code below refits and resizes as necessary. */
unicode = PyUnicode_New(unicode_size, maxchar);
if (!unicode)
return NULL;
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
/* Unpack UTF-8 encoded data */
i = 0;
e = s + size;
switch (kind) {
case PyUnicode_1BYTE_KIND:
has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
break;
case PyUnicode_2BYTE_KIND:
has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
break;
case PyUnicode_4BYTE_KIND:
has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
break;
}
if (!has_errors) {
/* Ensure the unicode size calculation was correct */
assert(i == unicode_size);
assert(s == e);
if (consumed)
*consumed = s-starts;
return unicode;
}
/* Fall through to the generic decoding loop for the rest of
the string */
if (refit_partial_string(&unicode, kind, data, i) < 0)
goto onError;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
@ -4591,11 +4504,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
continue; continue;
utf8Error: utf8Error:
if (!has_errors) {
if (refit_partial_string(&unicode, kind, data, i) < 0)
goto onError;
has_errors = 1;
}
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"utf8", errmsg, "utf8", errmsg,
@ -4604,22 +4512,18 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
goto onError; goto onError;
/* Update data because unicode_decode_call_errorhandler might have /* Update data because unicode_decode_call_errorhandler might have
re-created or resized the unicode object. */ re-created or resized the unicode object. */
data = PyUnicode_DATA(unicode);
kind = PyUnicode_KIND(unicode);
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
} }
/* Ensure the unicode_size calculation above was correct: */
assert(has_errors || i == unicode_size);
if (consumed) if (consumed)
*consumed = s-starts; *consumed = s-starts;
/* Adjust length and ready string when it contained errors and /* Adjust length and ready string when it contained errors and
is of the old resizable kind. */ is of the old resizable kind. */
if (has_errors) { if (unicode_resize(&unicode, i) < 0)
if (PyUnicode_Resize(&unicode, i) < 0) goto onError;
goto onError; unicode_adjust_maxchar(&unicode);
} if (unicode == NULL)
goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
@ -4629,12 +4533,78 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
onError: onError:
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
Py_DECREF(unicode); Py_XDECREF(unicode);
return NULL; return NULL;
} }
#undef WRITE_MAYBE_FAIL #undef WRITE_MAYBE_FAIL
PyObject *
PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
Py_UCS4 maxchar = 0;
Py_ssize_t unicode_size;
int has_errors = 0;
PyObject *unicode;
int kind;
void *data;
const char *starts = s;
const char *e;
Py_ssize_t i;
if (size == 0) {
if (consumed)
*consumed = 0;
return (PyObject *)PyUnicode_New(0, 0);
}
maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
if (maxchar < 128 && size == unicode_size) {
if (consumed)
*consumed = size;
return unicode_fromascii(s, size);
}
unicode = PyUnicode_New(unicode_size, maxchar);
if (!unicode)
return NULL;
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
/* Unpack UTF-8 encoded data */
i = 0;
e = starts + size;
switch (kind) {
case PyUnicode_1BYTE_KIND:
has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
break;
case PyUnicode_2BYTE_KIND:
has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
break;
case PyUnicode_4BYTE_KIND:
has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
break;
}
if (!has_errors) {
/* Ensure the unicode size calculation was correct */
assert(i == unicode_size);
assert(s == e);
if (consumed)
*consumed = size;
return unicode;
}
/* In case of errors, maxchar and size computation might be incorrect;
code below refits and resizes as necessary. */
return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
}
#ifdef __APPLE__ #ifdef __APPLE__
/* Simplified UTF-8 decoder using surrogateescape error handler, /* Simplified UTF-8 decoder using surrogateescape error handler,