mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Call directly PyUnicode_DecodeUTF8Stateful() instead of PyUnicode_DecodeUTF8()
* Remove micro-optimization from PyUnicode_FromStringAndSize(): PyUnicode_DecodeUTF8Stateful() has already these optimizations (for size=0 and one ascii char). * Rename utf8_max_char_size_and_char_count() to utf8_scanner(), and remove an useless variable
This commit is contained in:
parent
382955ff4e
commit
a1d12bb119
1 changed files with 14 additions and 33 deletions
|
@ -1717,28 +1717,10 @@ PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
|
||||||
"Negative size passed to PyUnicode_FromStringAndSize");
|
"Negative size passed to PyUnicode_FromStringAndSize");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
if (u != NULL)
|
||||||
/* If the Unicode data is known at construction time, we can apply
|
return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
|
||||||
some optimizations which share commonly used objects.
|
else
|
||||||
Also, this means the input must be UTF-8, so fall back to the
|
return (PyObject *)_PyUnicode_New(size);
|
||||||
UTF-8 decoder at the end. */
|
|
||||||
if (u != NULL) {
|
|
||||||
|
|
||||||
/* Optimization for empty strings */
|
|
||||||
if (size == 0 && unicode_empty != NULL) {
|
|
||||||
Py_INCREF(unicode_empty);
|
|
||||||
return unicode_empty;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Single characters are shared when using this constructor.
|
|
||||||
Restrict to ASCII, since the input must be UTF-8. */
|
|
||||||
if (size == 1 && (unsigned char)*u < 128)
|
|
||||||
return get_latin1_char((unsigned char)*u);
|
|
||||||
|
|
||||||
return PyUnicode_DecodeUTF8(u, size, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
return (PyObject *)_PyUnicode_New(size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
|
@ -1749,15 +1731,16 @@ PyUnicode_FromString(const char *u)
|
||||||
PyErr_SetString(PyExc_OverflowError, "input too long");
|
PyErr_SetString(PyExc_OverflowError, "input too long");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
|
||||||
return PyUnicode_FromStringAndSize(u, size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
_PyUnicode_FromId(_Py_Identifier *id)
|
_PyUnicode_FromId(_Py_Identifier *id)
|
||||||
{
|
{
|
||||||
if (!id->object) {
|
if (!id->object) {
|
||||||
id->object = PyUnicode_FromString(id->string);
|
id->object = PyUnicode_DecodeUTF8Stateful(id->string,
|
||||||
|
strlen(id->string),
|
||||||
|
NULL, NULL);
|
||||||
if (!id->object)
|
if (!id->object)
|
||||||
return NULL;
|
return NULL;
|
||||||
PyUnicode_InternInPlace(&id->object);
|
PyUnicode_InternInPlace(&id->object);
|
||||||
|
@ -2443,7 +2426,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
|
||||||
{
|
{
|
||||||
/* UTF-8 */
|
/* UTF-8 */
|
||||||
const char *s = va_arg(count, const char*);
|
const char *s = va_arg(count, const char*);
|
||||||
PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
|
PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
|
||||||
if (!str)
|
if (!str)
|
||||||
goto fail;
|
goto fail;
|
||||||
/* since PyUnicode_DecodeUTF8 returns already flexible
|
/* since PyUnicode_DecodeUTF8 returns already flexible
|
||||||
|
@ -2482,7 +2465,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
|
||||||
*callresult++ = NULL;
|
*callresult++ = NULL;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
|
str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
|
||||||
if (!str_obj)
|
if (!str_obj)
|
||||||
goto fail;
|
goto fail;
|
||||||
if (PyUnicode_READY(str_obj)) {
|
if (PyUnicode_READY(str_obj)) {
|
||||||
|
@ -2947,7 +2930,7 @@ PyUnicode_Decode(const char *s,
|
||||||
if (normalize_encoding(encoding, lower, sizeof(lower))) {
|
if (normalize_encoding(encoding, lower, sizeof(lower))) {
|
||||||
if ((strcmp(lower, "utf-8") == 0) ||
|
if ((strcmp(lower, "utf-8") == 0) ||
|
||||||
(strcmp(lower, "utf8") == 0))
|
(strcmp(lower, "utf8") == 0))
|
||||||
return PyUnicode_DecodeUTF8(s, size, errors);
|
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
||||||
else if ((strcmp(lower, "latin-1") == 0) ||
|
else if ((strcmp(lower, "latin-1") == 0) ||
|
||||||
(strcmp(lower, "latin1") == 0) ||
|
(strcmp(lower, "latin1") == 0) ||
|
||||||
(strcmp(lower, "iso-8859-1") == 0))
|
(strcmp(lower, "iso-8859-1") == 0))
|
||||||
|
@ -3260,7 +3243,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
|
||||||
#ifdef HAVE_MBCS
|
#ifdef HAVE_MBCS
|
||||||
return PyUnicode_DecodeMBCS(s, size, NULL);
|
return PyUnicode_DecodeMBCS(s, size, NULL);
|
||||||
#elif defined(__APPLE__)
|
#elif defined(__APPLE__)
|
||||||
return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
|
return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
|
||||||
#else
|
#else
|
||||||
PyInterpreterState *interp = PyThreadState_GET()->interp;
|
PyInterpreterState *interp = PyThreadState_GET()->interp;
|
||||||
/* Bootstrap check: if the filesystem codec is implemented in Python, we
|
/* Bootstrap check: if the filesystem codec is implemented in Python, we
|
||||||
|
@ -4240,11 +4223,9 @@ PyUnicode_DecodeUTF8(const char *s,
|
||||||
PyUnicode_DecodeUTF8Stateful.
|
PyUnicode_DecodeUTF8Stateful.
|
||||||
*/
|
*/
|
||||||
static Py_UCS4
|
static Py_UCS4
|
||||||
utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
|
utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
|
||||||
Py_ssize_t *unicode_size)
|
|
||||||
{
|
{
|
||||||
Py_ssize_t char_count = 0;
|
Py_ssize_t char_count = 0;
|
||||||
const unsigned char *p = (const unsigned char *)s;
|
|
||||||
const unsigned char *end = p + string_size;
|
const unsigned char *end = p + string_size;
|
||||||
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
|
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
|
||||||
|
|
||||||
|
@ -4563,7 +4544,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
return unicode_empty;
|
return unicode_empty;
|
||||||
}
|
}
|
||||||
|
|
||||||
maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
|
maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
|
||||||
|
|
||||||
/* When the string is ASCII only, just use memcpy and return.
|
/* When the string is ASCII only, just use memcpy and return.
|
||||||
unicode_size may be != size if there is an incomplete UTF-8
|
unicode_size may be != size if there is an incomplete UTF-8
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue