mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
Issue #27938: Add a fast-path for us-ascii encoding
Other changes: * Rewrite _Py_normalize_encoding() as a C implementation of encodings.normalize_encoding(). For example, " utf-8 " is now normalized to "utf_8". So the fast path is now used for more name variants of the same encoding. * Avoid strcpy() when encoding is NULL: call directly the UTF-8 codec
This commit is contained in:
parent
a9ab165cd2
commit
942889aae2
1 changed files with 110 additions and 56 deletions
|
@ -3100,9 +3100,9 @@ PyUnicode_FromEncodedObject(PyObject *obj,
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Convert encoding to lower case and replace '_' with '-' in order to
|
/* Normalize an encoding name: C implementation of
|
||||||
catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
|
encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
|
||||||
1 on success. */
|
is longer than lower_len-1). */
|
||||||
int
|
int
|
||||||
_Py_normalize_encoding(const char *encoding,
|
_Py_normalize_encoding(const char *encoding,
|
||||||
char *lower,
|
char *lower,
|
||||||
|
@ -3111,30 +3111,39 @@ _Py_normalize_encoding(const char *encoding,
|
||||||
const char *e;
|
const char *e;
|
||||||
char *l;
|
char *l;
|
||||||
char *l_end;
|
char *l_end;
|
||||||
|
int punct;
|
||||||
|
|
||||||
|
assert(encoding != NULL);
|
||||||
|
|
||||||
if (encoding == NULL) {
|
|
||||||
/* 6 == strlen("utf-8") + 1 */
|
|
||||||
if (lower_len < 6)
|
|
||||||
return 0;
|
|
||||||
strcpy(lower, "utf-8");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
e = encoding;
|
e = encoding;
|
||||||
l = lower;
|
l = lower;
|
||||||
l_end = &lower[lower_len - 1];
|
l_end = &lower[lower_len - 1];
|
||||||
while (*e) {
|
punct = 0;
|
||||||
if (l == l_end)
|
while (1) {
|
||||||
return 0;
|
char c = *e;
|
||||||
if (Py_ISUPPER(*e)) {
|
if (c == 0) {
|
||||||
*l++ = Py_TOLOWER(*e++);
|
break;
|
||||||
}
|
}
|
||||||
else if (*e == '_') {
|
|
||||||
*l++ = '-';
|
if (Py_ISALNUM(c) || c == '.') {
|
||||||
e++;
|
if (punct && l != lower) {
|
||||||
|
if (l == l_end) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
*l++ = '_';
|
||||||
|
}
|
||||||
|
punct = 0;
|
||||||
|
|
||||||
|
if (l == l_end) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
*l++ = Py_TOLOWER(c);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
*l++ = *e++;
|
punct = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
e++;
|
||||||
}
|
}
|
||||||
*l = '\0';
|
*l = '\0';
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -3148,28 +3157,51 @@ PyUnicode_Decode(const char *s,
|
||||||
{
|
{
|
||||||
PyObject *buffer = NULL, *unicode;
|
PyObject *buffer = NULL, *unicode;
|
||||||
Py_buffer info;
|
Py_buffer info;
|
||||||
char lower[11]; /* Enough for any encoding shortcut */
|
char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
|
||||||
|
|
||||||
|
if (encoding == NULL) {
|
||||||
|
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/* Shortcuts for common default encodings */
|
/* Shortcuts for common default encodings */
|
||||||
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
|
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
|
||||||
if ((strcmp(lower, "utf-8") == 0) ||
|
char *lower = buflower;
|
||||||
(strcmp(lower, "utf8") == 0))
|
|
||||||
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
/* Fast paths */
|
||||||
else if ((strcmp(lower, "latin-1") == 0) ||
|
if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
|
||||||
(strcmp(lower, "latin1") == 0) ||
|
lower += 3;
|
||||||
(strcmp(lower, "iso-8859-1") == 0) ||
|
if (*lower == '_') {
|
||||||
(strcmp(lower, "iso8859-1") == 0))
|
/* Match "utf8" and "utf_8" */
|
||||||
return PyUnicode_DecodeLatin1(s, size, errors);
|
lower++;
|
||||||
#ifdef HAVE_MBCS
|
}
|
||||||
else if (strcmp(lower, "mbcs") == 0)
|
|
||||||
return PyUnicode_DecodeMBCS(s, size, errors);
|
if (lower[0] == '8' && lower[1] == 0) {
|
||||||
#endif
|
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
||||||
else if (strcmp(lower, "ascii") == 0)
|
}
|
||||||
return PyUnicode_DecodeASCII(s, size, errors);
|
else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
|
||||||
else if (strcmp(lower, "utf-16") == 0)
|
return PyUnicode_DecodeUTF16(s, size, errors, 0);
|
||||||
return PyUnicode_DecodeUTF16(s, size, errors, 0);
|
}
|
||||||
else if (strcmp(lower, "utf-32") == 0)
|
else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
|
||||||
return PyUnicode_DecodeUTF32(s, size, errors, 0);
|
return PyUnicode_DecodeUTF32(s, size, errors, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (strcmp(lower, "ascii") == 0
|
||||||
|
|| strcmp(lower, "us_ascii") == 0) {
|
||||||
|
return PyUnicode_DecodeASCII(s, size, errors);
|
||||||
|
}
|
||||||
|
#ifdef HAVE_MBCS
|
||||||
|
else if (strcmp(lower, "mbcs") == 0) {
|
||||||
|
return PyUnicode_DecodeMBCS(s, size, errors);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
else if (strcmp(lower, "latin1") == 0
|
||||||
|
|| strcmp(lower, "latin_1") == 0
|
||||||
|
|| strcmp(lower, "iso_8859_1") == 0
|
||||||
|
|| strcmp(lower, "iso8859_1") == 0) {
|
||||||
|
return PyUnicode_DecodeLatin1(s, size, errors);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Decode via the codec registry */
|
/* Decode via the codec registry */
|
||||||
|
@ -3512,34 +3544,56 @@ PyUnicode_AsEncodedString(PyObject *unicode,
|
||||||
const char *errors)
|
const char *errors)
|
||||||
{
|
{
|
||||||
PyObject *v;
|
PyObject *v;
|
||||||
char lower[11]; /* Enough for any encoding shortcut */
|
char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
|
||||||
|
|
||||||
if (!PyUnicode_Check(unicode)) {
|
if (!PyUnicode_Check(unicode)) {
|
||||||
PyErr_BadArgument();
|
PyErr_BadArgument();
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (encoding == NULL) {
|
||||||
|
return _PyUnicode_AsUTF8String(unicode, errors);
|
||||||
|
}
|
||||||
|
|
||||||
/* Shortcuts for common default encodings */
|
/* Shortcuts for common default encodings */
|
||||||
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
|
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
|
||||||
if ((strcmp(lower, "utf-8") == 0) ||
|
char *lower = buflower;
|
||||||
(strcmp(lower, "utf8") == 0))
|
|
||||||
{
|
/* Fast paths */
|
||||||
if (errors == NULL || strcmp(errors, "strict") == 0)
|
if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
|
||||||
return _PyUnicode_AsUTF8String(unicode, NULL);
|
lower += 3;
|
||||||
else
|
if (*lower == '_') {
|
||||||
|
/* Match "utf8" and "utf_8" */
|
||||||
|
lower++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lower[0] == '8' && lower[1] == 0) {
|
||||||
return _PyUnicode_AsUTF8String(unicode, errors);
|
return _PyUnicode_AsUTF8String(unicode, errors);
|
||||||
|
}
|
||||||
|
else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
|
||||||
|
return _PyUnicode_EncodeUTF16(unicode, errors, 0);
|
||||||
|
}
|
||||||
|
else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
|
||||||
|
return _PyUnicode_EncodeUTF32(unicode, errors, 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if ((strcmp(lower, "latin-1") == 0) ||
|
else {
|
||||||
(strcmp(lower, "latin1") == 0) ||
|
if (strcmp(lower, "ascii") == 0
|
||||||
(strcmp(lower, "iso-8859-1") == 0) ||
|
|| strcmp(lower, "us_ascii") == 0) {
|
||||||
(strcmp(lower, "iso8859-1") == 0))
|
return _PyUnicode_AsASCIIString(unicode, errors);
|
||||||
return _PyUnicode_AsLatin1String(unicode, errors);
|
}
|
||||||
#ifdef HAVE_MBCS
|
#ifdef HAVE_MBCS
|
||||||
else if (strcmp(lower, "mbcs") == 0)
|
else if (strcmp(lower, "mbcs") == 0) {
|
||||||
return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
|
return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
else if (strcmp(lower, "ascii") == 0)
|
else if (strcmp(lower, "latin1") == 0 ||
|
||||||
return _PyUnicode_AsASCIIString(unicode, errors);
|
strcmp(lower, "latin_1") == 0 ||
|
||||||
|
strcmp(lower, "iso_8859_1") == 0 ||
|
||||||
|
strcmp(lower, "iso8859_1") == 0) {
|
||||||
|
return _PyUnicode_AsLatin1String(unicode, errors);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Encode via the codec registry */
|
/* Encode via the codec registry */
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue