mirror of
https://github.com/python/cpython.git
synced 2025-08-29 05:05:03 +00:00
reorganized PyUnicode_DecodeUnicodeEscape a bit (in order to make it
less likely that bug #132817 ever appears again)
This commit is contained in:
parent
b95896b2d2
commit
ccc7473fc8
1 changed files with 73 additions and 114 deletions
|
@ -1110,9 +1110,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
const char *errors)
|
const char *errors)
|
||||||
{
|
{
|
||||||
PyUnicodeObject *v;
|
PyUnicodeObject *v;
|
||||||
Py_UNICODE *p = NULL, *buf = NULL;
|
Py_UNICODE *p, *buf;
|
||||||
const char *end;
|
const char *end;
|
||||||
Py_UCS4 chr;
|
char* message;
|
||||||
|
Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
|
||||||
|
|
||||||
/* Escaped strings will always be longer than the resulting
|
/* Escaped strings will always be longer than the resulting
|
||||||
Unicode string, so we start with size here and then reduce the
|
Unicode string, so we start with size here and then reduce the
|
||||||
|
@ -1122,12 +1123,14 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
goto onError;
|
goto onError;
|
||||||
if (size == 0)
|
if (size == 0)
|
||||||
return (PyObject *)v;
|
return (PyObject *)v;
|
||||||
|
|
||||||
p = buf = PyUnicode_AS_UNICODE(v);
|
p = buf = PyUnicode_AS_UNICODE(v);
|
||||||
end = s + size;
|
end = s + size;
|
||||||
|
|
||||||
while (s < end) {
|
while (s < end) {
|
||||||
unsigned char c;
|
unsigned char c;
|
||||||
Py_UNICODE x;
|
Py_UNICODE x;
|
||||||
int i;
|
int i, digits;
|
||||||
|
|
||||||
/* Non-escape characters are interpreted as Unicode ordinals */
|
/* Non-escape characters are interpreted as Unicode ordinals */
|
||||||
if (*s != '\\') {
|
if (*s != '\\') {
|
||||||
|
@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
*p++ = x;
|
*p++ = x;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* \xXX with two hex digits */
|
/* hex escapes */
|
||||||
|
/* \xXX */
|
||||||
case 'x':
|
case 'x':
|
||||||
for (x = 0, i = 0; i < 2; i++) {
|
digits = 2;
|
||||||
c = (unsigned char)s[i];
|
message = "truncated \\xXX escape";
|
||||||
if (!isxdigit(c)) {
|
goto hexescape;
|
||||||
if (unicodeescape_decoding_error(&s, &x, errors,
|
|
||||||
"truncated \\xXX"))
|
|
||||||
goto onError;
|
|
||||||
i++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
x = (x<<4) & ~0xF;
|
|
||||||
if (c >= '0' && c <= '9')
|
|
||||||
x += c - '0';
|
|
||||||
else if (c >= 'a' && c <= 'f')
|
|
||||||
x += 10 + c - 'a';
|
|
||||||
else
|
|
||||||
x += 10 + c - 'A';
|
|
||||||
}
|
|
||||||
s += i;
|
|
||||||
*p++ = x;
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* \uXXXX with 4 hex digits */
|
/* \uXXXX */
|
||||||
case 'u':
|
case 'u':
|
||||||
for (x = 0, i = 0; i < 4; i++) {
|
digits = 4;
|
||||||
c = (unsigned char)s[i];
|
message = "truncated \\uXXXX escape";
|
||||||
if (!isxdigit(c)) {
|
goto hexescape;
|
||||||
if (unicodeescape_decoding_error(&s, &x, errors,
|
|
||||||
"truncated \\uXXXX"))
|
|
||||||
goto onError;
|
|
||||||
i++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
x = (x<<4) & ~0xF;
|
|
||||||
if (c >= '0' && c <= '9')
|
|
||||||
x += c - '0';
|
|
||||||
else if (c >= 'a' && c <= 'f')
|
|
||||||
x += 10 + c - 'a';
|
|
||||||
else
|
|
||||||
x += 10 + c - 'A';
|
|
||||||
}
|
|
||||||
s += i;
|
|
||||||
*p++ = x;
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* \UXXXXXXXX with 8 hex digits */
|
/* \UXXXXXXXX */
|
||||||
case 'U':
|
case 'U':
|
||||||
for (chr = 0, i = 0; i < 8; i++) {
|
digits = 8;
|
||||||
|
message = "truncated \\UXXXXXXXX escape";
|
||||||
|
hexescape:
|
||||||
|
chr = 0;
|
||||||
|
for (i = 0; i < digits; i++) {
|
||||||
c = (unsigned char) s[i];
|
c = (unsigned char) s[i];
|
||||||
if (!isxdigit(c)) {
|
if (!isxdigit(c)) {
|
||||||
if (unicodeescape_decoding_error(&s, &x, errors,
|
if (unicodeescape_decoding_error(&s, &x, errors, message))
|
||||||
"truncated \\uXXXX"))
|
|
||||||
goto onError;
|
goto onError;
|
||||||
|
chr = x;
|
||||||
i++;
|
i++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1230,64 +1204,6 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
chr += 10 + c - 'A';
|
chr += 10 + c - 'A';
|
||||||
}
|
}
|
||||||
s += i;
|
s += i;
|
||||||
goto store;
|
|
||||||
|
|
||||||
case 'N':
|
|
||||||
/* Ok, we need to deal with Unicode Character Names now,
|
|
||||||
* make sure we've imported the hash table data...
|
|
||||||
*/
|
|
||||||
if (ucnhash_CAPI == NULL) {
|
|
||||||
PyObject *mod = 0, *v = 0;
|
|
||||||
mod = PyImport_ImportModule("unicodedata");
|
|
||||||
if (mod == NULL)
|
|
||||||
goto ucnhashError;
|
|
||||||
v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
|
|
||||||
Py_DECREF(mod);
|
|
||||||
if (v == NULL)
|
|
||||||
goto ucnhashError;
|
|
||||||
ucnhash_CAPI = PyCObject_AsVoidPtr(v);
|
|
||||||
Py_DECREF(v);
|
|
||||||
if (ucnhash_CAPI == NULL)
|
|
||||||
goto ucnhashError;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*s == '{') {
|
|
||||||
const char *start = s + 1;
|
|
||||||
const char *endBrace = start;
|
|
||||||
|
|
||||||
/* look for the closing brace */
|
|
||||||
while (*endBrace != '}' && endBrace < end)
|
|
||||||
endBrace++;
|
|
||||||
if (endBrace != end && *endBrace == '}') {
|
|
||||||
if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
|
|
||||||
if (unicodeescape_decoding_error(
|
|
||||||
&s, &x, errors,
|
|
||||||
"Invalid Unicode Character Name")
|
|
||||||
)
|
|
||||||
goto onError;
|
|
||||||
goto ucnFallthrough;
|
|
||||||
}
|
|
||||||
s = endBrace + 1;
|
|
||||||
goto store;
|
|
||||||
} else {
|
|
||||||
if (unicodeescape_decoding_error(
|
|
||||||
&s, &x, errors,
|
|
||||||
"Unicode name missing closing brace"))
|
|
||||||
goto onError;
|
|
||||||
goto ucnFallthrough;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (unicodeescape_decoding_error(
|
|
||||||
&s, &x, errors,
|
|
||||||
"Missing opening brace for Unicode Character Name escape"))
|
|
||||||
goto onError;
|
|
||||||
ucnFallthrough:
|
|
||||||
/* fall through on purpose */
|
|
||||||
default:
|
|
||||||
*p++ = '\\';
|
|
||||||
*p++ = (unsigned char)s[-1];
|
|
||||||
break;
|
|
||||||
store:
|
store:
|
||||||
/* when we get here, chr is a 32-bit unicode character */
|
/* when we get here, chr is a 32-bit unicode character */
|
||||||
if (chr <= 0xffff)
|
if (chr <= 0xffff)
|
||||||
|
@ -1301,10 +1217,53 @@ store:
|
||||||
} else {
|
} else {
|
||||||
if (unicodeescape_decoding_error(
|
if (unicodeescape_decoding_error(
|
||||||
&s, &x, errors,
|
&s, &x, errors,
|
||||||
"Illegal Unicode character")
|
"illegal Unicode character")
|
||||||
)
|
)
|
||||||
goto onError;
|
goto onError;
|
||||||
|
*p++ = x; /* store replacement character */
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* \N{name} */
|
||||||
|
case 'N':
|
||||||
|
message = "malformed \\N character escape";
|
||||||
|
if (ucnhash_CAPI == NULL) {
|
||||||
|
/* load the unicode data module */
|
||||||
|
PyObject *m, *v;
|
||||||
|
m = PyImport_ImportModule("unicodedata");
|
||||||
|
if (m == NULL)
|
||||||
|
goto ucnhashError;
|
||||||
|
v = PyObject_GetAttrString(m, "ucnhash_CAPI");
|
||||||
|
Py_DECREF(m);
|
||||||
|
if (v == NULL)
|
||||||
|
goto ucnhashError;
|
||||||
|
ucnhash_CAPI = PyCObject_AsVoidPtr(v);
|
||||||
|
Py_DECREF(v);
|
||||||
|
if (ucnhash_CAPI == NULL)
|
||||||
|
goto ucnhashError;
|
||||||
|
}
|
||||||
|
if (*s == '{') {
|
||||||
|
const char *start = s+1;
|
||||||
|
/* look for the closing brace */
|
||||||
|
while (*s != '}' && s < end)
|
||||||
|
s++;
|
||||||
|
if (s > start && s < end && *s == '}') {
|
||||||
|
/* found a name. look it up in the unicode database */
|
||||||
|
message = "unknown Unicode character name";
|
||||||
|
s++;
|
||||||
|
if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
|
||||||
|
goto store;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (unicodeescape_decoding_error(&s, &x, errors, message))
|
||||||
|
goto onError;
|
||||||
|
*p++ = x;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
*p++ = '\\';
|
||||||
|
*p++ = (unsigned char)s[-1];
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (_PyUnicode_Resize(v, (int)(p - buf)))
|
if (_PyUnicode_Resize(v, (int)(p - buf)))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue