reorganized PyUnicode_DecodeUnicodeEscape a bit (in order to make it

less likely that bug #132817 ever appears again)
This commit is contained in:
Fredrik Lundh 2001-02-18 22:13:49 +00:00
parent b95896b2d2
commit ccc7473fc8

View file

@ -1110,9 +1110,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
const char *errors) const char *errors)
{ {
PyUnicodeObject *v; PyUnicodeObject *v;
Py_UNICODE *p = NULL, *buf = NULL; Py_UNICODE *p, *buf;
const char *end; const char *end;
Py_UCS4 chr; char* message;
Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
/* Escaped strings will always be longer than the resulting /* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the Unicode string, so we start with size here and then reduce the
@ -1122,12 +1123,14 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
goto onError; goto onError;
if (size == 0) if (size == 0)
return (PyObject *)v; return (PyObject *)v;
p = buf = PyUnicode_AS_UNICODE(v); p = buf = PyUnicode_AS_UNICODE(v);
end = s + size; end = s + size;
while (s < end) { while (s < end) {
unsigned char c; unsigned char c;
Py_UNICODE x; Py_UNICODE x;
int i; int i, digits;
/* Non-escape characters are interpreted as Unicode ordinals */ /* Non-escape characters are interpreted as Unicode ordinals */
if (*s != '\\') { if (*s != '\\') {
@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = x; *p++ = x;
break; break;
/* \xXX with two hex digits */ /* hex escapes */
/* \xXX */
case 'x': case 'x':
for (x = 0, i = 0; i < 2; i++) { digits = 2;
c = (unsigned char)s[i]; message = "truncated \\xXX escape";
if (!isxdigit(c)) { goto hexescape;
if (unicodeescape_decoding_error(&s, &x, errors,
"truncated \\xXX"))
goto onError;
i++;
break;
}
x = (x<<4) & ~0xF;
if (c >= '0' && c <= '9')
x += c - '0';
else if (c >= 'a' && c <= 'f')
x += 10 + c - 'a';
else
x += 10 + c - 'A';
}
s += i;
*p++ = x;
break;
/* \uXXXX with 4 hex digits */ /* \uXXXX */
case 'u': case 'u':
for (x = 0, i = 0; i < 4; i++) { digits = 4;
c = (unsigned char)s[i]; message = "truncated \\uXXXX escape";
if (!isxdigit(c)) { goto hexescape;
if (unicodeescape_decoding_error(&s, &x, errors,
"truncated \\uXXXX"))
goto onError;
i++;
break;
}
x = (x<<4) & ~0xF;
if (c >= '0' && c <= '9')
x += c - '0';
else if (c >= 'a' && c <= 'f')
x += 10 + c - 'a';
else
x += 10 + c - 'A';
}
s += i;
*p++ = x;
break;
/* \UXXXXXXXX with 8 hex digits */ /* \UXXXXXXXX */
case 'U': case 'U':
for (chr = 0, i = 0; i < 8; i++) { digits = 8;
message = "truncated \\UXXXXXXXX escape";
hexescape:
chr = 0;
for (i = 0; i < digits; i++) {
c = (unsigned char) s[i]; c = (unsigned char) s[i];
if (!isxdigit(c)) { if (!isxdigit(c)) {
if (unicodeescape_decoding_error(&s, &x, errors, if (unicodeescape_decoding_error(&s, &x, errors, message))
"truncated \\uXXXX"))
goto onError; goto onError;
chr = x;
i++; i++;
break; break;
} }
@ -1230,64 +1204,6 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
chr += 10 + c - 'A'; chr += 10 + c - 'A';
} }
s += i; s += i;
goto store;
case 'N':
/* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data...
*/
if (ucnhash_CAPI == NULL) {
PyObject *mod = 0, *v = 0;
mod = PyImport_ImportModule("unicodedata");
if (mod == NULL)
goto ucnhashError;
v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
Py_DECREF(mod);
if (v == NULL)
goto ucnhashError;
ucnhash_CAPI = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
if (ucnhash_CAPI == NULL)
goto ucnhashError;
}
if (*s == '{') {
const char *start = s + 1;
const char *endBrace = start;
/* look for the closing brace */
while (*endBrace != '}' && endBrace < end)
endBrace++;
if (endBrace != end && *endBrace == '}') {
if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Invalid Unicode Character Name")
)
goto onError;
goto ucnFallthrough;
}
s = endBrace + 1;
goto store;
} else {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Unicode name missing closing brace"))
goto onError;
goto ucnFallthrough;
}
break;
}
if (unicodeescape_decoding_error(
&s, &x, errors,
"Missing opening brace for Unicode Character Name escape"))
goto onError;
ucnFallthrough:
/* fall through on purpose */
default:
*p++ = '\\';
*p++ = (unsigned char)s[-1];
break;
store: store:
/* when we get here, chr is a 32-bit unicode character */ /* when we get here, chr is a 32-bit unicode character */
if (chr <= 0xffff) if (chr <= 0xffff)
@ -1301,10 +1217,53 @@ store:
} else { } else {
if (unicodeescape_decoding_error( if (unicodeescape_decoding_error(
&s, &x, errors, &s, &x, errors,
"Illegal Unicode character") "illegal Unicode character")
) )
goto onError; goto onError;
*p++ = x; /* store replacement character */
} }
break;
/* \N{name} */
case 'N':
message = "malformed \\N character escape";
if (ucnhash_CAPI == NULL) {
/* load the unicode data module */
PyObject *m, *v;
m = PyImport_ImportModule("unicodedata");
if (m == NULL)
goto ucnhashError;
v = PyObject_GetAttrString(m, "ucnhash_CAPI");
Py_DECREF(m);
if (v == NULL)
goto ucnhashError;
ucnhash_CAPI = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
if (ucnhash_CAPI == NULL)
goto ucnhashError;
}
if (*s == '{') {
const char *start = s+1;
/* look for the closing brace */
while (*s != '}' && s < end)
s++;
if (s > start && s < end && *s == '}') {
/* found a name. look it up in the unicode database */
message = "unknown Unicode character name";
s++;
if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
goto store;
}
}
if (unicodeescape_decoding_error(&s, &x, errors, message))
goto onError;
*p++ = x;
break;
default:
*p++ = '\\';
*p++ = (unsigned char)s[-1];
break;
} }
} }
if (_PyUnicode_Resize(v, (int)(p - buf))) if (_PyUnicode_Resize(v, (int)(p - buf)))