changed \x to consume exactly two hex digits, also for unicode

strings.  closes PEP-223.

also added \U escape (eight hex digits).
This commit is contained in:
Fredrik Lundh 2000-09-03 11:29:49 +00:00
parent 03dd010b4f
commit df84675f93

View file

@ -1163,6 +1163,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
PyUnicodeObject *v; PyUnicodeObject *v;
Py_UNICODE *p = NULL, *buf = NULL; Py_UNICODE *p = NULL, *buf = NULL;
const char *end; const char *end;
Py_UCS4 chr;
/* Escaped strings will always be longer than the resulting /* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the Unicode string, so we start with size here and then reduce the
@ -1214,28 +1215,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = x; *p++ = x;
break; break;
/* \xXXXX escape with 1-n hex digits. for compatibility /* \xXX with two hex digits */
with 8-bit strings, this code ignores all but the last
two digits */
case 'x': case 'x':
x = 0; for (x = 0, i = 0; i < 2; i++) {
c = (unsigned char)*s; c = (unsigned char)s[i];
if (isxdigit(c)) { if (!isxdigit(c)) {
do { if (unicodeescape_decoding_error(&s, &x, errors,
x = (x<<4) & 0xF0; "truncated \\xXX"))
if ('0' <= c && c <= '9') goto onError;
i++;
break;
}
x = (x<<4) & ~0xF;
if (c >= '0' && c <= '9')
x += c - '0'; x += c - '0';
else if ('a' <= c && c <= 'f') else if (c >= 'a' && c <= 'f')
x += 10 + c - 'a'; x += 10 + c - 'a';
else else
x += 10 + c - 'A'; x += 10 + c - 'A';
c = (unsigned char)*++s;
} while (isxdigit(c));
*p++ = (unsigned char) x;
} else {
*p++ = '\\';
*p++ = (unsigned char)s[-1];
} }
s += i;
*p++ = x;
break; break;
/* \uXXXX with 4 hex digits */ /* \uXXXX with 4 hex digits */
@ -1261,36 +1261,50 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = x; *p++ = x;
break; break;
/* \UXXXXXXXX with 8 hex digits */
case 'U':
for (chr = 0, i = 0; i < 8; i++) {
c = (unsigned char)s[i];
if (!isxdigit(c)) {
if (unicodeescape_decoding_error(&s, &x, errors,
"truncated \\uXXXX"))
goto onError;
i++;
break;
}
chr = (chr<<4) & ~0xF;
if (c >= '0' && c <= '9')
chr += c - '0';
else if (c >= 'a' && c <= 'f')
chr += 10 + c - 'a';
else
chr += 10 + c - 'A';
}
s += i;
goto store;
case 'N': case 'N':
/* Ok, we need to deal with Unicode Character Names now, /* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data... * make sure we've imported the hash table data...
*/ */
if (pucnHash == NULL) if (pucnHash == NULL) {
{
PyObject *mod = 0, *v = 0; PyObject *mod = 0, *v = 0;
mod = PyImport_ImportModule("ucnhash"); mod = PyImport_ImportModule("ucnhash");
if (mod == NULL) if (mod == NULL)
goto onError; goto onError;
v = PyObject_GetAttrString(mod,"ucnhashAPI"); v = PyObject_GetAttrString(mod,"ucnhashAPI");
Py_DECREF(mod); Py_DECREF(mod);
if (v == NULL) if (v == NULL)
{
goto onError; goto onError;
}
pucnHash = PyCObject_AsVoidPtr(v); pucnHash = PyCObject_AsVoidPtr(v);
Py_DECREF(v); Py_DECREF(v);
if (pucnHash == NULL) if (pucnHash == NULL)
{
goto onError; goto onError;
} }
}
if (*s == '{') if (*s == '{') {
{
const char *start = s + 1; const char *start = s + 1;
const char *endBrace = start; const char *endBrace = start;
Py_UCS4 value;
unsigned long j; unsigned long j;
/* look for either the closing brace, or we /* look for either the closing brace, or we
@ -1303,8 +1317,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
{ {
endBrace++; endBrace++;
} }
if (endBrace != end && *endBrace == '}') if (endBrace != end && *endBrace == '}') {
{
j = pucnHash->hash(start, endBrace - start); j = pucnHash->hash(start, endBrace - start);
if (j > pucnHash->cKeys || if (j > pucnHash->cKeys ||
mystrnicmp( mystrnicmp(
@ -1321,30 +1334,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
} }
goto ucnFallthrough; goto ucnFallthrough;
} }
value = ((_Py_UnicodeCharacterName *) chr = ((_Py_UnicodeCharacterName *)
(pucnHash->getValue(j)))->value; (pucnHash->getValue(j)))->value;
if (value < 1<<16)
{
/* In UCS-2 range, easy solution.. */
*p++ = value;
}
else
{
/* Oops, its in UCS-4 space, */
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFFF */
value -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
*p++ = 0xD800 + (value >> 10);
/* low surrogate = bottom 10 bits added to DC00 */
*p++ = 0xDC00 + (value & ~0xFC00);
}
s = endBrace + 1; s = endBrace + 1;
} goto store;
else } else {
{
if (unicodeescape_decoding_error( if (unicodeescape_decoding_error(
&s, &x, errors, &s, &x, errors,
"Unicode name missing closing brace")) "Unicode name missing closing brace"))
@ -1363,6 +1357,23 @@ ucnFallthrough:
*p++ = '\\'; *p++ = '\\';
*p++ = (unsigned char)s[-1]; *p++ = (unsigned char)s[-1];
break; break;
store:
/* when we get here, chr is a 32-bit unicode character */
if (chr <= 0xffff)
/* UCS-2 character */
*p++ = (Py_UNICODE) chr;
else if (chr <= 0x10ffff) {
/* UCS-4 character. store as two surrogate characters */
chr -= 0x10000L;
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
} else {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Illegal Unicode character")
)
goto onError;
}
} }
} }
if (_PyUnicode_Resize(v, (int)(p - buf))) if (_PyUnicode_Resize(v, (int)(p - buf)))