changed \x to consume exactly two hex digits, also for unicode

strings. closes PEP-223. also added \U escape (eight hex digits).
2025-10-21 22:22:48 +00:00 · 2000-09-03 11:29:49 +00:00 · 2000-09-03 11:29:49 +00:00 · df84675f93
commit df84675f93
parent 03dd010b4f
1 changed files with 66 additions and 55 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1163,6 +1163,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
    PyUnicodeObject *v;
    Py_UNICODE *p = NULL, *buf = NULL;
    const char *end;
    Py_UCS4 chr;
    /* Escaped strings will always be longer than the resulting
       Unicode string, so we start with size here and then reduce the
@ -1214,28 +1215,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
            *p++ = x;
            break;
-        /* \xXXXX escape with 1-n hex digits.  for compatibility
+        /* \xXX with two hex digits */
           with 8-bit strings, this code ignores all but the last
           two digits */
        case 'x':
-            x = 0;
+            for (x = 0, i = 0; i < 2; i++) {
-            c = (unsigned char)*s;
+                c = (unsigned char)s[i];
-            if (isxdigit(c)) {
+                if (!isxdigit(c)) {
-                do {
+                    if (unicodeescape_decoding_error(&s, &x, errors,
-                    x = (x<<4) & 0xF0;
+                                                     "truncated \\xXX"))
-                    if ('0' <= c && c <= '9')
+                        goto onError;
-                        x += c - '0';
+                    i++;
-                    else if ('a' <= c && c <= 'f')
+                    break;
-                        x += 10 + c - 'a';
+                }
-                    else
+                x = (x<<4) & ~0xF;
-                        x += 10 + c - 'A';
+                if (c >= '0' && c <= '9')
-                    c = (unsigned char)*++s;
+                    x += c - '0';
-                } while (isxdigit(c));
+                else if (c >= 'a' && c <= 'f')
-                *p++ = (unsigned char) x;
+                    x += 10 + c - 'a';
-            } else {
+                else
-                *p++ = '\\';
+                    x += 10 + c - 'A';
                *p++ = (unsigned char)s[-1];
            }
            s += i;
            *p++ = x;
            break;
        /* \uXXXX with 4 hex digits */
@ -1261,36 +1261,50 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
            *p++ = x;
            break;
        /* \UXXXXXXXX with 8 hex digits */
        case 'U':
            for (chr = 0, i = 0; i < 8; i++) {
                c = (unsigned char)s[i];
                if (!isxdigit(c)) {
                    if (unicodeescape_decoding_error(&s, &x, errors,
                                                     "truncated \\uXXXX"))
                        goto onError;
                    i++;
                    break;
                }
                chr = (chr<<4) & ~0xF;
                if (c >= '0' && c <= '9')
                    chr += c - '0';
                else if (c >= 'a' && c <= 'f')
                    chr += 10 + c - 'a';
                else
                    chr += 10 + c - 'A';
            }
            s += i;
            goto store;
        case 'N':
            /* Ok, we need to deal with Unicode Character Names now,
             * make sure we've imported the hash table data...
             */
-            if (pucnHash == NULL)
+            if (pucnHash == NULL) {
            {
                PyObject *mod = 0, *v = 0;
                mod = PyImport_ImportModule("ucnhash");
                if (mod == NULL)
                    goto onError;
                v = PyObject_GetAttrString(mod,"ucnhashAPI");
                Py_DECREF(mod);
                if (v == NULL)
                {
                    goto onError;
                }
                pucnHash = PyCObject_AsVoidPtr(v);
                Py_DECREF(v);
                if (pucnHash == NULL)
                {
                    goto onError;
                }
            }
-            if (*s == '{')
+            if (*s == '{') {
            {
                const char *start = s + 1;
                const char *endBrace = start;
                Py_UCS4 value;
                unsigned long j;
                /* look for either the closing brace, or we
@ -1303,8 +1317,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                {
                    endBrace++;
                }
-                if (endBrace != end && *endBrace == '}')
+                if (endBrace != end && *endBrace == '}') {
                {
                    j = pucnHash->hash(start, endBrace - start);
                    if (j > pucnHash->cKeys ||
                        mystrnicmp(
@ -1321,30 +1334,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                        }
                        goto ucnFallthrough;
                    }
-                    value = ((_Py_UnicodeCharacterName *)
+                    chr = ((_Py_UnicodeCharacterName *)
-                               (pucnHash->getValue(j)))->value;
+                           (pucnHash->getValue(j)))->value;
                    if (value < 1<<16)
                    {
                        /* In UCS-2 range, easy solution.. */
                        *p++ = value;
                    }
                    else
                    {
                        /* Oops, its in UCS-4 space, */
                        /*  compute and append the two surrogates: */
                        /*  translate from 10000..10FFFF to 0..FFFFF */
                        value -= 0x10000;
                        /* high surrogate = top 10 bits added to D800 */
                        *p++ = 0xD800 + (value >> 10);
                        /* low surrogate  = bottom 10 bits added to DC00 */
                        *p++ = 0xDC00 + (value & ~0xFC00);
                    }
                    s = endBrace + 1;
-                }
+                    goto store;
-                else
+                } else {
                {
                    if (unicodeescape_decoding_error(
                            &s, &x, errors,
                            "Unicode name missing closing brace"))
@ -1363,6 +1357,23 @@ ucnFallthrough:
            *p++ = '\\';
            *p++ = (unsigned char)s[-1];
            break;
 store:
            /* when we get here, chr is a 32-bit unicode character */
            if (chr <= 0xffff)
                /* UCS-2 character */
                *p++ = (Py_UNICODE) chr;
            else if (chr <= 0x10ffff) {
                /* UCS-4 character.  store as two surrogate characters */
                chr -= 0x10000L;
                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
                *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
            } else {
                if (unicodeescape_decoding_error(
                    &s, &x, errors,
                    "Illegal Unicode character")
                    )
                    goto onError;
            }
        }
    }
    if (_PyUnicode_Resize(v, (int)(p - buf)))