reorganized PyUnicode_DecodeUnicodeEscape a bit (in order to make it

less likely that bug #132817 ever appears again)
2025-10-17 04:08:28 +00:00 · 2001-02-18 22:13:49 +00:00 · 2001-02-18 22:13:49 +00:00 · ccc7473fc8
commit ccc7473fc8
parent b95896b2d2
1 changed files with 73 additions and 114 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1110,9 +1110,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
 					const char *errors)
 {
    PyUnicodeObject *v;
-    Py_UNICODE *p = NULL, *buf = NULL;
+    Py_UNICODE *p, *buf;
    const char *end;
-    Py_UCS4 chr;
+    char* message;
    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
    /* Escaped strings will always be longer than the resulting
       Unicode string, so we start with size here and then reduce the
@ -1122,12 +1123,14 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
        goto onError;
    if (size == 0)
        return (PyObject *)v;
    p = buf = PyUnicode_AS_UNICODE(v);
    end = s + size;
    while (s < end) {
        unsigned char c;
        Py_UNICODE x;
-        int i;
+        int i, digits;
        /* Non-escape characters are interpreted as Unicode ordinals */
        if (*s != '\\') {
@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
            *p++ = x;
            break;
-        /* \xXX with two hex digits */
+        /* hex escapes */
        /* \xXX */
        case 'x':
-            for (x = 0, i = 0; i < 2; i++) {
+            digits = 2;
-                c = (unsigned char)s[i];
+            message = "truncated \\xXX escape";
-                if (!isxdigit(c)) {
+            goto hexescape;
                    if (unicodeescape_decoding_error(&s, &x, errors,
                                                     "truncated \\xXX"))
                        goto onError;
                    i++;
                    break;
                }
                x = (x<<4) & ~0xF;
                if (c >= '0' && c <= '9')
                    x += c - '0';
                else if (c >= 'a' && c <= 'f')
                    x += 10 + c - 'a';
                else
                    x += 10 + c - 'A';
            }
            s += i;
            *p++ = x;
            break;
-        /* \uXXXX with 4 hex digits */
+        /* \uXXXX */
        case 'u':
-            for (x = 0, i = 0; i < 4; i++) {
+            digits = 4;
-                c = (unsigned char)s[i];
+            message = "truncated \\uXXXX escape";
-                if (!isxdigit(c)) {
+            goto hexescape;
                    if (unicodeescape_decoding_error(&s, &x, errors,
                                                     "truncated \\uXXXX"))
                        goto onError;
                    i++;
                    break;
                }
                x = (x<<4) & ~0xF;
                if (c >= '0' && c <= '9')
                    x += c - '0';
                else if (c >= 'a' && c <= 'f')
                    x += 10 + c - 'a';
                else
                    x += 10 + c - 'A';
            }
            s += i;
            *p++ = x;
            break;
-        /* \UXXXXXXXX with 8 hex digits */
+        /* \UXXXXXXXX */
        case 'U':
-            for (chr = 0, i = 0; i < 8; i++) {
+            digits = 8;
            message = "truncated \\UXXXXXXXX escape";
        hexescape:
            chr = 0;
            for (i = 0; i < digits; i++) {
                c = (unsigned char) s[i];
                if (!isxdigit(c)) {
-                    if (unicodeescape_decoding_error(&s, &x, errors,
+                    if (unicodeescape_decoding_error(&s, &x, errors, message))
                                                     "truncated \\uXXXX"))
                        goto onError;
                    chr = x;
                    i++;
                    break;
                }
@ -1230,64 +1204,6 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                    chr += 10 + c - 'A';
            }
            s += i;
            goto store;
        case 'N':
            /* Ok, we need to deal with Unicode Character Names now,
             * make sure we've imported the hash table data...
             */
            if (ucnhash_CAPI == NULL) {
                PyObject *mod = 0, *v = 0;
                mod = PyImport_ImportModule("unicodedata");
                if (mod == NULL)
                    goto ucnhashError;
                v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
                Py_DECREF(mod);
                if (v == NULL)
                    goto ucnhashError;
                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
                Py_DECREF(v);
                if (ucnhash_CAPI == NULL)
                    goto ucnhashError;
            }
            if (*s == '{') {
                const char *start = s + 1;
                const char *endBrace = start;
                /* look for the closing brace */
                while (*endBrace != '}' && endBrace < end)
                    endBrace++;
                if (endBrace != end && *endBrace == '}') {
                    if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
                        if (unicodeescape_decoding_error(
                                &s, &x, errors,
                                "Invalid Unicode Character Name")
                            )
                            goto onError;
                        goto ucnFallthrough;
                    }
                    s = endBrace + 1;
                    goto store;
                } else {
                    if (unicodeescape_decoding_error(
                            &s, &x, errors,
                            "Unicode name missing closing brace"))
                        goto onError;
                    goto ucnFallthrough;
                }
                break;                
            }
            if (unicodeescape_decoding_error(
                    &s, &x, errors,
                    "Missing opening brace for Unicode Character Name escape"))
                goto onError;
 ucnFallthrough:
            /* fall through on purpose */
 		default:
            *p++ = '\\';
            *p++ = (unsigned char)s[-1];
            break;
        store:
            /* when we get here, chr is a 32-bit unicode character */
            if (chr <= 0xffff)
@ -1301,10 +1217,53 @@ store:
            } else {
                if (unicodeescape_decoding_error(
                    &s, &x, errors,
-                    "Illegal Unicode character")
+                    "illegal Unicode character")
                    )
                    goto onError;
                *p++ = x; /* store replacement character */
            }
            break;
        /* \N{name} */
        case 'N':
            message = "malformed \\N character escape";
            if (ucnhash_CAPI == NULL) {
                /* load the unicode data module */
                PyObject *m, *v;
                m = PyImport_ImportModule("unicodedata");
                if (m == NULL)
                    goto ucnhashError;
                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
                Py_DECREF(m);
                if (v == NULL)
                    goto ucnhashError;
                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
                Py_DECREF(v);
                if (ucnhash_CAPI == NULL)
                    goto ucnhashError;
            }
            if (*s == '{') {
                const char *start = s+1;
                /* look for the closing brace */
                while (*s != '}' && s < end)
                    s++;
                if (s > start && s < end && *s == '}') {
                    /* found a name.  look it up in the unicode database */
                    message = "unknown Unicode character name";
                    s++;
                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
                        goto store;
                }
            }
            if (unicodeescape_decoding_error(&s, &x, errors, message))
                goto onError;
            *p++ = x;
            break;
        default:
            *p++ = '\\';
            *p++ = (unsigned char)s[-1];
            break;
        }
    }
    if (_PyUnicode_Resize(v, (int)(p - buf)))