Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:

1. Non-ASCII bytes were accepted after shift sequence. 2. A low surrogate could be emitted in case of error in high surrogate. 3. In some circumstances the '\xfd' character was produced instead of the replacement character '\ufffd' (due to a bug in _PyUnicodeWriter).
2025-07-13 14:25:18 +00:00 · 2015-10-02 13:13:14 +03:00 · 2015-10-02 13:13:14 +03:00 · 58c8f2bb6d
commit 58c8f2bb6d
parent b9d98d532c 28b21e50c8
4 changed files with 75 additions and 11 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4330,31 +4330,31 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
            }
            else { /* now leaving a base-64 section */
                inShift = 0;
-                s++;
-                if (surrogate) {
-                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
-                        goto onError;
-                    surrogate = 0;
-                }
                if (base64bits > 0) { /* left-over bits */
                    if (base64bits >= 6) {
                        /* We've seen at least one base-64 character */
+                        s++;
                        errmsg = "partial character in shift sequence";
                        goto utf7Error;
                    }
                    else {
                        /* Some bits remain; they should be zero */
                        if (base64buffer != 0) {
+                            s++;
                            errmsg = "non-zero padding bits in shift sequence";
                            goto utf7Error;
                        }
                    }
                }
-                if (ch != '-') {
+                if (surrogate && DECODE_DIRECT(ch)) {
+                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
+                        goto onError;
+                }
+                surrogate = 0;
+                if (ch == '-') {
                    /* '-' is absorbed; other terminating
                       characters are preserved */
-                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
-                        goto onError;
+                    s++;
                }
            }
        }
@ -4368,6 +4368,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
            }
            else { /* begin base64-encoded section */
                inShift = 1;
+                surrogate = 0;
                shiftOutStart = writer.pos;
                base64bits = 0;
                base64buffer = 0;
@ -4399,6 +4400,7 @@ utf7Error:

    if (inShift && !consumed) { /* in shift sequence, no more to follow */
        /* if we're in an inconsistent state, that's an error */
+        inShift = 0;
        if (surrogate ||
                (base64bits >= 6) ||
                (base64bits > 0 && base64buffer != 0)) {
@ -13291,6 +13293,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,

        if (maxchar > writer->maxchar || writer->readonly) {
            /* resize + widen */
+            maxchar = Py_MAX(maxchar, writer->maxchar);
            newbuffer = PyUnicode_New(newlen, maxchar);
            if (newbuffer == NULL)
                return -1;