Issue #23055: Fixed a buffer overflow in PyUnicode_FromFormatV. Analysis

and fix by Guido Vranken.
2025-12-04 00:30:19 +00:00 · 2015-01-27 22:18:34 +02:00 · 2015-01-27 22:18:34 +02:00 · 3f95292be6
commit 3f95292be6
parent f18bf6fd2d
3 changed files with 159 additions and 31 deletions
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -1661,7 +1661,10 @@ class UnicodeTest(string_tests.CommonTest,
    # Test PyUnicode_FromFormat()
    def test_from_format(self):
        support.import_module('ctypes')
-        from ctypes import pythonapi, py_object, c_int
+        from ctypes import (
            pythonapi, py_object, sizeof,
            c_int, c_long, c_longlong, c_ssize_t,
            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
        if sys.maxunicode == 65535:
            name = "PyUnicodeUCS2_FromFormat"
        else:
@ -1675,9 +1678,13 @@ class UnicodeTest(string_tests.CommonTest,
                for arg in args)
            return _PyUnicode_FromFormat(format, *cargs)
        def check_format(expected, format, *args):
            text = PyUnicode_FromFormat(format, *args)
            self.assertEqual(expected, text)
        # ascii format, non-ascii argument
-        text = PyUnicode_FromFormat(b'ascii\x7f=%U', 'unicode\xe9')
+        check_format('ascii\x7f=unicode\xe9',
-        self.assertEqual(text, 'ascii\x7f=unicode\xe9')
+                     b'ascii\x7f=%U', 'unicode\xe9')
        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
        # raises an error
@ -1686,25 +1693,131 @@ class UnicodeTest(string_tests.CommonTest,
            'string, got a non-ASCII byte: 0xe9$',
            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
-        self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd')
+        # test "%c"
-        self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff')
+        check_format('\uabcd',
                     b'%c', c_int(0xabcd))
        check_format('\U0010ffff',
                     b'%c', c_int(0x10ffff))
        with self.assertRaises(OverflowError):
            PyUnicode_FromFormat(b'%c', c_int(0x110000))
        # Issue #18183
        check_format('\U00010000\U00100000',
                     b'%c%c', c_int(0x10000), c_int(0x100000))
-        # other tests
+        # test "%"
-        text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
+        check_format('%',
-        self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")
+                     b'%')
        check_format('%',
                     b'%%')
        check_format('%s',
                     b'%%s')
        check_format('[%]',
                     b'[%%]')
        check_format('%abc',
                     b'%%%s', b'abc')
-        text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz')
+        # test %S
-        self.assertEqual(text, 'repr=abc')
+        check_format("repr=\u20acABC",
                     b'repr=%S', '\u20acABC')
        # test %R
        check_format("repr='\u20acABC'",
                     b'repr=%R', '\u20acABC')
        # test integer formats (%i, %d, %u)
        check_format('010',
                     b'%03i', c_int(10))
        check_format('0010',
                     b'%0.4i', c_int(10))
        check_format('-123',
                     b'%i', c_int(-123))
        check_format('-123',
                     b'%d', c_int(-123))
        check_format('-123',
                     b'%ld', c_long(-123))
        check_format('-123',
                     b'%lld', c_longlong(-123))
        check_format('-123',
                     b'%zd', c_ssize_t(-123))
        check_format('123',
                     b'%u', c_uint(123))
        check_format('123',
                     b'%lu', c_ulong(123))
        check_format('123',
                     b'%llu', c_ulonglong(123))
        check_format('123',
                     b'%zu', c_size_t(123))
        # test long output
        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
        max_longlong = -min_longlong - 1
        check_format(str(min_longlong),
                     b'%lld', c_longlong(min_longlong))
        check_format(str(max_longlong),
                     b'%lld', c_longlong(max_longlong))
        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
        check_format(str(max_ulonglong),
                     b'%llu', c_ulonglong(max_ulonglong))
        PyUnicode_FromFormat(b'%p', c_void_p(-1))
        # test padding (width and/or precision)
        check_format('123'.rjust(10, '0'),
                     b'%010i', c_int(123))
        check_format('123'.rjust(100),
                     b'%100i', c_int(123))
        check_format('123'.rjust(100, '0'),
                     b'%.100i', c_int(123))
        check_format('123'.rjust(80, '0').rjust(100),
                     b'%100.80i', c_int(123))
        check_format('123'.rjust(10, '0'),
                     b'%010u', c_uint(123))
        check_format('123'.rjust(100),
                     b'%100u', c_uint(123))
        check_format('123'.rjust(100, '0'),
                     b'%.100u', c_uint(123))
        check_format('123'.rjust(80, '0').rjust(100),
                     b'%100.80u', c_uint(123))
        check_format('123'.rjust(10, '0'),
                     b'%010x', c_int(0x123))
        check_format('123'.rjust(100),
                     b'%100x', c_int(0x123))
        check_format('123'.rjust(100, '0'),
                     b'%.100x', c_int(0x123))
        check_format('123'.rjust(80, '0').rjust(100),
                     b'%100.80x', c_int(0x123))
        # test %A
        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
        # test %V
        check_format('repr=abc',
                     b'repr=%V', 'abc', b'xyz')
        # Test string decode from parameter of %s using utf-8.
        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
        # '\u4eba\u6c11'
-        text = PyUnicode_FromFormat(b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
+        check_format('repr=\u4eba\u6c11',
-        self.assertEqual(text, 'repr=\u4eba\u6c11')
+                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
        #Test replace error handler.
-        text = PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff')
+        check_format('repr=abc\ufffd',
-        self.assertEqual(text, 'repr=abc\ufffd')
+                     b'repr=%V', None, b'abc\xff')
        # not supported: copy the raw format string. these tests are just here
        # to check for crashs and should not be considered as specifications
        check_format('%s',
                     b'%1%s', b'abc')
        check_format('%1abc',
                     b'%1abc')
        check_format('%+i',
                     b'%+i', c_int(10))
        check_format('%s',
                     b'%.%s', b'abc')
    # Test PyUnicode_AsWideChar()
    def test_aswidechar(self):
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -2,6 +2,18 @@
 Python News
 +++++++++++
 What's New in Python 3.2.7?
 ============================
 *Release date: XXXX-XX-XX*
 Core and Builtins
 -----------------
 - Issue #23055: Fixed a buffer overflow in PyUnicode_FromFormatV.  Analysis
  and fix by Guido Vranken.
 What's New in Python 3.2.6?
 ===========================
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -759,15 +759,10 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
     * result in an array) */
    for (f = format; *f; f++) {
         if (*f == '%') {
-             if (*(f+1)=='%')
+             f++;
-                 continue;
+             while (*f && *f != '%' && !Py_ISALPHA((unsigned)*f))
-             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
+                 f++;
-                 ++callcount;
+             if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
             while (Py_ISDIGIT((unsigned)*f))
                 width = (width*10) + *f++ - '0';
             while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
                 ;
             if (*f == 's')
                 ++callcount;
         }
         else if (128 <= (unsigned char)*f) {
@ -794,12 +789,16 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
 #ifdef HAVE_LONG_LONG
            int longlongflag = 0;
 #endif
-            const char* p = f;
+            const char* p = f++;
            width = 0;
            while (Py_ISDIGIT((unsigned)*f))
                width = (width*10) + *f++ - '0';
-            while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
+            precision = 0;
-                ;
+            if (*f == '.') {
                f++;
                while (Py_ISDIGIT((unsigned)*f))
                    precision = (precision*10) + *f++ - '0';
            }
            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
             * they don't affect the amount of space we reserve.
@ -823,16 +822,18 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
            switch (*f) {
            case 'c':
            {
 #ifndef Py_UNICODE_WIDE
                int ordinal = va_arg(count, int);
                if (ordinal < 0 || ordinal > 0x10ffff) {
                    PyErr_SetString(PyExc_OverflowError,
                                    "%c arg not in range(0x110000)");
                    goto fail;
                }
 #ifndef Py_UNICODE_WIDE
                if (ordinal > 0xffff)
                    n += 2;
                else
                    n++;
 #else
                (void)va_arg(count, int);
                n++;
 #endif
                n++;
                break;
            }
            case '%':
@ -840,6 +841,8 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
                break;
            case 'd': case 'u': case 'i': case 'x':
                (void) va_arg(count, int);
                if (width < precision)
                    width = precision;
 #ifdef HAVE_LONG_LONG
                if (longlongflag) {
                    if (width < MAX_LONG_LONG_CHARS)