Issue #10829: Refactor PyUnicode_FromFormat()

* Use the same function to parse the format string in the 3 steps * Fix crashs on invalid format strings
2025-10-21 22:22:48 +00:00 · 2011-03-01 23:44:09 +00:00 · 2011-03-01 23:44:09 +00:00 · 968654515f
commit 968654515f
parent 096f1a85f0
3 changed files with 103 additions and 68 deletions
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -1455,9 +1455,28 @@ class UnicodeTest(string_tests.CommonTest,
            'string, got a non-ASCII byte: 0xe9$',
            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
        # test "%c"
        self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd')
        self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff')
        # test "%"
        self.assertEqual(PyUnicode_FromFormat(b'%'), '%')
        self.assertEqual(PyUnicode_FromFormat(b'%%'), '%')
        self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s')
        self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]')
        self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc')
        # test "%i"
        self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010')
        self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010')
        # not supported: copy the raw format string. these tests are just here
        # to check for crashs and should not be considered as specifications
        self.assertEqual(PyUnicode_FromFormat(b'%1%s', b'abc'), '%s')
        self.assertEqual(PyUnicode_FromFormat(b'%1abc'), '%1abc')
        self.assertEqual(PyUnicode_FromFormat(b'%+i', c_int(10)), '%+i')
        self.assertEqual(PyUnicode_FromFormat(b'%.%s', b'abc'), '%.%s')
        # other tests
        text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
        self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,9 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------
 - Issue #10829: Refactor PyUnicode_FromFormat(), use the same function to parse
  the format string in the 3 steps, fix crashs on invalid format strings.
 - Issue #11246: Fix PyUnicode_FromFormat("%V") to decode the byte string from
  UTF-8 (with replace error handler) instead of ISO-8859-1 (in strict mode).
  Patch written by Ray Allen.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -714,6 +714,70 @@ makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
    *fmt = '\0';
 }
 /* helper for PyUnicode_FromFormatV() */
 static const char*
 parse_format_flags(const char *f,
                   int *p_width, int *p_precision,
                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
 {
    int width, precision, longflag, longlongflag, size_tflag;
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
    f++;
    width = 0;
    while (Py_ISDIGIT((unsigned)*f))
        width = (width*10) + *f++ - '0';
    precision = 0;
    if (*f == '.') {
        f++;
        while (Py_ISDIGIT((unsigned)*f))
            precision = (precision*10) + *f++ - '0';
        if (*f == '%') {
            /* "%.3%s" => f points to "3" */
            f--;
        }
    }
    if (*f == '\0') {
        /* bogus format "%.1" => go backward, f points to "1" */
        f--;
    }
    if (p_width != NULL)
        *p_width = width;
    if (p_precision != NULL)
        *p_precision = precision;
    /* Handle %ld, %lu, %lld and %llu. */
    longflag = 0;
    longlongflag = 0;
    if (*f == 'l') {
        if (f[1] == 'd' || f[1] == 'u') {
            longflag = 1;
            ++f;
        }
 #ifdef HAVE_LONG_LONG
        else if (f[1] == 'l' &&
                 (f[2] == 'd' || f[2] == 'u')) {
            longlongflag = 1;
            f += 2;
        }
 #endif
    }
    /* handle the size_t flag. */
    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
        size_tflag = 1;
        ++f;
    }
    if (p_longflag != NULL)
        *p_longflag = longflag;
    if (p_longlongflag != NULL)
        *p_longlongflag = longlongflag;
    if (p_size_tflag != NULL)
        *p_size_tflag = size_tflag;
    return f;
 }
 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 /* size of fixed-size buffer for formatting single arguments */
@ -757,15 +821,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
     * result in an array) */
    for (f = format; *f; f++) {
         if (*f == '%') {
-             if (*(f+1)=='%')
+             /* skip width or width.precision (eg. "1.2" of "%1.2f") */
-                 continue;
+             f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
-             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
+             if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
                 ++callcount;
             while (Py_ISDIGIT((unsigned)*f))
                 width = (width*10) + *f++ - '0';
             while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
                 ;
             if (*f == 's')
                 ++callcount;
         }
         else if (128 <= (unsigned char)*f) {
@ -790,33 +848,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
    for (f = format; *f; f++) {
        if (*f == '%') {
 #ifdef HAVE_LONG_LONG
-            int longlongflag = 0;
+            int longlongflag;
 #endif
-            const char* p = f;
+            const char* p;
            width = 0;
            while (Py_ISDIGIT((unsigned)*f))
                width = (width*10) + *f++ - '0';
            while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
                ;
-            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
+            p = f;
-             * they don't affect the amount of space we reserve.
+            f = parse_format_flags(f, &width, NULL,
-             */
+                                   NULL, &longlongflag, NULL);
            if (*f == 'l') {
                if (f[1] == 'd' || f[1] == 'u') {
                    ++f;
                }
 #ifdef HAVE_LONG_LONG
                else if (f[1] == 'l' &&
                         (f[2] == 'd' || f[2] == 'u')) {
                    longlongflag = 1;
                    f += 2;
                }
 #endif
            }
            else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
                ++f;
            }
            switch (*f) {
            case 'c':
@ -981,40 +1019,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
    for (f = format; *f; f++) {
        if (*f == '%') {
-            const char* p = f++;
+            const char* p;
-            int longflag = 0;
+            int longflag;
-            int longlongflag = 0;
+            int longlongflag;
-            int size_tflag = 0;
+            int size_tflag;
-            zeropad = (*f == '0');
+
-            /* parse the width.precision part */
+            p = f;
-            width = 0;
+            zeropad = (f[1] == '0');
-            while (Py_ISDIGIT((unsigned)*f))
+            f = parse_format_flags(f, &width, &precision,
-                width = (width*10) + *f++ - '0';
+                                   &longflag, &longlongflag, &size_tflag);
            precision = 0;
            if (*f == '.') {
                f++;
                while (Py_ISDIGIT((unsigned)*f))
                    precision = (precision*10) + *f++ - '0';
            }
            /* Handle %ld, %lu, %lld and %llu. */
            if (*f == 'l') {
                if (f[1] == 'd' || f[1] == 'u') {
                    longflag = 1;
                    ++f;
                }
 #ifdef HAVE_LONG_LONG
                else if (f[1] == 'l' &&
                         (f[2] == 'd' || f[2] == 'u')) {
                    longlongflag = 1;
                    f += 2;
                }
 #endif
            }
            /* handle the size_t flag. */
            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
                size_tflag = 1;
                ++f;
            }
            switch (*f) {
            case 'c':