Derived from Martin's SF patch 110609: support unbounded ints in %d,i,u,x,X,o formats.

Note a curious extension to the std C rules: x, X and o formatting can never produce a sign character in C, so the '+' and ' ' flags are meaningless for them. But unbounded ints *can* produce a sign character under these conversions (no fixed- width bitstring is wide enough to hold all negative values in 2's-comp form). So these flags become meaningful in Python when formatting a Python long which is too big to fit in a C long. This required shuffling around existing code, which hacked x and X conversions to death when both the '#' and '0' flags were specified: the hacks weren't strong enough to deal with the simultaneous possibility of the ' ' or '+' flags too, since signs were always meaningless before for x and X conversions. Isomorphic shuffling was required in unicodeobject.c. Also added dozens of non-trivial new unbounded-int test cases to test_format.py.
2025-07-24 03:35:53 +00:00 · 2000-09-21 05:43:11 +00:00 · 2000-09-21 05:43:11 +00:00 · 38fd5b6413
commit 38fd5b6413
parent 31575ce817
4 changed files with 409 additions and 75 deletions
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@ -2427,6 +2427,13 @@ getnextarg(PyObject *args, int arglen, int *p_argidx)
 	return NULL;
 }

+/* Format codes
+ * F_LJUST	'-'
+ * F_SIGN	'+'
+ * F_BLANK	' '
+ * F_ALT	'#'
+ * F_ZERO	'0'
+ */
 #define F_LJUST (1<<0)
 #define F_SIGN	(1<<1)
 #define F_BLANK (1<<2)
@ -2464,22 +2471,164 @@ formatfloat(char *buf, size_t buflen, int flags,
 	return strlen(buf);
 }

+/* _PyString_FormatLong emulates the format codes d, u, o, x and X, and
+ * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
+ * Python's regular ints.
+ * Return value:  a new PyString*, or NULL if error.
+ *  .  *pbuf is set to point into it,
+ *     *plen set to the # of chars following that.
+ *     Caller must decref it when done using pbuf.
+ *     The string starting at *pbuf is of the form
+ *         "-"? ("0x" | "0X")? digit+
+ *     "0x"/"0X" are present only for x and X conversions, with F_ALT
+ *         set in flags.  The case of hex digits will be correct, 
+ *     There will be at least prec digits, zero-filled on the left if
+ *         necessary to get that many.
+ * val		object to be converted
+ * flags	bitmask of format flags; only F_ALT is looked at
+ * prec		minimum number of digits; 0-fill on left if needed
+ * type		a character in [duoxX]; u acts the same as d
+ *
+ * CAUTION:  o, x and X conversions on regular ints can never
+ * produce a '-' sign, but can for Python's unbounded ints.
+ */
+PyObject*
+_PyString_FormatLong(PyObject *val, int flags, int prec, int type,
+		     char **pbuf, int *plen)
+{
+	PyObject *result = NULL;
+	char *buf;
+	int i;
+	int sign;	/* 1 if '-', else 0 */
+	int len;	/* number of characters */
+	int numdigits;	/* len == numnondigits + numdigits */
+	int numnondigits = 0;
+
+	switch (type) {
+	case 'd':
+	case 'u':
+		result = val->ob_type->tp_str(val);
+		break;
+	case 'o':
+		result = val->ob_type->tp_as_number->nb_oct(val);
+		break;
+	case 'x':
+	case 'X':
+		numnondigits = 2;
+		result = val->ob_type->tp_as_number->nb_hex(val);
+		break;
+	default:
+		assert(!"'type' not in [duoxX]");
+	}
+	if (!result)
+		return NULL;
+
+	/* To modify the string in-place, there can only be one reference. */
+	if (result->ob_refcnt != 1) {
+		PyErr_BadInternalCall();
+		return NULL;
+	}
+	buf = PyString_AsString(result);
+	len = PyString_Size(result);
+	if (buf[len-1] == 'L') {
+		--len;
+		buf[len] = '\0';
+	}
+	sign = buf[0] == '-';
+	numnondigits += sign;
+	numdigits = len - numnondigits;
+	assert(numdigits > 0);
+
+	/* Get rid of base marker unless F_ALT */
+	if ((flags & F_ALT) == 0) {
+		/* Need to skip 0x, 0X or 0. */
+		int skipped = 0;
+		switch (type) {
+		case 'o':
+			assert(buf[sign] == '0');
+			/* If 0 is only digit, leave it alone. */
+			if (numdigits > 1) {
+				skipped = 1;
+				--numdigits;
+			}
+			break;
+		case 'x':
+		case 'X':
+			assert(buf[sign] == '0');
+			assert(buf[sign + 1] == 'x');
+			skipped = 2;
+			numnondigits -= 2;
+			break;
+		}
+		if (skipped) {
+			buf += skipped;
+			len -= skipped;
+			if (sign)
+				buf[0] = '-';
+		}
+		assert(len == numnondigits + numdigits);
+		assert(numdigits > 0);
+	}
+
+	/* Fill with leading zeroes to meet minimum width. */
+	if (prec > numdigits) {
+		PyObject *r1 = PyString_FromStringAndSize(NULL,
+					numnondigits + prec);
+		char *b1;
+		if (!r1) {
+			Py_DECREF(result);
+			return NULL;
+		}
+		b1 = PyString_AS_STRING(r1);
+		for (i = 0; i < numnondigits; ++i)
+			*b1++ = *buf++;
+		for (i = 0; i < prec - numdigits; i++)
+			*b1++ = '0';
+		for (i = 0; i < numdigits; i++)
+			*b1++ = *buf++;
+		*b1 = '\0';
+		Py_DECREF(result);
+		result = r1;
+		buf = PyString_AS_STRING(result);
+		len = numnondigits + prec;
+	}
+
+	/* Fix up case for hex conversions. */
+	switch (type) {
+	case 'x':
+		/* Need to convert all upper case letters to lower case. */
+		for (i = 0; i < len; i++)
+			if (buf[i] >= 'A' && buf[i] <= 'F')
+				buf[i] += 'a'-'A';
+		break;
+	case 'X':
+		/* Need to convert 0x to 0X (and -0x to -0X). */
+		if (buf[sign + 1] == 'x')
+			buf[sign + 1] = 'X';
+		break;
+	}
+	*pbuf = buf;
+	*plen = len;
+	return result;
+}
+
 static int
 formatint(char *buf, size_t buflen, int flags,
          int prec, int type, PyObject *v)
 {
 	/* fmt = '%#.' + `prec` + 'l' + `type`
-	   worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
-	char fmt[20];
+	   worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
+	   + 1 + 1 = 24 */
+	char fmt[64];	/* plenty big enough! */
 	long x;
 	if (!PyArg_Parse(v, "l;int argument required", &x))
 		return -1;
 	if (prec < 0)
 		prec = 1;
 	sprintf(fmt, "%%%s.%dl%c", (flags&F_ALT) ? "#" : "", prec, type);
-	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
+	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec, len(x in octal))
 	   worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
-	if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
+	if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
 		PyErr_SetString(PyExc_OverflowError,
 			"formatted integer is too long (precision too long?)");
 		return -1;
@ -2752,25 +2901,29 @@ PyString_Format(PyObject *format, PyObject *args)
 			case 'X':
 				if (c == 'i')
 					c = 'd';
-				pbuf = formatbuf;
-				len = formatint(pbuf, sizeof(formatbuf), flags, prec, c, v);
-				if (len < 0)
-					goto error;
-				sign = (c == 'd');
-				if (flags&F_ZERO) {
-					fill = '0';
-					if ((flags&F_ALT) &&
-					    (c == 'x' || c == 'X') &&
-					    pbuf[0] == '0' && pbuf[1] == c) {
-						*res++ = *pbuf++;
-						*res++ = *pbuf++;
-						rescnt -= 2;
-						len -= 2;
-						width -= 2;
-						if (width < 0)
-							width = 0;
-					}
+				if (PyLong_Check(v) && PyLong_AsLong(v) == -1
+				    && PyErr_Occurred()) {
+					/* Too big for a C long. */
+					PyErr_Clear();
+					temp = _PyString_FormatLong(v, flags,
+						prec, c, &pbuf, &len);
+					if (!temp)
+						goto error;
+					/* unbounded ints can always produce
+					   a sign character! */
+					sign = 1;
 				}
+				else {
+					pbuf = formatbuf;
+					len = formatint(pbuf, sizeof(formatbuf),
+							flags, prec, c, v);
+					if (len < 0)
+						goto error;
+					/* only d conversion is signed */
+					sign = c == 'd';
+				}
+				if (flags & F_ZERO)
+					fill = '0';
 				break;
 			case 'e':
 			case 'E':
@ -2782,7 +2935,7 @@ PyString_Format(PyObject *format, PyObject *args)
 				if (len < 0)
 					goto error;
 				sign = 1;
-				if (flags&F_ZERO)
+				if (flags & F_ZERO)
 					fill = '0';
 				break;
 			case 'c':
@ -2807,11 +2960,11 @@ PyString_Format(PyObject *format, PyObject *args)
 				else if (flags & F_BLANK)
 					sign = ' ';
 				else
-					sign = '\0';
+					sign = 0;
 			}
 			if (width < len)
 				width = len;
-			if (rescnt < width + (sign != '\0')) {
+			if (rescnt < width + (sign != 0)) {
 				reslen -= rescnt;
 				rescnt = width + fmtcnt + 100;
 				reslen += rescnt;
@ -2827,14 +2980,36 @@ PyString_Format(PyObject *format, PyObject *args)
 				if (width > len)
 					width--;
 			}
-			if (width > len && !(flags&F_LJUST)) {
+			if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
+				assert(pbuf[0] == '0');
+				assert(pbuf[1] == c);
+				if (fill != ' ') {
+					*res++ = *pbuf++;
+					*res++ = *pbuf++;
+				}
+				rescnt -= 2;
+				width -= 2;
+				if (width < 0)
+					width = 0;
+				len -= 2;
+			}
+			if (width > len && !(flags & F_LJUST)) {
 				do {
 					--rescnt;
 					*res++ = fill;
 				} while (--width > len);
 			}
-			if (sign && fill == ' ')
-				*res++ = sign;
+			if (fill == ' ') {
+				if (sign)
+					*res++ = sign;
+				if ((flags & F_ALT) &&
+				    (c == 'x' || c == 'X')) {
+					assert(pbuf[0] == '0');
+					assert(pbuf[1] == c);
+					*res++ = *pbuf++;
+					*res++ = *pbuf++;
+				}
+			}
 			memcpy(res, pbuf, len);
 			res += len;
 			rescnt -= len;
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4668,6 +4668,25 @@ formatfloat(Py_UNICODE *buf,
    return usprintf(buf, fmt, x);
 }

+static PyObject*
+formatlong(PyObject *val, int flags, int prec, int type)
+{
+	char *buf;
+	int i, len;
+	PyObject *str; /* temporary string object. */
+	PyUnicodeObject *result;
+
+	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
+	if (!str)
+		return NULL;
+	result = _PyUnicode_New(len);
+	for (i = 0; i < len; i++)
+		result->str[i] = buf[i];
+	result->str[len] = 0;
+	Py_DECREF(str);
+	return (PyObject*)result;
+}
+
 static int
 formatint(Py_UNICODE *buf,
 	  size_t buflen,
@ -4677,8 +4696,9 @@ formatint(Py_UNICODE *buf,
 	  PyObject *v)
 {
    /* fmt = '%#.' + `prec` + 'l' + `type`
-       worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
-    char fmt[20];
+       worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
+       + 1 + 1 = 24*/
+    char fmt[64]; /* plenty big enough! */
    long x;

    x = PyInt_AsLong(v);
@ -5006,26 +5026,29 @@ PyObject *PyUnicode_Format(PyObject *format,
 	    case 'X':
 		if (c == 'i')
 		    c = 'd';
-		pbuf = formatbuf;
-		len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
-			flags, prec, c, v);
-		if (len < 0)
-		    goto onError;
-		sign = (c == 'd');
-		if (flags & F_ZERO) {
-		    fill = '0';
-		    if ((flags&F_ALT) &&
-			(c == 'x' || c == 'X') &&
-			pbuf[0] == '0' && pbuf[1] == c) {
-			*res++ = *pbuf++;
-			*res++ = *pbuf++;
-			rescnt -= 2;
-			len -= 2;
-			width -= 2;
-			if (width < 0)
-			    width = 0;
-		    }
+		if (PyLong_Check(v) && PyLong_AsLong(v) == -1
+		    && PyErr_Occurred()) {
+		    PyErr_Clear();
+		    temp = formatlong(v, flags, prec, c);
+		    if (!temp)
+			goto onError;
+		    pbuf = PyUnicode_AS_UNICODE(temp);
+		    len = PyUnicode_GET_SIZE(temp);
+		    /* unbounded ints can always produce
+		       a sign character! */
+		    sign = 1;
 		}
+		else {
+		    pbuf = formatbuf;
+		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
+				    flags, prec, c, v);
+		    if (len < 0)
+			goto onError;
+		    /* only d conversion is signed */
+		    sign = c == 'd';
+		}
+		if (flags & F_ZERO)
+		    fill = '0';
 		break;

 	    case 'e':
@ -5039,7 +5062,7 @@ PyObject *PyUnicode_Format(PyObject *format,
 		if (len < 0)
 		    goto onError;
 		sign = 1;
-		if (flags&F_ZERO)
+		if (flags & F_ZERO)
 		    fill = '0';
 		break;

@ -5086,14 +5109,35 @@ PyObject *PyUnicode_Format(PyObject *format,
 		if (width > len)
 		    width--;
 	    }
+	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
+		assert(pbuf[0] == '0');
+		assert(pbuf[1] == c);
+		if (fill != ' ') {
+		    *res++ = *pbuf++;
+		    *res++ = *pbuf++;
+		}
+		rescnt -= 2;
+		width -= 2;
+		if (width < 0)
+		    width = 0;
+		len -= 2;
+	    }
 	    if (width > len && !(flags & F_LJUST)) {
 		do {
 		    --rescnt;
 		    *res++ = fill;
 		} while (--width > len);
 	    }
-	    if (sign && fill == ' ')
-		*res++ = sign;
+	    if (fill == ' ') {
+		if (sign)
+		    *res++ = sign;
+		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
+		    assert(pbuf[0] == '0');
+		    assert(pbuf[1] == c);
+		    *res++ = *pbuf++;
+		    *res++ = *pbuf++;
+		}
+	    }
 	    memcpy(res, pbuf, len * sizeof(Py_UNICODE));
 	    res += len;
 	    rescnt -= len;