Fix massive slowdown in string formatting with the % operator

This commit is contained in:
Antoine Pitrou 2011-10-07 01:54:09 +02:00
parent 438818b4b0
commit 5c0ba36d5f

View file

@ -12693,17 +12693,13 @@ formatlong(PyObject *val, int flags, int prec, int type)
return result; return result;
} }
static int static Py_UCS4
formatchar(Py_UCS4 *buf, formatchar(PyObject *v)
size_t buflen,
PyObject *v)
{ {
/* presume that the buffer is at least 3 characters long */ /* presume that the buffer is at least 3 characters long */
if (PyUnicode_Check(v)) { if (PyUnicode_Check(v)) {
if (PyUnicode_GET_LENGTH(v) == 1) { if (PyUnicode_GET_LENGTH(v) == 1) {
buf[0] = PyUnicode_READ_CHAR(v, 0); return PyUnicode_READ_CHAR(v, 0);
buf[1] = '\0';
return 1;
} }
goto onError; goto onError;
} }
@ -12717,38 +12713,45 @@ formatchar(Py_UCS4 *buf,
if (x < 0 || x > 0x10ffff) { if (x < 0 || x > 0x10ffff) {
PyErr_SetString(PyExc_OverflowError, PyErr_SetString(PyExc_OverflowError,
"%c arg not in range(0x110000)"); "%c arg not in range(0x110000)");
return -1; return (Py_UCS4) -1;
} }
buf[0] = (Py_UCS4) x; return (Py_UCS4) x;
buf[1] = '\0';
return 1;
} }
onError: onError:
PyErr_SetString(PyExc_TypeError, PyErr_SetString(PyExc_TypeError,
"%c requires int or char"); "%c requires int or char");
return -1; return (Py_UCS4) -1;
} }
/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
FORMATBUFLEN is the length of the buffer in which chars are formatted.
*/
#define FORMATBUFLEN (size_t)10
PyObject * PyObject *
PyUnicode_Format(PyObject *format, PyObject *args) PyUnicode_Format(PyObject *format, PyObject *args)
{ {
void *fmt; void *fmt;
int fmtkind; int fmtkind;
PyObject *result; PyObject *result;
Py_UCS4 *res, *res0;
Py_UCS4 max;
int kind; int kind;
Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx; int r;
Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
int args_owned = 0; int args_owned = 0;
PyObject *dict = NULL; PyObject *dict = NULL;
PyObject *temp = NULL;
PyObject *second = NULL;
PyUnicodeObject *uformat; PyUnicodeObject *uformat;
_PyAccu acc;
static PyObject *plus, *minus, *blank, *zero, *percent;
if (!plus && !(plus = get_latin1_char('+')))
return NULL;
if (!minus && !(minus = get_latin1_char('-')))
return NULL;
if (!blank && !(blank = get_latin1_char(' ')))
return NULL;
if (!zero && !(zero = get_latin1_char('0')))
return NULL;
if (!percent && !(percent = get_latin1_char('%')))
return NULL;
if (format == NULL || args == NULL) { if (format == NULL || args == NULL) {
PyErr_BadInternalCall(); PyErr_BadInternalCall();
@ -12757,18 +12760,13 @@ PyUnicode_Format(PyObject *format, PyObject *args)
uformat = (PyUnicodeObject*)PyUnicode_FromObject(format); uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
if (uformat == NULL || PyUnicode_READY(uformat) == -1) if (uformat == NULL || PyUnicode_READY(uformat) == -1)
return NULL; return NULL;
if (_PyAccu_Init(&acc))
goto onError;
fmt = PyUnicode_DATA(uformat); fmt = PyUnicode_DATA(uformat);
fmtkind = PyUnicode_KIND(uformat); fmtkind = PyUnicode_KIND(uformat);
fmtcnt = PyUnicode_GET_LENGTH(uformat); fmtcnt = PyUnicode_GET_LENGTH(uformat);
fmtpos = 0; fmtpos = 0;
reslen = rescnt = fmtcnt + 100;
res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
if (res0 == NULL) {
PyErr_NoMemory();
goto onError;
}
if (PyTuple_Check(args)) { if (PyTuple_Check(args)) {
arglen = PyTuple_Size(args); arglen = PyTuple_Size(args);
argidx = 0; argidx = 0;
@ -12783,18 +12781,21 @@ PyUnicode_Format(PyObject *format, PyObject *args)
while (--fmtcnt >= 0) { while (--fmtcnt >= 0) {
if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
if (--rescnt < 0) { PyObject *nonfmt;
rescnt = fmtcnt + 100; Py_ssize_t nonfmtpos;
reslen += rescnt; nonfmtpos = fmtpos++;
res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); while (fmtcnt >= 0 &&
if (res0 == NULL){ PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
PyErr_NoMemory(); fmtpos++;
goto onError; fmtcnt--;
}
res = res0 + reslen - rescnt;
--rescnt;
} }
*res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++); nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
if (nonfmt == NULL)
goto onError;
r = _PyAccu_Accumulate(&acc, nonfmt);
Py_DECREF(nonfmt);
if (r)
goto onError;
} }
else { else {
/* Got a format specifier */ /* Got a format specifier */
@ -12802,15 +12803,12 @@ PyUnicode_Format(PyObject *format, PyObject *args)
Py_ssize_t width = -1; Py_ssize_t width = -1;
int prec = -1; int prec = -1;
Py_UCS4 c = '\0'; Py_UCS4 c = '\0';
Py_UCS4 fill; Py_UCS4 fill, sign;
int isnumok; int isnumok;
PyObject *v = NULL; PyObject *v = NULL;
PyObject *temp = NULL; void *pbuf = NULL;
void *pbuf; Py_ssize_t pindex, len;
Py_ssize_t pindex; PyObject *signobj = NULL, *fillobj = NULL;
Py_UNICODE sign;
Py_ssize_t len, len1;
Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
fmtpos++; fmtpos++;
if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
@ -12955,15 +12953,12 @@ PyUnicode_Format(PyObject *format, PyObject *args)
} }
sign = 0; sign = 0;
fill = ' '; fill = ' ';
fillobj = blank;
switch (c) { switch (c) {
case '%': case '%':
pbuf = formatbuf; _PyAccu_Accumulate(&acc, percent);
kind = PyUnicode_4BYTE_KIND; continue;
/* presume that buffer length is at least 1 */
PyUnicode_WRITE(kind, pbuf, 0, '%');
len = 1;
break;
case 's': case 's':
case 'r': case 'r':
@ -13045,8 +13040,10 @@ PyUnicode_Format(PyObject *format, PyObject *args)
"not %.200s", (char)c, Py_TYPE(v)->tp_name); "not %.200s", (char)c, Py_TYPE(v)->tp_name);
goto onError; goto onError;
} }
if (flags & F_ZERO) if (flags & F_ZERO) {
fill = '0'; fill = '0';
fillobj = zero;
}
break; break;
case 'e': case 'e':
@ -13066,17 +13063,25 @@ PyUnicode_Format(PyObject *format, PyObject *args)
kind = PyUnicode_KIND(temp); kind = PyUnicode_KIND(temp);
len = PyUnicode_GET_LENGTH(temp); len = PyUnicode_GET_LENGTH(temp);
sign = 1; sign = 1;
if (flags & F_ZERO) if (flags & F_ZERO) {
fill = '0'; fill = '0';
fillobj = zero;
}
break; break;
case 'c': case 'c':
pbuf = formatbuf; {
kind = PyUnicode_4BYTE_KIND; Py_UCS4 ch = formatchar(v);
len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v); if (ch == (Py_UCS4) -1)
if (len < 0)
goto onError; goto onError;
temp = _PyUnicode_FromUCS4(&ch, 1);
if (temp == NULL)
goto onError;
pbuf = PyUnicode_DATA(temp);
kind = PyUnicode_KIND(temp);
len = PyUnicode_GET_LENGTH(temp);
break; break;
}
default: default:
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
@ -13090,90 +13095,105 @@ PyUnicode_Format(PyObject *format, PyObject *args)
/* pbuf is initialized here. */ /* pbuf is initialized here. */
pindex = 0; pindex = 0;
if (sign) { if (sign) {
if (PyUnicode_READ(kind, pbuf, pindex) == '-' || if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
PyUnicode_READ(kind, pbuf, pindex) == '+') { signobj = minus;
sign = PyUnicode_READ(kind, pbuf, pindex++);
len--; len--;
pindex++;
}
else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
signobj = plus;
len--;
pindex++;
} }
else if (flags & F_SIGN) else if (flags & F_SIGN)
sign = '+'; signobj = plus;
else if (flags & F_BLANK) else if (flags & F_BLANK)
sign = ' '; signobj = blank;
else else
sign = 0; sign = 0;
} }
if (width < len) if (width < len)
width = len; width = len;
if (rescnt - (sign != 0) < width) {
reslen -= rescnt;
rescnt = width + fmtcnt + 100;
reslen += rescnt;
if (reslen < 0) {
Py_XDECREF(temp);
PyErr_NoMemory();
goto onError;
}
res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
if (res0 == 0) {
PyErr_NoMemory();
Py_XDECREF(temp);
goto onError;
}
res = res0 + reslen - rescnt;
}
if (sign) { if (sign) {
if (fill != ' ') if (fill != ' ') {
*res++ = sign; assert(signobj != NULL);
rescnt--; if (_PyAccu_Accumulate(&acc, signobj))
goto onError;
}
if (width > len) if (width > len)
width--; width--;
} }
if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
if (fill != ' ') { if (fill != ' ') {
*res++ = PyUnicode_READ(kind, pbuf, pindex++); second = get_latin1_char(
*res++ = PyUnicode_READ(kind, pbuf, pindex++); PyUnicode_READ(kind, pbuf, pindex + 1));
pindex += 2;
if (second == NULL ||
_PyAccu_Accumulate(&acc, zero) ||
_PyAccu_Accumulate(&acc, second))
goto onError;
Py_CLEAR(second);
} }
rescnt -= 2;
width -= 2; width -= 2;
if (width < 0) if (width < 0)
width = 0; width = 0;
len -= 2; len -= 2;
} }
if (width > len && !(flags & F_LJUST)) { if (width > len && !(flags & F_LJUST)) {
assert(fillobj != NULL);
do { do {
--rescnt; if (_PyAccu_Accumulate(&acc, fillobj))
*res++ = fill; goto onError;
} while (--width > len); } while (--width > len);
} }
if (fill == ' ') { if (fill == ' ') {
if (sign) if (sign) {
*res++ = sign; assert(signobj != NULL);
if (_PyAccu_Accumulate(&acc, signobj))
goto onError;
}
if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
*res++ = PyUnicode_READ(kind, pbuf, pindex++); second = get_latin1_char(
*res++ = PyUnicode_READ(kind, pbuf, pindex++); PyUnicode_READ(kind, pbuf, pindex + 1));
pindex += 2;
if (second == NULL ||
_PyAccu_Accumulate(&acc, zero) ||
_PyAccu_Accumulate(&acc, second))
goto onError;
Py_CLEAR(second);
} }
} }
/* Copy all characters, preserving len */ /* Copy all characters, preserving len */
len1 = len; if (temp != NULL) {
while (len1--) { assert(pbuf == PyUnicode_DATA(temp));
*res++ = PyUnicode_READ(kind, pbuf, pindex++); v = PyUnicode_Substring(temp, pindex, pindex + len);
rescnt--;
} }
else {
const char *p = (const char *) pbuf;
assert(pbuf != NULL);
p = p + PyUnicode_KIND_SIZE(kind, pindex);
v = PyUnicode_FromKindAndData(kind, p, len);
}
if (v == NULL)
goto onError;
r = _PyAccu_Accumulate(&acc, v);
Py_DECREF(v);
if (r)
goto onError;
while (--width >= len) { while (--width >= len) {
--rescnt; if (_PyAccu_Accumulate(&acc, blank))
*res++ = ' '; goto onError;
} }
if (dict && (argidx < arglen) && c != '%') { if (dict && (argidx < arglen) && c != '%') {
PyErr_SetString(PyExc_TypeError, PyErr_SetString(PyExc_TypeError,
"not all arguments converted during string formatting"); "not all arguments converted during string formatting");
Py_XDECREF(temp);
goto onError; goto onError;
} }
Py_XDECREF(temp); Py_CLEAR(temp);
} /* '%' */ } /* '%' */
} /* until end */ } /* until end */
if (argidx < arglen && !dict) { if (argidx < arglen && !dict) {
@ -13182,27 +13202,20 @@ PyUnicode_Format(PyObject *format, PyObject *args)
goto onError; goto onError;
} }
result = _PyAccu_Finish(&acc);
for (max=0, res = res0; res < res0+reslen-rescnt; res++)
if (*res > max)
max = *res;
result = PyUnicode_New(reslen - rescnt, max);
if (!result)
goto onError;
kind = PyUnicode_KIND(result);
for (res = res0; res < res0+reslen-rescnt; res++)
PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
PyMem_Free(res0);
if (args_owned) { if (args_owned) {
Py_DECREF(args); Py_DECREF(args);
} }
Py_DECREF(uformat); Py_DECREF(uformat);
assert(_PyUnicode_CheckConsistency(result, 1)); Py_XDECREF(temp);
Py_XDECREF(second);
return (PyObject *)result; return (PyObject *)result;
onError: onError:
PyMem_Free(res0);
Py_DECREF(uformat); Py_DECREF(uformat);
Py_XDECREF(temp);
Py_XDECREF(second);
_PyAccu_Destroy(&acc);
if (args_owned) { if (args_owned) {
Py_DECREF(args); Py_DECREF(args);
} }