Issue #21118: Use _PyUnicodeWriter API in str.translate() to simplify and

factorize the code
This commit is contained in:
Victor Stinner 2014-04-04 19:37:40 +02:00
parent d129eeb303
commit 1194ea020c

View file

@ -8495,76 +8495,54 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
return -1; return -1;
} }
} }
/* ensure that *outobj is at least requiredsize characters long,
if not reallocate and adjust various state variables. /* lookup the character, write the result into the writer.
Return 0 on success, -1 on error */ Return 1 if the result was written into the writer, return 0 if the mapping
was undefined, raise an exception return -1 on error. */
static int static int
charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
Py_ssize_t requiredsize) _PyUnicodeWriter *writer)
{ {
Py_ssize_t oldsize = *psize; PyObject *item;
Py_UCS4 *new_outobj;
if (requiredsize > oldsize) { if (charmaptranslate_lookup(ch, mapping, &item))
/* exponentially overallocate to minimize reallocations */
if (requiredsize < 2 * oldsize)
requiredsize = 2 * oldsize;
new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
if (new_outobj == 0)
return -1;
*outobj = new_outobj;
*psize = requiredsize;
}
return 0;
}
/* lookup the character, put the result in the output string and adjust
various state variables. Return a new reference to the object that
was put in the output buffer in *result, or Py_None, if the mapping was
undefined (in which case no character was written).
The called must decref result.
Return 0 on success, -1 on error. */
static int
charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
PyObject *mapping, Py_UCS4 **output,
Py_ssize_t *osize, Py_ssize_t *opos,
PyObject **res)
{
Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
if (charmaptranslate_lookup(curinp, mapping, res))
return -1; return -1;
if (*res==NULL) {
if (item == NULL) {
/* not found => default to 1:1 mapping */ /* not found => default to 1:1 mapping */
(*output)[(*opos)++] = curinp; if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
}
else if (*res==Py_None)
;
else if (PyLong_Check(*res)) {
/* no overflow check, because we know that the space is enough */
(*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
}
else if (PyUnicode_Check(*res)) {
Py_ssize_t repsize;
if (PyUnicode_READY(*res) == -1)
return -1; return -1;
repsize = PyUnicode_GET_LENGTH(*res);
if (repsize==1) {
/* no overflow check, because we know that the space is enough */
(*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
}
else if (repsize!=0) {
/* more than one character */
Py_ssize_t requiredsize = *opos +
(PyUnicode_GET_LENGTH(input) - ipos) +
repsize - 1;
Py_ssize_t i;
if (charmaptranslate_makespace(output, osize, requiredsize))
return -1;
for(i = 0; i < repsize; i++)
(*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
} }
return 1;
} }
else
if (item == Py_None) {
Py_DECREF(item);
return 0;
}
if (PyLong_Check(item)) {
Py_UCS4 ch = (Py_UCS4)PyLong_AS_LONG(item);
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Py_DECREF(item);
return -1;
}
Py_DECREF(item);
return 1;
}
if (!PyUnicode_Check(item)) {
Py_DECREF(item);
return -1; return -1;
return 0; }
if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
Py_DECREF(item);
return -1;
}
Py_DECREF(item);
return 1;
} }
PyObject * PyObject *
@ -8573,22 +8551,16 @@ _PyUnicode_TranslateCharmap(PyObject *input,
const char *errors) const char *errors)
{ {
/* input object */ /* input object */
char *idata; char *data;
Py_ssize_t size, i; Py_ssize_t size, i;
int kind; int kind;
/* output buffer */ /* output buffer */
Py_UCS4 *output = NULL; _PyUnicodeWriter writer;
Py_ssize_t osize; /* error handler */
PyObject *res;
/* current output position */
Py_ssize_t opos;
char *reason = "character maps to <undefined>"; char *reason = "character maps to <undefined>";
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
/* the following variable is used for caching string comparisons int ignore;
* -1=not initialized, 0=unknown, 1=strict, 2=replace,
* 3=ignore, 4=xmlcharrefreplace */
int known_errorHandler = -1;
if (mapping == NULL) { if (mapping == NULL) {
PyErr_BadArgument(); PyErr_BadArgument();
@ -8597,10 +8569,9 @@ _PyUnicode_TranslateCharmap(PyObject *input,
if (PyUnicode_READY(input) == -1) if (PyUnicode_READY(input) == -1)
return NULL; return NULL;
idata = (char*)PyUnicode_DATA(input); data = (char*)PyUnicode_DATA(input);
kind = PyUnicode_KIND(input); kind = PyUnicode_KIND(input);
size = PyUnicode_GET_LENGTH(input); size = PyUnicode_GET_LENGTH(input);
i = 0;
if (size == 0) { if (size == 0) {
Py_INCREF(input); Py_INCREF(input);
@ -8609,121 +8580,74 @@ _PyUnicode_TranslateCharmap(PyObject *input,
/* allocate enough for a simple 1:1 translation without /* allocate enough for a simple 1:1 translation without
replacements, if we need more, we'll resize */ replacements, if we need more, we'll resize */
osize = size; _PyUnicodeWriter_Init(&writer);
output = PyMem_Malloc(osize * sizeof(Py_UCS4)); if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
opos = 0;
if (output == NULL) {
PyErr_NoMemory();
goto onError; goto onError;
}
ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
i = 0;
while (i<size) { while (i<size) {
/* try to encode it */ /* try to encode it */
PyObject *x = NULL; int translate;
if (charmaptranslate_output(input, i, mapping, PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
&output, &osize, &opos, &x)) { Py_ssize_t newpos;
Py_XDECREF(x); /* startpos for collecting untranslatable chars */
goto onError; Py_ssize_t collstart;
} Py_ssize_t collend;
Py_XDECREF(x); Py_ssize_t coll;
if (x!=Py_None) /* it worked => adjust input pointer */ Py_UCS4 ch;
++i;
else { /* untranslatable character */
PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Py_ssize_t repsize;
Py_ssize_t newpos;
Py_ssize_t uni2;
/* startpos for collecting untranslatable chars */
Py_ssize_t collstart = i;
Py_ssize_t collend = i+1;
Py_ssize_t coll;
/* find all untranslatable characters */ ch = PyUnicode_READ(kind, data, i);
while (collend < size) { translate = charmaptranslate_output(ch, mapping, &writer);
if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) if (translate < 0)
goto onError; goto onError;
Py_XDECREF(x);
if (x!=Py_None) if (translate != 0) {
break; /* it worked => adjust input pointer */
++collend; ++i;
} continue;
/* cache callback name lookup }
* (if not done yet, i.e. it's the first error) */
if (known_errorHandler==-1) { /* untranslatable character */
if ((errors==NULL) || (!strcmp(errors, "strict"))) collstart = i;
known_errorHandler = 1; collend = i+1;
else if (!strcmp(errors, "replace"))
known_errorHandler = 2; /* find all untranslatable characters */
else if (!strcmp(errors, "ignore")) while (collend < size) {
known_errorHandler = 3; PyObject *x;
else if (!strcmp(errors, "xmlcharrefreplace")) ch = PyUnicode_READ(kind, data, collend);
known_errorHandler = 4; if (charmaptranslate_lookup(ch, mapping, &x))
else
known_errorHandler = 0;
}
switch (known_errorHandler) {
case 1: /* strict */
make_translate_exception(&exc,
input, collstart, collend, reason);
if (exc != NULL)
PyCodec_StrictErrors(exc);
goto onError; goto onError;
case 2: /* replace */ Py_XDECREF(x);
/* No need to check for space, this is a 1:1 replacement */ if (x != Py_None)
for (coll = collstart; coll<collend; coll++)
output[opos++] = '?';
/* fall through */
case 3: /* ignore */
i = collend;
break; break;
case 4: /* xmlcharrefreplace */ ++collend;
/* generate replacement (temporarily (mis)uses i) */ }
for (i = collstart; i < collend; ++i) {
char buffer[2+29+1+1]; if (ignore) {
char *cp; i = collend;
sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); }
if (charmaptranslate_makespace(&output, &osize, else {
opos+strlen(buffer)+(size-collend))) repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
goto onError; reason, input, &exc,
for (cp = buffer; *cp; ++cp) collstart, collend, &newpos);
output[opos++] = *cp; if (repunicode == NULL)
} goto onError;
i = collend; if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
break;
default:
repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
reason, input, &exc,
collstart, collend, &newpos);
if (repunicode == NULL)
goto onError;
if (PyUnicode_READY(repunicode) == -1) {
Py_DECREF(repunicode);
goto onError;
}
/* generate replacement */
repsize = PyUnicode_GET_LENGTH(repunicode);
if (charmaptranslate_makespace(&output, &osize,
opos+repsize+(size-collend))) {
Py_DECREF(repunicode);
goto onError;
}
for (uni2 = 0; repsize-->0; ++uni2)
output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
i = newpos;
Py_DECREF(repunicode); Py_DECREF(repunicode);
goto onError;
} }
Py_DECREF(repunicode);
i = newpos;
} }
} }
res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
if (!res)
goto onError;
PyMem_Free(output);
Py_XDECREF(exc); Py_XDECREF(exc);
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
return res; return _PyUnicodeWriter_Finish(&writer);
onError: onError:
PyMem_Free(output); _PyUnicodeWriter_Dealloc(&writer);
Py_XDECREF(exc); Py_XDECREF(exc);
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
return NULL; return NULL;