mirror of
https://github.com/python/cpython.git
synced 2025-07-16 07:45:20 +00:00
Close #14716: str.format() now uses the new "unicode writer" API instead of the
PyAccu API. For example, it makes str.format() from 25% to 30% faster on Linux.
This commit is contained in:
parent
9fad160411
commit
202fdca133
2 changed files with 148 additions and 170 deletions
|
@ -2,8 +2,6 @@
|
||||||
unicode_format.h -- implementation of str.format().
|
unicode_format.h -- implementation of str.format().
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "accu.h"
|
|
||||||
|
|
||||||
/* Defines for more efficiently reallocating the string buffer */
|
/* Defines for more efficiently reallocating the string buffer */
|
||||||
#define INITIAL_SIZE_INCREMENT 100
|
#define INITIAL_SIZE_INCREMENT 100
|
||||||
#define SIZE_MULTIPLIER 2
|
#define SIZE_MULTIPLIER 2
|
||||||
|
@ -111,33 +109,6 @@ autonumber_state_error(AutoNumberState state, int field_name_is_empty)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/************************************************************************/
|
|
||||||
/*********** Output string management functions ****************/
|
|
||||||
/************************************************************************/
|
|
||||||
|
|
||||||
/*
|
|
||||||
output_data dumps characters into our output string
|
|
||||||
buffer.
|
|
||||||
|
|
||||||
In some cases, it has to reallocate the string.
|
|
||||||
|
|
||||||
It returns a status: 0 for a failed reallocation,
|
|
||||||
1 for success.
|
|
||||||
*/
|
|
||||||
static int
|
|
||||||
output_data(_PyAccu *acc, PyObject *s, Py_ssize_t start, Py_ssize_t end)
|
|
||||||
{
|
|
||||||
PyObject *substring;
|
|
||||||
int r;
|
|
||||||
|
|
||||||
substring = PyUnicode_Substring(s, start, end);
|
|
||||||
if (substring == NULL)
|
|
||||||
return 0;
|
|
||||||
r = _PyAccu_Accumulate(acc, substring);
|
|
||||||
Py_DECREF(substring);
|
|
||||||
return r == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/************************************************************************/
|
/************************************************************************/
|
||||||
/*********** Format string parsing -- integers and identifiers *********/
|
/*********** Format string parsing -- integers and identifiers *********/
|
||||||
/************************************************************************/
|
/************************************************************************/
|
||||||
|
@ -523,7 +494,7 @@ error:
|
||||||
appends to the output.
|
appends to the output.
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
render_field(PyObject *fieldobj, SubString *format_spec, _PyAccu *acc)
|
render_field(PyObject *fieldobj, SubString *format_spec, unicode_writer_t *writer)
|
||||||
{
|
{
|
||||||
int ok = 0;
|
int ok = 0;
|
||||||
PyObject *result = NULL;
|
PyObject *result = NULL;
|
||||||
|
@ -566,7 +537,8 @@ render_field(PyObject *fieldobj, SubString *format_spec, _PyAccu *acc)
|
||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
assert(PyUnicode_Check(result));
|
assert(PyUnicode_Check(result));
|
||||||
ok = output_data(acc, result, 0, PyUnicode_GET_LENGTH(result));
|
|
||||||
|
ok = (unicode_writer_write_str(writer, result, 0, PyUnicode_GET_LENGTH(result)) == 0);
|
||||||
done:
|
done:
|
||||||
Py_XDECREF(format_spec_object);
|
Py_XDECREF(format_spec_object);
|
||||||
Py_XDECREF(result);
|
Py_XDECREF(result);
|
||||||
|
@ -831,7 +803,7 @@ do_conversion(PyObject *obj, Py_UCS4 conversion)
|
||||||
static int
|
static int
|
||||||
output_markup(SubString *field_name, SubString *format_spec,
|
output_markup(SubString *field_name, SubString *format_spec,
|
||||||
int format_spec_needs_expanding, Py_UCS4 conversion,
|
int format_spec_needs_expanding, Py_UCS4 conversion,
|
||||||
_PyAccu *acc, PyObject *args, PyObject *kwargs,
|
unicode_writer_t *writer, PyObject *args, PyObject *kwargs,
|
||||||
int recursion_depth, AutoNumber *auto_number)
|
int recursion_depth, AutoNumber *auto_number)
|
||||||
{
|
{
|
||||||
PyObject *tmp = NULL;
|
PyObject *tmp = NULL;
|
||||||
|
@ -872,7 +844,7 @@ output_markup(SubString *field_name, SubString *format_spec,
|
||||||
else
|
else
|
||||||
actual_format_spec = format_spec;
|
actual_format_spec = format_spec;
|
||||||
|
|
||||||
if (render_field(fieldobj, actual_format_spec, acc) == 0)
|
if (render_field(fieldobj, actual_format_spec, writer) == 0)
|
||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
result = 1;
|
result = 1;
|
||||||
|
@ -892,7 +864,7 @@ done:
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
|
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
|
||||||
_PyAccu *acc, int recursion_depth, AutoNumber *auto_number)
|
unicode_writer_t *writer, int recursion_depth, AutoNumber *auto_number)
|
||||||
{
|
{
|
||||||
MarkupIterator iter;
|
MarkupIterator iter;
|
||||||
int format_spec_needs_expanding;
|
int format_spec_needs_expanding;
|
||||||
|
@ -902,17 +874,21 @@ do_markup(SubString *input, PyObject *args, PyObject *kwargs,
|
||||||
SubString field_name;
|
SubString field_name;
|
||||||
SubString format_spec;
|
SubString format_spec;
|
||||||
Py_UCS4 conversion;
|
Py_UCS4 conversion;
|
||||||
|
int err;
|
||||||
|
|
||||||
MarkupIterator_init(&iter, input->str, input->start, input->end);
|
MarkupIterator_init(&iter, input->str, input->start, input->end);
|
||||||
while ((result = MarkupIterator_next(&iter, &literal, &field_present,
|
while ((result = MarkupIterator_next(&iter, &literal, &field_present,
|
||||||
&field_name, &format_spec,
|
&field_name, &format_spec,
|
||||||
&conversion,
|
&conversion,
|
||||||
&format_spec_needs_expanding)) == 2) {
|
&format_spec_needs_expanding)) == 2) {
|
||||||
if (!output_data(acc, literal.str, literal.start, literal.end))
|
err = unicode_writer_write_str(writer,
|
||||||
|
literal.str, literal.start,
|
||||||
|
literal.end - literal.start);
|
||||||
|
if (err == -1)
|
||||||
return 0;
|
return 0;
|
||||||
if (field_present)
|
if (field_present)
|
||||||
if (!output_markup(&field_name, &format_spec,
|
if (!output_markup(&field_name, &format_spec,
|
||||||
format_spec_needs_expanding, conversion, acc,
|
format_spec_needs_expanding, conversion, writer,
|
||||||
args, kwargs, recursion_depth, auto_number))
|
args, kwargs, recursion_depth, auto_number))
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -928,7 +904,8 @@ static PyObject *
|
||||||
build_string(SubString *input, PyObject *args, PyObject *kwargs,
|
build_string(SubString *input, PyObject *args, PyObject *kwargs,
|
||||||
int recursion_depth, AutoNumber *auto_number)
|
int recursion_depth, AutoNumber *auto_number)
|
||||||
{
|
{
|
||||||
_PyAccu acc;
|
unicode_writer_t writer;
|
||||||
|
Py_ssize_t initlen;
|
||||||
|
|
||||||
/* check the recursion level */
|
/* check the recursion level */
|
||||||
if (recursion_depth <= 0) {
|
if (recursion_depth <= 0) {
|
||||||
|
@ -937,16 +914,17 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_PyAccu_Init(&acc))
|
initlen = PyUnicode_GET_LENGTH(input->str) + 100;
|
||||||
|
if (unicode_writer_init(&writer, initlen, 127) == -1)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
if (!do_markup(input, args, kwargs, &acc, recursion_depth,
|
if (!do_markup(input, args, kwargs, &writer, recursion_depth,
|
||||||
auto_number)) {
|
auto_number)) {
|
||||||
_PyAccu_Destroy(&acc);
|
unicode_writer_dealloc(&writer);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
return _PyAccu_Finish(&acc);
|
return unicode_writer_finish(&writer);
|
||||||
}
|
}
|
||||||
|
|
||||||
/************************************************************************/
|
/************************************************************************/
|
||||||
|
|
|
@ -13200,6 +13200,135 @@ unicode_endswith(PyObject *self,
|
||||||
return PyBool_FromLong(result);
|
return PyBool_FromLong(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
PyObject *buffer;
|
||||||
|
void *data;
|
||||||
|
enum PyUnicode_Kind kind;
|
||||||
|
Py_UCS4 maxchar;
|
||||||
|
Py_ssize_t pos;
|
||||||
|
} unicode_writer_t;
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(void)
|
||||||
|
unicode_writer_update(unicode_writer_t *writer)
|
||||||
|
{
|
||||||
|
writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
|
||||||
|
writer->data = PyUnicode_DATA(writer->buffer);
|
||||||
|
writer->kind = PyUnicode_KIND(writer->buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL(int)
|
||||||
|
unicode_writer_init(unicode_writer_t *writer,
|
||||||
|
Py_ssize_t length, Py_UCS4 maxchar)
|
||||||
|
{
|
||||||
|
writer->pos = 0;
|
||||||
|
writer->buffer = PyUnicode_New(length, maxchar);
|
||||||
|
if (writer->buffer == NULL)
|
||||||
|
return -1;
|
||||||
|
unicode_writer_update(writer);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(int)
|
||||||
|
unicode_writer_prepare(unicode_writer_t *writer,
|
||||||
|
Py_ssize_t length, Py_UCS4 maxchar)
|
||||||
|
{
|
||||||
|
Py_ssize_t newlen;
|
||||||
|
PyObject *newbuffer;
|
||||||
|
|
||||||
|
if (length > PY_SSIZE_T_MAX - writer->pos) {
|
||||||
|
PyErr_NoMemory();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
newlen = writer->pos + length;
|
||||||
|
|
||||||
|
if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) {
|
||||||
|
/* overallocate 25% to limit the number of resize */
|
||||||
|
if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
|
||||||
|
newlen += newlen / 4;
|
||||||
|
|
||||||
|
if (maxchar > writer->maxchar) {
|
||||||
|
/* resize + widen */
|
||||||
|
newbuffer = PyUnicode_New(newlen, maxchar);
|
||||||
|
if (newbuffer == NULL)
|
||||||
|
return -1;
|
||||||
|
PyUnicode_CopyCharacters(newbuffer, 0,
|
||||||
|
writer->buffer, 0, writer->pos);
|
||||||
|
Py_DECREF(writer->buffer);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
newbuffer = resize_compact(writer->buffer, newlen);
|
||||||
|
if (newbuffer == NULL)
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
writer->buffer = newbuffer;
|
||||||
|
unicode_writer_update(writer);
|
||||||
|
}
|
||||||
|
else if (maxchar > writer->maxchar) {
|
||||||
|
if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
|
||||||
|
return -1;
|
||||||
|
unicode_writer_update(writer);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(int)
|
||||||
|
unicode_writer_write_str(
|
||||||
|
unicode_writer_t *writer,
|
||||||
|
PyObject *str, Py_ssize_t start, Py_ssize_t length)
|
||||||
|
{
|
||||||
|
Py_UCS4 maxchar;
|
||||||
|
|
||||||
|
assert(str != NULL);
|
||||||
|
assert(PyUnicode_Check(str));
|
||||||
|
if (PyUnicode_READY(str) == -1)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
assert(0 <= start);
|
||||||
|
assert(0 <= length);
|
||||||
|
assert(start + length <= PyUnicode_GET_LENGTH(str));
|
||||||
|
if (length == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
maxchar = _PyUnicode_FindMaxChar(str, start, start + length);
|
||||||
|
if (unicode_writer_prepare(writer, length, maxchar) == -1)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
assert((writer->pos + length) <= PyUnicode_GET_LENGTH(writer->buffer));
|
||||||
|
copy_characters(writer->buffer, writer->pos,
|
||||||
|
str, start, length);
|
||||||
|
writer->pos += length;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(int)
|
||||||
|
unicode_writer_write_char(
|
||||||
|
unicode_writer_t *writer,
|
||||||
|
Py_UCS4 ch)
|
||||||
|
{
|
||||||
|
if (unicode_writer_prepare(writer, 1, ch) == -1)
|
||||||
|
return -1;
|
||||||
|
assert((writer->pos + 1) <= PyUnicode_GET_LENGTH(writer->buffer));
|
||||||
|
PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
|
||||||
|
writer->pos += 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL(PyObject *)
|
||||||
|
unicode_writer_finish(unicode_writer_t *writer)
|
||||||
|
{
|
||||||
|
if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) {
|
||||||
|
Py_DECREF(writer->buffer);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return writer->buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL(void)
|
||||||
|
unicode_writer_dealloc(unicode_writer_t *writer)
|
||||||
|
{
|
||||||
|
Py_CLEAR(writer->buffer);
|
||||||
|
}
|
||||||
|
|
||||||
#include "stringlib/unicode_format.h"
|
#include "stringlib/unicode_format.h"
|
||||||
|
|
||||||
PyDoc_STRVAR(format__doc__,
|
PyDoc_STRVAR(format__doc__,
|
||||||
|
@ -13649,135 +13778,6 @@ formatchar(PyObject *v)
|
||||||
return (Py_UCS4) -1;
|
return (Py_UCS4) -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
PyObject *buffer;
|
|
||||||
void *data;
|
|
||||||
enum PyUnicode_Kind kind;
|
|
||||||
Py_UCS4 maxchar;
|
|
||||||
Py_ssize_t pos;
|
|
||||||
} unicode_writer_t;
|
|
||||||
|
|
||||||
Py_LOCAL_INLINE(void)
|
|
||||||
unicode_writer_update(unicode_writer_t *writer)
|
|
||||||
{
|
|
||||||
writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
|
|
||||||
writer->data = PyUnicode_DATA(writer->buffer);
|
|
||||||
writer->kind = PyUnicode_KIND(writer->buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
Py_LOCAL(int)
|
|
||||||
unicode_writer_init(unicode_writer_t *writer,
|
|
||||||
Py_ssize_t length, Py_UCS4 maxchar)
|
|
||||||
{
|
|
||||||
writer->pos = 0;
|
|
||||||
writer->buffer = PyUnicode_New(length, maxchar);
|
|
||||||
if (writer->buffer == NULL)
|
|
||||||
return -1;
|
|
||||||
unicode_writer_update(writer);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
Py_LOCAL_INLINE(int)
|
|
||||||
unicode_writer_prepare(unicode_writer_t *writer,
|
|
||||||
Py_ssize_t length, Py_UCS4 maxchar)
|
|
||||||
{
|
|
||||||
Py_ssize_t newlen;
|
|
||||||
PyObject *newbuffer;
|
|
||||||
|
|
||||||
if (length > PY_SSIZE_T_MAX - writer->pos) {
|
|
||||||
PyErr_NoMemory();
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
newlen = writer->pos + length;
|
|
||||||
|
|
||||||
if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) {
|
|
||||||
/* overallocate 25% to limit the number of resize */
|
|
||||||
if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
|
|
||||||
newlen += newlen / 4;
|
|
||||||
|
|
||||||
if (maxchar > writer->maxchar) {
|
|
||||||
/* resize + widen */
|
|
||||||
newbuffer = PyUnicode_New(newlen, maxchar);
|
|
||||||
if (newbuffer == NULL)
|
|
||||||
return -1;
|
|
||||||
PyUnicode_CopyCharacters(newbuffer, 0,
|
|
||||||
writer->buffer, 0, writer->pos);
|
|
||||||
Py_DECREF(writer->buffer);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
newbuffer = resize_compact(writer->buffer, newlen);
|
|
||||||
if (newbuffer == NULL)
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
writer->buffer = newbuffer;
|
|
||||||
unicode_writer_update(writer);
|
|
||||||
}
|
|
||||||
else if (maxchar > writer->maxchar) {
|
|
||||||
if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
|
|
||||||
return -1;
|
|
||||||
unicode_writer_update(writer);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
Py_LOCAL_INLINE(int)
|
|
||||||
unicode_writer_write_str(
|
|
||||||
unicode_writer_t *writer,
|
|
||||||
PyObject *str, Py_ssize_t start, Py_ssize_t length)
|
|
||||||
{
|
|
||||||
Py_UCS4 maxchar;
|
|
||||||
|
|
||||||
assert(str != NULL);
|
|
||||||
assert(PyUnicode_Check(str));
|
|
||||||
if (PyUnicode_READY(str) == -1)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
assert(0 <= start);
|
|
||||||
assert(0 <= length);
|
|
||||||
assert(start + length <= PyUnicode_GET_LENGTH(str));
|
|
||||||
if (length == 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
maxchar = _PyUnicode_FindMaxChar(str, start, start + length);
|
|
||||||
if (unicode_writer_prepare(writer, length, maxchar) == -1)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
assert((writer->pos + length) <= PyUnicode_GET_LENGTH(writer->buffer));
|
|
||||||
copy_characters(writer->buffer, writer->pos,
|
|
||||||
str, start, length);
|
|
||||||
writer->pos += length;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
Py_LOCAL_INLINE(int)
|
|
||||||
unicode_writer_write_char(
|
|
||||||
unicode_writer_t *writer,
|
|
||||||
Py_UCS4 ch)
|
|
||||||
{
|
|
||||||
if (unicode_writer_prepare(writer, 1, ch) == -1)
|
|
||||||
return -1;
|
|
||||||
assert((writer->pos + 1) <= PyUnicode_GET_LENGTH(writer->buffer));
|
|
||||||
PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
|
|
||||||
writer->pos += 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
Py_LOCAL(PyObject *)
|
|
||||||
unicode_writer_finish(unicode_writer_t *writer)
|
|
||||||
{
|
|
||||||
if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) {
|
|
||||||
Py_DECREF(writer->buffer);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return writer->buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
Py_LOCAL(void)
|
|
||||||
unicode_writer_dealloc(unicode_writer_t *writer)
|
|
||||||
{
|
|
||||||
Py_CLEAR(writer->buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
PyUnicode_Format(PyObject *format, PyObject *args)
|
PyUnicode_Format(PyObject *format, PyObject *args)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue