bpo-33954: Fix _PyUnicode_InsertThousandsGrouping() (GH-10623)

Fix str.format(), float.__format__() and complex.__format__() methods
for non-ASCII decimal point when using the "n" formatter.

Changes:

* Rewrite _PyUnicode_InsertThousandsGrouping(): it now requires
  a _PyUnicodeWriter object for the buffer and a Python str object
  for digits.
* Rename FILL() macro to unicode_fill(), convert it to static inline function,
  add "assert(0 <= start);" and rework its code.
This commit is contained in:
Victor Stinner 2018-11-26 13:40:01 +01:00 committed by GitHub
parent df108dc661
commit 59423e3ddd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 228 additions and 252 deletions

View file

@ -220,6 +220,38 @@ static PyObject *unicode_empty = NULL;
return unicode_empty; \
} while (0)
static inline void
unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
Py_ssize_t start, Py_ssize_t length)
{
assert(0 <= start);
assert(kind != PyUnicode_WCHAR_KIND);
switch (kind) {
case PyUnicode_1BYTE_KIND: {
Py_UCS1 ch = (unsigned char)value;
Py_UCS1 *to = (Py_UCS1 *)data + start;
memset(to, ch, length);
break;
}
case PyUnicode_2BYTE_KIND: {
Py_UCS2 ch = (Py_UCS2)value;
Py_UCS2 *to = (Py_UCS2 *)data + start;
const Py_UCS2 *end = to + length;
for (; to < end; ++to) *to = ch;
break;
}
case PyUnicode_4BYTE_KIND: {
Py_UCS4 ch = value;
Py_UCS4 * to = (Py_UCS4 *)data + start;
const Py_UCS4 *end = to + length;
for (; to < end; ++to) *to = ch;
break;
}
default: Py_UNREACHABLE();
}
}
/* Forward declaration */
static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
@ -790,7 +822,6 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/find_max_char.h"
#include "stringlib/localeutil.h"
#include "stringlib/undef.h"
#include "stringlib/ucs1lib.h"
@ -801,7 +832,6 @@ ensure_unicode(PyObject *obj)
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/find_max_char.h"
#include "stringlib/localeutil.h"
#include "stringlib/undef.h"
#include "stringlib/ucs2lib.h"
@ -812,7 +842,6 @@ ensure_unicode(PyObject *obj)
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/find_max_char.h"
#include "stringlib/localeutil.h"
#include "stringlib/undef.h"
#include "stringlib/ucs4lib.h"
@ -823,7 +852,6 @@ ensure_unicode(PyObject *obj)
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/find_max_char.h"
#include "stringlib/localeutil.h"
#include "stringlib/undef.h"
#include "stringlib/unicodedefs.h"
@ -9323,86 +9351,149 @@ any_find_slice(PyObject* s1, PyObject* s2,
return result;
}
/* _PyUnicode_InsertThousandsGrouping() helper functions */
#include "stringlib/localeutil.h"
/**
* InsertThousandsGrouping:
* @writer: Unicode writer.
* @n_buffer: Number of characters in @buffer.
* @digits: Digits we're reading from. If count is non-NULL, this is unused.
* @d_pos: Start of digits string.
* @n_digits: The number of digits in the string, in which we want
* to put the grouping chars.
* @min_width: The minimum width of the digits in the output string.
* Output will be zero-padded on the left to fill.
* @grouping: see definition in localeconv().
* @thousands_sep: see definition in localeconv().
*
* There are 2 modes: counting and filling. If @writer is NULL,
* we are in counting mode, else filling mode.
* If counting, the required buffer size is returned.
* If filling, we know the buffer will be large enough, so we don't
* need to pass in the buffer size.
* Inserts thousand grouping characters (as defined by grouping and
* thousands_sep) into @writer.
*
* Return value: -1 on error, number of characters otherwise.
**/
Py_ssize_t
_PyUnicode_InsertThousandsGrouping(
PyObject *unicode, Py_ssize_t index,
_PyUnicodeWriter *writer,
Py_ssize_t n_buffer,
void *digits, Py_ssize_t n_digits,
PyObject *digits,
Py_ssize_t d_pos,
Py_ssize_t n_digits,
Py_ssize_t min_width,
const char *grouping, PyObject *thousands_sep,
const char *grouping,
PyObject *thousands_sep,
Py_UCS4 *maxchar)
{
unsigned int kind, thousands_sep_kind;
char *data, *thousands_sep_data;
Py_ssize_t thousands_sep_len;
Py_ssize_t len;
if (unicode != NULL) {
kind = PyUnicode_KIND(unicode);
data = (char *) PyUnicode_DATA(unicode) + index * kind;
if (writer) {
assert(digits != NULL);
assert(maxchar == NULL);
}
else {
kind = PyUnicode_1BYTE_KIND;
data = NULL;
assert(digits == NULL);
assert(maxchar != NULL);
}
thousands_sep_kind = PyUnicode_KIND(thousands_sep);
thousands_sep_data = PyUnicode_DATA(thousands_sep);
thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
if (unicode != NULL && thousands_sep_kind != kind) {
if (thousands_sep_kind < kind) {
thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
if (!thousands_sep_data)
return -1;
}
else {
data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
if (!data)
return -1;
assert(0 <= d_pos);
assert(0 <= n_digits);
assert(0 <= min_width);
assert(grouping != NULL);
if (digits != NULL) {
if (PyUnicode_READY(digits) == -1) {
return -1;
}
}
if (PyUnicode_READY(thousands_sep) == -1) {
return -1;
}
switch (kind) {
case PyUnicode_1BYTE_KIND:
if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
len = asciilib_InsertThousandsGrouping(
(Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
min_width, grouping,
(Py_UCS1 *) thousands_sep_data, thousands_sep_len);
else
len = ucs1lib_InsertThousandsGrouping(
(Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
min_width, grouping,
(Py_UCS1 *) thousands_sep_data, thousands_sep_len);
break;
case PyUnicode_2BYTE_KIND:
len = ucs2lib_InsertThousandsGrouping(
(Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
min_width, grouping,
(Py_UCS2 *) thousands_sep_data, thousands_sep_len);
break;
case PyUnicode_4BYTE_KIND:
len = ucs4lib_InsertThousandsGrouping(
(Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
min_width, grouping,
(Py_UCS4 *) thousands_sep_data, thousands_sep_len);
break;
default:
Py_UNREACHABLE();
Py_ssize_t count = 0;
Py_ssize_t n_zeros;
int loop_broken = 0;
int use_separator = 0; /* First time through, don't append the
separator. They only go between
groups. */
Py_ssize_t buffer_pos;
Py_ssize_t digits_pos;
Py_ssize_t len;
Py_ssize_t n_chars;
Py_ssize_t remaining = n_digits; /* Number of chars remaining to
be looked at */
/* A generator that returns all of the grouping widths, until it
returns 0. */
GroupGenerator groupgen;
GroupGenerator_init(&groupgen, grouping);
const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
/* if digits are not grouped, thousands separator
should be an empty string */
assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
digits_pos = d_pos + n_digits;
if (writer) {
buffer_pos = writer->pos + n_buffer;
assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
}
if (unicode != NULL && thousands_sep_kind != kind) {
if (thousands_sep_kind < kind)
PyMem_Free(thousands_sep_data);
else
PyMem_Free(data);
else {
buffer_pos = n_buffer;
}
if (unicode == NULL) {
if (!writer) {
*maxchar = 127;
if (len != n_digits) {
*maxchar = Py_MAX(*maxchar,
PyUnicode_MAX_CHAR_VALUE(thousands_sep));
}
}
return len;
while ((len = GroupGenerator_next(&groupgen)) > 0) {
len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
n_zeros = Py_MAX(0, len - remaining);
n_chars = Py_MAX(0, Py_MIN(remaining, len));
/* Use n_zero zero's and n_chars chars */
/* Count only, don't do anything. */
count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
/* Copy into the writer. */
InsertThousandsGrouping_fill(writer, &buffer_pos,
digits, &digits_pos,
n_chars, n_zeros,
use_separator ? thousands_sep : NULL,
thousands_sep_len, maxchar);
/* Use a separator next time. */
use_separator = 1;
remaining -= n_chars;
min_width -= len;
if (remaining <= 0 && min_width <= 0) {
loop_broken = 1;
break;
}
min_width -= thousands_sep_len;
}
if (!loop_broken) {
/* We left the loop without using a break statement. */
len = Py_MAX(Py_MAX(remaining, min_width), 1);
n_zeros = Py_MAX(0, len - remaining);
n_chars = Py_MAX(0, Py_MIN(remaining, len));
/* Use n_zero zero's and n_chars chars */
count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
/* Copy into the writer. */
InsertThousandsGrouping_fill(writer, &buffer_pos,
digits, &digits_pos,
n_chars, n_zeros,
use_separator ? thousands_sep : NULL,
thousands_sep_len, maxchar);
}
return count;
}
@ -10021,30 +10112,6 @@ _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seq
return NULL;
}
#define FILL(kind, data, value, start, length) \
do { \
Py_ssize_t i_ = 0; \
assert(kind != PyUnicode_WCHAR_KIND); \
switch ((kind)) { \
case PyUnicode_1BYTE_KIND: { \
unsigned char * to_ = (unsigned char *)((data)) + (start); \
memset(to_, (unsigned char)value, (length)); \
break; \
} \
case PyUnicode_2BYTE_KIND: { \
Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
break; \
} \
case PyUnicode_4BYTE_KIND: { \
Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
break; \
} \
default: Py_UNREACHABLE(); \
} \
} while (0)
void
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
Py_UCS4 fill_char)
@ -10056,7 +10123,7 @@ _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
assert(start >= 0);
assert(start + length <= PyUnicode_GET_LENGTH(unicode));
FILL(kind, data, fill_char, start, length);
unicode_fill(kind, data, fill_char, start, length);
}
Py_ssize_t
@ -10127,9 +10194,9 @@ pad(PyObject *self,
kind = PyUnicode_KIND(u);
data = PyUnicode_DATA(u);
if (left)
FILL(kind, data, fill, 0, left);
unicode_fill(kind, data, fill, 0, left);
if (right)
FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
_PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
assert(_PyUnicode_CheckConsistency(u, 1));
return u;
@ -11516,7 +11583,7 @@ unicode_expandtabs_impl(PyObject *self, int tabsize)
if (tabsize > 0) {
incr = tabsize - (line_pos % tabsize);
line_pos += incr;
FILL(kind, dest_data, ' ', j, incr);
unicode_fill(kind, dest_data, ' ', j, incr);
j += incr;
}
}
@ -14792,7 +14859,7 @@ unicode_format_arg_output(struct unicode_formatter_t *ctx,
/* Pad left with the fill character if needed */
if (arg->width > len && !(arg->flags & F_LJUST)) {
sublen = arg->width - len;
FILL(writer->kind, writer->data, fill, writer->pos, sublen);
unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
writer->pos += sublen;
arg->width = len;
}
@ -14824,7 +14891,7 @@ unicode_format_arg_output(struct unicode_formatter_t *ctx,
/* Pad right with the fill character if needed */
if (arg->width > len) {
sublen = arg->width - len;
FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
writer->pos += sublen;
}
return 0;