mirror of
https://github.com/python/cpython.git
synced 2025-10-07 23:51:16 +00:00
PyUnicode_Ready() now sets ascii=1 if maxchar < 128
ascii=1 is no more reserved to PyASCIIObject. Use PyUnicode_IS_COMPACT_ASCII(obj) to check if obj is a PyASCIIObject (as before).
This commit is contained in:
parent
1b4f9ceca7
commit
a3b334da6d
3 changed files with 42 additions and 33 deletions
|
@ -224,7 +224,7 @@ typedef struct {
|
||||||
PyUnicode_4BYTE_KIND
|
PyUnicode_4BYTE_KIND
|
||||||
* compact = 1
|
* compact = 1
|
||||||
* ready = 1
|
* ready = 1
|
||||||
* (ascii = 0)
|
* ascii = 0
|
||||||
|
|
||||||
- string created by the legacy API (not ready):
|
- string created by the legacy API (not ready):
|
||||||
|
|
||||||
|
@ -236,7 +236,7 @@ typedef struct {
|
||||||
* data.any is NULL
|
* data.any is NULL
|
||||||
* utf8 is NULL
|
* utf8 is NULL
|
||||||
* interned = SSTATE_NOT_INTERNED
|
* interned = SSTATE_NOT_INTERNED
|
||||||
* (ascii = 0)
|
* ascii = 0
|
||||||
|
|
||||||
- string created by the legacy API, ready:
|
- string created by the legacy API, ready:
|
||||||
|
|
||||||
|
@ -246,7 +246,6 @@ typedef struct {
|
||||||
* compact = 0
|
* compact = 0
|
||||||
* ready = 1
|
* ready = 1
|
||||||
* data.any is not NULL
|
* data.any is not NULL
|
||||||
* (ascii = 0)
|
|
||||||
|
|
||||||
String created by the legacy API becomes ready when calling
|
String created by the legacy API becomes ready when calling
|
||||||
PyUnicode_READY().
|
PyUnicode_READY().
|
||||||
|
@ -278,8 +277,9 @@ typedef struct {
|
||||||
one block for the PyUnicodeObject struct and another for its data
|
one block for the PyUnicodeObject struct and another for its data
|
||||||
buffer. */
|
buffer. */
|
||||||
unsigned int compact:1;
|
unsigned int compact:1;
|
||||||
/* Compact objects which are ASCII-only also have the state.compact
|
/* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
|
||||||
flag set, and use the PyASCIIObject struct. */
|
characters. If ascii is 1 and compact is 1, use the PyASCIIObject
|
||||||
|
structure. */
|
||||||
unsigned int ascii:1;
|
unsigned int ascii:1;
|
||||||
/* The ready flag indicates whether the object layout is initialized
|
/* The ready flag indicates whether the object layout is initialized
|
||||||
completely. This means that this is either a compact object, or
|
completely. This means that this is either a compact object, or
|
||||||
|
@ -304,7 +304,7 @@ typedef struct {
|
||||||
|
|
||||||
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
|
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
|
||||||
PyUnicodeObject structure. The actual string data is initially in the wstr
|
PyUnicodeObject structure. The actual string data is initially in the wstr
|
||||||
block, and copied into the data block using PyUnicode_Ready. */
|
block, and copied into the data block using _PyUnicode_Ready. */
|
||||||
typedef struct {
|
typedef struct {
|
||||||
PyCompactUnicodeObject _base;
|
PyCompactUnicodeObject _base;
|
||||||
union {
|
union {
|
||||||
|
@ -327,7 +327,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
|
||||||
#ifndef Py_LIMITED_API
|
#ifndef Py_LIMITED_API
|
||||||
|
|
||||||
#define PyUnicode_WSTR_LENGTH(op) \
|
#define PyUnicode_WSTR_LENGTH(op) \
|
||||||
(((PyASCIIObject*)op)->state.ascii ? \
|
(PyUnicode_IS_COMPACT_ASCII(op) ? \
|
||||||
((PyASCIIObject*)op)->length : \
|
((PyASCIIObject*)op)->length : \
|
||||||
((PyCompactUnicodeObject*)op)->wstr_length)
|
((PyCompactUnicodeObject*)op)->wstr_length)
|
||||||
|
|
||||||
|
@ -369,10 +369,24 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
|
||||||
#define SSTATE_INTERNED_MORTAL 1
|
#define SSTATE_INTERNED_MORTAL 1
|
||||||
#define SSTATE_INTERNED_IMMORTAL 2
|
#define SSTATE_INTERNED_IMMORTAL 2
|
||||||
|
|
||||||
#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
|
/* Return true if the string contains only ASCII characters, or 0 if not. The
|
||||||
|
string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
|
||||||
|
or Ready calls are performed. */
|
||||||
|
#define PyUnicode_IS_ASCII(op) \
|
||||||
|
(((PyASCIIObject*)op)->state.ascii)
|
||||||
|
|
||||||
|
/* Return true if the string is compact or 0 if not.
|
||||||
|
No type checks or Ready calls are performed. */
|
||||||
|
#define PyUnicode_IS_COMPACT(op) \
|
||||||
|
(((PyASCIIObject*)(op))->state.compact)
|
||||||
|
|
||||||
|
/* Return true if the string is a compact ASCII string (use PyASCIIObject
|
||||||
|
structure), or 0 if not. No type checks or Ready calls are performed. */
|
||||||
|
#define PyUnicode_IS_COMPACT_ASCII(op) \
|
||||||
|
(PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
|
||||||
|
|
||||||
/* String contains only wstr byte characters. This is only possible
|
/* String contains only wstr byte characters. This is only possible
|
||||||
when the string was created with a legacy API and PyUnicode_Ready()
|
when the string was created with a legacy API and _PyUnicode_Ready()
|
||||||
has not been called yet. */
|
has not been called yet. */
|
||||||
#define PyUnicode_WCHAR_KIND 0
|
#define PyUnicode_WCHAR_KIND 0
|
||||||
|
|
||||||
|
@ -399,11 +413,6 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
|
||||||
#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
|
#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
|
||||||
#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
|
#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
|
||||||
|
|
||||||
/* Return true if the string is compact or 0 if not.
|
|
||||||
No type checks or Ready calls are performed. */
|
|
||||||
#define PyUnicode_IS_COMPACT(op) \
|
|
||||||
(((PyASCIIObject*)(op))->state.compact)
|
|
||||||
|
|
||||||
/* Return one of the PyUnicode_*_KIND values defined above. */
|
/* Return one of the PyUnicode_*_KIND values defined above. */
|
||||||
#define PyUnicode_KIND(op) \
|
#define PyUnicode_KIND(op) \
|
||||||
(assert(PyUnicode_Check(op)), \
|
(assert(PyUnicode_Check(op)), \
|
||||||
|
@ -500,9 +509,9 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
|
||||||
|
|
||||||
#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
|
#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
|
||||||
|
|
||||||
/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
|
/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
|
||||||
case. If the canonical representation is not yet set, it will still call
|
case. If the canonical representation is not yet set, it will still call
|
||||||
PyUnicode_Ready().
|
_PyUnicode_Ready().
|
||||||
Returns 0 on success and -1 on errors. */
|
Returns 0 on success and -1 on errors. */
|
||||||
#define PyUnicode_READY(op) \
|
#define PyUnicode_READY(op) \
|
||||||
(assert(PyUnicode_Check(op)), \
|
(assert(PyUnicode_Check(op)), \
|
||||||
|
|
|
@ -288,16 +288,14 @@ _PyUnicode_CheckConsistency(void *op)
|
||||||
ascii = (PyASCIIObject *)op;
|
ascii = (PyASCIIObject *)op;
|
||||||
kind = ascii->state.kind;
|
kind = ascii->state.kind;
|
||||||
|
|
||||||
if (ascii->state.ascii == 1) {
|
if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
|
||||||
assert(kind == PyUnicode_1BYTE_KIND);
|
assert(kind == PyUnicode_1BYTE_KIND);
|
||||||
assert(ascii->state.compact == 1);
|
|
||||||
assert(ascii->state.ready == 1);
|
assert(ascii->state.ready == 1);
|
||||||
}
|
}
|
||||||
else if (ascii->state.compact == 1) {
|
else if (ascii->state.compact == 1) {
|
||||||
assert(kind == PyUnicode_1BYTE_KIND
|
assert(kind == PyUnicode_1BYTE_KIND
|
||||||
|| kind == PyUnicode_2BYTE_KIND
|
|| kind == PyUnicode_2BYTE_KIND
|
||||||
|| kind == PyUnicode_4BYTE_KIND);
|
|| kind == PyUnicode_4BYTE_KIND);
|
||||||
assert(ascii->state.compact == 1);
|
|
||||||
assert(ascii->state.ascii == 0);
|
assert(ascii->state.ascii == 0);
|
||||||
assert(ascii->state.ready == 1);
|
assert(ascii->state.ready == 1);
|
||||||
} else {
|
} else {
|
||||||
|
@ -305,9 +303,9 @@ _PyUnicode_CheckConsistency(void *op)
|
||||||
PyUnicodeObject *unicode = (PyUnicodeObject *)op;
|
PyUnicodeObject *unicode = (PyUnicodeObject *)op;
|
||||||
|
|
||||||
if (kind == PyUnicode_WCHAR_KIND) {
|
if (kind == PyUnicode_WCHAR_KIND) {
|
||||||
assert(!ascii->state.compact == 1);
|
assert(ascii->state.compact == 0);
|
||||||
assert(ascii->state.ascii == 0);
|
assert(ascii->state.ascii == 0);
|
||||||
assert(!ascii->state.ready == 1);
|
assert(ascii->state.ready == 0);
|
||||||
assert(ascii->wstr != NULL);
|
assert(ascii->wstr != NULL);
|
||||||
assert(unicode->data.any == NULL);
|
assert(unicode->data.any == NULL);
|
||||||
assert(compact->utf8 == NULL);
|
assert(compact->utf8 == NULL);
|
||||||
|
@ -317,10 +315,9 @@ _PyUnicode_CheckConsistency(void *op)
|
||||||
assert(kind == PyUnicode_1BYTE_KIND
|
assert(kind == PyUnicode_1BYTE_KIND
|
||||||
|| kind == PyUnicode_2BYTE_KIND
|
|| kind == PyUnicode_2BYTE_KIND
|
||||||
|| kind == PyUnicode_4BYTE_KIND);
|
|| kind == PyUnicode_4BYTE_KIND);
|
||||||
assert(!ascii->state.compact == 1);
|
assert(ascii->state.compact == 0);
|
||||||
assert(ascii->state.ready == 1);
|
assert(ascii->state.ready == 1);
|
||||||
assert(unicode->data.any != NULL);
|
assert(unicode->data.any != NULL);
|
||||||
assert(ascii->state.ascii == 0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -638,7 +635,7 @@ unicode_kind_name(PyObject *unicode)
|
||||||
switch(PyUnicode_KIND(unicode))
|
switch(PyUnicode_KIND(unicode))
|
||||||
{
|
{
|
||||||
case PyUnicode_1BYTE_KIND:
|
case PyUnicode_1BYTE_KIND:
|
||||||
if (PyUnicode_IS_COMPACT_ASCII(unicode))
|
if (PyUnicode_IS_ASCII(unicode))
|
||||||
return "legacy ascii";
|
return "legacy ascii";
|
||||||
else
|
else
|
||||||
return "legacy latin1";
|
return "legacy latin1";
|
||||||
|
@ -654,14 +651,14 @@ unicode_kind_name(PyObject *unicode)
|
||||||
switch(PyUnicode_KIND(unicode))
|
switch(PyUnicode_KIND(unicode))
|
||||||
{
|
{
|
||||||
case PyUnicode_1BYTE_KIND:
|
case PyUnicode_1BYTE_KIND:
|
||||||
if (PyUnicode_IS_COMPACT_ASCII(unicode))
|
if (PyUnicode_IS_ASCII(unicode))
|
||||||
return "ascii";
|
return "ascii";
|
||||||
else
|
else
|
||||||
return "compact latin1";
|
return "latin1";
|
||||||
case PyUnicode_2BYTE_KIND:
|
case PyUnicode_2BYTE_KIND:
|
||||||
return "compact UCS2";
|
return "UCS2";
|
||||||
case PyUnicode_4BYTE_KIND:
|
case PyUnicode_4BYTE_KIND:
|
||||||
return "compact UCS4";
|
return "UCS4";
|
||||||
default:
|
default:
|
||||||
return "<invalid compact kind>";
|
return "<invalid compact kind>";
|
||||||
}
|
}
|
||||||
|
@ -703,7 +700,7 @@ _PyUnicode_Dump(PyObject *op)
|
||||||
if (ascii->wstr == data)
|
if (ascii->wstr == data)
|
||||||
printf("shared ");
|
printf("shared ");
|
||||||
printf("wstr=%p", ascii->wstr);
|
printf("wstr=%p", ascii->wstr);
|
||||||
if (!ascii->state.ascii) {
|
if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
|
||||||
printf(" (%zu), ", compact->wstr_length);
|
printf(" (%zu), ", compact->wstr_length);
|
||||||
if (!ascii->state.compact && compact->utf8 == unicode->data.any)
|
if (!ascii->state.compact && compact->utf8 == unicode->data.any)
|
||||||
printf("shared ");
|
printf("shared ");
|
||||||
|
@ -954,9 +951,9 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
|
||||||
/* check if max_char(from substring) <= max_char(to) */
|
/* check if max_char(from substring) <= max_char(to) */
|
||||||
if (from_kind > to_kind
|
if (from_kind > to_kind
|
||||||
/* latin1 => ascii */
|
/* latin1 => ascii */
|
||||||
|| (PyUnicode_IS_COMPACT_ASCII(to)
|
|| (PyUnicode_IS_ASCII(to)
|
||||||
&& to_kind == PyUnicode_1BYTE_KIND
|
&& to_kind == PyUnicode_1BYTE_KIND
|
||||||
&& !PyUnicode_IS_COMPACT_ASCII(from)))
|
&& !PyUnicode_IS_ASCII(from)))
|
||||||
{
|
{
|
||||||
/* slow path to check for character overflow */
|
/* slow path to check for character overflow */
|
||||||
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
|
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
|
||||||
|
@ -1115,10 +1112,12 @@ unicode_ready(PyObject **p_obj, int replace)
|
||||||
_PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
|
_PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
|
||||||
_PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
|
_PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
|
||||||
if (maxchar < 128) {
|
if (maxchar < 128) {
|
||||||
|
_PyUnicode_STATE(unicode).ascii = 1;
|
||||||
_PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
|
_PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
|
||||||
_PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
|
_PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
_PyUnicode_STATE(unicode).ascii = 0;
|
||||||
_PyUnicode_UTF8(unicode) = NULL;
|
_PyUnicode_UTF8(unicode) = NULL;
|
||||||
_PyUnicode_UTF8_LENGTH(unicode) = 0;
|
_PyUnicode_UTF8_LENGTH(unicode) = 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1132,15 +1132,16 @@ class PyUnicodeObjectPtr(PyObjectPtr):
|
||||||
compact = self.field('_base')
|
compact = self.field('_base')
|
||||||
ascii = compact['_base']
|
ascii = compact['_base']
|
||||||
state = ascii['state']
|
state = ascii['state']
|
||||||
|
is_compact_ascii = (int(state['ascii']) and int(state['compact']))
|
||||||
field_length = long(ascii['length'])
|
field_length = long(ascii['length'])
|
||||||
if not int(state['ready']):
|
if not int(state['ready']):
|
||||||
# string is not ready
|
# string is not ready
|
||||||
may_have_surrogates = True
|
may_have_surrogates = True
|
||||||
field_str = ascii['wstr']
|
field_str = ascii['wstr']
|
||||||
if not int(state['ascii']):
|
if not is_compact_ascii:
|
||||||
field_length = compact('wstr_length')
|
field_length = compact('wstr_length')
|
||||||
else:
|
else:
|
||||||
if int(state['ascii']):
|
if is_compact_ascii:
|
||||||
field_str = ascii.address + 1
|
field_str = ascii.address + 1
|
||||||
elif int(state['compact']):
|
elif int(state['compact']):
|
||||||
field_str = compact.address + 1
|
field_str = compact.address + 1
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue