mirror of
https://github.com/python/cpython.git
synced 2025-08-03 16:39:00 +00:00
This patch finalizes the move from UTF-8 to a default encoding in
the Python Unicode implementation. The internal buffer used for implementing the buffer protocol is renamed to defenc to make this change visible. It now holds the default encoded version of the Unicode object and is calculated on demand (NULL otherwise). Since the default encoding defaults to ASCII, this will mean that Unicode objects which hold non-ASCII characters will no longer work on C APIs using the "s" or "t" parser markers. C APIs must now explicitly provide Unicode support via the "u", "U" or "es"/"es#" parser markers in order to work with non-ASCII Unicode strings. (Note: this patch will also have to be applied to the 1.6 branch of the CVS tree.)
This commit is contained in:
parent
2b83b4601f
commit
bff879cabb
4 changed files with 109 additions and 65 deletions
|
@ -204,8 +204,9 @@ typedef struct {
|
||||||
int length; /* Length of raw Unicode data in buffer */
|
int length; /* Length of raw Unicode data in buffer */
|
||||||
Py_UNICODE *str; /* Raw Unicode buffer */
|
Py_UNICODE *str; /* Raw Unicode buffer */
|
||||||
long hash; /* Hash value; -1 if not set */
|
long hash; /* Hash value; -1 if not set */
|
||||||
PyObject *utf8str; /* UTF-8 encoded version as Python string,
|
PyObject *defenc; /* (Default) Encoded version as Python
|
||||||
or NULL */
|
string, or NULL; this is used for
|
||||||
|
implementing the buffer protocol */
|
||||||
} PyUnicodeObject;
|
} PyUnicodeObject;
|
||||||
|
|
||||||
extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
|
extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
=============================================================================
|
=============================================================================
|
||||||
Python Unicode Integration Proposal Version: 1.4
|
Python Unicode Integration Proposal Version: 1.6
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -41,16 +41,52 @@ General Remarks:
|
||||||
case-insensitive on input (they will be converted to lower case
|
case-insensitive on input (they will be converted to lower case
|
||||||
by all APIs taking an encoding name as input).
|
by all APIs taking an encoding name as input).
|
||||||
|
|
||||||
Encoding names should follow the name conventions as used by the
|
· Encoding names should follow the name conventions as used by the
|
||||||
Unicode Consortium: spaces are converted to hyphens, e.g. 'utf 16' is
|
Unicode Consortium: spaces are converted to hyphens, e.g. 'utf 16' is
|
||||||
written as 'utf-16'.
|
written as 'utf-16'.
|
||||||
|
|
||||||
Codec modules should use the same names, but with hyphens converted
|
· Codec modules should use the same names, but with hyphens converted
|
||||||
to underscores, e.g. utf_8, utf_16, iso_8859_1.
|
to underscores, e.g. utf_8, utf_16, iso_8859_1.
|
||||||
|
|
||||||
· The <default encoding> should be the widely used 'utf-8' format. This
|
|
||||||
is very close to the standard 7-bit ASCII format and thus resembles the
|
Unicode Default Encoding:
|
||||||
standard used programming nowadays in most aspects.
|
-------------------------
|
||||||
|
|
||||||
|
The Unicode implementation has to make some assumption about the
|
||||||
|
encoding of 8-bit strings passed to it for coercion and about the
|
||||||
|
encoding to as default for conversion of Unicode to strings when no
|
||||||
|
specific encoding is given. This encoding is called <default encoding>
|
||||||
|
throughout this text.
|
||||||
|
|
||||||
|
For this, the implementation maintains a global which can be set in
|
||||||
|
the site.py Python startup script. Subsequent changes are not
|
||||||
|
possible. The <default encoding> can be set and queried using the
|
||||||
|
two sys module APIs:
|
||||||
|
|
||||||
|
sys.setdefaultencoding(encoding)
|
||||||
|
--> Sets the <default encoding> used by the Unicode implementation.
|
||||||
|
encoding has to be an encoding which is supported by the Python
|
||||||
|
installation, otherwise, a LookupError is raised.
|
||||||
|
|
||||||
|
Note: This API is only available in site.py ! It is removed
|
||||||
|
from the sys module by site.py after usage.
|
||||||
|
|
||||||
|
sys.getdefaultencoding()
|
||||||
|
--> Returns the current <default encoding>.
|
||||||
|
|
||||||
|
If not otherwise defined or set, the <default encoding> defaults to
|
||||||
|
'ascii'. This encoding is also the startup default of Python (and in
|
||||||
|
effect before site.py is executed).
|
||||||
|
|
||||||
|
Note that the default site.py startup module contains disabled
|
||||||
|
optional code which can set the <default encoding> according to the
|
||||||
|
encoding defined by the current locale. The locale module is used to
|
||||||
|
extract the encoding from the locale default settings defined by the
|
||||||
|
OS environment (see locale.py). If the encoding cannot be determined,
|
||||||
|
is unkown or unsupported, the code defaults to setting the <default
|
||||||
|
encoding> to 'ascii'. To enable this code, edit the site.py file or
|
||||||
|
place the appropriate code into the sitecustomize.py module of your
|
||||||
|
Python installation.
|
||||||
|
|
||||||
|
|
||||||
Unicode Constructors:
|
Unicode Constructors:
|
||||||
|
@ -159,8 +195,10 @@ other objects have been coerced to Unicode. For strings this means
|
||||||
that they are interpreted as Unicode string using the <default
|
that they are interpreted as Unicode string using the <default
|
||||||
encoding>.
|
encoding>.
|
||||||
|
|
||||||
For the same reason, Unicode objects should return the same hash value
|
Unicode objects should return the same hash value as their ASCII
|
||||||
as their UTF-8 equivalent strings.
|
equivalent strings. Unicode strings holding non-ASCII values are not
|
||||||
|
guaranteed to return the same hash values as the default encoded
|
||||||
|
equivalent string representation.
|
||||||
|
|
||||||
When compared using cmp() (or PyObject_Compare()) the implementation
|
When compared using cmp() (or PyObject_Compare()) the implementation
|
||||||
should mask TypeErrors raised during the conversion to remain in synch
|
should mask TypeErrors raised during the conversion to remain in synch
|
||||||
|
@ -661,11 +699,10 @@ to the compiler's wchar_t which can be 16 or 32 bit depending on the
|
||||||
compiler/libc/platform being used.
|
compiler/libc/platform being used.
|
||||||
|
|
||||||
Unicode objects should have a pointer to a cached Python string object
|
Unicode objects should have a pointer to a cached Python string object
|
||||||
<defencstr> holding the object's value using the current <default
|
<defenc> holding the object's value using the <default encoding>.
|
||||||
encoding>. This is needed for performance and internal parsing (see
|
This is needed for performance and internal parsing (see Internal
|
||||||
Internal Argument Parsing) reasons. The buffer is filled when the
|
Argument Parsing) reasons. The buffer is filled when the first
|
||||||
first conversion request to the <default encoding> is issued on the
|
conversion request to the <default encoding> is issued on the object.
|
||||||
object.
|
|
||||||
|
|
||||||
Interning is not needed (for now), since Python identifiers are
|
Interning is not needed (for now), since Python identifiers are
|
||||||
defined as being ASCII only.
|
defined as being ASCII only.
|
||||||
|
@ -701,11 +738,11 @@ type).
|
||||||
Buffer Interface:
|
Buffer Interface:
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
Implement the buffer interface using the <defencstr> Python string
|
Implement the buffer interface using the <defenc> Python string
|
||||||
object as basis for bf_getcharbuf (corresponds to the "t#" argument
|
object as basis for bf_getcharbuf (corresponds to the "t#" argument
|
||||||
parsing marker) and the internal buffer for bf_getreadbuf (corresponds
|
parsing marker) and the internal buffer for bf_getreadbuf (corresponds
|
||||||
to the "s#" argument parsing marker). If bf_getcharbuf is requested
|
to the "s#" argument parsing marker). If bf_getcharbuf is requested
|
||||||
and the <defencstr> object does not yet exist, it is created first.
|
and the <defenc> object does not yet exist, it is created first.
|
||||||
|
|
||||||
This has the advantage of being able to write to output streams (which
|
This has the advantage of being able to write to output streams (which
|
||||||
typically use this interface) without additional specification of the
|
typically use this interface) without additional specification of the
|
||||||
|
@ -775,8 +812,8 @@ These markers are used by the PyArg_ParseTuple() APIs:
|
||||||
|
|
||||||
"U": Check for Unicode object and return a pointer to it
|
"U": Check for Unicode object and return a pointer to it
|
||||||
|
|
||||||
"s": For Unicode objects: auto convert them to the <default encoding>
|
"s": For Unicode objects: return a pointer to the object's
|
||||||
and return a pointer to the object's <defencstr> buffer.
|
<defenc> buffer (which uses the <default encoding>).
|
||||||
|
|
||||||
"s#": Access to the Unicode object via the bf_getreadbuf buffer interface
|
"s#": Access to the Unicode object via the bf_getreadbuf buffer interface
|
||||||
(see Buffer Interface); note that the length relates to the buffer
|
(see Buffer Interface); note that the length relates to the buffer
|
||||||
|
@ -785,8 +822,7 @@ These markers are used by the PyArg_ParseTuple() APIs:
|
||||||
|
|
||||||
"t#": Access to the Unicode object via the bf_getcharbuf buffer interface
|
"t#": Access to the Unicode object via the bf_getcharbuf buffer interface
|
||||||
(see Buffer Interface); note that the length relates to the buffer
|
(see Buffer Interface); note that the length relates to the buffer
|
||||||
length, not necessarily to the Unicode string length (this may
|
length, not necessarily to the Unicode string length.
|
||||||
be different depending on the <default encoding>).
|
|
||||||
|
|
||||||
"es":
|
"es":
|
||||||
Takes two parameters: encoding (const char *) and
|
Takes two parameters: encoding (const char *) and
|
||||||
|
@ -1007,6 +1043,11 @@ Encodings:
|
||||||
|
|
||||||
History of this Proposal:
|
History of this Proposal:
|
||||||
-------------------------
|
-------------------------
|
||||||
|
1.6: Changed <defencstr> to <defenc> since this is the name used in the
|
||||||
|
implementation. Added notes about the usage of <defenc> in the
|
||||||
|
buffer protocol implementation.
|
||||||
|
1.5: Added notes about setting the <default encoding>. Fixed some
|
||||||
|
typos (thanks to Andrew Kuchling). Changed <defencstr> to <utf8str>.
|
||||||
1.4: Added note about mixed type comparisons and contains tests.
|
1.4: Added note about mixed type comparisons and contains tests.
|
||||||
Changed treating of Unicode objects in format strings (if used
|
Changed treating of Unicode objects in format strings (if used
|
||||||
with '%s' % u they will now cause the format string to be
|
with '%s' % u they will now cause the format string to be
|
||||||
|
|
|
@ -165,9 +165,9 @@ int _PyUnicode_Resize(register PyUnicodeObject *unicode,
|
||||||
|
|
||||||
reset:
|
reset:
|
||||||
/* Reset the object caches */
|
/* Reset the object caches */
|
||||||
if (unicode->utf8str) {
|
if (unicode->defenc) {
|
||||||
Py_DECREF(unicode->utf8str);
|
Py_DECREF(unicode->defenc);
|
||||||
unicode->utf8str = NULL;
|
unicode->defenc = NULL;
|
||||||
}
|
}
|
||||||
unicode->hash = -1;
|
unicode->hash = -1;
|
||||||
|
|
||||||
|
@ -243,7 +243,7 @@ PyUnicodeObject *_PyUnicode_New(int length)
|
||||||
unicode->str[length] = 0;
|
unicode->str[length] = 0;
|
||||||
unicode->length = length;
|
unicode->length = length;
|
||||||
unicode->hash = -1;
|
unicode->hash = -1;
|
||||||
unicode->utf8str = NULL;
|
unicode->defenc = NULL;
|
||||||
return unicode;
|
return unicode;
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
@ -262,9 +262,9 @@ void _PyUnicode_Free(register PyUnicodeObject *unicode)
|
||||||
unicode->str = NULL;
|
unicode->str = NULL;
|
||||||
unicode->length = 0;
|
unicode->length = 0;
|
||||||
}
|
}
|
||||||
if (unicode->utf8str) {
|
if (unicode->defenc) {
|
||||||
Py_DECREF(unicode->utf8str);
|
Py_DECREF(unicode->defenc);
|
||||||
unicode->utf8str = NULL;
|
unicode->defenc = NULL;
|
||||||
}
|
}
|
||||||
/* Add to free list */
|
/* Add to free list */
|
||||||
*(PyUnicodeObject **)unicode = unicode_freelist;
|
*(PyUnicodeObject **)unicode = unicode_freelist;
|
||||||
|
@ -273,7 +273,7 @@ void _PyUnicode_Free(register PyUnicodeObject *unicode)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
PyMem_DEL(unicode->str);
|
PyMem_DEL(unicode->str);
|
||||||
Py_XDECREF(unicode->utf8str);
|
Py_XDECREF(unicode->defenc);
|
||||||
PyObject_DEL(unicode);
|
PyObject_DEL(unicode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -529,6 +529,33 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Return a Python string holding the default encoded value of the
|
||||||
|
Unicode object.
|
||||||
|
|
||||||
|
The resulting string is cached in the Unicode object for subsequent
|
||||||
|
usage by this function. The cached version is needed to implement
|
||||||
|
the character buffer interface and will live (at least) as long as
|
||||||
|
the Unicode object itself.
|
||||||
|
|
||||||
|
The refcount of the string is *not* incremented.
|
||||||
|
|
||||||
|
*** Exported for internal use by the interpreter only !!! ***
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
|
||||||
|
const char *errors)
|
||||||
|
{
|
||||||
|
PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
|
||||||
|
|
||||||
|
if (v)
|
||||||
|
return v;
|
||||||
|
v = PyUnicode_AsEncodedString(unicode, NULL, errors);
|
||||||
|
if (v && errors == NULL)
|
||||||
|
((PyUnicodeObject *)unicode)->defenc = v;
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
|
Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
|
||||||
{
|
{
|
||||||
if (!PyUnicode_Check(unicode)) {
|
if (!PyUnicode_Check(unicode)) {
|
||||||
|
@ -874,35 +901,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Return a Python string holding the UTF-8 encoded value of the
|
|
||||||
Unicode object.
|
|
||||||
|
|
||||||
The resulting string is cached in the Unicode object for subsequent
|
|
||||||
usage by this function. The cached version is needed to implement
|
|
||||||
the character buffer interface and will live (at least) as long as
|
|
||||||
the Unicode object itself.
|
|
||||||
|
|
||||||
The refcount of the string is *not* incremented.
|
|
||||||
|
|
||||||
*** Exported for internal use by the interpreter only !!! ***
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
|
|
||||||
const char *errors)
|
|
||||||
{
|
|
||||||
PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
|
|
||||||
|
|
||||||
if (v)
|
|
||||||
return v;
|
|
||||||
v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
|
|
||||||
PyUnicode_GET_SIZE(unicode),
|
|
||||||
errors);
|
|
||||||
if (v && errors == NULL)
|
|
||||||
((PyUnicodeObject *)unicode)->utf8str = v;
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
|
PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
|
||||||
{
|
{
|
||||||
PyObject *str;
|
PyObject *str;
|
||||||
|
@ -911,7 +909,9 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
|
||||||
PyErr_BadArgument();
|
PyErr_BadArgument();
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
str = _PyUnicode_AsUTF8String(unicode, NULL);
|
str = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
|
||||||
|
PyUnicode_GET_SIZE(unicode),
|
||||||
|
NULL);
|
||||||
if (str == NULL)
|
if (str == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
Py_INCREF(str);
|
Py_INCREF(str);
|
||||||
|
@ -4519,7 +4519,7 @@ unicode_buffer_getcharbuf(PyUnicodeObject *self,
|
||||||
"accessing non-existent unicode segment");
|
"accessing non-existent unicode segment");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
|
str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
|
||||||
if (str == NULL)
|
if (str == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
*ptr = (void *) PyString_AS_STRING(str);
|
*ptr = (void *) PyString_AS_STRING(str);
|
||||||
|
@ -5130,7 +5130,7 @@ _PyUnicode_Fini(void)
|
||||||
u = *(PyUnicodeObject **)u;
|
u = *(PyUnicodeObject **)u;
|
||||||
if (v->str)
|
if (v->str)
|
||||||
PyMem_DEL(v->str);
|
PyMem_DEL(v->str);
|
||||||
Py_XDECREF(v->utf8str);
|
Py_XDECREF(v->defenc);
|
||||||
PyObject_DEL(v);
|
PyObject_DEL(v);
|
||||||
}
|
}
|
||||||
unicode_freelist = NULL;
|
unicode_freelist = NULL;
|
||||||
|
|
|
@ -372,7 +372,7 @@ convertsimple(PyObject *arg, char **p_format, va_list *p_va, char *msgbuf)
|
||||||
|
|
||||||
/* Internal API needed by convertsimple1(): */
|
/* Internal API needed by convertsimple1(): */
|
||||||
extern
|
extern
|
||||||
PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
|
PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
|
||||||
const char *errors);
|
const char *errors);
|
||||||
|
|
||||||
/* Convert a non-tuple argument. Return NULL if conversion went OK,
|
/* Convert a non-tuple argument. Return NULL if conversion went OK,
|
||||||
|
@ -567,7 +567,8 @@ convertsimple1(PyObject *arg, char **p_format, va_list *p_va)
|
||||||
if (PyString_Check(arg))
|
if (PyString_Check(arg))
|
||||||
*p = PyString_AS_STRING(arg);
|
*p = PyString_AS_STRING(arg);
|
||||||
else if (PyUnicode_Check(arg)) {
|
else if (PyUnicode_Check(arg)) {
|
||||||
arg = _PyUnicode_AsUTF8String(arg, NULL);
|
arg = _PyUnicode_AsDefaultEncodedString(
|
||||||
|
arg, NULL);
|
||||||
if (arg == NULL)
|
if (arg == NULL)
|
||||||
return "unicode conversion error";
|
return "unicode conversion error";
|
||||||
*p = PyString_AS_STRING(arg);
|
*p = PyString_AS_STRING(arg);
|
||||||
|
@ -612,7 +613,8 @@ convertsimple1(PyObject *arg, char **p_format, va_list *p_va)
|
||||||
else if (PyString_Check(arg))
|
else if (PyString_Check(arg))
|
||||||
*p = PyString_AsString(arg);
|
*p = PyString_AsString(arg);
|
||||||
else if (PyUnicode_Check(arg)) {
|
else if (PyUnicode_Check(arg)) {
|
||||||
arg = _PyUnicode_AsUTF8String(arg, NULL);
|
arg = _PyUnicode_AsDefaultEncodedString(
|
||||||
|
arg, NULL);
|
||||||
if (arg == NULL)
|
if (arg == NULL)
|
||||||
return "unicode conversion error";
|
return "unicode conversion error";
|
||||||
*p = PyString_AS_STRING(arg);
|
*p = PyString_AS_STRING(arg);
|
||||||
|
@ -644,7 +646,7 @@ convertsimple1(PyObject *arg, char **p_format, va_list *p_va)
|
||||||
/* Get 'e' parameter: the encoding name */
|
/* Get 'e' parameter: the encoding name */
|
||||||
encoding = (const char *)va_arg(*p_va, const char *);
|
encoding = (const char *)va_arg(*p_va, const char *);
|
||||||
if (encoding == NULL)
|
if (encoding == NULL)
|
||||||
return "(encoding is NULL)";
|
encoding = PyUnicode_GetDefaultEncoding();
|
||||||
|
|
||||||
/* Get 's' parameter: the output buffer to use */
|
/* Get 's' parameter: the output buffer to use */
|
||||||
if (*format != 's')
|
if (*format != 's')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue