mirror of
https://github.com/python/cpython.git
synced 2025-08-26 11:45:20 +00:00
M.-A. Lemburg <mal@lemburg.com>:
Added support for user settable default encodings. The current implementation uses a per-process global which defines the value of the encoding parameter in case it is set to NULL (meaning: use the default encoding).
This commit is contained in:
parent
aff601804d
commit
e4315f58d2
1 changed files with 71 additions and 20 deletions
|
@ -117,6 +117,16 @@ static PyUnicodeObject *unicode_empty = NULL;
|
||||||
static PyUnicodeObject *unicode_freelist = NULL;
|
static PyUnicodeObject *unicode_freelist = NULL;
|
||||||
static int unicode_freelist_size = 0;
|
static int unicode_freelist_size = 0;
|
||||||
|
|
||||||
|
/* Default encoding to use and assume when NULL is passed as encoding
|
||||||
|
parameter; it is initialized by _PyUnicode_Init().
|
||||||
|
|
||||||
|
Always use the PyUnicode_SetDefaultEncoding() and
|
||||||
|
PyUnicode_GetDefaultEncoding() APIs to access this global.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
static char unicode_default_encoding[100];
|
||||||
|
|
||||||
/* --- Unicode Object ----------------------------------------------------- */
|
/* --- Unicode Object ----------------------------------------------------- */
|
||||||
|
|
||||||
static
|
static
|
||||||
|
@ -366,7 +376,7 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
|
||||||
Py_INCREF(unicode_empty);
|
Py_INCREF(unicode_empty);
|
||||||
return (PyObject *)unicode_empty;
|
return (PyObject *)unicode_empty;
|
||||||
}
|
}
|
||||||
return PyUnicode_DecodeUTF8(s, len, "strict");
|
return PyUnicode_Decode(s, len, NULL, "strict");
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *PyUnicode_Decode(const char *s,
|
PyObject *PyUnicode_Decode(const char *s,
|
||||||
|
@ -376,10 +386,16 @@ PyObject *PyUnicode_Decode(const char *s,
|
||||||
{
|
{
|
||||||
PyObject *buffer = NULL, *unicode;
|
PyObject *buffer = NULL, *unicode;
|
||||||
|
|
||||||
/* Shortcut for the default encoding UTF-8 */
|
if (encoding == NULL)
|
||||||
if (encoding == NULL ||
|
encoding = PyUnicode_GetDefaultEncoding();
|
||||||
(strcmp(encoding, "utf-8") == 0))
|
|
||||||
|
/* Shortcuts for common default encodings */
|
||||||
|
if (strcmp(encoding, "utf-8") == 0)
|
||||||
return PyUnicode_DecodeUTF8(s, size, errors);
|
return PyUnicode_DecodeUTF8(s, size, errors);
|
||||||
|
else if (strcmp(encoding, "latin-1") == 0)
|
||||||
|
return PyUnicode_DecodeLatin1(s, size, errors);
|
||||||
|
else if (strcmp(encoding, "ascii") == 0)
|
||||||
|
return PyUnicode_DecodeASCII(s, size, errors);
|
||||||
|
|
||||||
/* Decode via the codec registry */
|
/* Decode via the codec registry */
|
||||||
buffer = PyBuffer_FromMemory((void *)s, size);
|
buffer = PyBuffer_FromMemory((void *)s, size);
|
||||||
|
@ -428,11 +444,19 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
|
||||||
PyErr_BadArgument();
|
PyErr_BadArgument();
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
/* Shortcut for the default encoding UTF-8 */
|
|
||||||
if ((encoding == NULL ||
|
if (encoding == NULL)
|
||||||
(strcmp(encoding, "utf-8") == 0)) &&
|
encoding = PyUnicode_GetDefaultEncoding();
|
||||||
errors == NULL)
|
|
||||||
|
/* Shortcuts for common default encodings */
|
||||||
|
if (errors == NULL) {
|
||||||
|
if (strcmp(encoding, "utf-8") == 0)
|
||||||
return PyUnicode_AsUTF8String(unicode);
|
return PyUnicode_AsUTF8String(unicode);
|
||||||
|
else if (strcmp(encoding, "latin-1") == 0)
|
||||||
|
return PyUnicode_AsLatin1String(unicode);
|
||||||
|
else if (strcmp(encoding, "ascii") == 0)
|
||||||
|
return PyUnicode_AsASCIIString(unicode);
|
||||||
|
}
|
||||||
|
|
||||||
/* Encode via the codec registry */
|
/* Encode via the codec registry */
|
||||||
v = PyCodec_Encode(unicode, encoding, errors);
|
v = PyCodec_Encode(unicode, encoding, errors);
|
||||||
|
@ -476,6 +500,30 @@ int PyUnicode_GetSize(PyObject *unicode)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char *PyUnicode_GetDefaultEncoding()
|
||||||
|
{
|
||||||
|
return unicode_default_encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
int PyUnicode_SetDefaultEncoding(const char *encoding)
|
||||||
|
{
|
||||||
|
PyObject *v;
|
||||||
|
|
||||||
|
/* Make sure the encoding is valid. As side effect, this also
|
||||||
|
loads the encoding into the codec registry cache. */
|
||||||
|
v = _PyCodec_Lookup(encoding);
|
||||||
|
if (v == NULL)
|
||||||
|
goto onError;
|
||||||
|
Py_DECREF(v);
|
||||||
|
strncpy(unicode_default_encoding,
|
||||||
|
encoding,
|
||||||
|
sizeof(unicode_default_encoding));
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
onError:
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
/* --- UTF-8 Codec -------------------------------------------------------- */
|
/* --- UTF-8 Codec -------------------------------------------------------- */
|
||||||
|
|
||||||
static
|
static
|
||||||
|
@ -772,7 +820,8 @@ int utf16_decoding_error(const Py_UNICODE **source,
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
PyErr_Format(PyExc_ValueError,
|
PyErr_Format(PyExc_ValueError,
|
||||||
"UTF-16 decoding error; unknown error handling code: %.400s",
|
"UTF-16 decoding error; "
|
||||||
|
"unknown error handling code: %.400s",
|
||||||
errors);
|
errors);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -3057,10 +3106,10 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
|
||||||
static char encode__doc__[] =
|
static char encode__doc__[] =
|
||||||
"S.encode([encoding[,errors]]) -> string\n\
|
"S.encode([encoding[,errors]]) -> string\n\
|
||||||
\n\
|
\n\
|
||||||
Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
|
Return an encoded string version of S. Default encoding is the current\n\
|
||||||
errors may be given to set a different error handling scheme. Default\n\
|
default string encoding. errors may be given to set a different error\n\
|
||||||
is 'strict' meaning that encoding errors raise a ValueError. Other\n\
|
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
|
||||||
possible values are 'ignore' and 'replace'.";
|
a ValueError. Other possible values are 'ignore' and 'replace'.";
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicode_encode(PyUnicodeObject *self, PyObject *args)
|
unicode_encode(PyUnicodeObject *self, PyObject *args)
|
||||||
|
@ -3816,7 +3865,7 @@ unicode_splitlines(PyUnicodeObject *self, PyObject *args)
|
||||||
static
|
static
|
||||||
PyObject *unicode_str(PyUnicodeObject *self)
|
PyObject *unicode_str(PyUnicodeObject *self)
|
||||||
{
|
{
|
||||||
return PyUnicode_AsUTF8String((PyObject *)self);
|
return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
static char strip__doc__[] =
|
static char strip__doc__[] =
|
||||||
|
@ -4246,6 +4295,8 @@ PyObject *PyUnicode_Format(PyObject *format,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
uformat = PyUnicode_FromObject(format);
|
uformat = PyUnicode_FromObject(format);
|
||||||
|
if (uformat == NULL)
|
||||||
|
return NULL;
|
||||||
fmt = PyUnicode_AS_UNICODE(uformat);
|
fmt = PyUnicode_AS_UNICODE(uformat);
|
||||||
fmtcnt = PyUnicode_GET_SIZE(uformat);
|
fmtcnt = PyUnicode_GET_SIZE(uformat);
|
||||||
|
|
||||||
|
@ -4322,13 +4373,10 @@ PyObject *PyUnicode_Format(PyObject *format,
|
||||||
"incomplete format key");
|
"incomplete format key");
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
/* keys are converted to strings (using UTF-8) and
|
/* keys are converted to strings using UTF-8 and
|
||||||
then looked up since Python uses strings to hold
|
then looked up since Python uses strings to hold
|
||||||
variables names etc. in its namespaces and we
|
variables names etc. in its namespaces and we
|
||||||
wouldn't want to break common idioms. The
|
wouldn't want to break common idioms. */
|
||||||
alternative would be using Unicode objects for the
|
|
||||||
lookup but u"abc" and "abc" have different hash
|
|
||||||
values (on purpose). */
|
|
||||||
key = PyUnicode_EncodeUTF8(keystart,
|
key = PyUnicode_EncodeUTF8(keystart,
|
||||||
keylen,
|
keylen,
|
||||||
NULL);
|
NULL);
|
||||||
|
@ -4472,8 +4520,9 @@ PyObject *PyUnicode_Format(PyObject *format,
|
||||||
"%s argument has non-string str()");
|
"%s argument has non-string str()");
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
|
unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
|
||||||
PyString_GET_SIZE(temp),
|
PyString_GET_SIZE(temp),
|
||||||
|
NULL,
|
||||||
"strict");
|
"strict");
|
||||||
Py_DECREF(temp);
|
Py_DECREF(temp);
|
||||||
temp = unicode;
|
temp = unicode;
|
||||||
|
@ -4659,7 +4708,9 @@ void _PyUnicode_Init()
|
||||||
Py_FatalError("Unicode configuration error: "
|
Py_FatalError("Unicode configuration error: "
|
||||||
"sizeof(Py_UNICODE) != 2 bytes");
|
"sizeof(Py_UNICODE) != 2 bytes");
|
||||||
|
|
||||||
|
/* Init the implementation */
|
||||||
unicode_empty = _PyUnicode_New(0);
|
unicode_empty = _PyUnicode_New(0);
|
||||||
|
strcpy(unicode_default_encoding, "utf-8");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Finalize the Unicode implementation */
|
/* Finalize the Unicode implementation */
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue