M.-A. Lemburg <mal@lemburg.com>:

Added support for user settable default encodings. The
current implementation uses a per-process global which
defines the value of the encoding parameter in case it
is set to NULL (meaning: use the default encoding).
This commit is contained in:
Fred Drake 2000-05-09 19:53:39 +00:00
parent aff601804d
commit e4315f58d2

View file

@ -117,6 +117,16 @@ static PyUnicodeObject *unicode_empty = NULL;
static PyUnicodeObject *unicode_freelist = NULL; static PyUnicodeObject *unicode_freelist = NULL;
static int unicode_freelist_size = 0; static int unicode_freelist_size = 0;
/* Default encoding to use and assume when NULL is passed as encoding
parameter; it is initialized by _PyUnicode_Init().
Always use the PyUnicode_SetDefaultEncoding() and
PyUnicode_GetDefaultEncoding() APIs to access this global.
*/
static char unicode_default_encoding[100];
/* --- Unicode Object ----------------------------------------------------- */ /* --- Unicode Object ----------------------------------------------------- */
static static
@ -366,7 +376,7 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
Py_INCREF(unicode_empty); Py_INCREF(unicode_empty);
return (PyObject *)unicode_empty; return (PyObject *)unicode_empty;
} }
return PyUnicode_DecodeUTF8(s, len, "strict"); return PyUnicode_Decode(s, len, NULL, "strict");
} }
PyObject *PyUnicode_Decode(const char *s, PyObject *PyUnicode_Decode(const char *s,
@ -376,10 +386,16 @@ PyObject *PyUnicode_Decode(const char *s,
{ {
PyObject *buffer = NULL, *unicode; PyObject *buffer = NULL, *unicode;
/* Shortcut for the default encoding UTF-8 */ if (encoding == NULL)
if (encoding == NULL || encoding = PyUnicode_GetDefaultEncoding();
(strcmp(encoding, "utf-8") == 0))
/* Shortcuts for common default encodings */
if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_DecodeUTF8(s, size, errors); return PyUnicode_DecodeUTF8(s, size, errors);
else if (strcmp(encoding, "latin-1") == 0)
return PyUnicode_DecodeLatin1(s, size, errors);
else if (strcmp(encoding, "ascii") == 0)
return PyUnicode_DecodeASCII(s, size, errors);
/* Decode via the codec registry */ /* Decode via the codec registry */
buffer = PyBuffer_FromMemory((void *)s, size); buffer = PyBuffer_FromMemory((void *)s, size);
@ -428,11 +444,19 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
PyErr_BadArgument(); PyErr_BadArgument();
goto onError; goto onError;
} }
/* Shortcut for the default encoding UTF-8 */
if ((encoding == NULL || if (encoding == NULL)
(strcmp(encoding, "utf-8") == 0)) && encoding = PyUnicode_GetDefaultEncoding();
errors == NULL)
/* Shortcuts for common default encodings */
if (errors == NULL) {
if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_AsUTF8String(unicode); return PyUnicode_AsUTF8String(unicode);
else if (strcmp(encoding, "latin-1") == 0)
return PyUnicode_AsLatin1String(unicode);
else if (strcmp(encoding, "ascii") == 0)
return PyUnicode_AsASCIIString(unicode);
}
/* Encode via the codec registry */ /* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors); v = PyCodec_Encode(unicode, encoding, errors);
@ -476,6 +500,30 @@ int PyUnicode_GetSize(PyObject *unicode)
return -1; return -1;
} }
const char *PyUnicode_GetDefaultEncoding()
{
return unicode_default_encoding;
}
int PyUnicode_SetDefaultEncoding(const char *encoding)
{
PyObject *v;
/* Make sure the encoding is valid. As side effect, this also
loads the encoding into the codec registry cache. */
v = _PyCodec_Lookup(encoding);
if (v == NULL)
goto onError;
Py_DECREF(v);
strncpy(unicode_default_encoding,
encoding,
sizeof(unicode_default_encoding));
return 0;
onError:
return -1;
}
/* --- UTF-8 Codec -------------------------------------------------------- */ /* --- UTF-8 Codec -------------------------------------------------------- */
static static
@ -772,7 +820,8 @@ int utf16_decoding_error(const Py_UNICODE **source,
} }
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"UTF-16 decoding error; unknown error handling code: %.400s", "UTF-16 decoding error; "
"unknown error handling code: %.400s",
errors); errors);
return -1; return -1;
} }
@ -3057,10 +3106,10 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
static char encode__doc__[] = static char encode__doc__[] =
"S.encode([encoding[,errors]]) -> string\n\ "S.encode([encoding[,errors]]) -> string\n\
\n\ \n\
Return an encoded string version of S. Default encoding is 'UTF-8'.\n\ Return an encoded string version of S. Default encoding is the current\n\
errors may be given to set a different error handling scheme. Default\n\ default string encoding. errors may be given to set a different error\n\
is 'strict' meaning that encoding errors raise a ValueError. Other\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\
possible values are 'ignore' and 'replace'."; a ValueError. Other possible values are 'ignore' and 'replace'.";
static PyObject * static PyObject *
unicode_encode(PyUnicodeObject *self, PyObject *args) unicode_encode(PyUnicodeObject *self, PyObject *args)
@ -3816,7 +3865,7 @@ unicode_splitlines(PyUnicodeObject *self, PyObject *args)
static static
PyObject *unicode_str(PyUnicodeObject *self) PyObject *unicode_str(PyUnicodeObject *self)
{ {
return PyUnicode_AsUTF8String((PyObject *)self); return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
} }
static char strip__doc__[] = static char strip__doc__[] =
@ -4246,6 +4295,8 @@ PyObject *PyUnicode_Format(PyObject *format,
return NULL; return NULL;
} }
uformat = PyUnicode_FromObject(format); uformat = PyUnicode_FromObject(format);
if (uformat == NULL)
return NULL;
fmt = PyUnicode_AS_UNICODE(uformat); fmt = PyUnicode_AS_UNICODE(uformat);
fmtcnt = PyUnicode_GET_SIZE(uformat); fmtcnt = PyUnicode_GET_SIZE(uformat);
@ -4322,13 +4373,10 @@ PyObject *PyUnicode_Format(PyObject *format,
"incomplete format key"); "incomplete format key");
goto onError; goto onError;
} }
/* keys are converted to strings (using UTF-8) and /* keys are converted to strings using UTF-8 and
then looked up since Python uses strings to hold then looked up since Python uses strings to hold
variables names etc. in its namespaces and we variables names etc. in its namespaces and we
wouldn't want to break common idioms. The wouldn't want to break common idioms. */
alternative would be using Unicode objects for the
lookup but u"abc" and "abc" have different hash
values (on purpose). */
key = PyUnicode_EncodeUTF8(keystart, key = PyUnicode_EncodeUTF8(keystart,
keylen, keylen,
NULL); NULL);
@ -4472,8 +4520,9 @@ PyObject *PyUnicode_Format(PyObject *format,
"%s argument has non-string str()"); "%s argument has non-string str()");
goto onError; goto onError;
} }
unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp), unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
PyString_GET_SIZE(temp), PyString_GET_SIZE(temp),
NULL,
"strict"); "strict");
Py_DECREF(temp); Py_DECREF(temp);
temp = unicode; temp = unicode;
@ -4659,7 +4708,9 @@ void _PyUnicode_Init()
Py_FatalError("Unicode configuration error: " Py_FatalError("Unicode configuration error: "
"sizeof(Py_UNICODE) != 2 bytes"); "sizeof(Py_UNICODE) != 2 bytes");
/* Init the implementation */
unicode_empty = _PyUnicode_New(0); unicode_empty = _PyUnicode_New(0);
strcpy(unicode_default_encoding, "utf-8");
} }
/* Finalize the Unicode implementation */ /* Finalize the Unicode implementation */