M.-A. Lemburg <mal@lemburg.com>:

Added support for user settable default encodings. The current implementation uses a per-process global which defines the value of the encoding parameter in case it is set to NULL (meaning: use the default encoding).
2025-11-24 20:30:18 +00:00 · 2000-05-09 19:53:39 +00:00 · 2000-05-09 19:53:39 +00:00 · e4315f58d2
commit e4315f58d2
parent aff601804d
1 changed files with 71 additions and 20 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -117,6 +117,16 @@ static PyUnicodeObject *unicode_empty = NULL;
 static PyUnicodeObject *unicode_freelist = NULL;
 static int unicode_freelist_size = 0;
 /* Default encoding to use and assume when NULL is passed as encoding
   parameter; it is initialized by _PyUnicode_Init().
   Always use the PyUnicode_SetDefaultEncoding() and
   PyUnicode_GetDefaultEncoding() APIs to access this global. 
 */
 static char unicode_default_encoding[100];
 /* --- Unicode Object ----------------------------------------------------- */
 static
@ -366,7 +376,7 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
 	Py_INCREF(unicode_empty);
 	return (PyObject *)unicode_empty;
    }
-    return PyUnicode_DecodeUTF8(s, len, "strict");
+    return PyUnicode_Decode(s, len, NULL, "strict");
 }
 PyObject *PyUnicode_Decode(const char *s,
@ -376,10 +386,16 @@ PyObject *PyUnicode_Decode(const char *s,
 {
    PyObject *buffer = NULL, *unicode;
-    /* Shortcut for the default encoding UTF-8 */
+    if (encoding == NULL) 
-    if (encoding == NULL || 
+	encoding = PyUnicode_GetDefaultEncoding();
-        (strcmp(encoding, "utf-8") == 0))
+
    /* Shortcuts for common default encodings */
    if (strcmp(encoding, "utf-8") == 0)
        return PyUnicode_DecodeUTF8(s, size, errors);
    else if (strcmp(encoding, "latin-1") == 0)
        return PyUnicode_DecodeLatin1(s, size, errors);
    else if (strcmp(encoding, "ascii") == 0)
        return PyUnicode_DecodeASCII(s, size, errors);
    /* Decode via the codec registry */
    buffer = PyBuffer_FromMemory((void *)s, size);
@ -428,11 +444,19 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
        PyErr_BadArgument();
        goto onError;
    }
-    /* Shortcut for the default encoding UTF-8 */
+
-    if ((encoding == NULL || 
+    if (encoding == NULL) 
-	 (strcmp(encoding, "utf-8") == 0)) &&
+	encoding = PyUnicode_GetDefaultEncoding();
-	errors == NULL)
+
    /* Shortcuts for common default encodings */
    if (errors == NULL) {
 	if (strcmp(encoding, "utf-8") == 0)
        return PyUnicode_AsUTF8String(unicode);
 	else if (strcmp(encoding, "latin-1") == 0)
 	    return PyUnicode_AsLatin1String(unicode);
 	else if (strcmp(encoding, "ascii") == 0)
 	    return PyUnicode_AsASCIIString(unicode);
    }
    /* Encode via the codec registry */
    v = PyCodec_Encode(unicode, encoding, errors);
@ -476,6 +500,30 @@ int PyUnicode_GetSize(PyObject *unicode)
    return -1;
 }
 const char *PyUnicode_GetDefaultEncoding()
 {
    return unicode_default_encoding;
 }
 int PyUnicode_SetDefaultEncoding(const char *encoding)
 {
    PyObject *v;
    /* Make sure the encoding is valid. As side effect, this also
       loads the encoding into the codec registry cache. */
    v = _PyCodec_Lookup(encoding);
    if (v == NULL)
 	goto onError;
    Py_DECREF(v);
    strncpy(unicode_default_encoding,
 	    encoding, 
 	    sizeof(unicode_default_encoding));
    return 0;
 onError:
    return -1;
 }
 /* --- UTF-8 Codec -------------------------------------------------------- */
 static 
@ -772,7 +820,8 @@ int utf16_decoding_error(const Py_UNICODE **source,
    }
    else {
        PyErr_Format(PyExc_ValueError,
-                     "UTF-16 decoding error; unknown error handling code: %.400s",
+                     "UTF-16 decoding error; "
 		     "unknown error handling code: %.400s",
                     errors);
        return -1;
    }
@ -3057,10 +3106,10 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
 static char encode__doc__[] =
 "S.encode([encoding[,errors]]) -> string\n\
 \n\
-Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
+Return an encoded string version of S. Default encoding is the current\n\
-errors may be given to set a different error handling scheme. Default\n\
+default string encoding. errors may be given to set a different error\n\
-is 'strict' meaning that encoding errors raise a ValueError. Other\n\
+handling scheme. Default is 'strict' meaning that encoding errors raise\n\
-possible values are 'ignore' and 'replace'.";
+a ValueError. Other possible values are 'ignore' and 'replace'.";
 static PyObject *
 unicode_encode(PyUnicodeObject *self, PyObject *args)
@ -3816,7 +3865,7 @@ unicode_splitlines(PyUnicodeObject *self, PyObject *args)
 static
 PyObject *unicode_str(PyUnicodeObject *self)
 {
-    return PyUnicode_AsUTF8String((PyObject *)self);
+    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
 }
 static char strip__doc__[] =
@ -4246,6 +4295,8 @@ PyObject *PyUnicode_Format(PyObject *format,
 	return NULL;
    }
    uformat = PyUnicode_FromObject(format);
    if (uformat == NULL)
 	return NULL;
    fmt = PyUnicode_AS_UNICODE(uformat);
    fmtcnt = PyUnicode_GET_SIZE(uformat);
@ -4322,13 +4373,10 @@ PyObject *PyUnicode_Format(PyObject *format,
 				    "incomplete format key");
 		    goto onError;
 		}
-		/* keys are converted to strings (using UTF-8) and
+		/* keys are converted to strings using UTF-8 and
 		   then looked up since Python uses strings to hold
 		   variables names etc. in its namespaces and we
-		   wouldn't want to break common idioms.  The
+		   wouldn't want to break common idioms. */
 		   alternative would be using Unicode objects for the
 		   lookup but u"abc" and "abc" have different hash
 		   values (on purpose). */
 		key = PyUnicode_EncodeUTF8(keystart,
 					   keylen,
 					   NULL);
@ -4472,8 +4520,9 @@ PyObject *PyUnicode_Format(PyObject *format,
 					"%s argument has non-string str()");
 			goto onError;
 		    }
-		    unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
+		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
 						   PyString_GET_SIZE(temp),
 					       NULL,
 						   "strict");
 		    Py_DECREF(temp);
 		    temp = unicode;
@ -4659,7 +4708,9 @@ void _PyUnicode_Init()
        Py_FatalError("Unicode configuration error: "
 		      "sizeof(Py_UNICODE) != 2 bytes");
    /* Init the implementation */
    unicode_empty = _PyUnicode_New(0);
    strcpy(unicode_default_encoding, "utf-8");
 }
 /* Finalize the Unicode implementation */