Change PyUnicode_FromString[AndSize] to expect UTF-8.

2025-11-30 23:08:56 +00:00 · 2007-08-05 20:26:11 +00:00 · 2007-08-05 20:26:11 +00:00 · 9c121069d3
commit 9c121069d3
parent 64ce5052e1
3 changed files with 25 additions and 20 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -427,7 +427,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 {
    PyUnicodeObject *unicode;
    /* If the Unicode data is known at construction time, we can apply
-       some optimizations which share commonly used objects. */
+       some optimizations which share commonly used objects.
+       Also, this means the input must be UTF-8, so fall back to the
+       UTF-8 decoder at the end. */
    if (u != NULL) {

 	/* Optimization for empty strings */
@ -436,8 +438,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 	    return (PyObject *)unicode_empty;
 	}

-	/* Single characters are shared when using this constructor */
-	if (size == 1) {
+	/* Single characters are shared when using this constructor.
+           Restrict to ASCII, since the input must be UTF-8. */
+	if (size == 1 && Py_CHARMASK(*u) < 128) {
 	    unicode = unicode_latin1[Py_CHARMASK(*u)];
 	    if (!unicode) {
 		unicode = _PyUnicode_New(1);
@ -449,21 +452,14 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 	    Py_INCREF(unicode);
 	    return (PyObject *)unicode;
 	}
+
+        return PyUnicode_DecodeUTF8(u, size, NULL);
    }

    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;

-    /* Copy the Unicode data into the new object */
-    if (u != NULL) {
-        Py_UNICODE *p = unicode->str;
-        while (size--)
-            *p++ = Py_CHARMASK(*u++);
-        /* Don't need to write trailing 0 because
-           that's already done by _PyUnicode_New */
-    }
-
    return (PyObject *)unicode;
 }