bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)

Py_EncodeLocale() now uses _Py_EncodeUTF8_surrogateescape(), instead of using temporary unicode and bytes objects. So Py_EncodeLocale() doesn't use the Python C API anymore.
2025-10-12 01:43:12 +00:00 · 2017-12-21 15:45:16 +01:00 · 2017-12-21 15:45:16 +01:00 · e47e698da6
commit e47e698da6
parent fbd605151f
2 changed files with 93 additions and 38 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
 }


+/* UTF-8 encoder using the surrogateescape error handler .
+
+   On success, return a pointer to a newly allocated character string (use
+   PyMem_Free() to free the memory).
+
+   On encoding failure, return NULL and write the position of the invalid
+   surrogate character into *error_pos (if error_pos is set).
+
+   On memory allocation failure, return NULL and write (size_t)-1 into
+   *error_pos (if error_pos is set). */
+char*
+_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos)
+{
+    const Py_ssize_t max_char_size = 4;
+    Py_ssize_t len = wcslen(text);
+
+    assert(len >= 0);
+
+    char *bytes;
+    if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
+        bytes = PyMem_Malloc((len + 1) * max_char_size);
+    }
+    else {
+        bytes = NULL;
+    }
+    if (bytes == NULL) {
+        if (error_pos != NULL) {
+            *error_pos = (size_t)-1;
+        }
+        return NULL;
+    }
+
+    char *p = bytes;
+    Py_ssize_t i;
+    for (i = 0; i < len;) {
+        Py_UCS4 ch = text[i++];
+
+        if (ch < 0x80) {
+            /* Encode ASCII */
+            *p++ = (char) ch;
+
+        }
+        else if (ch < 0x0800) {
+            /* Encode Latin-1 */
+            *p++ = (char)(0xc0 | (ch >> 6));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
+        else if (Py_UNICODE_IS_SURROGATE(ch)) {
+            /* surrogateescape error handler */
+            if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
+                if (error_pos != NULL) {
+                    *error_pos = (size_t)i - 1;
+                }
+                goto error;
+            }
+            *p++ = (char)(ch & 0xff);
+        }
+        else if (ch < 0x10000) {
+            *p++ = (char)(0xe0 | (ch >> 12));
+            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
+        else {  /* ch >= 0x10000 */
+            assert(ch <= MAX_UNICODE);
+            /* Encode UCS4 Unicode ordinals */
+            *p++ = (char)(0xf0 | (ch >> 18));
+            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
+    }
+    *p++ = '\0';
+
+    size_t final_size = (p - bytes);
+    char *bytes2 = PyMem_Realloc(bytes, final_size);
+    if (bytes2 == NULL) {
+        if (error_pos != NULL) {
+            *error_pos = (size_t)-1;
+        }
+        goto error;
+    }
+    return bytes2;
+
+ error:
+    PyMem_Free(bytes);
+    return NULL;
+}
+
+
 /* Primary internal function which creates utf8 encoded bytes objects.

   Allocation strategy:  if the string is short, convert into a stack buffer