mirror of
https://github.com/python/cpython.git
synced 2025-08-27 12:16:04 +00:00
bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)
Py_EncodeLocale() now uses _Py_EncodeUTF8_surrogateescape(), instead of using temporary unicode and bytes objects. So Py_EncodeLocale() doesn't use the Python C API anymore.
This commit is contained in:
parent
fbd605151f
commit
e47e698da6
2 changed files with 93 additions and 38 deletions
|
@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
|
|||
}
|
||||
|
||||
|
||||
/* UTF-8 encoder using the surrogateescape error handler .
|
||||
|
||||
On success, return a pointer to a newly allocated character string (use
|
||||
PyMem_Free() to free the memory).
|
||||
|
||||
On encoding failure, return NULL and write the position of the invalid
|
||||
surrogate character into *error_pos (if error_pos is set).
|
||||
|
||||
On memory allocation failure, return NULL and write (size_t)-1 into
|
||||
*error_pos (if error_pos is set). */
|
||||
char*
|
||||
_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
const Py_ssize_t max_char_size = 4;
|
||||
Py_ssize_t len = wcslen(text);
|
||||
|
||||
assert(len >= 0);
|
||||
|
||||
char *bytes;
|
||||
if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
|
||||
bytes = PyMem_Malloc((len + 1) * max_char_size);
|
||||
}
|
||||
else {
|
||||
bytes = NULL;
|
||||
}
|
||||
if (bytes == NULL) {
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *p = bytes;
|
||||
Py_ssize_t i;
|
||||
for (i = 0; i < len;) {
|
||||
Py_UCS4 ch = text[i++];
|
||||
|
||||
if (ch < 0x80) {
|
||||
/* Encode ASCII */
|
||||
*p++ = (char) ch;
|
||||
|
||||
}
|
||||
else if (ch < 0x0800) {
|
||||
/* Encode Latin-1 */
|
||||
*p++ = (char)(0xc0 | (ch >> 6));
|
||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||
}
|
||||
else if (Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
/* surrogateescape error handler */
|
||||
if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)i - 1;
|
||||
}
|
||||
goto error;
|
||||
}
|
||||
*p++ = (char)(ch & 0xff);
|
||||
}
|
||||
else if (ch < 0x10000) {
|
||||
*p++ = (char)(0xe0 | (ch >> 12));
|
||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||
}
|
||||
else { /* ch >= 0x10000 */
|
||||
assert(ch <= MAX_UNICODE);
|
||||
/* Encode UCS4 Unicode ordinals */
|
||||
*p++ = (char)(0xf0 | (ch >> 18));
|
||||
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||
}
|
||||
}
|
||||
*p++ = '\0';
|
||||
|
||||
size_t final_size = (p - bytes);
|
||||
char *bytes2 = PyMem_Realloc(bytes, final_size);
|
||||
if (bytes2 == NULL) {
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
goto error;
|
||||
}
|
||||
return bytes2;
|
||||
|
||||
error:
|
||||
PyMem_Free(bytes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* Primary internal function which creates utf8 encoded bytes objects.
|
||||
|
||||
Allocation strategy: if the string is short, convert into a stack buffer
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue