mirror of
https://github.com/python/cpython.git
synced 2025-08-27 04:05:34 +00:00
bpo-34523: Support surrogatepass in locale codecs (GH-8995)
Add support for the "surrogatepass" error handler in PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() for the UTF-8 encoding. Changes: * _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the surrogatepass error handler (_Py_ERROR_SURROGATEPASS). * _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use the _Py_error_handler enum instead of "int surrogateescape" to pass the error handler. These functions now return -3 if the error handler is unknown. * Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() in test_codecs. * Rename get_error_handler() to _Py_GetErrorHandler() and expose it as a private function. * _freeze_importlib doesn't need config.filesystem_errors="strict" workaround anymore.
This commit is contained in:
parent
c5989cd876
commit
3d4226a832
7 changed files with 423 additions and 117 deletions
|
@ -318,20 +318,8 @@ static int convert_uc(PyObject *obj, void *addr);
|
|||
|
||||
#include "clinic/unicodeobject.c.h"
|
||||
|
||||
typedef enum {
|
||||
_Py_ERROR_UNKNOWN=0,
|
||||
_Py_ERROR_STRICT,
|
||||
_Py_ERROR_SURROGATEESCAPE,
|
||||
_Py_ERROR_REPLACE,
|
||||
_Py_ERROR_IGNORE,
|
||||
_Py_ERROR_BACKSLASHREPLACE,
|
||||
_Py_ERROR_SURROGATEPASS,
|
||||
_Py_ERROR_XMLCHARREFREPLACE,
|
||||
_Py_ERROR_OTHER
|
||||
} _Py_error_handler;
|
||||
|
||||
static _Py_error_handler
|
||||
get_error_handler(const char *errors)
|
||||
_Py_error_handler
|
||||
_Py_GetErrorHandler(const char *errors)
|
||||
{
|
||||
if (errors == NULL || strcmp(errors, "strict") == 0) {
|
||||
return _Py_ERROR_STRICT;
|
||||
|
@ -3327,34 +3315,12 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static int
|
||||
locale_error_handler(const char *errors, int *surrogateescape)
|
||||
{
|
||||
_Py_error_handler error_handler = get_error_handler(errors);
|
||||
switch (error_handler)
|
||||
{
|
||||
case _Py_ERROR_STRICT:
|
||||
*surrogateescape = 0;
|
||||
return 0;
|
||||
case _Py_ERROR_SURROGATEESCAPE:
|
||||
*surrogateescape = 1;
|
||||
return 0;
|
||||
default:
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"only 'strict' and 'surrogateescape' error handlers "
|
||||
"are supported, not '%s'",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
unicode_encode_locale(PyObject *unicode, const char *errors,
|
||||
int current_locale)
|
||||
{
|
||||
int surrogateescape;
|
||||
if (locale_error_handler(errors, &surrogateescape) < 0)
|
||||
return NULL;
|
||||
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
|
||||
|
||||
Py_ssize_t wlen;
|
||||
wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
|
||||
|
@ -3373,7 +3339,7 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
|
|||
size_t error_pos;
|
||||
const char *reason;
|
||||
int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
|
||||
current_locale, surrogateescape);
|
||||
current_locale, error_handler);
|
||||
if (res != 0) {
|
||||
if (res == -2) {
|
||||
PyObject *exc;
|
||||
|
@ -3388,6 +3354,9 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
|
|||
}
|
||||
return NULL;
|
||||
}
|
||||
else if (res == -3) {
|
||||
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
|
||||
}
|
||||
else {
|
||||
PyErr_NoMemory();
|
||||
PyMem_Free(wstr);
|
||||
|
@ -3571,9 +3540,7 @@ static PyObject*
|
|||
unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
||||
int current_locale)
|
||||
{
|
||||
int surrogateescape;
|
||||
if (locale_error_handler(errors, &surrogateescape) < 0)
|
||||
return NULL;
|
||||
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
|
||||
|
||||
if (str[len] != '\0' || (size_t)len != strlen(str)) {
|
||||
PyErr_SetString(PyExc_ValueError, "embedded null byte");
|
||||
|
@ -3584,7 +3551,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
|||
size_t wlen;
|
||||
const char *reason;
|
||||
int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
|
||||
current_locale, surrogateescape);
|
||||
current_locale, error_handler);
|
||||
if (res != 0) {
|
||||
if (res == -2) {
|
||||
PyObject *exc;
|
||||
|
@ -3598,6 +3565,9 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
|||
Py_DECREF(exc);
|
||||
}
|
||||
}
|
||||
else if (res == -3) {
|
||||
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
|
||||
}
|
||||
else {
|
||||
PyErr_NoMemory();
|
||||
}
|
||||
|
@ -4863,7 +4833,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
|||
}
|
||||
|
||||
if (error_handler == _Py_ERROR_UNKNOWN)
|
||||
error_handler = get_error_handler(errors);
|
||||
error_handler = _Py_GetErrorHandler(errors);
|
||||
|
||||
switch (error_handler) {
|
||||
case _Py_ERROR_IGNORE:
|
||||
|
@ -4932,13 +4902,29 @@ onError:
|
|||
is not NULL, write the decoding error message into *reason. */
|
||||
int
|
||||
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
|
||||
const char **reason, int surrogateescape)
|
||||
const char **reason, _Py_error_handler errors)
|
||||
{
|
||||
const char *orig_s = s;
|
||||
const char *e;
|
||||
wchar_t *unicode;
|
||||
Py_ssize_t outpos;
|
||||
|
||||
int surrogateescape = 0;
|
||||
int surrogatepass = 0;
|
||||
switch (errors)
|
||||
{
|
||||
case _Py_ERROR_STRICT:
|
||||
break;
|
||||
case _Py_ERROR_SURROGATEESCAPE:
|
||||
surrogateescape = 1;
|
||||
break;
|
||||
case _Py_ERROR_SURROGATEPASS:
|
||||
surrogatepass = 1;
|
||||
break;
|
||||
default:
|
||||
return -3;
|
||||
}
|
||||
|
||||
/* Note: size will always be longer than the resulting Unicode
|
||||
character count */
|
||||
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
|
||||
|
@ -4971,31 +4957,47 @@ _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
|
|||
#endif
|
||||
}
|
||||
else {
|
||||
if (!ch && s == e)
|
||||
if (!ch && s == e) {
|
||||
break;
|
||||
if (!surrogateescape) {
|
||||
PyMem_RawFree(unicode );
|
||||
if (reason != NULL) {
|
||||
switch (ch) {
|
||||
case 0:
|
||||
*reason = "unexpected end of data";
|
||||
break;
|
||||
case 1:
|
||||
*reason = "invalid start byte";
|
||||
break;
|
||||
/* 2, 3, 4 */
|
||||
default:
|
||||
*reason = "invalid continuation byte";
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (wlen != NULL) {
|
||||
*wlen = s - orig_s;
|
||||
}
|
||||
return -2;
|
||||
}
|
||||
/* surrogateescape */
|
||||
unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
|
||||
|
||||
if (surrogateescape) {
|
||||
unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
|
||||
}
|
||||
else {
|
||||
/* Is it a valid three-byte code? */
|
||||
if (surrogatepass
|
||||
&& (e - s) >= 3
|
||||
&& (s[0] & 0xf0) == 0xe0
|
||||
&& (s[1] & 0xc0) == 0x80
|
||||
&& (s[2] & 0xc0) == 0x80)
|
||||
{
|
||||
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
||||
s += 3;
|
||||
unicode[outpos++] = ch;
|
||||
}
|
||||
else {
|
||||
PyMem_RawFree(unicode );
|
||||
if (reason != NULL) {
|
||||
switch (ch) {
|
||||
case 0:
|
||||
*reason = "unexpected end of data";
|
||||
break;
|
||||
case 1:
|
||||
*reason = "invalid start byte";
|
||||
break;
|
||||
/* 2, 3, 4 */
|
||||
default:
|
||||
*reason = "invalid continuation byte";
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (wlen != NULL) {
|
||||
*wlen = s - orig_s;
|
||||
}
|
||||
return -2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
unicode[outpos] = L'\0';
|
||||
|
@ -5030,13 +5032,29 @@ _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
|
|||
On memory allocation failure, return -1. */
|
||||
int
|
||||
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
|
||||
const char **reason, int raw_malloc, int surrogateescape)
|
||||
const char **reason, int raw_malloc, _Py_error_handler errors)
|
||||
{
|
||||
const Py_ssize_t max_char_size = 4;
|
||||
Py_ssize_t len = wcslen(text);
|
||||
|
||||
assert(len >= 0);
|
||||
|
||||
int surrogateescape = 0;
|
||||
int surrogatepass = 0;
|
||||
switch (errors)
|
||||
{
|
||||
case _Py_ERROR_STRICT:
|
||||
break;
|
||||
case _Py_ERROR_SURROGATEESCAPE:
|
||||
surrogateescape = 1;
|
||||
break;
|
||||
case _Py_ERROR_SURROGATEPASS:
|
||||
surrogatepass = 1;
|
||||
break;
|
||||
default:
|
||||
return -3;
|
||||
}
|
||||
|
||||
if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
|
||||
return -1;
|
||||
}
|
||||
|
@ -5053,8 +5071,19 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
|
|||
|
||||
char *p = bytes;
|
||||
Py_ssize_t i;
|
||||
for (i = 0; i < len; i++) {
|
||||
for (i = 0; i < len; ) {
|
||||
Py_ssize_t ch_pos = i;
|
||||
Py_UCS4 ch = text[i];
|
||||
i++;
|
||||
#if Py_UNICODE_SIZE == 2
|
||||
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
|
||||
&& i < len
|
||||
&& Py_UNICODE_IS_LOW_SURROGATE(text[i]))
|
||||
{
|
||||
ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
|
||||
i++;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ch < 0x80) {
|
||||
/* Encode ASCII */
|
||||
|
@ -5066,11 +5095,11 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
|
|||
*p++ = (char)(0xc0 | (ch >> 6));
|
||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||
}
|
||||
else if (Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
|
||||
/* surrogateescape error handler */
|
||||
if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)i;
|
||||
*error_pos = (size_t)ch_pos;
|
||||
}
|
||||
if (reason != NULL) {
|
||||
*reason = "encoding error";
|
||||
|
@ -6741,7 +6770,7 @@ unicode_encode_ucs1(PyObject *unicode,
|
|||
|
||||
/* cache callback name lookup (if not done yet, i.e. it's the first error) */
|
||||
if (error_handler == _Py_ERROR_UNKNOWN)
|
||||
error_handler = get_error_handler(errors);
|
||||
error_handler = _Py_GetErrorHandler(errors);
|
||||
|
||||
switch (error_handler) {
|
||||
case _Py_ERROR_STRICT:
|
||||
|
@ -6945,7 +6974,7 @@ PyUnicode_DecodeASCII(const char *s,
|
|||
/* byte outsize range 0x00..0x7f: call the error handler */
|
||||
|
||||
if (error_handler == _Py_ERROR_UNKNOWN)
|
||||
error_handler = get_error_handler(errors);
|
||||
error_handler = _Py_GetErrorHandler(errors);
|
||||
|
||||
switch (error_handler)
|
||||
{
|
||||
|
@ -8404,7 +8433,7 @@ charmap_encoding_error(
|
|||
/* cache callback name lookup
|
||||
* (if not done yet, i.e. it's the first error) */
|
||||
if (*error_handler == _Py_ERROR_UNKNOWN)
|
||||
*error_handler = get_error_handler(errors);
|
||||
*error_handler = _Py_GetErrorHandler(errors);
|
||||
|
||||
switch (*error_handler) {
|
||||
case _Py_ERROR_STRICT:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue