mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00
gh-129173: Use _PyUnicodeError_GetParams
in PyCodec_SurrogatePassErrors
(GH-129134)
This commit is contained in:
parent
303043f506
commit
1775091dc1
1 changed files with 197 additions and 155 deletions
352
Python/codecs.c
352
Python/codecs.c
|
@ -1095,7 +1095,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
|
|||
#define ENC_UTF32LE 4
|
||||
|
||||
static int
|
||||
get_standard_encoding(const char *encoding, int *bytelength)
|
||||
get_standard_encoding_impl(const char *encoding, int *bytelength)
|
||||
{
|
||||
if (Py_TOLOWER(encoding[0]) == 'u' &&
|
||||
Py_TOLOWER(encoding[1]) == 't' &&
|
||||
|
@ -1153,165 +1153,204 @@ get_standard_encoding(const char *encoding, int *bytelength)
|
|||
return ENC_UNKNOWN;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
|
||||
{
|
||||
const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
|
||||
if (encoding_cstr == NULL) {
|
||||
return -1;
|
||||
}
|
||||
*code = get_standard_encoding_impl(encoding_cstr, bytelength);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// --- handler: 'surrogatepass' -----------------------------------------------
|
||||
|
||||
static PyObject *
|
||||
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
|
||||
{
|
||||
PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
|
||||
if (encoding == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
int code, bytelength;
|
||||
int rc = get_standard_encoding(encoding, &code, &bytelength);
|
||||
Py_DECREF(encoding);
|
||||
if (rc < 0) {
|
||||
return NULL;
|
||||
}
|
||||
if (code == ENC_UNKNOWN) {
|
||||
goto bail;
|
||||
}
|
||||
|
||||
PyObject *obj;
|
||||
Py_ssize_t objlen, start, end, slen;
|
||||
if (_PyUnicodeError_GetParams(exc,
|
||||
&obj, &objlen,
|
||||
&start, &end, &slen, false) < 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (slen > PY_SSIZE_T_MAX / bytelength) {
|
||||
end = start + PY_SSIZE_T_MAX / bytelength;
|
||||
end = Py_MIN(end, objlen);
|
||||
slen = Py_MAX(0, end - start);
|
||||
}
|
||||
|
||||
PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
|
||||
if (res == NULL) {
|
||||
Py_DECREF(obj);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
|
||||
for (Py_ssize_t i = start; i < end; i++) {
|
||||
/* object is guaranteed to be "ready" */
|
||||
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
|
||||
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
/* Not a surrogate, fail with original exception */
|
||||
Py_DECREF(obj);
|
||||
Py_DECREF(res);
|
||||
goto bail;
|
||||
}
|
||||
switch (code) {
|
||||
case ENC_UTF8: {
|
||||
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
|
||||
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
|
||||
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
|
||||
break;
|
||||
}
|
||||
case ENC_UTF16LE: {
|
||||
*outp++ = (unsigned char)ch;
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
break;
|
||||
}
|
||||
case ENC_UTF16BE: {
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
*outp++ = (unsigned char)ch;
|
||||
break;
|
||||
}
|
||||
case ENC_UTF32LE: {
|
||||
*outp++ = (unsigned char)ch;
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
*outp++ = (unsigned char)(ch >> 16);
|
||||
*outp++ = (unsigned char)(ch >> 24);
|
||||
break;
|
||||
}
|
||||
case ENC_UTF32BE: {
|
||||
*outp++ = (unsigned char)(ch >> 24);
|
||||
*outp++ = (unsigned char)(ch >> 16);
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
*outp++ = (unsigned char)ch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Py_DECREF(obj);
|
||||
PyObject *restuple = Py_BuildValue("(Nn)", res, end);
|
||||
return restuple;
|
||||
|
||||
bail:
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
static PyObject *
|
||||
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
|
||||
{
|
||||
PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
|
||||
if (encoding == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
int code, bytelength;
|
||||
int rc = get_standard_encoding(encoding, &code, &bytelength);
|
||||
Py_DECREF(encoding);
|
||||
if (rc < 0) {
|
||||
return NULL;
|
||||
}
|
||||
if (code == ENC_UNKNOWN) {
|
||||
goto bail;
|
||||
}
|
||||
|
||||
PyObject *obj;
|
||||
Py_ssize_t objlen, start, end, slen;
|
||||
if (_PyUnicodeError_GetParams(exc,
|
||||
&obj, &objlen,
|
||||
&start, &end, &slen, true) < 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Try decoding a single surrogate character. If
|
||||
there are more, let the codec call us again. */
|
||||
Py_UCS4 ch = 0;
|
||||
const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
|
||||
p += start;
|
||||
|
||||
if (objlen - start >= bytelength) {
|
||||
switch (code) {
|
||||
case ENC_UTF8: {
|
||||
if ((p[0] & 0xf0) == 0xe0 &&
|
||||
(p[1] & 0xc0) == 0x80 &&
|
||||
(p[2] & 0xc0) == 0x80)
|
||||
{
|
||||
/* it's a three-byte code */
|
||||
ch = ((p[0] & 0x0f) << 12) +
|
||||
((p[1] & 0x3f) << 6) +
|
||||
(p[2] & 0x3f);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ENC_UTF16LE: {
|
||||
ch = p[1] << 8 | p[0];
|
||||
break;
|
||||
}
|
||||
case ENC_UTF16BE: {
|
||||
ch = p[0] << 8 | p[1];
|
||||
break;
|
||||
}
|
||||
case ENC_UTF32LE: {
|
||||
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
|
||||
break;
|
||||
}
|
||||
case ENC_UTF32BE: {
|
||||
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Py_DECREF(obj);
|
||||
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
goto bail;
|
||||
}
|
||||
|
||||
PyObject *res = PyUnicode_FromOrdinal(ch);
|
||||
if (res == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("(Nn)", res, start + bytelength);
|
||||
|
||||
bail:
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* This handler is declared static until someone demonstrates
|
||||
a need to call it directly. */
|
||||
static PyObject *
|
||||
PyCodec_SurrogatePassErrors(PyObject *exc)
|
||||
{
|
||||
PyObject *restuple;
|
||||
PyObject *object;
|
||||
PyObject *encode;
|
||||
const char *encoding;
|
||||
int code;
|
||||
int bytelength;
|
||||
Py_ssize_t i;
|
||||
Py_ssize_t start;
|
||||
Py_ssize_t end;
|
||||
PyObject *res;
|
||||
|
||||
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
|
||||
unsigned char *outp;
|
||||
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
||||
return NULL;
|
||||
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
||||
return NULL;
|
||||
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
||||
return NULL;
|
||||
if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
if (!(encoding = PyUnicode_AsUTF8(encode))) {
|
||||
Py_DECREF(object);
|
||||
Py_DECREF(encode);
|
||||
return NULL;
|
||||
}
|
||||
code = get_standard_encoding(encoding, &bytelength);
|
||||
Py_DECREF(encode);
|
||||
if (code == ENC_UNKNOWN) {
|
||||
/* Not supported, fail with original exception */
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (end - start > PY_SSIZE_T_MAX / bytelength)
|
||||
end = start + PY_SSIZE_T_MAX / bytelength;
|
||||
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
|
||||
if (!res) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
outp = (unsigned char*)PyBytes_AsString(res);
|
||||
for (i = start; i < end; i++) {
|
||||
/* object is guaranteed to be "ready" */
|
||||
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
|
||||
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
/* Not a surrogate, fail with original exception */
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
Py_DECREF(res);
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
switch (code) {
|
||||
case ENC_UTF8:
|
||||
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
|
||||
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
|
||||
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
|
||||
break;
|
||||
case ENC_UTF16LE:
|
||||
*outp++ = (unsigned char) ch;
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
break;
|
||||
case ENC_UTF16BE:
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
*outp++ = (unsigned char) ch;
|
||||
break;
|
||||
case ENC_UTF32LE:
|
||||
*outp++ = (unsigned char) ch;
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
*outp++ = (unsigned char)(ch >> 16);
|
||||
*outp++ = (unsigned char)(ch >> 24);
|
||||
break;
|
||||
case ENC_UTF32BE:
|
||||
*outp++ = (unsigned char)(ch >> 24);
|
||||
*outp++ = (unsigned char)(ch >> 16);
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
*outp++ = (unsigned char) ch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
restuple = Py_BuildValue("(On)", res, end);
|
||||
Py_DECREF(res);
|
||||
Py_DECREF(object);
|
||||
return restuple;
|
||||
if (_PyIsUnicodeEncodeError(exc)) {
|
||||
return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
|
||||
}
|
||||
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
|
||||
const unsigned char *p;
|
||||
Py_UCS4 ch = 0;
|
||||
if (PyUnicodeDecodeError_GetStart(exc, &start))
|
||||
return NULL;
|
||||
if (PyUnicodeDecodeError_GetEnd(exc, &end))
|
||||
return NULL;
|
||||
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
|
||||
return NULL;
|
||||
p = (const unsigned char*)PyBytes_AS_STRING(object);
|
||||
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
if (!(encoding = PyUnicode_AsUTF8(encode))) {
|
||||
Py_DECREF(object);
|
||||
Py_DECREF(encode);
|
||||
return NULL;
|
||||
}
|
||||
code = get_standard_encoding(encoding, &bytelength);
|
||||
Py_DECREF(encode);
|
||||
if (code == ENC_UNKNOWN) {
|
||||
/* Not supported, fail with original exception */
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Try decoding a single surrogate character. If
|
||||
there are more, let the codec call us again. */
|
||||
p += start;
|
||||
if (PyBytes_GET_SIZE(object) - start >= bytelength) {
|
||||
switch (code) {
|
||||
case ENC_UTF8:
|
||||
if ((p[0] & 0xf0) == 0xe0 &&
|
||||
(p[1] & 0xc0) == 0x80 &&
|
||||
(p[2] & 0xc0) == 0x80) {
|
||||
/* it's a three-byte code */
|
||||
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
||||
}
|
||||
break;
|
||||
case ENC_UTF16LE:
|
||||
ch = p[1] << 8 | p[0];
|
||||
break;
|
||||
case ENC_UTF16BE:
|
||||
ch = p[0] << 8 | p[1];
|
||||
break;
|
||||
case ENC_UTF32LE:
|
||||
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
|
||||
break;
|
||||
case ENC_UTF32BE:
|
||||
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Py_DECREF(object);
|
||||
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
/* it's not a surrogate - fail */
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
return NULL;
|
||||
}
|
||||
res = PyUnicode_FromOrdinal(ch);
|
||||
if (res == NULL)
|
||||
return NULL;
|
||||
return Py_BuildValue("(Nn)", res, start + bytelength);
|
||||
else if (_PyIsUnicodeDecodeError(exc)) {
|
||||
return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
|
||||
}
|
||||
else {
|
||||
wrong_exception_type(exc);
|
||||
|
@ -1319,6 +1358,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static PyObject *
|
||||
PyCodec_SurrogateEscapeErrors(PyObject *exc)
|
||||
{
|
||||
|
@ -1438,11 +1478,13 @@ namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
|
|||
}
|
||||
|
||||
|
||||
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
|
||||
static inline PyObject *
|
||||
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
|
||||
{
|
||||
return PyCodec_SurrogatePassErrors(exc);
|
||||
}
|
||||
|
||||
|
||||
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
|
||||
{
|
||||
return PyCodec_SurrogateEscapeErrors(exc);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue