mirror of
https://github.com/python/cpython.git
synced 2025-08-30 21:48:47 +00:00
Fixed problems with UTF error reporting macros and some formatting bugs.
This commit is contained in:
parent
cf5f358784
commit
9542f48fd5
1 changed files with 64 additions and 45 deletions
|
@ -633,13 +633,6 @@ int utf8_decoding_error(const char **source,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define UTF8_ERROR(details) \
|
|
||||||
do { \
|
|
||||||
if (utf8_decoding_error(&s, &p, errors, (details))) \
|
|
||||||
goto onError; \
|
|
||||||
goto nextchar; \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
PyObject *PyUnicode_DecodeUTF8(const char *s,
|
PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||||
int size,
|
int size,
|
||||||
const char *errors)
|
const char *errors)
|
||||||
|
@ -648,6 +641,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||||
const char *e;
|
const char *e;
|
||||||
PyUnicodeObject *unicode;
|
PyUnicodeObject *unicode;
|
||||||
Py_UNICODE *p;
|
Py_UNICODE *p;
|
||||||
|
const char *errmsg = "";
|
||||||
|
|
||||||
/* Note: size will always be longer than the resulting Unicode
|
/* Note: size will always be longer than the resulting Unicode
|
||||||
character count */
|
character count */
|
||||||
|
@ -672,36 +666,48 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||||
|
|
||||||
n = utf8_code_length[ch];
|
n = utf8_code_length[ch];
|
||||||
|
|
||||||
if (s + n > e)
|
if (s + n > e) {
|
||||||
UTF8_ERROR("unexpected end of data");
|
errmsg = "unexpected end of data";
|
||||||
|
goto utf8Error;
|
||||||
|
}
|
||||||
|
|
||||||
switch (n) {
|
switch (n) {
|
||||||
|
|
||||||
case 0:
|
case 0:
|
||||||
UTF8_ERROR("unexpected code byte");
|
errmsg = "unexpected code byte";
|
||||||
|
goto utf8Error;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 1:
|
case 1:
|
||||||
UTF8_ERROR("internal error");
|
errmsg = "internal error";
|
||||||
|
goto utf8Error;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 2:
|
case 2:
|
||||||
if ((s[1] & 0xc0) != 0x80)
|
if ((s[1] & 0xc0) != 0x80) {
|
||||||
UTF8_ERROR("invalid data");
|
errmsg = "invalid data";
|
||||||
|
goto utf8Error;
|
||||||
|
}
|
||||||
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
|
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
|
||||||
if (ch < 0x80)
|
if (ch < 0x80) {
|
||||||
UTF8_ERROR("illegal encoding");
|
errmsg = "illegal encoding";
|
||||||
|
goto utf8Error;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
*p++ = (Py_UNICODE)ch;
|
*p++ = (Py_UNICODE)ch;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 3:
|
case 3:
|
||||||
if ((s[1] & 0xc0) != 0x80 ||
|
if ((s[1] & 0xc0) != 0x80 ||
|
||||||
(s[2] & 0xc0) != 0x80)
|
(s[2] & 0xc0) != 0x80) {
|
||||||
UTF8_ERROR("invalid data");
|
errmsg = "invalid data";
|
||||||
|
goto utf8Error;
|
||||||
|
}
|
||||||
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
||||||
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
|
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
|
||||||
UTF8_ERROR("illegal encoding");
|
errmsg = "illegal encoding";
|
||||||
|
goto utf8Error;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
*p++ = (Py_UNICODE)ch;
|
*p++ = (Py_UNICODE)ch;
|
||||||
break;
|
break;
|
||||||
|
@ -709,14 +715,20 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||||
case 4:
|
case 4:
|
||||||
if ((s[1] & 0xc0) != 0x80 ||
|
if ((s[1] & 0xc0) != 0x80 ||
|
||||||
(s[2] & 0xc0) != 0x80 ||
|
(s[2] & 0xc0) != 0x80 ||
|
||||||
(s[3] & 0xc0) != 0x80)
|
(s[3] & 0xc0) != 0x80) {
|
||||||
UTF8_ERROR("invalid data");
|
errmsg = "invalid data";
|
||||||
|
goto utf8Error;
|
||||||
|
}
|
||||||
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
||||||
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
||||||
/* validate and convert to UTF-16 */
|
/* validate and convert to UTF-16 */
|
||||||
if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */
|
if ((ch < 0x10000) || /* minimum value allowed for 4
|
||||||
(ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
|
byte encoding */
|
||||||
UTF8_ERROR("illegal encoding");
|
(ch > 0x10ffff)) { /* maximum value allowed for
|
||||||
|
UTF-16 */
|
||||||
|
errmsg = "illegal encoding";
|
||||||
|
goto utf8Error;
|
||||||
|
}
|
||||||
/* compute and append the two surrogates: */
|
/* compute and append the two surrogates: */
|
||||||
|
|
||||||
/* translate from 10000..10FFFF to 0..FFFF */
|
/* translate from 10000..10FFFF to 0..FFFF */
|
||||||
|
@ -731,12 +743,16 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||||
|
|
||||||
default:
|
default:
|
||||||
/* Other sizes are only needed for UCS-4 */
|
/* Other sizes are only needed for UCS-4 */
|
||||||
UTF8_ERROR("unsupported Unicode code range");
|
errmsg = "unsupported Unicode code range";
|
||||||
|
goto utf8Error;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
s += n;
|
s += n;
|
||||||
|
continue;
|
||||||
nextchar:
|
|
||||||
;
|
utf8Error:
|
||||||
|
if (utf8_decoding_error(&s, &p, errors, errmsg))
|
||||||
|
goto onError;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Adjust length */
|
/* Adjust length */
|
||||||
|
@ -750,9 +766,8 @@ onError:
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef UTF8_ERROR
|
/* Not used anymore, now that the encoder supports UTF-16
|
||||||
|
surrogates. */
|
||||||
/* NOT USED */
|
|
||||||
#if 0
|
#if 0
|
||||||
static
|
static
|
||||||
int utf8_encoding_error(const Py_UNICODE **source,
|
int utf8_encoding_error(const Py_UNICODE **source,
|
||||||
|
@ -783,7 +798,7 @@ int utf8_encoding_error(const Py_UNICODE **source,
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* NOT USED */
|
#endif
|
||||||
|
|
||||||
PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
int size,
|
int size,
|
||||||
|
@ -827,7 +842,7 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
surrogates */
|
surrogates */
|
||||||
cbAllocated += 4*10;
|
cbAllocated += 4*10;
|
||||||
if (_PyString_Resize(&v, cbAllocated))
|
if (_PyString_Resize(&v, cbAllocated))
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* combine the two values */
|
/* combine the two values */
|
||||||
|
@ -938,12 +953,6 @@ int utf16_decoding_error(const Py_UNICODE **source,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define UTF16_ERROR(details) do { \
|
|
||||||
if (utf16_decoding_error(&q, &p, errors, details)) \
|
|
||||||
goto onError; \
|
|
||||||
continue; \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
PyObject *PyUnicode_DecodeUTF16(const char *s,
|
PyObject *PyUnicode_DecodeUTF16(const char *s,
|
||||||
int size,
|
int size,
|
||||||
const char *errors,
|
const char *errors,
|
||||||
|
@ -953,6 +962,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
|
||||||
Py_UNICODE *p;
|
Py_UNICODE *p;
|
||||||
const Py_UNICODE *q, *e;
|
const Py_UNICODE *q, *e;
|
||||||
int bo = 0;
|
int bo = 0;
|
||||||
|
const char *errmsg = "";
|
||||||
|
|
||||||
/* size should be an even number */
|
/* size should be an even number */
|
||||||
if (size % sizeof(Py_UNICODE) != 0) {
|
if (size % sizeof(Py_UNICODE) != 0) {
|
||||||
|
@ -1012,20 +1022,29 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
|
||||||
}
|
}
|
||||||
|
|
||||||
/* UTF-16 code pair: */
|
/* UTF-16 code pair: */
|
||||||
if (q >= e)
|
if (q >= e) {
|
||||||
UTF16_ERROR("unexpected end of data");
|
errmsg = "unexpected end of data";
|
||||||
|
goto utf16Error;
|
||||||
|
}
|
||||||
if (0xDC00 <= *q && *q <= 0xDFFF) {
|
if (0xDC00 <= *q && *q <= 0xDFFF) {
|
||||||
q++;
|
q++;
|
||||||
if (0xD800 <= *q && *q <= 0xDBFF)
|
if (0xD800 <= *q && *q <= 0xDBFF) {
|
||||||
/* This is valid data (a UTF-16 surrogate pair), but
|
/* This is valid data (a UTF-16 surrogate pair), but
|
||||||
we are not able to store this information since our
|
we are not able to store this information since our
|
||||||
Py_UNICODE type only has 16 bits... this might
|
Py_UNICODE type only has 16 bits... this might
|
||||||
change someday, even though it's unlikely. */
|
change someday, even though it's unlikely. */
|
||||||
UTF16_ERROR("code pairs are not supported");
|
errmsg = "code pairs are not supported";
|
||||||
|
goto utf16Error;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
UTF16_ERROR("illegal encoding");
|
errmsg = "illegal encoding";
|
||||||
|
/* Fall through to report the error */
|
||||||
|
|
||||||
|
utf16Error:
|
||||||
|
if (utf16_decoding_error(&q, &p, errors, errmsg))
|
||||||
|
goto onError;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (byteorder)
|
if (byteorder)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue