Fixed problems with UTF error reporting macros and some formatting bugs.

This commit is contained in:
Marc-André Lemburg 2000-07-17 18:23:13 +00:00
parent cf5f358784
commit 9542f48fd5

View file

@ -633,13 +633,6 @@ int utf8_decoding_error(const char **source,
} }
} }
#define UTF8_ERROR(details) \
do { \
if (utf8_decoding_error(&s, &p, errors, (details))) \
goto onError; \
goto nextchar; \
} while (0)
PyObject *PyUnicode_DecodeUTF8(const char *s, PyObject *PyUnicode_DecodeUTF8(const char *s,
int size, int size,
const char *errors) const char *errors)
@ -648,6 +641,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
const char *e; const char *e;
PyUnicodeObject *unicode; PyUnicodeObject *unicode;
Py_UNICODE *p; Py_UNICODE *p;
const char *errmsg = "";
/* Note: size will always be longer than the resulting Unicode /* Note: size will always be longer than the resulting Unicode
character count */ character count */
@ -672,36 +666,48 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
n = utf8_code_length[ch]; n = utf8_code_length[ch];
if (s + n > e) if (s + n > e) {
UTF8_ERROR("unexpected end of data"); errmsg = "unexpected end of data";
goto utf8Error;
}
switch (n) { switch (n) {
case 0: case 0:
UTF8_ERROR("unexpected code byte"); errmsg = "unexpected code byte";
goto utf8Error;
break; break;
case 1: case 1:
UTF8_ERROR("internal error"); errmsg = "internal error";
goto utf8Error;
break; break;
case 2: case 2:
if ((s[1] & 0xc0) != 0x80) if ((s[1] & 0xc0) != 0x80) {
UTF8_ERROR("invalid data"); errmsg = "invalid data";
goto utf8Error;
}
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
if (ch < 0x80) if (ch < 0x80) {
UTF8_ERROR("illegal encoding"); errmsg = "illegal encoding";
goto utf8Error;
}
else else
*p++ = (Py_UNICODE)ch; *p++ = (Py_UNICODE)ch;
break; break;
case 3: case 3:
if ((s[1] & 0xc0) != 0x80 || if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80) (s[2] & 0xc0) != 0x80) {
UTF8_ERROR("invalid data"); errmsg = "invalid data";
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
UTF8_ERROR("illegal encoding"); errmsg = "illegal encoding";
goto utf8Error;
}
else else
*p++ = (Py_UNICODE)ch; *p++ = (Py_UNICODE)ch;
break; break;
@ -709,14 +715,20 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
case 4: case 4:
if ((s[1] & 0xc0) != 0x80 || if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80) (s[3] & 0xc0) != 0x80) {
UTF8_ERROR("invalid data"); errmsg = "invalid data";
goto utf8Error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f); ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
/* validate and convert to UTF-16 */ /* validate and convert to UTF-16 */
if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */ if ((ch < 0x10000) || /* minimum value allowed for 4
(ch > 0x10ffff)) /* maximum value allowed for UTF-16 */ byte encoding */
UTF8_ERROR("illegal encoding"); (ch > 0x10ffff)) { /* maximum value allowed for
UTF-16 */
errmsg = "illegal encoding";
goto utf8Error;
}
/* compute and append the two surrogates: */ /* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */ /* translate from 10000..10FFFF to 0..FFFF */
@ -731,12 +743,16 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
default: default:
/* Other sizes are only needed for UCS-4 */ /* Other sizes are only needed for UCS-4 */
UTF8_ERROR("unsupported Unicode code range"); errmsg = "unsupported Unicode code range";
goto utf8Error;
break;
} }
s += n; s += n;
continue;
nextchar:
; utf8Error:
if (utf8_decoding_error(&s, &p, errors, errmsg))
goto onError;
} }
/* Adjust length */ /* Adjust length */
@ -750,9 +766,8 @@ onError:
return NULL; return NULL;
} }
#undef UTF8_ERROR /* Not used anymore, now that the encoder supports UTF-16
surrogates. */
/* NOT USED */
#if 0 #if 0
static static
int utf8_encoding_error(const Py_UNICODE **source, int utf8_encoding_error(const Py_UNICODE **source,
@ -783,7 +798,7 @@ int utf8_encoding_error(const Py_UNICODE **source,
return -1; return -1;
} }
} }
#endif /* NOT USED */ #endif
PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
int size, int size,
@ -827,7 +842,7 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
surrogates */ surrogates */
cbAllocated += 4*10; cbAllocated += 4*10;
if (_PyString_Resize(&v, cbAllocated)) if (_PyString_Resize(&v, cbAllocated))
goto onError; goto onError;
} }
/* combine the two values */ /* combine the two values */
@ -938,12 +953,6 @@ int utf16_decoding_error(const Py_UNICODE **source,
} }
} }
#define UTF16_ERROR(details) do { \
if (utf16_decoding_error(&q, &p, errors, details)) \
goto onError; \
continue; \
} while(0)
PyObject *PyUnicode_DecodeUTF16(const char *s, PyObject *PyUnicode_DecodeUTF16(const char *s,
int size, int size,
const char *errors, const char *errors,
@ -953,6 +962,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
Py_UNICODE *p; Py_UNICODE *p;
const Py_UNICODE *q, *e; const Py_UNICODE *q, *e;
int bo = 0; int bo = 0;
const char *errmsg = "";
/* size should be an even number */ /* size should be an even number */
if (size % sizeof(Py_UNICODE) != 0) { if (size % sizeof(Py_UNICODE) != 0) {
@ -1012,20 +1022,29 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
} }
/* UTF-16 code pair: */ /* UTF-16 code pair: */
if (q >= e) if (q >= e) {
UTF16_ERROR("unexpected end of data"); errmsg = "unexpected end of data";
goto utf16Error;
}
if (0xDC00 <= *q && *q <= 0xDFFF) { if (0xDC00 <= *q && *q <= 0xDFFF) {
q++; q++;
if (0xD800 <= *q && *q <= 0xDBFF) if (0xD800 <= *q && *q <= 0xDBFF) {
/* This is valid data (a UTF-16 surrogate pair), but /* This is valid data (a UTF-16 surrogate pair), but
we are not able to store this information since our we are not able to store this information since our
Py_UNICODE type only has 16 bits... this might Py_UNICODE type only has 16 bits... this might
change someday, even though it's unlikely. */ change someday, even though it's unlikely. */
UTF16_ERROR("code pairs are not supported"); errmsg = "code pairs are not supported";
goto utf16Error;
}
else else
continue; continue;
} }
UTF16_ERROR("illegal encoding"); errmsg = "illegal encoding";
/* Fall through to report the error */
utf16Error:
if (utf16_decoding_error(&q, &p, errors, errmsg))
goto onError;
} }
if (byteorder) if (byteorder)