Fixed problems with UTF error reporting macros and some formatting bugs.

2025-08-30 21:48:47 +00:00 · 2000-07-17 18:23:13 +00:00 · 2000-07-17 18:23:13 +00:00 · 9542f48fd5
commit 9542f48fd5
parent cf5f358784
1 changed files with 64 additions and 45 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -633,13 +633,6 @@ int utf8_decoding_error(const char **source,
    }
 }
 #define UTF8_ERROR(details) \
  do {                                                      \
      if (utf8_decoding_error(&s, &p, errors, (details)))   \
          goto onError;                                     \
      goto nextchar;                                        \
  } while (0)
 PyObject *PyUnicode_DecodeUTF8(const char *s,
 			       int size,
 			       const char *errors)
@ -648,6 +641,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
    const char *e;
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
    const char *errmsg = "";
    /* Note: size will always be longer than the resulting Unicode
       character count */
@ -672,36 +666,48 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
        n = utf8_code_length[ch];
-        if (s + n > e)
+        if (s + n > e) {
-            UTF8_ERROR("unexpected end of data");
+	    errmsg = "unexpected end of data";
 	    goto utf8Error;
 	}
        switch (n) {
        case 0:
-            UTF8_ERROR("unexpected code byte");
+            errmsg = "unexpected code byte";
 	    goto utf8Error;
            break;
        case 1:
-            UTF8_ERROR("internal error");
+            errmsg = "internal error";
 	    goto utf8Error;
            break;
        case 2:
-            if ((s[1] & 0xc0) != 0x80) 
+            if ((s[1] & 0xc0) != 0x80) {
-                UTF8_ERROR("invalid data");
+                errmsg = "invalid data";
 		goto utf8Error;
 	    }
            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
-            if (ch < 0x80)
+            if (ch < 0x80) {
-                UTF8_ERROR("illegal encoding");
+                errmsg = "illegal encoding";
 		goto utf8Error;
 	    }
 	    else
-				*p++ = (Py_UNICODE)ch;
+		*p++ = (Py_UNICODE)ch;
            break;
        case 3:
            if ((s[1] & 0xc0) != 0x80 || 
-                (s[2] & 0xc0) != 0x80) 
+                (s[2] & 0xc0) != 0x80) {
-                UTF8_ERROR("invalid data");
+                errmsg = "invalid data";
 		goto utf8Error;
 	    }
            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
-            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
+            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
-                UTF8_ERROR("illegal encoding");
+                errmsg = "illegal encoding";
 		goto utf8Error;
 	    }
 	    else
 				*p++ = (Py_UNICODE)ch;
            break;
@ -709,14 +715,20 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
        case 4:
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
-                (s[3] & 0xc0) != 0x80)
+                (s[3] & 0xc0) != 0x80) {
-                UTF8_ERROR("invalid data");
+                errmsg = "invalid data";
 		goto utf8Error;
 	    }
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
            /* validate and convert to UTF-16 */
-            if ((ch < 0x10000) ||                  /* minimum value allowed for 4 byte encoding */
+            if ((ch < 0x10000) ||   /* minimum value allowed for 4
-                (ch > 0x10ffff))                   /* maximum value allowed for UTF-16 */
+                                       byte encoding */
-                UTF8_ERROR("illegal encoding");
+                (ch > 0x10ffff)) {  /* maximum value allowed for
                                       UTF-16 */
                errmsg = "illegal encoding";
 		goto utf8Error;
 	    }
            /*  compute and append the two surrogates: */
            /*  translate from 10000..10FFFF to 0..FFFF */
@ -731,12 +743,16 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
        default:
            /* Other sizes are only needed for UCS-4 */
-            UTF8_ERROR("unsupported Unicode code range");
+            errmsg = "unsupported Unicode code range";
 	    goto utf8Error;
 	    break;
        }
        s += n;
-
+	continue;
-      nextchar:
+	
-        ;
+    utf8Error:
      if (utf8_decoding_error(&s, &p, errors, errmsg))
          goto onError;
    }
    /* Adjust length */
@ -750,9 +766,8 @@ onError:
    return NULL;
 }
-#undef UTF8_ERROR
+/* Not used anymore, now that the encoder supports UTF-16
-
+   surrogates. */
 /* NOT USED */
 #if 0
 static
 int utf8_encoding_error(const Py_UNICODE **source,
@ -783,7 +798,7 @@ int utf8_encoding_error(const Py_UNICODE **source,
 	return -1;
    }
 }
-#endif /* NOT USED */
+#endif
 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 			       int size,
@ -827,7 +842,7 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 			       surrogates */
 			    cbAllocated += 4*10;
                            if (_PyString_Resize(&v, cbAllocated))
-		goto onError;
+				goto onError;
                        }
                        /* combine the two values */
@ -938,12 +953,6 @@ int utf16_decoding_error(const Py_UNICODE **source,
    }
 }
 #define UTF16_ERROR(details)  do {                       \
    if (utf16_decoding_error(&q, &p, errors, details))   \
        goto onError;                                    \
    continue;                                            \
 } while(0)
 PyObject *PyUnicode_DecodeUTF16(const char *s,
 				int size,
 				const char *errors,
@ -953,6 +962,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
    Py_UNICODE *p;
    const Py_UNICODE *q, *e;
    int bo = 0;
    const char *errmsg = "";
    /* size should be an even number */
    if (size % sizeof(Py_UNICODE) != 0) {
@ -1012,20 +1022,29 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
 	}
 	/* UTF-16 code pair: */
-	if (q >= e)
+	if (q >= e) {
-	    UTF16_ERROR("unexpected end of data");
+	    errmsg = "unexpected end of data";
 	    goto utf16Error;
 	}
 	if (0xDC00 <= *q && *q <= 0xDFFF) {
 	    q++;
-	    if (0xD800 <= *q && *q <= 0xDBFF)
+	    if (0xD800 <= *q && *q <= 0xDBFF) {
 		/* This is valid data (a UTF-16 surrogate pair), but
 		   we are not able to store this information since our
 		   Py_UNICODE type only has 16 bits... this might
 		   change someday, even though it's unlikely. */
-		UTF16_ERROR("code pairs are not supported");
+		errmsg = "code pairs are not supported";
 		goto utf16Error;
 	    }
 	    else
 		continue;
 	}
-	UTF16_ERROR("illegal encoding");
+	errmsg = "illegal encoding";
 	/* Fall through to report the error */
    utf16Error:
 	if (utf16_decoding_error(&q, &p, errors, errmsg))
 	    goto onError;
    }
    if (byteorder)