Fix to the UTF-8 encoder: it failed on 0-length input strings.

Fix for the UTF-8 decoder: it will now accept isolated surrogates (previously it raised an exception which causes round-trips to fail). Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for marshalling Unicode objects, so we better make sure it works for all Unicode code points, including isolated surrogates). Bumped the PYC magic in a non-standard way -- please review. This was needed because the old PYC format used illegal UTF-8 sequences for isolated high surrogates which now raise an exception.
2025-09-27 10:50:04 +00:00 · 2002-02-07 11:33:49 +00:00 · 2002-02-07 11:33:49 +00:00 · bd3be8f0ca
commit bd3be8f0ca
parent 9273ec726c
4 changed files with 71 additions and 31 deletions
--- a/Lib/test/output/test_unicodedata
+++ b/Lib/test/output/test_unicodedata
@ -1,5 +1,5 @@
 test_unicodedata
 Testing Unicode Database...
-Methods: 6c7a7c02657b69d0fdd7a7d174f573194bba2e18
+Methods: 84b72943b1d4320bc1e64a4888f7cdf62eea219a
 Functions: 41e1d4792185d6474a43c83ce4f593b1bdb01f8a
 API: ok
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -23,7 +23,7 @@ if not sys.platform.startswith('java'):
    verify(repr(u"'\"") == """u'\\'"'""")
    verify(repr(u"'") == '''u"'"''')
    verify(repr(u'"') == """u'"'""")
-    verify(repr(u''.join(map(unichr, range(256)))) ==
+    latin1repr = (
        "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
        "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
        "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
@ -38,6 +38,8 @@ if not sys.platform.startswith('java'):
        "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
        "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
        "\\xfe\\xff'")
    testrepr = repr(u''.join(map(unichr, range(256))))
    verify(testrepr == latin1repr)
 def test(method, input, output, *args):
    if verbose:
@ -495,6 +497,7 @@ else:
 verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
 # UTF-8 specific encoding tests:
 verify(u''.encode('utf-8') == '')
 verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
 verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
 verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
@ -552,14 +555,7 @@ for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
                 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
    verify(unicode(u.encode(encoding),encoding) == u)
-# Roundtrip safety for non-BMP (just a few chars)
+# Roundtrip safety for BMP (just the first 256 chars)
 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
 for encoding in ('utf-8',
                 'utf-16', 'utf-16-le', 'utf-16-be',
                 #'raw_unicode_escape',
                 'unicode_escape', 'unicode_internal'):
    verify(unicode(u.encode(encoding),encoding) == u)
 u = u''.join(map(unichr, range(256)))
 for encoding in (
    'latin-1',
@ -571,6 +567,7 @@ for encoding in (
    except ValueError,why:
        print '*** codec for "%s" failed: %s' % (encoding, why)
 # Roundtrip safety for BMP (just the first 128 chars)
 u = u''.join(map(unichr, range(128)))
 for encoding in (
    'ascii',
@ -582,6 +579,19 @@ for encoding in (
    except ValueError,why:
        print '*** codec for "%s" failed: %s' % (encoding, why)
 # Roundtrip safety for non-BMP (just a few chars)
 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
 for encoding in ('utf-8',
                 'utf-16', 'utf-16-le', 'utf-16-be',
                 #'raw_unicode_escape',
                 'unicode_escape', 'unicode_internal'):
    verify(unicode(u.encode(encoding),encoding) == u)
 # UTF-8 must be roundtrip safe for all UCS-2 code points
 u = u''.join(map(unichr, range(0x10000)))
 for encoding in ('utf-8',):
    verify(unicode(u.encode(encoding),encoding) == u)
 print 'done.'
 print 'Testing standard mapping codecs...',
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1065,7 +1065,14 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
 		goto utf8Error;
 	    }
            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
-            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
+            if (ch < 0x0800) {
 		/* Note: UTF-8 encodings of surrogates are considered
 		   legal UTF-8 sequences; 
 		   XXX For wide builds (UCS-4) we should probably try
 		       to recombine the surrogates into a single code
 		       unit.
 		*/
                errmsg = "illegal encoding";
 		goto utf8Error;
 	    }
@ -1175,11 +1182,15 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
    unsigned int cbWritten = 0;
    int i = 0;
    /* Short-cut for emtpy strings */
    if (size == 0)
 	return PyString_FromStringAndSize(NULL, 0);
    /* We allocate 4 more bytes to have room for at least one full
       UTF-8 sequence; saves a few cycles in the loop below */
    v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
    if (v == NULL)
        return NULL;
    if (size == 0)
        return v;
    p = PyString_AS_STRING(v);
    while (i < size) {
--- a/Python/import.c
+++ b/Python/import.c
@ -41,8 +41,27 @@ extern time_t PyOS_GetLastModificationTime(char *, FILE *);
       the Unicode -U option is in use.  IMO (Tim's), that's a Bad Idea
       (quite apart from that the -U option doesn't work so isn't used
       anyway).
   XXX MAL, 2002-02-07: I had to modify the MAGIC due to a fix of the
       UTF-8 encoder (it previously produced invalid UTF-8 for unpaired
       high surrogates), so I simply bumped the month value to 20 (invalid
       month) and set the day to 1.  This should be recognizable by any
       algorithm relying on the above scheme. Perhaps we should simply
       start counting in increments of 10 from now on ?!
   Known values:
       Python 1.5:   20121
       Python 1.5.1: 20121
       Python 1.5.2: 20121
       Python 2.0:   50823
       Python 2.0.1: 50823
       Python 2.1:   60202
       Python 2.1.1: 60202
       Python 2.1.2: 60202
       Python 2.2:   60717
       Python 2.3a0: 62001
 */
-#define MAGIC (60717 | ((long)'\r'<<16) | ((long)'\n'<<24))
+#define MAGIC (62001 | ((long)'\r'<<16) | ((long)'\n'<<24))
 /* Magic word as global; note that _PyImport_Init() can change the
   value of this global to accommodate for alterations of how the