mirror of
https://github.com/python/cpython.git
synced 2025-09-27 10:50:04 +00:00
Fix to the UTF-8 encoder: it failed on 0-length input strings.
Fix for the UTF-8 decoder: it will now accept isolated surrogates (previously it raised an exception which causes round-trips to fail). Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for marshalling Unicode objects, so we better make sure it works for all Unicode code points, including isolated surrogates). Bumped the PYC magic in a non-standard way -- please review. This was needed because the old PYC format used illegal UTF-8 sequences for isolated high surrogates which now raise an exception.
This commit is contained in:
parent
9273ec726c
commit
bd3be8f0ca
4 changed files with 71 additions and 31 deletions
|
@ -1,5 +1,5 @@
|
||||||
test_unicodedata
|
test_unicodedata
|
||||||
Testing Unicode Database...
|
Testing Unicode Database...
|
||||||
Methods: 6c7a7c02657b69d0fdd7a7d174f573194bba2e18
|
Methods: 84b72943b1d4320bc1e64a4888f7cdf62eea219a
|
||||||
Functions: 41e1d4792185d6474a43c83ce4f593b1bdb01f8a
|
Functions: 41e1d4792185d6474a43c83ce4f593b1bdb01f8a
|
||||||
API: ok
|
API: ok
|
||||||
|
|
|
@ -23,7 +23,7 @@ if not sys.platform.startswith('java'):
|
||||||
verify(repr(u"'\"") == """u'\\'"'""")
|
verify(repr(u"'\"") == """u'\\'"'""")
|
||||||
verify(repr(u"'") == '''u"'"''')
|
verify(repr(u"'") == '''u"'"''')
|
||||||
verify(repr(u'"') == """u'"'""")
|
verify(repr(u'"') == """u'"'""")
|
||||||
verify(repr(u''.join(map(unichr, range(256)))) ==
|
latin1repr = (
|
||||||
"u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
|
"u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
|
||||||
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
|
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
|
||||||
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
|
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
|
||||||
|
@ -38,6 +38,8 @@ if not sys.platform.startswith('java'):
|
||||||
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
|
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
|
||||||
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
|
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
|
||||||
"\\xfe\\xff'")
|
"\\xfe\\xff'")
|
||||||
|
testrepr = repr(u''.join(map(unichr, range(256))))
|
||||||
|
verify(testrepr == latin1repr)
|
||||||
|
|
||||||
def test(method, input, output, *args):
|
def test(method, input, output, *args):
|
||||||
if verbose:
|
if verbose:
|
||||||
|
@ -495,6 +497,7 @@ else:
|
||||||
verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
|
verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
|
||||||
|
|
||||||
# UTF-8 specific encoding tests:
|
# UTF-8 specific encoding tests:
|
||||||
|
verify(u''.encode('utf-8') == '')
|
||||||
verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
|
verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
|
||||||
verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
|
verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
|
||||||
verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
|
verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
|
||||||
|
@ -552,14 +555,7 @@ for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
|
||||||
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
|
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
|
||||||
verify(unicode(u.encode(encoding),encoding) == u)
|
verify(unicode(u.encode(encoding),encoding) == u)
|
||||||
|
|
||||||
# Roundtrip safety for non-BMP (just a few chars)
|
# Roundtrip safety for BMP (just the first 256 chars)
|
||||||
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
|
|
||||||
for encoding in ('utf-8',
|
|
||||||
'utf-16', 'utf-16-le', 'utf-16-be',
|
|
||||||
#'raw_unicode_escape',
|
|
||||||
'unicode_escape', 'unicode_internal'):
|
|
||||||
verify(unicode(u.encode(encoding),encoding) == u)
|
|
||||||
|
|
||||||
u = u''.join(map(unichr, range(256)))
|
u = u''.join(map(unichr, range(256)))
|
||||||
for encoding in (
|
for encoding in (
|
||||||
'latin-1',
|
'latin-1',
|
||||||
|
@ -571,6 +567,7 @@ for encoding in (
|
||||||
except ValueError,why:
|
except ValueError,why:
|
||||||
print '*** codec for "%s" failed: %s' % (encoding, why)
|
print '*** codec for "%s" failed: %s' % (encoding, why)
|
||||||
|
|
||||||
|
# Roundtrip safety for BMP (just the first 128 chars)
|
||||||
u = u''.join(map(unichr, range(128)))
|
u = u''.join(map(unichr, range(128)))
|
||||||
for encoding in (
|
for encoding in (
|
||||||
'ascii',
|
'ascii',
|
||||||
|
@ -582,6 +579,19 @@ for encoding in (
|
||||||
except ValueError,why:
|
except ValueError,why:
|
||||||
print '*** codec for "%s" failed: %s' % (encoding, why)
|
print '*** codec for "%s" failed: %s' % (encoding, why)
|
||||||
|
|
||||||
|
# Roundtrip safety for non-BMP (just a few chars)
|
||||||
|
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
|
||||||
|
for encoding in ('utf-8',
|
||||||
|
'utf-16', 'utf-16-le', 'utf-16-be',
|
||||||
|
#'raw_unicode_escape',
|
||||||
|
'unicode_escape', 'unicode_internal'):
|
||||||
|
verify(unicode(u.encode(encoding),encoding) == u)
|
||||||
|
|
||||||
|
# UTF-8 must be roundtrip safe for all UCS-2 code points
|
||||||
|
u = u''.join(map(unichr, range(0x10000)))
|
||||||
|
for encoding in ('utf-8',):
|
||||||
|
verify(unicode(u.encode(encoding),encoding) == u)
|
||||||
|
|
||||||
print 'done.'
|
print 'done.'
|
||||||
|
|
||||||
print 'Testing standard mapping codecs...',
|
print 'Testing standard mapping codecs...',
|
||||||
|
|
|
@ -1065,7 +1065,14 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||||
goto utf8Error;
|
goto utf8Error;
|
||||||
}
|
}
|
||||||
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
||||||
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
|
if (ch < 0x0800) {
|
||||||
|
/* Note: UTF-8 encodings of surrogates are considered
|
||||||
|
legal UTF-8 sequences;
|
||||||
|
|
||||||
|
XXX For wide builds (UCS-4) we should probably try
|
||||||
|
to recombine the surrogates into a single code
|
||||||
|
unit.
|
||||||
|
*/
|
||||||
errmsg = "illegal encoding";
|
errmsg = "illegal encoding";
|
||||||
goto utf8Error;
|
goto utf8Error;
|
||||||
}
|
}
|
||||||
|
@ -1175,11 +1182,15 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
unsigned int cbWritten = 0;
|
unsigned int cbWritten = 0;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
|
/* Short-cut for emtpy strings */
|
||||||
|
if (size == 0)
|
||||||
|
return PyString_FromStringAndSize(NULL, 0);
|
||||||
|
|
||||||
|
/* We allocate 4 more bytes to have room for at least one full
|
||||||
|
UTF-8 sequence; saves a few cycles in the loop below */
|
||||||
v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
|
v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
|
||||||
if (v == NULL)
|
if (v == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (size == 0)
|
|
||||||
return v;
|
|
||||||
|
|
||||||
p = PyString_AS_STRING(v);
|
p = PyString_AS_STRING(v);
|
||||||
while (i < size) {
|
while (i < size) {
|
||||||
|
|
|
@ -41,8 +41,27 @@ extern time_t PyOS_GetLastModificationTime(char *, FILE *);
|
||||||
the Unicode -U option is in use. IMO (Tim's), that's a Bad Idea
|
the Unicode -U option is in use. IMO (Tim's), that's a Bad Idea
|
||||||
(quite apart from that the -U option doesn't work so isn't used
|
(quite apart from that the -U option doesn't work so isn't used
|
||||||
anyway).
|
anyway).
|
||||||
|
|
||||||
|
XXX MAL, 2002-02-07: I had to modify the MAGIC due to a fix of the
|
||||||
|
UTF-8 encoder (it previously produced invalid UTF-8 for unpaired
|
||||||
|
high surrogates), so I simply bumped the month value to 20 (invalid
|
||||||
|
month) and set the day to 1. This should be recognizable by any
|
||||||
|
algorithm relying on the above scheme. Perhaps we should simply
|
||||||
|
start counting in increments of 10 from now on ?!
|
||||||
|
|
||||||
|
Known values:
|
||||||
|
Python 1.5: 20121
|
||||||
|
Python 1.5.1: 20121
|
||||||
|
Python 1.5.2: 20121
|
||||||
|
Python 2.0: 50823
|
||||||
|
Python 2.0.1: 50823
|
||||||
|
Python 2.1: 60202
|
||||||
|
Python 2.1.1: 60202
|
||||||
|
Python 2.1.2: 60202
|
||||||
|
Python 2.2: 60717
|
||||||
|
Python 2.3a0: 62001
|
||||||
*/
|
*/
|
||||||
#define MAGIC (60717 | ((long)'\r'<<16) | ((long)'\n'<<24))
|
#define MAGIC (62001 | ((long)'\r'<<16) | ((long)'\n'<<24))
|
||||||
|
|
||||||
/* Magic word as global; note that _PyImport_Init() can change the
|
/* Magic word as global; note that _PyImport_Init() can change the
|
||||||
value of this global to accommodate for alterations of how the
|
value of this global to accommodate for alterations of how the
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue