mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
Issue #11489: JSON decoder now accepts lone surrogates.
This commit is contained in:
parent
f45bbb6211
commit
c93329b3dd
4 changed files with 73 additions and 41 deletions
|
@ -66,6 +66,16 @@ BACKSLASH = {
|
||||||
'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
|
'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _decode_uXXXX(s, pos):
|
||||||
|
esc = s[pos + 1:pos + 5]
|
||||||
|
if len(esc) == 4 and esc[1] not in 'xX':
|
||||||
|
try:
|
||||||
|
return int(esc, 16)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
msg = "Invalid \\uXXXX escape"
|
||||||
|
raise ValueError(errmsg(msg, s, pos))
|
||||||
|
|
||||||
def py_scanstring(s, end, strict=True,
|
def py_scanstring(s, end, strict=True,
|
||||||
_b=BACKSLASH, _m=STRINGCHUNK.match):
|
_b=BACKSLASH, _m=STRINGCHUNK.match):
|
||||||
"""Scan the string s for a JSON string. End is the index of the
|
"""Scan the string s for a JSON string. End is the index of the
|
||||||
|
@ -115,25 +125,14 @@ def py_scanstring(s, end, strict=True,
|
||||||
raise ValueError(errmsg(msg, s, end))
|
raise ValueError(errmsg(msg, s, end))
|
||||||
end += 1
|
end += 1
|
||||||
else:
|
else:
|
||||||
esc = s[end + 1:end + 5]
|
uni = _decode_uXXXX(s, end)
|
||||||
next_end = end + 5
|
end += 5
|
||||||
if len(esc) != 4:
|
if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
|
||||||
msg = "Invalid \\uXXXX escape"
|
uni2 = _decode_uXXXX(s, end + 1)
|
||||||
raise ValueError(errmsg(msg, s, end))
|
if 0xdc00 <= uni2 <= 0xdfff:
|
||||||
uni = int(esc, 16)
|
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
|
||||||
if 0xd800 <= uni <= 0xdbff:
|
end += 6
|
||||||
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
|
|
||||||
if not s[end + 5:end + 7] == '\\u':
|
|
||||||
raise ValueError(errmsg(msg, s, end))
|
|
||||||
esc2 = s[end + 7:end + 11]
|
|
||||||
if len(esc2) != 4:
|
|
||||||
raise ValueError(errmsg(msg, s, end))
|
|
||||||
uni2 = int(esc2, 16)
|
|
||||||
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
|
|
||||||
next_end += 6
|
|
||||||
char = chr(uni)
|
char = chr(uni)
|
||||||
|
|
||||||
end = next_end
|
|
||||||
_append(char)
|
_append(char)
|
||||||
return ''.join(chunks), end
|
return ''.join(chunks), end
|
||||||
|
|
||||||
|
|
|
@ -5,10 +5,6 @@ from test.test_json import PyTest, CTest
|
||||||
class TestScanstring:
|
class TestScanstring:
|
||||||
def test_scanstring(self):
|
def test_scanstring(self):
|
||||||
scanstring = self.json.decoder.scanstring
|
scanstring = self.json.decoder.scanstring
|
||||||
self.assertEqual(
|
|
||||||
scanstring('"z\\ud834\\udd20x"', 1, True),
|
|
||||||
('z\U0001d120x', 16))
|
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
scanstring('"z\U0001d120x"', 1, True),
|
scanstring('"z\U0001d120x"', 1, True),
|
||||||
('z\U0001d120x', 5))
|
('z\U0001d120x', 5))
|
||||||
|
@ -89,6 +85,53 @@ class TestScanstring:
|
||||||
scanstring('["Bad value", truth]', 2, True),
|
scanstring('["Bad value", truth]', 2, True),
|
||||||
('Bad value', 12))
|
('Bad value', 12))
|
||||||
|
|
||||||
|
def test_surrogates(self):
|
||||||
|
scanstring = self.json.decoder.scanstring
|
||||||
|
def assertScan(given, expect):
|
||||||
|
self.assertEqual(scanstring(given, 1, True),
|
||||||
|
(expect, len(given)))
|
||||||
|
|
||||||
|
assertScan('"z\\ud834\\u0079x"', 'z\ud834yx')
|
||||||
|
assertScan('"z\\ud834\\udd20x"', 'z\U0001d120x')
|
||||||
|
assertScan('"z\\ud834\\ud834\\udd20x"', 'z\ud834\U0001d120x')
|
||||||
|
assertScan('"z\\ud834x"', 'z\ud834x')
|
||||||
|
assertScan('"z\\ud834\udd20x12345"', 'z\ud834\udd20x12345')
|
||||||
|
assertScan('"z\\udd20x"', 'z\udd20x')
|
||||||
|
assertScan('"z\ud834\udd20x"', 'z\ud834\udd20x')
|
||||||
|
assertScan('"z\ud834\\udd20x"', 'z\ud834\udd20x')
|
||||||
|
assertScan('"z\ud834x"', 'z\ud834x')
|
||||||
|
|
||||||
|
def test_bad_escapes(self):
|
||||||
|
scanstring = self.json.decoder.scanstring
|
||||||
|
bad_escapes = [
|
||||||
|
'"\\"',
|
||||||
|
'"\\x"',
|
||||||
|
'"\\u"',
|
||||||
|
'"\\u0"',
|
||||||
|
'"\\u01"',
|
||||||
|
'"\\u012"',
|
||||||
|
'"\\uz012"',
|
||||||
|
'"\\u0z12"',
|
||||||
|
'"\\u01z2"',
|
||||||
|
'"\\u012z"',
|
||||||
|
'"\\u0x12"',
|
||||||
|
'"\\u0X12"',
|
||||||
|
'"\\ud834\\"',
|
||||||
|
'"\\ud834\\u"',
|
||||||
|
'"\\ud834\\ud"',
|
||||||
|
'"\\ud834\\udd"',
|
||||||
|
'"\\ud834\\udd2"',
|
||||||
|
'"\\ud834\\uzdd2"',
|
||||||
|
'"\\ud834\\udzd2"',
|
||||||
|
'"\\ud834\\uddz2"',
|
||||||
|
'"\\ud834\\udd2z"',
|
||||||
|
'"\\ud834\\u0x20"',
|
||||||
|
'"\\ud834\\u0X20"',
|
||||||
|
]
|
||||||
|
for s in bad_escapes:
|
||||||
|
with self.assertRaises(ValueError, msg=s):
|
||||||
|
scanstring(s, 1, True)
|
||||||
|
|
||||||
def test_overflow(self):
|
def test_overflow(self):
|
||||||
with self.assertRaises(OverflowError):
|
with self.assertRaises(OverflowError):
|
||||||
self.json.decoder.scanstring(b"xxx", sys.maxsize+1)
|
self.json.decoder.scanstring(b"xxx", sys.maxsize+1)
|
||||||
|
|
|
@ -16,6 +16,8 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #11489: JSON decoder now accepts lone surrogates.
|
||||||
|
|
||||||
- Issue #19545: Avoid chained exceptions while passing stray % to
|
- Issue #19545: Avoid chained exceptions while passing stray % to
|
||||||
time.strptime(). Initial patch by Claudiu Popa.
|
time.strptime(). Initial patch by Claudiu Popa.
|
||||||
|
|
||||||
|
|
|
@ -433,17 +433,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Surrogate pair */
|
/* Surrogate pair */
|
||||||
if ((c & 0xfc00) == 0xd800) {
|
if (Py_UNICODE_IS_HIGH_SURROGATE(c) && end + 6 < len &&
|
||||||
|
PyUnicode_READ(kind, buf, next++) == '\\' &&
|
||||||
|
PyUnicode_READ(kind, buf, next++) == 'u') {
|
||||||
Py_UCS4 c2 = 0;
|
Py_UCS4 c2 = 0;
|
||||||
if (end + 6 >= len) {
|
|
||||||
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
|
||||||
goto bail;
|
|
||||||
}
|
|
||||||
if (PyUnicode_READ(kind, buf, next++) != '\\' ||
|
|
||||||
PyUnicode_READ(kind, buf, next++) != 'u') {
|
|
||||||
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
|
||||||
goto bail;
|
|
||||||
}
|
|
||||||
end += 6;
|
end += 6;
|
||||||
/* Decode 4 hex digits */
|
/* Decode 4 hex digits */
|
||||||
for (; next < end; next++) {
|
for (; next < end; next++) {
|
||||||
|
@ -464,15 +457,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
||||||
goto bail;
|
goto bail;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((c2 & 0xfc00) != 0xdc00) {
|
if (Py_UNICODE_IS_LOW_SURROGATE(c2))
|
||||||
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
c = Py_UNICODE_JOIN_SURROGATES(c, c2);
|
||||||
goto bail;
|
else
|
||||||
}
|
end -= 6;
|
||||||
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
|
|
||||||
}
|
|
||||||
else if ((c & 0xfc00) == 0xdc00) {
|
|
||||||
raise_errmsg("Unpaired low surrogate", pystr, end - 5);
|
|
||||||
goto bail;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
APPEND_OLD_CHUNK
|
APPEND_OLD_CHUNK
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue