port simplejson upgrade from the trunk #4136

json also now works only with unicode strings

Patch by Antoine Pitrou; updated by me
This commit is contained in:
Benjamin Peterson 2009-05-02 12:36:44 +00:00
parent 7255f18556
commit c6b607d4a9
15 changed files with 2011 additions and 959 deletions

View file

@ -1,10 +1,11 @@
"""Implementation of JSONDecoder
"""
import binascii
import re
import sys
import struct
from json.scanner import Scanner, pattern
from json.scanner import make_scanner
try:
from _json import scanstring as c_scanstring
except ImportError:
@ -14,7 +15,14 @@ __all__ = ['JSONDecoder']
FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
NaN, PosInf, NegInf = float('nan'), float('inf'), float('-inf')
def _floatconstants():
_BYTES = binascii.unhexlify(b'7FF80000000000007FF0000000000000')
if sys.byteorder != 'big':
_BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
nan, inf = struct.unpack('dd', _BYTES)
return nan, inf, -inf
NaN, PosInf, NegInf = _floatconstants()
def linecol(doc, pos):
@ -31,61 +39,43 @@ def linecol(doc, pos):
def errmsg(msg, doc, pos, end=None):
# Note that this function is called from _json
lineno, colno = linecol(doc, pos)
if end is None:
fmt = '{0}: line {1} column {2} (char {3})'
return fmt.format(msg, lineno, colno, pos)
#fmt = '%s: line %d column %d (char %d)'
#return fmt % (msg, lineno, colno, pos)
endlineno, endcolno = linecol(doc, end)
fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
#fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
#return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
_CONSTANTS = {
'-Infinity': NegInf,
'Infinity': PosInf,
'NaN': NaN,
'true': True,
'false': False,
'null': None,
}
def JSONConstant(match, context, c=_CONSTANTS):
s = match.group(0)
fn = getattr(context, 'parse_constant', None)
if fn is None:
rval = c[s]
else:
rval = fn(s)
return rval, None
pattern('(-?Infinity|NaN|true|false|null)')(JSONConstant)
def JSONNumber(match, context):
match = JSONNumber.regex.match(match.string, *match.span())
integer, frac, exp = match.groups()
if frac or exp:
fn = getattr(context, 'parse_float', None) or float
res = fn(integer + (frac or '') + (exp or ''))
else:
fn = getattr(context, 'parse_int', None) or int
res = fn(integer)
return res, None
pattern(r'(-?(?:0|[1-9][0-9]*))(\.[0-9]+)?([eE][-+]?[0-9]+)?')(JSONNumber)
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
BACKSLASH = {
'"': '"', '\\': '\\', '/': '/',
'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
}
DEFAULT_ENCODING = "utf-8"
def py_scanstring(s, end, strict=True,
_b=BACKSLASH, _m=STRINGCHUNK.match):
"""Scan the string s for a JSON string. End is the index of the
character in s after the quote that started the JSON string.
Unescapes all valid JSON string escape sequences and raises ValueError
on attempt to decode an invalid string. If strict is False then literal
control characters are allowed in the string.
def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match):
if encoding is None:
encoding = DEFAULT_ENCODING
Returns a tuple of the decoded string and the index of the character in s
after the end quote."""
chunks = []
_append = chunks.append
begin = end - 1
@ -96,14 +86,16 @@ def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHU
errmsg("Unterminated string starting at", s, begin))
end = chunk.end()
content, terminator = chunk.groups()
# Content is contains zero or more unescaped string characters
if content:
if not isinstance(content, str):
content = str(content, encoding)
_append(content)
# Terminator is the end of string, a literal control character,
# or a backslash denoting that an escape sequence follows
if terminator == '"':
break
elif terminator != '\\':
if strict:
#msg = "Invalid control character %r at" % (terminator,)
msg = "Invalid control character {0!r} at".format(terminator)
raise ValueError(errmsg(msg, s, end))
else:
@ -114,9 +106,10 @@ def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHU
except IndexError:
raise ValueError(
errmsg("Unterminated string starting at", s, begin))
# If not a unicode escape sequence, must be in the lookup table
if esc != 'u':
try:
m = _b[esc]
char = _b[esc]
except KeyError:
msg = "Invalid \\escape: {0!r}".format(esc)
raise ValueError(errmsg(msg, s, end))
@ -124,131 +117,138 @@ def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHU
else:
esc = s[end + 1:end + 5]
next_end = end + 5
msg = "Invalid \\uXXXX escape"
try:
if len(esc) != 4:
raise ValueError
uni = int(esc, 16)
if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
if not s[end + 5:end + 7] == '\\u':
raise ValueError
esc2 = s[end + 7:end + 11]
if len(esc2) != 4:
raise ValueError
uni2 = int(esc2, 16)
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
next_end += 6
m = chr(uni)
except ValueError:
if len(esc) != 4:
msg = "Invalid \\uXXXX escape"
raise ValueError(errmsg(msg, s, end))
uni = int(esc, 16)
# Check for surrogate pair on UCS-4 systems
if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
if not s[end + 5:end + 7] == '\\u':
raise ValueError(errmsg(msg, s, end))
esc2 = s[end + 7:end + 11]
if len(esc2) != 4:
raise ValueError(errmsg(msg, s, end))
uni2 = int(esc2, 16)
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
next_end += 6
char = chr(uni)
end = next_end
_append(m)
_append(char)
return ''.join(chunks), end
# Use speedup
if c_scanstring is not None:
scanstring = c_scanstring
else:
scanstring = py_scanstring
# Use speedup if available
scanstring = c_scanstring or py_scanstring
def JSONString(match, context):
encoding = getattr(context, 'encoding', None)
strict = getattr(context, 'strict', True)
return scanstring(match.string, match.end(), encoding, strict)
pattern(r'"')(JSONString)
WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
WHITESPACE_STR = ' \t\n\r'
WHITESPACE = re.compile(r'\s*', FLAGS)
def JSONObject(match, context, _w=WHITESPACE.match):
def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
_w=WHITESPACE.match, _ws=WHITESPACE_STR):
s, end = s_and_end
pairs = []
pairs_append = pairs.append
s = match.string
end = _w(s, match.end()).end()
# Use a slice to prevent IndexError from being raised, the following
# check will raise a more specific ValueError if the string is empty
nextchar = s[end:end + 1]
# Trivial empty object
if nextchar == '}':
return pairs, end + 1
# Normally we expect nextchar == '"'
if nextchar != '"':
raise ValueError(errmsg("Expecting property name", s, end))
if nextchar in _ws:
end = _w(s, end).end()
nextchar = s[end:end + 1]
# Trivial empty object
if nextchar == '}':
return pairs, end + 1
elif nextchar != '"':
raise ValueError(errmsg("Expecting property name", s, end))
end += 1
encoding = getattr(context, 'encoding', None)
strict = getattr(context, 'strict', True)
iterscan = JSONScanner.iterscan
while True:
key, end = scanstring(s, end, encoding, strict)
end = _w(s, end).end()
key, end = scanstring(s, end, strict)
# To skip some function call overhead we optimize the fast paths where
# the JSON key separator is ": " or just ":".
if s[end:end + 1] != ':':
raise ValueError(errmsg("Expecting : delimiter", s, end))
end = _w(s, end + 1).end()
end = _w(s, end).end()
if s[end:end + 1] != ':':
raise ValueError(errmsg("Expecting : delimiter", s, end))
end += 1
try:
value, end = next(iterscan(s, idx=end, context=context))
if s[end] in _ws:
end += 1
if s[end] in _ws:
end = _w(s, end + 1).end()
except IndexError:
pass
try:
value, end = scan_once(s, end)
except StopIteration:
raise ValueError(errmsg("Expecting object", s, end))
pairs_append((key, value))
end = _w(s, end).end()
nextchar = s[end:end + 1]
try:
nextchar = s[end]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end]
except IndexError:
nextchar = ''
end += 1
if nextchar == '}':
break
if nextchar != ',':
elif nextchar != ',':
raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
end = _w(s, end).end()
nextchar = s[end:end + 1]
end += 1
if nextchar != '"':
raise ValueError(errmsg("Expecting property name", s, end - 1))
object_pairs_hook = getattr(context, 'object_pairs_hook', None)
if object_pairs_hook is not None:
result = object_pairs_hook(pairs)
return result, end
pairs = dict(pairs)
object_hook = getattr(context, 'object_hook', None)
if object_hook is not None:
pairs = object_hook(pairs)
return pairs, end
pattern(r'{')(JSONObject)
def JSONArray(match, context, _w=WHITESPACE.match):
def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match):
s, end = s_and_end
values = []
s = match.string
end = _w(s, match.end()).end()
# Look-ahead for trivial empty array
nextchar = s[end:end + 1]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end:end + 1]
# Look-ahead for trivial empty array
if nextchar == ']':
return values, end + 1
iterscan = JSONScanner.iterscan
_append = values.append
while True:
try:
value, end = next(iterscan(s, idx=end, context=context))
value, end = scan_once(s, end)
except StopIteration:
raise ValueError(errmsg("Expecting object", s, end))
values.append(value)
end = _w(s, end).end()
_append(value)
nextchar = s[end:end + 1]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end:end + 1]
end += 1
if nextchar == ']':
break
if nextchar != ',':
elif nextchar != ',':
raise ValueError(errmsg("Expecting , delimiter", s, end))
end = _w(s, end).end()
try:
if s[end] in _ws:
end += 1
if s[end] in _ws:
end = _w(s, end + 1).end()
except IndexError:
pass
return values, end
pattern(r'\[')(JSONArray)
ANYTHING = [
JSONObject,
JSONArray,
JSONString,
JSONConstant,
JSONNumber,
]
JSONScanner = Scanner(ANYTHING)
class JSONDecoder(object):
@ -278,23 +278,14 @@ class JSONDecoder(object):
It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
their corresponding ``float`` values, which is outside the JSON spec.
"""
_scanner = Scanner(ANYTHING)
__all__ = ['__init__', 'decode', 'raw_decode']
def __init__(self, encoding=None, object_hook=None, parse_float=None,
def __init__(self, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, strict=True,
object_pairs_hook=None):
"""``encoding`` determines the encoding used to interpret any ``str``
objects decoded by this instance (utf-8 by default). It has no
effect when decoding ``unicode`` objects.
Note that currently only encodings that are a superset of ASCII work,
strings of other encodings should be passed in as ``unicode``.
``object_hook``, if specified, will be called with the result of
every JSON object decoded and its return value will be used in
"""``object_hook``, if specified, will be called with the result
of every JSON object decoded and its return value will be used in
place of the given ``dict``. This can be used to provide custom
deserializations (e.g. to support JSON-RPC class hinting).
@ -309,22 +300,25 @@ class JSONDecoder(object):
for JSON integers (e.g. float).
``parse_constant``, if specified, will be called with one of the
following strings: -Infinity, Infinity, NaN, null, true, false.
following strings: -Infinity, Infinity, NaN.
This can be used to raise an exception if invalid JSON numbers
are encountered.
"""
self.encoding = encoding
self.object_hook = object_hook
self.object_pairs_hook = object_pairs_hook
self.parse_float = parse_float
self.parse_int = parse_int
self.parse_constant = parse_constant
self.parse_float = parse_float or float
self.parse_int = parse_int or int
self.parse_constant = parse_constant or _CONSTANTS.__getitem__
self.strict = strict
self.object_pairs_hook = object_pairs_hook
self.parse_object = JSONObject
self.parse_array = JSONArray
self.parse_string = scanstring
self.scan_once = make_scanner(self)
def decode(self, s, _w=WHITESPACE.match):
"""
Return the Python representation of ``s`` (a ``str`` or ``unicode``
"""Return the Python representation of ``s`` (a ``str`` or ``unicode``
instance containing a JSON document)
"""
@ -334,18 +328,17 @@ class JSONDecoder(object):
raise ValueError(errmsg("Extra data", s, end, len(s)))
return obj
def raw_decode(self, s, **kw):
"""Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning
with a JSON document) and return a 2-tuple of the Python
def raw_decode(self, s, idx=0):
"""Decode a JSON document from ``s`` (a ``str`` or ``unicode``
beginning with a JSON document) and return a 2-tuple of the Python
representation and the index in ``s`` where the document ended.
This can be used to decode a JSON document from a string that may
have extraneous data at the end.
"""
kw.setdefault('context', self)
try:
obj, end = next(self._scanner.iterscan(s, **kw))
obj, end = self.scan_once(s, idx)
except StopIteration:
raise ValueError("No JSON object could be decoded")
return obj, end