mirror of
https://github.com/python/cpython.git
synced 2025-10-23 07:02:24 +00:00
gh-88500: Reduce memory use of urllib.unquote
(#96763)
`urllib.unquote_to_bytes` and `urllib.unquote` could both potentially generate `O(len(string))` intermediate `bytes` or `str` objects while computing the unquoted final result depending on the input provided. As Python objects are relatively large, this could consume a lot of ram. This switches the implementation to using an expanding `bytearray` and a generator internally instead of precomputed `split()` style operations. Microbenchmarks with some antagonistic inputs like `mess = "\u0141%%%20a%fe"*1000` show this is 10-20% slower for unquote and unquote_to_bytes and no different for typical inputs that are short or lack much unicode or % escaping. But the functions are already quite fast anyways so not a big deal. The slowdown scales consistently linear with input size as expected. Memory usage observed manually using `/usr/bin/time -v` on `python -m timeit` runs of larger inputs. Unittesting memory consumption is difficult and does not seem worthwhile. Observed memory usage is ~1/2 for `unquote()` and <1/3 for `unquote_to_bytes()` using `python -m timeit -s 'from urllib.parse import unquote, unquote_to_bytes; v="\u0141%01\u0161%20"*500_000' 'unquote_to_bytes(v)'` as a test.
This commit is contained in:
parent
1bb68ba6d9
commit
2e279e85fe
3 changed files with 23 additions and 11 deletions
|
@ -1104,6 +1104,8 @@ class UnquotingTests(unittest.TestCase):
|
||||||
self.assertEqual(result.count('%'), 1,
|
self.assertEqual(result.count('%'), 1,
|
||||||
"using unquote(): not all characters escaped: "
|
"using unquote(): not all characters escaped: "
|
||||||
"%s" % result)
|
"%s" % result)
|
||||||
|
|
||||||
|
def test_unquote_rejects_none_and_tuple(self):
|
||||||
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, None)
|
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, None)
|
||||||
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, ())
|
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, ())
|
||||||
|
|
||||||
|
|
|
@ -600,6 +600,9 @@ _hextobyte = None
|
||||||
|
|
||||||
def unquote_to_bytes(string):
|
def unquote_to_bytes(string):
|
||||||
"""unquote_to_bytes('abc%20def') -> b'abc def'."""
|
"""unquote_to_bytes('abc%20def') -> b'abc def'."""
|
||||||
|
return bytes(_unquote_impl(string))
|
||||||
|
|
||||||
|
def _unquote_impl(string: bytes | bytearray | str) -> bytes | bytearray:
|
||||||
# Note: strings are encoded as UTF-8. This is only an issue if it contains
|
# Note: strings are encoded as UTF-8. This is only an issue if it contains
|
||||||
# unescaped non-ASCII characters, which URIs should not.
|
# unescaped non-ASCII characters, which URIs should not.
|
||||||
if not string:
|
if not string:
|
||||||
|
@ -611,8 +614,8 @@ def unquote_to_bytes(string):
|
||||||
bits = string.split(b'%')
|
bits = string.split(b'%')
|
||||||
if len(bits) == 1:
|
if len(bits) == 1:
|
||||||
return string
|
return string
|
||||||
res = [bits[0]]
|
res = bytearray(bits[0])
|
||||||
append = res.append
|
append = res.extend
|
||||||
# Delay the initialization of the table to not waste memory
|
# Delay the initialization of the table to not waste memory
|
||||||
# if the function is never called
|
# if the function is never called
|
||||||
global _hextobyte
|
global _hextobyte
|
||||||
|
@ -626,10 +629,20 @@ def unquote_to_bytes(string):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
append(b'%')
|
append(b'%')
|
||||||
append(item)
|
append(item)
|
||||||
return b''.join(res)
|
return res
|
||||||
|
|
||||||
_asciire = re.compile('([\x00-\x7f]+)')
|
_asciire = re.compile('([\x00-\x7f]+)')
|
||||||
|
|
||||||
|
def _generate_unquoted_parts(string, encoding, errors):
|
||||||
|
previous_match_end = 0
|
||||||
|
for ascii_match in _asciire.finditer(string):
|
||||||
|
start, end = ascii_match.span()
|
||||||
|
yield string[previous_match_end:start] # Non-ASCII
|
||||||
|
# The ascii_match[1] group == string[start:end].
|
||||||
|
yield _unquote_impl(ascii_match[1]).decode(encoding, errors)
|
||||||
|
previous_match_end = end
|
||||||
|
yield string[previous_match_end:] # Non-ASCII tail
|
||||||
|
|
||||||
def unquote(string, encoding='utf-8', errors='replace'):
|
def unquote(string, encoding='utf-8', errors='replace'):
|
||||||
"""Replace %xx escapes by their single-character equivalent. The optional
|
"""Replace %xx escapes by their single-character equivalent. The optional
|
||||||
encoding and errors parameters specify how to decode percent-encoded
|
encoding and errors parameters specify how to decode percent-encoded
|
||||||
|
@ -641,21 +654,16 @@ def unquote(string, encoding='utf-8', errors='replace'):
|
||||||
unquote('abc%20def') -> 'abc def'.
|
unquote('abc%20def') -> 'abc def'.
|
||||||
"""
|
"""
|
||||||
if isinstance(string, bytes):
|
if isinstance(string, bytes):
|
||||||
return unquote_to_bytes(string).decode(encoding, errors)
|
return _unquote_impl(string).decode(encoding, errors)
|
||||||
if '%' not in string:
|
if '%' not in string:
|
||||||
|
# Is it a string-like object?
|
||||||
string.split
|
string.split
|
||||||
return string
|
return string
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
if errors is None:
|
if errors is None:
|
||||||
errors = 'replace'
|
errors = 'replace'
|
||||||
bits = _asciire.split(string)
|
return ''.join(_generate_unquoted_parts(string, encoding, errors))
|
||||||
res = [bits[0]]
|
|
||||||
append = res.append
|
|
||||||
for i in range(1, len(bits), 2):
|
|
||||||
append(unquote_to_bytes(bits[i]).decode(encoding, errors))
|
|
||||||
append(bits[i + 1])
|
|
||||||
return ''.join(res)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
|
def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Reduced the memory usage of :func:`urllib.parse.unquote` and
|
||||||
|
:func:`urllib.parse.unquote_to_bytes` on large values.
|
Loading…
Add table
Add a link
Reference in a new issue