Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().

2025-09-26 18:29:57 +00:00 · 2013-03-14 21:31:09 +02:00 · 2013-03-14 21:31:09 +02:00 · 923baea9f9
commit 923baea9f9
parent 2556c8388c
3 changed files with 58 additions and 18 deletions
--- a/Lib/urllib.py
+++ b/Lib/urllib.py
@ -28,6 +28,7 @@ import os
 import time
 import sys
 import base64
 import re
 from urlparse import urljoin as basejoin
@ -1198,22 +1199,35 @@ def splitvalue(attr):
 _hexdig = '0123456789ABCDEFabcdef'
 _hextochr = dict((a + b, chr(int(a + b, 16)))
                 for a in _hexdig for b in _hexdig)
 _asciire = re.compile('([\x00-\x7f]+)')
 def unquote(s):
    """unquote('abc%20def') -> 'abc def'."""
-    res = s.split('%')
+    if _is_unicode(s):
        if '%' not in s:
            return s
        bits = _asciire.split(s)
        res = [bits[0]]
        append = res.append
        for i in range(1, len(bits), 2):
            append(unquote(str(bits[i])).decode('latin1'))
            append(bits[i + 1])
        return ''.join(res)
    bits = s.split('%')
    # fastpath
-    if len(res) == 1:
+    if len(bits) == 1:
        return s
-    s = res[0]
+    res = [bits[0]]
-    for item in res[1:]:
+    append = res.append
    for item in bits[1:]:
        try:
-            s += _hextochr[item[:2]] + item[2:]
+            append(_hextochr[item[:2]])
            append(item[2:])
        except KeyError:
-            s += '%' + item
+            append('%')
-        except UnicodeDecodeError:
+            append(item)
-            s += unichr(int(item[:2], 16)) + item[2:]
+    return ''.join(res)
    return s
 def unquote_plus(s):
    """unquote('%7e/abc+def') -> '~/abc def'"""
--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@ -28,6 +28,8 @@ test_urlparse.py provides a good indicator of parsing behavior.
 """
 import re
 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
           "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
@ -311,6 +313,15 @@ def urldefrag(url):
    else:
        return url, ''
 try:
    unicode
 except NameError:
    def _is_unicode(x):
        return 0
 else:
    def _is_unicode(x):
        return isinstance(x, unicode)
 # unquote method for parse_qs and parse_qsl
 # Cannot use directly from urllib as it would create a circular reference
 # because urllib uses urlparse methods (urljoin).  If you update this function,
@ -319,22 +330,35 @@ def urldefrag(url):
 _hexdig = '0123456789ABCDEFabcdef'
 _hextochr = dict((a+b, chr(int(a+b,16)))
                 for a in _hexdig for b in _hexdig)
 _asciire = re.compile('([\x00-\x7f]+)')
 def unquote(s):
    """unquote('abc%20def') -> 'abc def'."""
-    res = s.split('%')
+    if _is_unicode(s):
        if '%' not in s:
            return s
        bits = _asciire.split(s)
        res = [bits[0]]
        append = res.append
        for i in range(1, len(bits), 2):
            append(unquote(str(bits[i])).decode('latin1'))
            append(bits[i + 1])
        return ''.join(res)
    bits = s.split('%')
    # fastpath
-    if len(res) == 1:
+    if len(bits) == 1:
        return s
-    s = res[0]
+    res = [bits[0]]
-    for item in res[1:]:
+    append = res.append
    for item in bits[1:]:
        try:
-            s += _hextochr[item[:2]] + item[2:]
+            append(_hextochr[item[:2]])
            append(item[2:])
        except KeyError:
-            s += '%' + item
+            append('%')
-        except UnicodeDecodeError:
+            append(item)
-            s += unichr(int(item[:2], 16)) + item[2:]
+    return ''.join(res)
    return s
 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
    """Parse a query given as a string argument.
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -214,6 +214,8 @@ Core and Builtins
 Library
 -------
 - Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().
 - Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused
  a failure while decoding empty object literals when object_pairs_hook was
  specified.