Fixing Issue1712522 - urllib.quote to support Unicode. The default

encoding='utf-8' and errors='strict'.
2025-09-26 18:29:57 +00:00 · 2010-07-18 02:27:10 +00:00 · 2010-07-18 02:27:10 +00:00 · 5dba6dfe6a
commit 5dba6dfe6a
parent 5d10d33cd5
4 changed files with 152 additions and 7 deletions
--- a/Doc/library/urllib.rst
+++ b/Doc/library/urllib.rst
@ -202,24 +202,40 @@ High-level interface
 Utility functions
 -----------------
-.. function:: quote(string[, safe])
+.. function:: quote(string[, safe[, encoding[, errors]]])
   Replace special characters in *string* using the ``%xx`` escape. Letters,
   digits, and the characters ``'_.-'`` are never quoted. By default, this
-   function is intended for quoting the path section of the URL.The optional
+   function is intended for quoting the path section of the URL. The optional
   *safe* parameter specifies additional characters that should not be quoted
   --- its default value is ``'/'``.
   *string* may be either a :class:`str` or a :class:`unicode`.
   The optional *encoding* and *errors* parameters specify how to deal with
   non-ASCII characters, as accepted by the :meth:`unicode.encode` method.
   *encoding* defaults to ``'utf-8'``.
   *errors* defaults to ``'strict'``, meaning unsupported characters raise a
   :class:`UnicodeEncodeError`.
   Non-Unicode strings are not encoded by default, and all bytes are allowed.
   Example: ``quote('/~connolly/')`` yields ``'/%7econnolly/'``.
   Example: ``quote(u'/El Niño/')`` yields ``'/El%20Ni%C3%B1o/'``.
-.. function:: quote_plus(string[, safe])
+   .. versionchanged:: 2.7.1
      Added *encoding* and *errors* parameters.
 .. function:: quote_plus(string[, safe[, encoding[, errors]]])
   Like :func:`quote`, but also replaces spaces by plus signs, as required for
   quoting HTML form values when building up a query string to go into a URL.
   Plus signs in the original string are escaped unless they are included in
   *safe*.  It also does not have *safe* default to ``'/'``.
   Example: ``quote_plus(u'/El Niño/')`` yields ``'%2FEl+Ni%C3%B1o%2F'``.
 .. function:: unquote(string)
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@ -355,6 +355,38 @@ class QuotingTests(unittest.TestCase):
        self.assertEqual(quote_by_default, result,
                         "using quote_plus(): %s != %s" %
                         (quote_by_default, result))
        # Safe expressed as unicode rather than str
        result = urllib.quote(quote_by_default, safe=u"<>")
        self.assertEqual(quote_by_default, result,
                         "using quote(): %r != %r" % (quote_by_default, result))
        # "Safe" non-ASCII bytes should still work
        # (Technically disallowed by the URI standard, but allowed for
        # backwards compatibility with previous versions of Python)
        result = urllib.quote(b"a\xfcb", safe=b"\xfc")
        expect = b"a\xfcb"
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" %
                         (expect, result))
        # Same as above, but with 'safe' as a unicode rather than str
        # "Safe" non-ASCII unicode characters should have no effect
        # (Since URIs are not allowed to have non-ASCII characters)
        result = urllib.quote(b"a\xfcb", safe=u"\xfc")
        expect = urllib.quote(b"a\xfcb", safe="")
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" %
                         (expect, result))
        # Same as above, but quoting a unicode rather than a str
        result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=b"\xfc")
        expect = b"a\xfcb"
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" %
                         (expect, result))
        # Same as above, but with both the quoted value and 'safe' as unicode
        result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=u"\xfc")
        expect = urllib.quote(u"a\xfcb", encoding="latin-1", safe="")
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" %
                         (expect, result))
    def test_default_quoting(self):
        # Make sure all characters that should be quoted are by default sans
@ -406,6 +438,81 @@ class QuotingTests(unittest.TestCase):
                         'alpha%2Bbeta+gamma')
        self.assertEqual(urllib.quote_plus('alpha+beta gamma', '+'),
                         'alpha+beta+gamma')
        # Test with unicode
        self.assertEqual(urllib.quote_plus(u'alpha+beta gamma'),
                         'alpha%2Bbeta+gamma')
        # Test with safe unicode
        self.assertEqual(urllib.quote_plus('alpha+beta gamma', u'+'),
                         'alpha+beta+gamma')
    def test_quote_bytes(self):
        # Non-ASCII bytes should quote directly to percent-encoded values
        given = b"\xa2\xd8ab\xff"
        expect = "%A2%D8ab%FF"
        result = urllib.quote(given)
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" % (expect, result))
        # Encoding argument should raise UnicodeDecodeError on bytes input
        # with non-ASCII characters (just as with str.encode).
        self.assertRaises(UnicodeDecodeError, urllib.quote, given,
                            encoding="latin-1")
    def test_quote_with_unicode(self):
        # Characters in Latin-1 range, encoded by default in UTF-8
        given = u"\xa2\xd8ab\xff"
        expect = "%C2%A2%C3%98ab%C3%BF"
        result = urllib.quote(given)
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" % (expect, result))
        # Characters in Latin-1 range, encoded by with None (default)
        result = urllib.quote(given, encoding=None, errors=None)
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" % (expect, result))
        # Characters in Latin-1 range, encoded with Latin-1
        given = u"\xa2\xd8ab\xff"
        expect = "%A2%D8ab%FF"
        result = urllib.quote(given, encoding="latin-1")
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" % (expect, result))
        # Characters in BMP, encoded by default in UTF-8
        given = u"\u6f22\u5b57"              # "Kanji"
        expect = "%E6%BC%A2%E5%AD%97"
        result = urllib.quote(given)
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" % (expect, result))
        # Characters in BMP, encoded with Latin-1
        given = u"\u6f22\u5b57"
        self.assertRaises(UnicodeEncodeError, urllib.quote, given,
                                    encoding="latin-1")
        # Characters in BMP, encoded with Latin-1, with replace error handling
        given = u"\u6f22\u5b57"
        expect = "%3F%3F"                    # "??"
        result = urllib.quote(given, encoding="latin-1",
                                    errors="replace")
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" % (expect, result))
        # Characters in BMP, Latin-1, with xmlcharref error handling
        given = u"\u6f22\u5b57"
        expect = "%26%2328450%3B%26%2323383%3B"      # "&#28450;&#23383;"
        result = urllib.quote(given, encoding="latin-1",
                                    errors="xmlcharrefreplace")
        self.assertEqual(expect, result,
                         "using quote(): %r != %r" % (expect, result))
    def test_quote_plus_with_unicode(self):
        # Encoding (latin-1) test for quote_plus
        given = u"\xa2\xd8 \xff"
        expect = "%A2%D8+%FF"
        result = urllib.quote_plus(given, encoding="latin-1")
        self.assertEqual(expect, result,
                         "using quote_plus(): %r != %r" % (expect, result))
        # Errors test for quote_plus
        given = u"ab\u6f22\u5b57 cd"
        expect = "ab%3F%3F+cd"
        result = urllib.quote_plus(given, encoding="latin-1",
                                         errors="replace")
        self.assertEqual(expect, result,
                         "using quote_plus(): %r != %r" % (expect, result))
 class UnquotingTests(unittest.TestCase):
    """Tests for unquote() and unquote_plus()
--- a/Lib/urllib.py
+++ b/Lib/urllib.py
@ -1193,7 +1193,7 @@ for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
    _safe_map[c] = c if (i < 128 and c in always_safe) else '%{:02X}'.format(i)
 _safe_quoters = {}
-def quote(s, safe='/'):
+def quote(s, safe='/', encoding=None, errors=None):
    """quote('abc def') -> 'abc%20def'
    Each part of a URL, e.g. the path info, the query, etc., has a
@ -1213,10 +1213,28 @@ def quote(s, safe='/'):
    is reserved, but in typical usage the quote function is being
    called on a path where the existing slash characters are used as
    reserved characters.
    string and safe may be either str or unicode objects.
    The optional encoding and errors parameters specify how to deal with the
    non-ASCII characters, as accepted by the unicode.encode method.
    By default, encoding='utf-8' (characters are encoded with UTF-8), and
    errors='strict' (unsupported characters raise a UnicodeEncodeError).
    """
    # fastpath
    if not s:
        return s
    if encoding is not None or isinstance(s, unicode):
        if encoding is None:
            encoding = 'utf-8'
        if errors is None:
            errors = 'strict'
        s = s.encode(encoding, errors)
    if isinstance(safe, unicode):
        # Normalize 'safe' by converting to str and removing non-ASCII chars
        safe = safe.encode('ascii', 'ignore')
    cachekey = (safe, always_safe)
    try:
        (quoter, safe) = _safe_quoters[cachekey]
@ -1230,12 +1248,12 @@ def quote(s, safe='/'):
        return s
    return ''.join(map(quoter, s))
-def quote_plus(s, safe=''):
+def quote_plus(s, safe='', encoding=None, errors=None):
    """Quote the query fragment of a URL; replacing ' ' with '+'"""
    if ' ' in s:
-        s = quote(s, safe + ' ')
+        s = quote(s, safe + ' ', encoding, errors)
        return s.replace(' ', '+')
-    return quote(s, safe)
+    return quote(s, safe, encoding, errors)
 def urlencode(query, doseq=0):
    """Encode a sequence of two-element tuples or dictionary into a URL query string.
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -18,6 +18,10 @@ Core and Builtins
 Library
 -------
 - Issue 1712522: urllib.quote supports Unicode String with encoding and errors
  parameter. The encoding parameter defaults to utf-8 and errors to strict.
  Patch by Matt Giuca.
 - Issue #7646: The fnmatch pattern cache no longer grows without bound.
 - Issue #9136: Fix 'dictionary changed size during iteration'