mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Fixing Issue1712522 - urllib.quote to support Unicode. The default
encoding='utf-8' and errors='strict'.
This commit is contained in:
parent
5d10d33cd5
commit
5dba6dfe6a
4 changed files with 152 additions and 7 deletions
|
@ -202,24 +202,40 @@ High-level interface
|
||||||
Utility functions
|
Utility functions
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
.. function:: quote(string[, safe])
|
.. function:: quote(string[, safe[, encoding[, errors]]])
|
||||||
|
|
||||||
Replace special characters in *string* using the ``%xx`` escape. Letters,
|
Replace special characters in *string* using the ``%xx`` escape. Letters,
|
||||||
digits, and the characters ``'_.-'`` are never quoted. By default, this
|
digits, and the characters ``'_.-'`` are never quoted. By default, this
|
||||||
function is intended for quoting the path section of the URL.The optional
|
function is intended for quoting the path section of the URL. The optional
|
||||||
*safe* parameter specifies additional characters that should not be quoted
|
*safe* parameter specifies additional characters that should not be quoted
|
||||||
--- its default value is ``'/'``.
|
--- its default value is ``'/'``.
|
||||||
|
|
||||||
|
*string* may be either a :class:`str` or a :class:`unicode`.
|
||||||
|
|
||||||
|
The optional *encoding* and *errors* parameters specify how to deal with
|
||||||
|
non-ASCII characters, as accepted by the :meth:`unicode.encode` method.
|
||||||
|
*encoding* defaults to ``'utf-8'``.
|
||||||
|
*errors* defaults to ``'strict'``, meaning unsupported characters raise a
|
||||||
|
:class:`UnicodeEncodeError`.
|
||||||
|
Non-Unicode strings are not encoded by default, and all bytes are allowed.
|
||||||
|
|
||||||
Example: ``quote('/~connolly/')`` yields ``'/%7econnolly/'``.
|
Example: ``quote('/~connolly/')`` yields ``'/%7econnolly/'``.
|
||||||
|
|
||||||
|
Example: ``quote(u'/El Niño/')`` yields ``'/El%20Ni%C3%B1o/'``.
|
||||||
|
|
||||||
.. function:: quote_plus(string[, safe])
|
.. versionchanged:: 2.7.1
|
||||||
|
Added *encoding* and *errors* parameters.
|
||||||
|
|
||||||
|
|
||||||
|
.. function:: quote_plus(string[, safe[, encoding[, errors]]])
|
||||||
|
|
||||||
Like :func:`quote`, but also replaces spaces by plus signs, as required for
|
Like :func:`quote`, but also replaces spaces by plus signs, as required for
|
||||||
quoting HTML form values when building up a query string to go into a URL.
|
quoting HTML form values when building up a query string to go into a URL.
|
||||||
Plus signs in the original string are escaped unless they are included in
|
Plus signs in the original string are escaped unless they are included in
|
||||||
*safe*. It also does not have *safe* default to ``'/'``.
|
*safe*. It also does not have *safe* default to ``'/'``.
|
||||||
|
|
||||||
|
Example: ``quote_plus(u'/El Niño/')`` yields ``'%2FEl+Ni%C3%B1o%2F'``.
|
||||||
|
|
||||||
|
|
||||||
.. function:: unquote(string)
|
.. function:: unquote(string)
|
||||||
|
|
||||||
|
|
|
@ -355,6 +355,38 @@ class QuotingTests(unittest.TestCase):
|
||||||
self.assertEqual(quote_by_default, result,
|
self.assertEqual(quote_by_default, result,
|
||||||
"using quote_plus(): %s != %s" %
|
"using quote_plus(): %s != %s" %
|
||||||
(quote_by_default, result))
|
(quote_by_default, result))
|
||||||
|
# Safe expressed as unicode rather than str
|
||||||
|
result = urllib.quote(quote_by_default, safe=u"<>")
|
||||||
|
self.assertEqual(quote_by_default, result,
|
||||||
|
"using quote(): %r != %r" % (quote_by_default, result))
|
||||||
|
# "Safe" non-ASCII bytes should still work
|
||||||
|
# (Technically disallowed by the URI standard, but allowed for
|
||||||
|
# backwards compatibility with previous versions of Python)
|
||||||
|
result = urllib.quote(b"a\xfcb", safe=b"\xfc")
|
||||||
|
expect = b"a\xfcb"
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" %
|
||||||
|
(expect, result))
|
||||||
|
# Same as above, but with 'safe' as a unicode rather than str
|
||||||
|
# "Safe" non-ASCII unicode characters should have no effect
|
||||||
|
# (Since URIs are not allowed to have non-ASCII characters)
|
||||||
|
result = urllib.quote(b"a\xfcb", safe=u"\xfc")
|
||||||
|
expect = urllib.quote(b"a\xfcb", safe="")
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" %
|
||||||
|
(expect, result))
|
||||||
|
# Same as above, but quoting a unicode rather than a str
|
||||||
|
result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=b"\xfc")
|
||||||
|
expect = b"a\xfcb"
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" %
|
||||||
|
(expect, result))
|
||||||
|
# Same as above, but with both the quoted value and 'safe' as unicode
|
||||||
|
result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=u"\xfc")
|
||||||
|
expect = urllib.quote(u"a\xfcb", encoding="latin-1", safe="")
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" %
|
||||||
|
(expect, result))
|
||||||
|
|
||||||
def test_default_quoting(self):
|
def test_default_quoting(self):
|
||||||
# Make sure all characters that should be quoted are by default sans
|
# Make sure all characters that should be quoted are by default sans
|
||||||
|
@ -406,6 +438,81 @@ class QuotingTests(unittest.TestCase):
|
||||||
'alpha%2Bbeta+gamma')
|
'alpha%2Bbeta+gamma')
|
||||||
self.assertEqual(urllib.quote_plus('alpha+beta gamma', '+'),
|
self.assertEqual(urllib.quote_plus('alpha+beta gamma', '+'),
|
||||||
'alpha+beta+gamma')
|
'alpha+beta+gamma')
|
||||||
|
# Test with unicode
|
||||||
|
self.assertEqual(urllib.quote_plus(u'alpha+beta gamma'),
|
||||||
|
'alpha%2Bbeta+gamma')
|
||||||
|
# Test with safe unicode
|
||||||
|
self.assertEqual(urllib.quote_plus('alpha+beta gamma', u'+'),
|
||||||
|
'alpha+beta+gamma')
|
||||||
|
|
||||||
|
def test_quote_bytes(self):
|
||||||
|
# Non-ASCII bytes should quote directly to percent-encoded values
|
||||||
|
given = b"\xa2\xd8ab\xff"
|
||||||
|
expect = "%A2%D8ab%FF"
|
||||||
|
result = urllib.quote(given)
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" % (expect, result))
|
||||||
|
# Encoding argument should raise UnicodeDecodeError on bytes input
|
||||||
|
# with non-ASCII characters (just as with str.encode).
|
||||||
|
self.assertRaises(UnicodeDecodeError, urllib.quote, given,
|
||||||
|
encoding="latin-1")
|
||||||
|
|
||||||
|
def test_quote_with_unicode(self):
|
||||||
|
# Characters in Latin-1 range, encoded by default in UTF-8
|
||||||
|
given = u"\xa2\xd8ab\xff"
|
||||||
|
expect = "%C2%A2%C3%98ab%C3%BF"
|
||||||
|
result = urllib.quote(given)
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" % (expect, result))
|
||||||
|
# Characters in Latin-1 range, encoded by with None (default)
|
||||||
|
result = urllib.quote(given, encoding=None, errors=None)
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" % (expect, result))
|
||||||
|
# Characters in Latin-1 range, encoded with Latin-1
|
||||||
|
given = u"\xa2\xd8ab\xff"
|
||||||
|
expect = "%A2%D8ab%FF"
|
||||||
|
result = urllib.quote(given, encoding="latin-1")
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" % (expect, result))
|
||||||
|
# Characters in BMP, encoded by default in UTF-8
|
||||||
|
given = u"\u6f22\u5b57" # "Kanji"
|
||||||
|
expect = "%E6%BC%A2%E5%AD%97"
|
||||||
|
result = urllib.quote(given)
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" % (expect, result))
|
||||||
|
# Characters in BMP, encoded with Latin-1
|
||||||
|
given = u"\u6f22\u5b57"
|
||||||
|
self.assertRaises(UnicodeEncodeError, urllib.quote, given,
|
||||||
|
encoding="latin-1")
|
||||||
|
# Characters in BMP, encoded with Latin-1, with replace error handling
|
||||||
|
given = u"\u6f22\u5b57"
|
||||||
|
expect = "%3F%3F" # "??"
|
||||||
|
result = urllib.quote(given, encoding="latin-1",
|
||||||
|
errors="replace")
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" % (expect, result))
|
||||||
|
# Characters in BMP, Latin-1, with xmlcharref error handling
|
||||||
|
given = u"\u6f22\u5b57"
|
||||||
|
expect = "%26%2328450%3B%26%2323383%3B" # "漢字"
|
||||||
|
result = urllib.quote(given, encoding="latin-1",
|
||||||
|
errors="xmlcharrefreplace")
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote(): %r != %r" % (expect, result))
|
||||||
|
|
||||||
|
def test_quote_plus_with_unicode(self):
|
||||||
|
# Encoding (latin-1) test for quote_plus
|
||||||
|
given = u"\xa2\xd8 \xff"
|
||||||
|
expect = "%A2%D8+%FF"
|
||||||
|
result = urllib.quote_plus(given, encoding="latin-1")
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote_plus(): %r != %r" % (expect, result))
|
||||||
|
# Errors test for quote_plus
|
||||||
|
given = u"ab\u6f22\u5b57 cd"
|
||||||
|
expect = "ab%3F%3F+cd"
|
||||||
|
result = urllib.quote_plus(given, encoding="latin-1",
|
||||||
|
errors="replace")
|
||||||
|
self.assertEqual(expect, result,
|
||||||
|
"using quote_plus(): %r != %r" % (expect, result))
|
||||||
|
|
||||||
class UnquotingTests(unittest.TestCase):
|
class UnquotingTests(unittest.TestCase):
|
||||||
"""Tests for unquote() and unquote_plus()
|
"""Tests for unquote() and unquote_plus()
|
||||||
|
|
|
@ -1193,7 +1193,7 @@ for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
|
||||||
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{:02X}'.format(i)
|
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{:02X}'.format(i)
|
||||||
_safe_quoters = {}
|
_safe_quoters = {}
|
||||||
|
|
||||||
def quote(s, safe='/'):
|
def quote(s, safe='/', encoding=None, errors=None):
|
||||||
"""quote('abc def') -> 'abc%20def'
|
"""quote('abc def') -> 'abc%20def'
|
||||||
|
|
||||||
Each part of a URL, e.g. the path info, the query, etc., has a
|
Each part of a URL, e.g. the path info, the query, etc., has a
|
||||||
|
@ -1213,10 +1213,28 @@ def quote(s, safe='/'):
|
||||||
is reserved, but in typical usage the quote function is being
|
is reserved, but in typical usage the quote function is being
|
||||||
called on a path where the existing slash characters are used as
|
called on a path where the existing slash characters are used as
|
||||||
reserved characters.
|
reserved characters.
|
||||||
|
|
||||||
|
string and safe may be either str or unicode objects.
|
||||||
|
|
||||||
|
The optional encoding and errors parameters specify how to deal with the
|
||||||
|
non-ASCII characters, as accepted by the unicode.encode method.
|
||||||
|
By default, encoding='utf-8' (characters are encoded with UTF-8), and
|
||||||
|
errors='strict' (unsupported characters raise a UnicodeEncodeError).
|
||||||
"""
|
"""
|
||||||
# fastpath
|
# fastpath
|
||||||
if not s:
|
if not s:
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
if encoding is not None or isinstance(s, unicode):
|
||||||
|
if encoding is None:
|
||||||
|
encoding = 'utf-8'
|
||||||
|
if errors is None:
|
||||||
|
errors = 'strict'
|
||||||
|
s = s.encode(encoding, errors)
|
||||||
|
if isinstance(safe, unicode):
|
||||||
|
# Normalize 'safe' by converting to str and removing non-ASCII chars
|
||||||
|
safe = safe.encode('ascii', 'ignore')
|
||||||
|
|
||||||
cachekey = (safe, always_safe)
|
cachekey = (safe, always_safe)
|
||||||
try:
|
try:
|
||||||
(quoter, safe) = _safe_quoters[cachekey]
|
(quoter, safe) = _safe_quoters[cachekey]
|
||||||
|
@ -1230,12 +1248,12 @@ def quote(s, safe='/'):
|
||||||
return s
|
return s
|
||||||
return ''.join(map(quoter, s))
|
return ''.join(map(quoter, s))
|
||||||
|
|
||||||
def quote_plus(s, safe=''):
|
def quote_plus(s, safe='', encoding=None, errors=None):
|
||||||
"""Quote the query fragment of a URL; replacing ' ' with '+'"""
|
"""Quote the query fragment of a URL; replacing ' ' with '+'"""
|
||||||
if ' ' in s:
|
if ' ' in s:
|
||||||
s = quote(s, safe + ' ')
|
s = quote(s, safe + ' ', encoding, errors)
|
||||||
return s.replace(' ', '+')
|
return s.replace(' ', '+')
|
||||||
return quote(s, safe)
|
return quote(s, safe, encoding, errors)
|
||||||
|
|
||||||
def urlencode(query, doseq=0):
|
def urlencode(query, doseq=0):
|
||||||
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
||||||
|
|
|
@ -18,6 +18,10 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue 1712522: urllib.quote supports Unicode String with encoding and errors
|
||||||
|
parameter. The encoding parameter defaults to utf-8 and errors to strict.
|
||||||
|
Patch by Matt Giuca.
|
||||||
|
|
||||||
- Issue #7646: The fnmatch pattern cache no longer grows without bound.
|
- Issue #7646: The fnmatch pattern cache no longer grows without bound.
|
||||||
|
|
||||||
- Issue #9136: Fix 'dictionary changed size during iteration'
|
- Issue #9136: Fix 'dictionary changed size during iteration'
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue