mirror of
https://github.com/python/cpython.git
synced 2025-09-27 02:39:58 +00:00
bpo-16285: Update urllib quoting to RFC 3986 (#173)
* bpo-16285: Update urllib quoting to RFC 3986 urllib.parse.quote is now based on RFC 3986, and hence includes `'~'` in the set of characters that is not escaped by default. Patch by Christian Theune and Ratnadeep Debnath.
This commit is contained in:
parent
140792bd51
commit
21024f0662
6 changed files with 27 additions and 7 deletions
|
@ -451,13 +451,17 @@ task isn't already covered by the URL parsing functions above.
|
||||||
.. function:: quote(string, safe='/', encoding=None, errors=None)
|
.. function:: quote(string, safe='/', encoding=None, errors=None)
|
||||||
|
|
||||||
Replace special characters in *string* using the ``%xx`` escape. Letters,
|
Replace special characters in *string* using the ``%xx`` escape. Letters,
|
||||||
digits, and the characters ``'_.-'`` are never quoted. By default, this
|
digits, and the characters ``'_.-~'`` are never quoted. By default, this
|
||||||
function is intended for quoting the path section of URL. The optional *safe*
|
function is intended for quoting the path section of URL. The optional *safe*
|
||||||
parameter specifies additional ASCII characters that should not be quoted
|
parameter specifies additional ASCII characters that should not be quoted
|
||||||
--- its default value is ``'/'``.
|
--- its default value is ``'/'``.
|
||||||
|
|
||||||
*string* may be either a :class:`str` or a :class:`bytes`.
|
*string* may be either a :class:`str` or a :class:`bytes`.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.7
|
||||||
|
Moved from RFC 2396 to RFC 3986 for quoting URL strings. "~" is now
|
||||||
|
included in the set of reserved characters.
|
||||||
|
|
||||||
The optional *encoding* and *errors* parameters specify how to deal with
|
The optional *encoding* and *errors* parameters specify how to deal with
|
||||||
non-ASCII characters, as accepted by the :meth:`str.encode` method.
|
non-ASCII characters, as accepted by the :meth:`str.encode` method.
|
||||||
*encoding* defaults to ``'utf-8'``.
|
*encoding* defaults to ``'utf-8'``.
|
||||||
|
|
|
@ -103,6 +103,13 @@ The :const:`~unittest.mock.sentinel` attributes now preserve their identity
|
||||||
when they are :mod:`copied <copy>` or :mod:`pickled <pickle>`.
|
when they are :mod:`copied <copy>` or :mod:`pickled <pickle>`.
|
||||||
(Contributed by Serhiy Storchaka in :issue:`20804`.)
|
(Contributed by Serhiy Storchaka in :issue:`20804`.)
|
||||||
|
|
||||||
|
urllib.parse
|
||||||
|
------------
|
||||||
|
|
||||||
|
:func:`urllib.parse.quote` has been updated to from RFC 2396 to RFC 3986,
|
||||||
|
adding `~` to the set of characters that is never quoted by default.
|
||||||
|
(Contributed by Christian Theune and Ratnadeep Debnath in :issue:`16285`.)
|
||||||
|
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
=============
|
=============
|
||||||
|
|
|
@ -733,7 +733,7 @@ FF
|
||||||
class QuotingTests(unittest.TestCase):
|
class QuotingTests(unittest.TestCase):
|
||||||
r"""Tests for urllib.quote() and urllib.quote_plus()
|
r"""Tests for urllib.quote() and urllib.quote_plus()
|
||||||
|
|
||||||
According to RFC 2396 (Uniform Resource Identifiers), to escape a
|
According to RFC 3986 (Uniform Resource Identifiers), to escape a
|
||||||
character you write it as '%' + <2 character US-ASCII hex value>.
|
character you write it as '%' + <2 character US-ASCII hex value>.
|
||||||
The Python code of ``'%' + hex(ord(<character>))[2:]`` escapes a
|
The Python code of ``'%' + hex(ord(<character>))[2:]`` escapes a
|
||||||
character properly. Case does not matter on the hex letters.
|
character properly. Case does not matter on the hex letters.
|
||||||
|
@ -761,7 +761,7 @@ class QuotingTests(unittest.TestCase):
|
||||||
do_not_quote = '' .join(["ABCDEFGHIJKLMNOPQRSTUVWXYZ",
|
do_not_quote = '' .join(["ABCDEFGHIJKLMNOPQRSTUVWXYZ",
|
||||||
"abcdefghijklmnopqrstuvwxyz",
|
"abcdefghijklmnopqrstuvwxyz",
|
||||||
"0123456789",
|
"0123456789",
|
||||||
"_.-"])
|
"_.-~"])
|
||||||
result = urllib.parse.quote(do_not_quote)
|
result = urllib.parse.quote(do_not_quote)
|
||||||
self.assertEqual(do_not_quote, result,
|
self.assertEqual(do_not_quote, result,
|
||||||
"using quote(): %r != %r" % (do_not_quote, result))
|
"using quote(): %r != %r" % (do_not_quote, result))
|
||||||
|
|
|
@ -704,7 +704,7 @@ def unquote_plus(string, encoding='utf-8', errors='replace'):
|
||||||
_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
b'abcdefghijklmnopqrstuvwxyz'
|
b'abcdefghijklmnopqrstuvwxyz'
|
||||||
b'0123456789'
|
b'0123456789'
|
||||||
b'_.-')
|
b'_.-~')
|
||||||
_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
|
_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
|
||||||
_safe_quoters = {}
|
_safe_quoters = {}
|
||||||
|
|
||||||
|
@ -736,15 +736,18 @@ def quote(string, safe='/', encoding=None, errors=None):
|
||||||
Each part of a URL, e.g. the path info, the query, etc., has a
|
Each part of a URL, e.g. the path info, the query, etc., has a
|
||||||
different set of reserved characters that must be quoted.
|
different set of reserved characters that must be quoted.
|
||||||
|
|
||||||
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
|
RFC 3986 Uniform Resource Identifiers (URI): Generic Syntax lists
|
||||||
the following reserved characters.
|
the following reserved characters.
|
||||||
|
|
||||||
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
||||||
"$" | ","
|
"$" | "," | "~"
|
||||||
|
|
||||||
Each of these characters is reserved in some component of a URL,
|
Each of these characters is reserved in some component of a URL,
|
||||||
but not necessarily in all of them.
|
but not necessarily in all of them.
|
||||||
|
|
||||||
|
Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
|
||||||
|
Now, "~" is included in the set of reserved characters.
|
||||||
|
|
||||||
By default, the quote function is intended for quoting the path
|
By default, the quote function is intended for quoting the path
|
||||||
section of a URL. Thus, it will not encode '/'. This character
|
section of a URL. Thus, it will not encode '/'. This character
|
||||||
is reserved, but in typical usage the quote function is being
|
is reserved, but in typical usage the quote function is being
|
||||||
|
|
|
@ -344,6 +344,7 @@ Kushal Das
|
||||||
Jonathan Dasteel
|
Jonathan Dasteel
|
||||||
Pierre-Yves David
|
Pierre-Yves David
|
||||||
A. Jesse Jiryu Davis
|
A. Jesse Jiryu Davis
|
||||||
|
Ratnadeep Debnath
|
||||||
Merlijn van Deen
|
Merlijn van Deen
|
||||||
John DeGood
|
John DeGood
|
||||||
Ned Deily
|
Ned Deily
|
||||||
|
@ -1518,6 +1519,7 @@ Mikhail Terekhov
|
||||||
Victor Terrón
|
Victor Terrón
|
||||||
Richard M. Tew
|
Richard M. Tew
|
||||||
Tobias Thelen
|
Tobias Thelen
|
||||||
|
Christian Theune
|
||||||
Févry Thibault
|
Févry Thibault
|
||||||
Lowe Thiderman
|
Lowe Thiderman
|
||||||
Nicolas M. Thiéry
|
Nicolas M. Thiéry
|
||||||
|
@ -1528,7 +1530,7 @@ Stephen Thorne
|
||||||
Jeremy Thurgood
|
Jeremy Thurgood
|
||||||
Eric Tiedemann
|
Eric Tiedemann
|
||||||
July Tikhonov
|
July Tikhonov
|
||||||
Tracy Tims
|
|