mirror of
https://github.com/python/cpython.git
synced 2025-09-27 02:39:58 +00:00
Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the resolution of relative URLs, rather than RFCs 1808 and 2396.
Patch by Demian Brecht.
This commit is contained in:
parent
a7eb746278
commit
55ac5b3f7b
4 changed files with 71 additions and 41 deletions
|
@ -267,6 +267,11 @@ or on combining URL components into a URL string.
|
||||||
:func:`urlunsplit`, removing possible *scheme* and *netloc* parts.
|
:func:`urlunsplit`, removing possible *scheme* and *netloc* parts.
|
||||||
|
|
||||||
|
|
||||||
|
.. versionchanged:: 3.5
|
||||||
|
|
||||||
|
Behaviour updated to match the semantics defined in :rfc:`3986`.
|
||||||
|
|
||||||
|
|
||||||
.. function:: urldefrag(url)
|
.. function:: urldefrag(url)
|
||||||
|
|
||||||
If *url* contains a fragment identifier, return a modified version of *url*
|
If *url* contains a fragment identifier, return a modified version of *url*
|
||||||
|
|
|
@ -211,10 +211,6 @@ class UrlParseTestCase(unittest.TestCase):
|
||||||
|
|
||||||
# "abnormal" cases from RFC 1808:
|
# "abnormal" cases from RFC 1808:
|
||||||
self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
|
self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
|
||||||
self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
|
|
||||||
self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
|
|
||||||
self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
|
|
||||||
self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
|
|
||||||
self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.')
|
self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.')
|
||||||
self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g')
|
self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g')
|
||||||
self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..')
|
self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..')
|
||||||
|
@ -229,6 +225,13 @@ class UrlParseTestCase(unittest.TestCase):
|
||||||
#self.checkJoin(RFC1808_BASE, 'http:g', 'http:g')
|
#self.checkJoin(RFC1808_BASE, 'http:g', 'http:g')
|
||||||
#self.checkJoin(RFC1808_BASE, 'http:', 'http:')
|
#self.checkJoin(RFC1808_BASE, 'http:', 'http:')
|
||||||
|
|
||||||
|
# XXX: The following tests are no longer compatible with RFC3986
|
||||||
|
# self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
|
||||||
|
# self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
|
||||||
|
# self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
|
||||||
|
# self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
|
||||||
|
|
||||||
|
|
||||||
def test_RFC2368(self):
|
def test_RFC2368(self):
|
||||||
# Issue 11467: path that starts with a number is not parsed correctly
|
# Issue 11467: path that starts with a number is not parsed correctly
|
||||||
self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org'),
|
self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org'),
|
||||||
|
@ -259,10 +262,6 @@ class UrlParseTestCase(unittest.TestCase):
|
||||||
self.checkJoin(RFC2396_BASE, '../../', 'http://a/')
|
self.checkJoin(RFC2396_BASE, '../../', 'http://a/')
|
||||||
self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g')
|
self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g')
|
||||||
self.checkJoin(RFC2396_BASE, '', RFC2396_BASE)
|
self.checkJoin(RFC2396_BASE, '', RFC2396_BASE)
|
||||||
self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
|
|
||||||
self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
|
|
||||||
self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
|
|
||||||
self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
|
|
||||||
self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.')
|
self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.')
|
||||||
self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g')
|
self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g')
|
||||||
self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..')
|
self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..')
|
||||||
|
@ -278,10 +277,17 @@ class UrlParseTestCase(unittest.TestCase):
|
||||||
self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
|
self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
|
||||||
self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x')
|
self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x')
|
||||||
|
|
||||||
|
# XXX: The following tests are no longer compatible with RFC3986
|
||||||
|
# self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
|
||||||
|
# self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
|
||||||
|
# self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
|
||||||
|
# self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
|
||||||
|
|
||||||
|
|
||||||
def test_RFC3986(self):
|
def test_RFC3986(self):
|
||||||
# Test cases from RFC3986
|
# Test cases from RFC3986
|
||||||
self.checkJoin(RFC3986_BASE, '?y','http://a/b/c/d;p?y')
|
self.checkJoin(RFC3986_BASE, '?y','http://a/b/c/d;p?y')
|
||||||
self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x')
|
self.checkJoin(RFC3986_BASE, ';x', 'http://a/b/c/;x')
|
||||||
self.checkJoin(RFC3986_BASE, 'g:h','g:h')
|
self.checkJoin(RFC3986_BASE, 'g:h','g:h')
|
||||||
self.checkJoin(RFC3986_BASE, 'g','http://a/b/c/g')
|
self.checkJoin(RFC3986_BASE, 'g','http://a/b/c/g')
|
||||||
self.checkJoin(RFC3986_BASE, './g','http://a/b/c/g')
|
self.checkJoin(RFC3986_BASE, './g','http://a/b/c/g')
|
||||||
|
@ -305,17 +311,17 @@ class UrlParseTestCase(unittest.TestCase):
|
||||||
self.checkJoin(RFC3986_BASE, '../..','http://a/')
|
self.checkJoin(RFC3986_BASE, '../..','http://a/')
|
||||||
self.checkJoin(RFC3986_BASE, '../../','http://a/')
|
self.checkJoin(RFC3986_BASE, '../../','http://a/')
|
||||||
self.checkJoin(RFC3986_BASE, '../../g','http://a/g')
|
self.checkJoin(RFC3986_BASE, '../../g','http://a/g')
|
||||||
|
self.checkJoin(RFC3986_BASE, '../../../g', 'http://a/g')
|
||||||
|
|
||||||
#Abnormal Examples
|
#Abnormal Examples
|
||||||
|
|
||||||
# The 'abnormal scenarios' are incompatible with RFC2986 parsing
|
# The 'abnormal scenarios' are incompatible with RFC2986 parsing
|
||||||
# Tests are here for reference.
|
# Tests are here for reference.
|
||||||
|
|
||||||
#self.checkJoin(RFC3986_BASE, '../../../g','http://a/g')
|
self.checkJoin(RFC3986_BASE, '../../../g','http://a/g')
|
||||||
#self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g')
|
self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g')
|
||||||
#self.checkJoin(RFC3986_BASE, '/./g','http://a/g')
|
self.checkJoin(RFC3986_BASE, '/./g','http://a/g')
|
||||||
#self.checkJoin(RFC3986_BASE, '/../g','http://a/g')
|
self.checkJoin(RFC3986_BASE, '/../g','http://a/g')
|
||||||
|
|
||||||
self.checkJoin(RFC3986_BASE, 'g.','http://a/b/c/g.')
|
self.checkJoin(RFC3986_BASE, 'g.','http://a/b/c/g.')
|
||||||
self.checkJoin(RFC3986_BASE, '.g','http://a/b/c/.g')
|
self.checkJoin(RFC3986_BASE, '.g','http://a/b/c/.g')
|
||||||
self.checkJoin(RFC3986_BASE, 'g..','http://a/b/c/g..')
|
self.checkJoin(RFC3986_BASE, 'g..','http://a/b/c/g..')
|
||||||
|
@ -355,10 +361,8 @@ class UrlParseTestCase(unittest.TestCase):
|
||||||
self.checkJoin(SIMPLE_BASE, '../g','http://a/b/g')
|
self.checkJoin(SIMPLE_BASE, '../g','http://a/b/g')
|
||||||
self.checkJoin(SIMPLE_BASE, '../..','http://a/')
|
self.checkJoin(SIMPLE_BASE, '../..','http://a/')
|
||||||
self.checkJoin(SIMPLE_BASE, '../../g','http://a/g')
|
self.checkJoin(SIMPLE_BASE, '../../g','http://a/g')
|
||||||
self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g')
|
|
||||||
self.checkJoin(SIMPLE_BASE, './../g','http://a/b/g')
|
self.checkJoin(SIMPLE_BASE, './../g','http://a/b/g')
|
||||||
self.checkJoin(SIMPLE_BASE, './g/.','http://a/b/c/g/')
|
self.checkJoin(SIMPLE_BASE, './g/.','http://a/b/c/g/')
|
||||||
self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g')
|
|
||||||
self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h')
|
self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h')
|
||||||
self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h')
|
self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h')
|
||||||
self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g')
|
self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g')
|
||||||
|
@ -372,6 +376,10 @@ class UrlParseTestCase(unittest.TestCase):
|
||||||
self.checkJoin('svn://pathtorepo/dir1', 'dir2', 'svn://pathtorepo/dir2')
|
self.checkJoin('svn://pathtorepo/dir1', 'dir2', 'svn://pathtorepo/dir2')
|
||||||
self.checkJoin('svn+ssh://pathtorepo/dir1', 'dir2', 'svn+ssh://pathtorepo/dir2')
|
self.checkJoin('svn+ssh://pathtorepo/dir1', 'dir2', 'svn+ssh://pathtorepo/dir2')
|
||||||
|
|
||||||
|
# XXX: The following tests are no longer compatible with RFC3986
|
||||||
|
# self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g')
|
||||||
|
# self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g')
|
||||||
|
|
||||||
def test_RFC2732(self):
|
def test_RFC2732(self):
|
||||||
str_cases = [
|
str_cases = [
|
||||||
('http://Test.python.org:5432/foo/', 'test.python.org', 5432),
|
('http://Test.python.org:5432/foo/', 'test.python.org', 5432),
|
||||||
|
|
|
@ -409,11 +409,13 @@ def urljoin(base, url, allow_fragments=True):
|
||||||
return url
|
return url
|
||||||
if not url:
|
if not url:
|
||||||
return base
|
return base
|
||||||
|
|
||||||
base, url, _coerce_result = _coerce_args(base, url)
|
base, url, _coerce_result = _coerce_args(base, url)
|
||||||
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
|
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
|
||||||
urlparse(base, '', allow_fragments)
|
urlparse(base, '', allow_fragments)
|
||||||
scheme, netloc, path, params, query, fragment = \
|
scheme, netloc, path, params, query, fragment = \
|
||||||
urlparse(url, bscheme, allow_fragments)
|
urlparse(url, bscheme, allow_fragments)
|
||||||
|
|
||||||
if scheme != bscheme or scheme not in uses_relative:
|
if scheme != bscheme or scheme not in uses_relative:
|
||||||
return _coerce_result(url)
|
return _coerce_result(url)
|
||||||
if scheme in uses_netloc:
|
if scheme in uses_netloc:
|
||||||
|
@ -421,9 +423,7 @@ def urljoin(base, url, allow_fragments=True):
|
||||||
return _coerce_result(urlunparse((scheme, netloc, path,
|
return _coerce_result(urlunparse((scheme, netloc, path,
|
||||||
params, query, fragment)))
|
params, query, fragment)))
|
||||||
netloc = bnetloc
|
netloc = bnetloc
|
||||||
if path[:1] == '/':
|
|
||||||
return _coerce_result(urlunparse((scheme, netloc, path,
|
|
||||||
params, query, fragment)))
|
|
||||||
if not path and not params:
|
if not path and not params:
|
||||||
path = bpath
|
path = bpath
|
||||||
params = bparams
|
params = bparams
|
||||||
|
@ -431,29 +431,42 @@ def urljoin(base, url, allow_fragments=True):
|
||||||
query = bquery
|
query = bquery
|
||||||
return _coerce_result(urlunparse((scheme, netloc, path,
|
return _coerce_result(urlunparse((scheme, netloc, path,
|
||||||
params, query, fragment)))
|
params, query, fragment)))
|
||||||
segments = bpath.split('/')[:-1] + path.split('/')
|
|
||||||
# XXX The stuff below is bogus in various ways...
|
base_parts = bpath.split('/')
|
||||||
if segments[-1] == '.':
|
if base_parts[-1] != '':
|
||||||
segments[-1] = ''
|
# the last item is not a directory, so will not be taken into account
|
||||||
while '.' in segments:
|
# in resolving the relative path
|
||||||
segments.remove('.')
|
del base_parts[-1]
|
||||||
while 1:
|
|
||||||
i = 1
|
# for rfc3986, ignore all base path should the first character be root.
|
||||||
n = len(segments) - 1
|
if path[:1] == '/':
|
||||||
while i < n:
|
segments = path.split('/')
|
||||||
if (segments[i] == '..'
|
else:
|
||||||
and segments[i-1] not in ('', '..')):
|
segments = base_parts + path.split('/')
|
||||||
del segments[i-1:i+1]
|
|
||||||
break
|
resolved_path = []
|
||||||
i = i+1
|
|
||||||
|
for seg in segments:
|
||||||
|
if seg == '..':
|
||||||
|
try:
|
||||||
|
resolved_path.pop()
|
||||||
|
except IndexError:
|
||||||
|
# ignore any .. segments that would otherwise cause an IndexError
|
||||||
|
# when popped from resolved_path if resolving for rfc3986
|
||||||
|
pass
|
||||||
|
elif seg == '.':
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
break
|
resolved_path.append(seg)
|
||||||
if segments == ['', '..']:
|
|
||||||
segments[-1] = ''
|
if segments[-1] in ('.', '..'):
|
||||||
elif len(segments) >= 2 and segments[-1] == '..':
|
# do some post-processing here. if the last segment was a relative dir,
|
||||||
segments[-2:] = ['']
|
# then we need to append the trailing '/'
|
||||||
return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
|
resolved_path.append('')
|
||||||
params, query, fragment)))
|
|
||||||
|
return _coerce_result(urlunparse((scheme, netloc, '/'.join(
|
||||||
|
resolved_path), params, query, fragment)))
|
||||||
|
|
||||||
|
|
||||||
def urldefrag(url):
|
def urldefrag(url):
|
||||||
"""Removes any existing fragment from URL.
|
"""Removes any existing fragment from URL.
|
||||||
|
|
|
@ -124,6 +124,10 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the
|
||||||
|
resolution of relative URLs, rather than RFCs 1808 and 2396.
|
||||||
|
Patch by Demian Brecht.
|
||||||
|
|
||||||
- Issue #21549: Added the "members" parameter to TarFile.list().
|
- Issue #21549: Added the "members" parameter to TarFile.list().
|
||||||
|
|
||||||
- Issue #19628: Allow compileall recursion depth to be specified with a -r
|
- Issue #19628: Allow compileall recursion depth to be specified with a -r
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue