mirror of
https://github.com/python/cpython.git
synced 2025-07-30 06:34:15 +00:00

It consists of code from urllib, urllib2, urlparse, and robotparser. The old modules have all been removed. The new package has five submodules: urllib.parse, urllib.request, urllib.response, urllib.error, and urllib.robotparser. The urllib.request.urlopen() function uses the url opener from urllib2. Note that the unittests have not been renamed for the beta, but they will be renamed in the future. Joint work with Senthil Kumaran.
317 lines
15 KiB
Python
317 lines
15 KiB
Python
#! /usr/bin/env python
|
|
|
|
from test import support
|
|
import unittest
|
|
import urllib.parse
|
|
|
|
RFC1808_BASE = "http://a/b/c/d;p?q#f"
|
|
RFC2396_BASE = "http://a/b/c/d;p?q"
|
|
|
|
class UrlParseTestCase(unittest.TestCase):
|
|
|
|
def checkRoundtrips(self, url, parsed, split):
|
|
result = urllib.parse.urlparse(url)
|
|
self.assertEqual(result, parsed)
|
|
t = (result.scheme, result.netloc, result.path,
|
|
result.params, result.query, result.fragment)
|
|
self.assertEqual(t, parsed)
|
|
# put it back together and it should be the same
|
|
result2 = urllib.parse.urlunparse(result)
|
|
self.assertEqual(result2, url)
|
|
self.assertEqual(result2, result.geturl())
|
|
|
|
# the result of geturl() is a fixpoint; we can always parse it
|
|
# again to get the same result:
|
|
result3 = urllib.parse.urlparse(result.geturl())
|
|
self.assertEqual(result3.geturl(), result.geturl())
|
|
self.assertEqual(result3, result)
|
|
self.assertEqual(result3.scheme, result.scheme)
|
|
self.assertEqual(result3.netloc, result.netloc)
|
|
self.assertEqual(result3.path, result.path)
|
|
self.assertEqual(result3.params, result.params)
|
|
self.assertEqual(result3.query, result.query)
|
|
self.assertEqual(result3.fragment, result.fragment)
|
|
self.assertEqual(result3.username, result.username)
|
|
self.assertEqual(result3.password, result.password)
|
|
self.assertEqual(result3.hostname, result.hostname)
|
|
self.assertEqual(result3.port, result.port)
|
|
|
|
# check the roundtrip using urlsplit() as well
|
|
result = urllib.parse.urlsplit(url)
|
|
self.assertEqual(result, split)
|
|
t = (result.scheme, result.netloc, result.path,
|
|
result.query, result.fragment)
|
|
self.assertEqual(t, split)
|
|
result2 = urllib.parse.urlunsplit(result)
|
|
self.assertEqual(result2, url)
|
|
self.assertEqual(result2, result.geturl())
|
|
|
|
# check the fixpoint property of re-parsing the result of geturl()
|
|
result3 = urllib.parse.urlsplit(result.geturl())
|
|
self.assertEqual(result3.geturl(), result.geturl())
|
|
self.assertEqual(result3, result)
|
|
self.assertEqual(result3.scheme, result.scheme)
|
|
self.assertEqual(result3.netloc, result.netloc)
|
|
self.assertEqual(result3.path, result.path)
|
|
self.assertEqual(result3.query, result.query)
|
|
self.assertEqual(result3.fragment, result.fragment)
|
|
self.assertEqual(result3.username, result.username)
|
|
self.assertEqual(result3.password, result.password)
|
|
self.assertEqual(result3.hostname, result.hostname)
|
|
self.assertEqual(result3.port, result.port)
|
|
|
|
def test_roundtrips(self):
|
|
testcases = [
|
|
('file:///tmp/junk.txt',
|
|
('file', '', '/tmp/junk.txt', '', '', ''),
|
|
('file', '', '/tmp/junk.txt', '', '')),
|
|
('imap://mail.python.org/mbox1',
|
|
('imap', 'mail.python.org', '/mbox1', '', '', ''),
|
|
('imap', 'mail.python.org', '/mbox1', '', '')),
|
|
('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf',
|
|
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
|
|
'', '', ''),
|
|
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
|
|
'', '')),
|
|
('svn+ssh://svn.zope.org/repos/main/ZConfig/trunk/',
|
|
('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/',
|
|
'', '', ''),
|
|
('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/',
|
|
'', ''))
|
|
]
|
|
for url, parsed, split in testcases:
|
|
self.checkRoundtrips(url, parsed, split)
|
|
|
|
def test_http_roundtrips(self):
|
|
# urllib.parse.urlsplit treats 'http:' as an optimized special case,
|
|
# so we test both 'http:' and 'https:' in all the following.
|
|
# Three cheers for white box knowledge!
|
|
testcases = [
|
|
('://www.python.org',
|
|
('www.python.org', '', '', '', ''),
|
|
('www.python.org', '', '', '')),
|
|
('://www.python.org#abc',
|
|
('www.python.org', '', '', '', 'abc'),
|
|
('www.python.org', '', '', 'abc')),
|
|
('://www.python.org?q=abc',
|
|
('www.python.org', '', '', 'q=abc', ''),
|
|
('www.python.org', '', 'q=abc', '')),
|
|
('://www.python.org/#abc',
|
|
('www.python.org', '/', '', '', 'abc'),
|
|
('www.python.org', '/', '', 'abc')),
|
|
('://a/b/c/d;p?q#f',
|
|
('a', '/b/c/d', 'p', 'q', 'f'),
|
|
('a', '/b/c/d;p', 'q', 'f')),
|
|
]
|
|
for scheme in ('http', 'https'):
|
|
for url, parsed, split in testcases:
|
|
url = scheme + url
|
|
parsed = (scheme,) + parsed
|
|
split = (scheme,) + split
|
|
self.checkRoundtrips(url, parsed, split)
|
|
|
|
def checkJoin(self, base, relurl, expected):
|
|
self.assertEqual(urllib.parse.urljoin(base, relurl), expected,
|
|
(base, relurl, expected))
|
|
|
|
def test_unparse_parse(self):
|
|
for u in ['Python', './Python']:
|
|
self.assertEqual(urllib.parse.urlunsplit(urllib.parse.urlsplit(u)), u)
|
|
self.assertEqual(urllib.parse.urlunparse(urllib.parse.urlparse(u)), u)
|
|
|
|
def test_RFC1808(self):
|
|
# "normal" cases from RFC 1808:
|
|
self.checkJoin(RFC1808_BASE, 'g:h', 'g:h')
|
|
self.checkJoin(RFC1808_BASE, 'g', 'http://a/b/c/g')
|
|
self.checkJoin(RFC1808_BASE, './g', 'http://a/b/c/g')
|
|
self.checkJoin(RFC1808_BASE, 'g/', 'http://a/b/c/g/')
|
|
self.checkJoin(RFC1808_BASE, '/g', 'http://a/g')
|
|
self.checkJoin(RFC1808_BASE, '//g', 'http://g')
|
|
self.checkJoin(RFC1808_BASE, 'g?y', 'http://a/b/c/g?y')
|
|
self.checkJoin(RFC1808_BASE, 'g?y/./x', 'http://a/b/c/g?y/./x')
|
|
self.checkJoin(RFC1808_BASE, '#s', 'http://a/b/c/d;p?q#s')
|
|
self.checkJoin(RFC1808_BASE, 'g#s', 'http://a/b/c/g#s')
|
|
self.checkJoin(RFC1808_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
|
|
self.checkJoin(RFC1808_BASE, 'g?y#s', 'http://a/b/c/g?y#s')
|
|
self.checkJoin(RFC1808_BASE, 'g;x', 'http://a/b/c/g;x')
|
|
self.checkJoin(RFC1808_BASE, 'g;x?y#s', 'http://a/b/c/g;x?y#s')
|
|
self.checkJoin(RFC1808_BASE, '.', 'http://a/b/c/')
|
|
self.checkJoin(RFC1808_BASE, './', 'http://a/b/c/')
|
|
self.checkJoin(RFC1808_BASE, '..', 'http://a/b/')
|
|
self.checkJoin(RFC1808_BASE, '../', 'http://a/b/')
|
|
self.checkJoin(RFC1808_BASE, '../g', 'http://a/b/g')
|
|
self.checkJoin(RFC1808_BASE, '../..', 'http://a/')
|
|
self.checkJoin(RFC1808_BASE, '../../', 'http://a/')
|
|
self.checkJoin(RFC1808_BASE, '../../g', 'http://a/g')
|
|
|
|
# "abnormal" cases from RFC 1808:
|
|
self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
|
|
self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
|
|
self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
|
|
self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
|
|
self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
|
|
self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.')
|
|
self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g')
|
|
self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..')
|
|
self.checkJoin(RFC1808_BASE, '..g', 'http://a/b/c/..g')
|
|
self.checkJoin(RFC1808_BASE, './../g', 'http://a/b/g')
|
|
self.checkJoin(RFC1808_BASE, './g/.', 'http://a/b/c/g/')
|
|
self.checkJoin(RFC1808_BASE, 'g/./h', 'http://a/b/c/g/h')
|
|
self.checkJoin(RFC1808_BASE, 'g/../h', 'http://a/b/c/h')
|
|
|
|
# RFC 1808 and RFC 1630 disagree on these (according to RFC 1808),
|
|
# so we'll not actually run these tests (which expect 1808 behavior).
|
|
#self.checkJoin(RFC1808_BASE, 'http:g', 'http:g')
|
|
#self.checkJoin(RFC1808_BASE, 'http:', 'http:')
|
|
|
|
def test_RFC2396(self):
|
|
# cases from RFC 2396
|
|
|
|
self.checkJoin(RFC2396_BASE, '?y', 'http://a/b/c/?y')
|
|
self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x')
|
|
|
|
self.checkJoin(RFC2396_BASE, 'g:h', 'g:h')
|
|
self.checkJoin(RFC2396_BASE, 'g', 'http://a/b/c/g')
|
|
self.checkJoin(RFC2396_BASE, './g', 'http://a/b/c/g')
|
|
self.checkJoin(RFC2396_BASE, 'g/', 'http://a/b/c/g/')
|
|
self.checkJoin(RFC2396_BASE, '/g', 'http://a/g')
|
|
self.checkJoin(RFC2396_BASE, '//g', 'http://g')
|
|
self.checkJoin(RFC2396_BASE, 'g?y', 'http://a/b/c/g?y')
|
|
self.checkJoin(RFC2396_BASE, '#s', 'http://a/b/c/d;p?q#s')
|
|
self.checkJoin(RFC2396_BASE, 'g#s', 'http://a/b/c/g#s')
|
|
self.checkJoin(RFC2396_BASE, 'g?y#s', 'http://a/b/c/g?y#s')
|
|
self.checkJoin(RFC2396_BASE, 'g;x', 'http://a/b/c/g;x')
|
|
self.checkJoin(RFC2396_BASE, 'g;x?y#s', 'http://a/b/c/g;x?y#s')
|
|
self.checkJoin(RFC2396_BASE, '.', 'http://a/b/c/')
|
|
self.checkJoin(RFC2396_BASE, './', 'http://a/b/c/')
|
|
self.checkJoin(RFC2396_BASE, '..', 'http://a/b/')
|
|
self.checkJoin(RFC2396_BASE, '../', 'http://a/b/')
|
|
self.checkJoin(RFC2396_BASE, '../g', 'http://a/b/g')
|
|
self.checkJoin(RFC2396_BASE, '../..', 'http://a/')
|
|
self.checkJoin(RFC2396_BASE, '../../', 'http://a/')
|
|
self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g')
|
|
self.checkJoin(RFC2396_BASE, '', RFC2396_BASE)
|
|
self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
|
|
self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
|
|
self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
|
|
self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
|
|
self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.')
|
|
self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g')
|
|
self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..')
|
|
self.checkJoin(RFC2396_BASE, '..g', 'http://a/b/c/..g')
|
|
self.checkJoin(RFC2396_BASE, './../g', 'http://a/b/g')
|
|
self.checkJoin(RFC2396_BASE, './g/.', 'http://a/b/c/g/')
|
|
self.checkJoin(RFC2396_BASE, 'g/./h', 'http://a/b/c/g/h')
|
|
self.checkJoin(RFC2396_BASE, 'g/../h', 'http://a/b/c/h')
|
|
self.checkJoin(RFC2396_BASE, 'g;x=1/./y', 'http://a/b/c/g;x=1/y')
|
|
self.checkJoin(RFC2396_BASE, 'g;x=1/../y', 'http://a/b/c/y')
|
|
self.checkJoin(RFC2396_BASE, 'g?y/./x', 'http://a/b/c/g?y/./x')
|
|
self.checkJoin(RFC2396_BASE, 'g?y/../x', 'http://a/b/c/g?y/../x')
|
|
self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
|
|
self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x')
|
|
|
|
def test_urldefrag(self):
|
|
for url, defrag, frag in [
|
|
('http://python.org#frag', 'http://python.org', 'frag'),
|
|
('http://python.org', 'http://python.org', ''),
|
|
('http://python.org/#frag', 'http://python.org/', 'frag'),
|
|
('http://python.org/', 'http://python.org/', ''),
|
|
('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
|
|
('http://python.org/?q', 'http://python.org/?q', ''),
|
|
('http://python.org/p#frag', 'http://python.org/p', 'frag'),
|
|
('http://python.org/p?q', 'http://python.org/p?q', ''),
|
|
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
|
|
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
|
|
]:
|
|
self.assertEqual(urllib.parse.urldefrag(url), (defrag, frag))
|
|
|
|
def test_urlsplit_attributes(self):
|
|
url = "HTTP://WWW.PYTHON.ORG/doc/#frag"
|
|
p = urllib.parse.urlsplit(url)
|
|
self.assertEqual(p.scheme, "http")
|
|
self.assertEqual(p.netloc, "WWW.PYTHON.ORG")
|
|
self.assertEqual(p.path, "/doc/")
|
|
self.assertEqual(p.query, "")
|
|
self.assertEqual(p.fragment, "frag")
|
|
self.assertEqual(p.username, None)
|
|
self.assertEqual(p.password, None)
|
|
self.assertEqual(p.hostname, "www.python.org")
|
|
self.assertEqual(p.port, None)
|
|
# geturl() won't return exactly the original URL in this case
|
|
# since the scheme is always case-normalized
|
|
#self.assertEqual(p.geturl(), url)
|
|
|
|
url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"
|
|
p = urllib.parse.urlsplit(url)
|
|
self.assertEqual(p.scheme, "http")
|
|
self.assertEqual(p.netloc, "User:Pass@www.python.org:080")
|
|
self.assertEqual(p.path, "/doc/")
|
|
self.assertEqual(p.query, "query=yes")
|
|
self.assertEqual(p.fragment, "frag")
|
|
self.assertEqual(p.username, "User")
|
|
self.assertEqual(p.password, "Pass")
|
|
self.assertEqual(p.hostname, "www.python.org")
|
|
self.assertEqual(p.port, 80)
|
|
self.assertEqual(p.geturl(), url)
|
|
|
|
# Addressing issue1698, which suggests Username can contain
|
|
# "@" characters. Though not RFC compliant, many ftp sites allow
|
|
# and request email addresses as usernames.
|
|
|
|
url = "http://User@example.com:Pass@www.python.org:080/doc/?query=yes#frag"
|
|
p = urllib.parse.urlsplit(url)
|
|
self.assertEqual(p.scheme, "http")
|
|
self.assertEqual(p.netloc, "User@example.com:Pass@www.python.org:080")
|
|
self.assertEqual(p.path, "/doc/")
|
|
self.assertEqual(p.query, "query=yes")
|
|
self.assertEqual(p.fragment, "frag")
|
|
self.assertEqual(p.username, "User@example.com")
|
|
self.assertEqual(p.password, "Pass")
|
|
self.assertEqual(p.hostname, "www.python.org")
|
|
self.assertEqual(p.port, 80)
|
|
self.assertEqual(p.geturl(), url)
|
|
|
|
|
|
def test_attributes_bad_port(self):
|
|
"""Check handling of non-integer ports."""
|
|
p = urllib.parse.urlsplit("http://www.example.net:foo")
|
|
self.assertEqual(p.netloc, "www.example.net:foo")
|
|
self.assertRaises(ValueError, lambda: p.port)
|
|
|
|
p = urllib.parse.urlparse("http://www.example.net:foo")
|
|
self.assertEqual(p.netloc, "www.example.net:foo")
|
|
self.assertRaises(ValueError, lambda: p.port)
|
|
|
|
def test_attributes_without_netloc(self):
|
|
# This example is straight from RFC 3261. It looks like it
|
|
# should allow the username, hostname, and port to be filled
|
|
# in, but doesn't. Since it's a URI and doesn't use the
|
|
# scheme://netloc syntax, the netloc and related attributes
|
|
# should be left empty.
|
|
uri = "sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15"
|
|
p = urllib.parse.urlsplit(uri)
|
|
self.assertEqual(p.netloc, "")
|
|
self.assertEqual(p.username, None)
|
|
self.assertEqual(p.password, None)
|
|
self.assertEqual(p.hostname, None)
|
|
self.assertEqual(p.port, None)
|
|
self.assertEqual(p.geturl(), uri)
|
|
|
|
p = urllib.parse.urlparse(uri)
|
|
self.assertEqual(p.netloc, "")
|
|
self.assertEqual(p.username, None)
|
|
self.assertEqual(p.password, None)
|
|
self.assertEqual(p.hostname, None)
|
|
self.assertEqual(p.port, None)
|
|
self.assertEqual(p.geturl(), uri)
|
|
|
|
def test_noslash(self):
|
|
# Issue 1637: http://foo.com?query is legal
|
|
self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"),
|
|
('http', 'example.com', '', '', 'blahblah=/foo', ''))
|
|
|
|
def test_main():
|
|
support.run_unittest(UrlParseTestCase)
|
|
|
|
if __name__ == "__main__":
|
|
test_main()
|