Fix Issue754016 - urlparse goes wrong with IP:port without scheme

2025-09-26 18:29:57 +00:00 · 2010-08-04 04:50:44 +00:00 · 2010-08-04 04:50:44 +00:00 · 84c7d9f87b
commit 84c7d9f87b
parent 4aa0d4d2d0
3 changed files with 44 additions and 5 deletions
--- a/Doc/library/urllib.parse.rst
+++ b/Doc/library/urllib.parse.rst
@ -48,6 +48,23 @@ The :mod:`urllib.parse` module defines the following functions:
      >>> o.geturl()
      'http://www.cwi.nl:80/%7Eguido/Python.html'
   If the scheme value is not specified, urlparse following the syntax
   specifications from RFC 1808, expects the netloc value to start with '//',
   Otherwise, it is not possible to distinguish between net_loc and path
   component and would classify the indistinguishable component as path as in
   a relative url.
       >>> from urlparse import urlparse
       >>> urlparse('//www.cwi.nl:80/%7Eguido/Python.html')
       ParseResult(scheme='', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',
                  params='', query='', fragment='')
       >>> urlparse('www.cwi.nl:80/%7Eguido/Python.html')
       ParseResult(scheme='', netloc='', path='www.cwi.nl:80/%7Eguido/Python.html',
                  params='', query='', fragment='')
       >>> urlparse('help/Python.html')
       ParseResult(scheme='', netloc='', path='help/Python.html', params='',
                  query='', fragment='')
   If the *scheme* argument is specified, it gives the default addressing
   scheme, to be used only if the URL does not specify one.  The default value for
   this argument is the empty string.
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@ -461,6 +461,27 @@ class UrlParseTestCase(unittest.TestCase):
        self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"),
                         ('http', 'example.com', '', '', 'blahblah=/foo', ''))
    def test_withoutscheme(self):
        # Test urlparse without scheme
        # Issue 754016: urlparse goes wrong with IP:port without scheme
        # RFC 1808 specifies that netloc should start with //, urlparse expects
        # the same, otherwise it classifies the portion of url as path.
        self.assertEqual(urllib.parse.urlparse("path"),
                ('','','path','','',''))
        self.assertEqual(urllib.parse.urlparse("//www.python.org:80"),
                ('','www.python.org:80','','','',''))
        self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
                ('http','www.python.org:80','','','',''))
    def test_portseparator(self):
        # Issue 754016 makes changes for port separator ':' from scheme separator
        self.assertEqual(urllib.parse.urlparse("path:80"),
                ('','','path:80','','',''))
        self.assertEqual(urllib.parse.urlparse("http:"),('http','','','','',''))
        self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','',''))
        self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
                ('http','www.python.org:80','','','',''))
    def test_usingsys(self):
        # Issue 3314: sys module is used in the error
        self.assertRaises(TypeError, urllib.parse.urlencode, "foo")
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@ -192,6 +192,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
            v = SplitResult(scheme, netloc, url, query, fragment)
            _parse_cache[key] = v
            return v
        if url.endswith(':') or not url[i+1].isdigit():
            for c in url[:i]:
                if c not in scheme_chars:
                    break