GH-123599: url2pathname(): handle authority section in file URL (#126844)

In `urllib.request.url2pathname()`, if the authority resolves to the
current host, discard it. If an authority is present but resolves somewhere
else, then on Windows we return a UNC path (as before), and on other
platforms we raise `URLError`.

Affects `pathlib.Path.from_uri()` in the same way.

Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
This commit is contained in:
Barney Gale 2025-04-10 20:58:04 +01:00 committed by GitHub
parent a214db0c54
commit 66cdb2bd8a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 106 additions and 48 deletions

View file

@ -871,6 +871,12 @@ conforming to :rfc:`8089`.
.. versionadded:: 3.13
.. versionchanged:: next
If a URL authority (e.g. a hostname) is present and resolves to a local
address, it is discarded. If an authority is present and *doesn't*
resolve to a local address, then on Windows a UNC path is returned (as
before), and on other platforms a :exc:`ValueError` is raised.
.. method:: Path.as_uri()

View file

@ -158,16 +158,16 @@ The :mod:`urllib.request` module defines the following functions:
>>> 'file:' + pathname2url(path)
'file:///C:/Program%20Files'
.. versionchanged:: 3.14
Paths beginning with a slash are converted to URLs with authority
sections. For example, the path ``/etc/hosts`` is converted to
the URL ``///etc/hosts``.
.. versionchanged:: 3.14
Windows drive letters are no longer converted to uppercase, and ``:``
characters not following a drive letter no longer cause an
:exc:`OSError` exception to be raised on Windows.
.. versionchanged:: 3.14
Paths beginning with a slash are converted to URLs with authority
sections. For example, the path ``/etc/hosts`` is converted to
the URL ``///etc/hosts``.
.. function:: url2pathname(url)
@ -186,6 +186,13 @@ The :mod:`urllib.request` module defines the following functions:
characters not following a drive letter no longer cause an
:exc:`OSError` exception to be raised on Windows.
.. versionchanged:: next
This function calls :func:`socket.gethostbyname` if the URL authority
isn't empty or ``localhost``. If the authority resolves to a local IP
address then it is discarded; otherwise, on Windows a UNC path is
returned (as before), and on other platforms a
:exc:`~urllib.error.URLError` is raised.
.. function:: getproxies()

View file

@ -1197,6 +1197,25 @@ urllib
supporting SHA-256 digest authentication as specified in :rfc:`7616`.
(Contributed by Calvin Bui in :gh:`128193`.)
* Improve standards compliance when parsing and emitting ``file:`` URLs.
In :func:`urllib.request.url2pathname`:
- Discard URL authorities that resolve to a local IP address.
- Raise :exc:`~urllib.error.URLError` if a URL authority doesn't resolve
to ``localhost``, except on Windows where we return a UNC path.
In :func:`urllib.request.pathname2url`:
- Include an empty URL authority when a path begins with a slash. For
example, the path ``/etc/hosts`` is converted to the URL ``///etc/hosts``.
On Windows, drive letters are no longer converted to uppercase, and ``:``
characters not following a drive letter no longer cause an :exc:`OSError`
exception to be raised.
(Contributed by Barney Gale in :gh:`125866`.)
uuid
----

View file

@ -1278,8 +1278,12 @@ class Path(PurePath):
"""Return a new path from the given 'file' URI."""
if not uri.startswith('file:'):
raise ValueError(f"URI does not start with 'file:': {uri!r}")
from urllib.error import URLError
from urllib.request import url2pathname
path = cls(url2pathname(uri.removeprefix('file:')))
try:
path = cls(url2pathname(uri.removeprefix('file:')))
except URLError as exc:
raise ValueError(exc.reason) from None
if not path.is_absolute():
raise ValueError(f"URI is not absolute: {uri!r}")
return path

View file

@ -3285,10 +3285,14 @@ class PathTest(PurePathTest):
def test_from_uri_posix(self):
P = self.cls
self.assertEqual(P.from_uri('file:/foo/bar'), P('/foo/bar'))
self.assertEqual(P.from_uri('file://foo/bar'), P('//foo/bar'))
self.assertRaises(ValueError, P.from_uri, 'file://foo/bar')
self.assertEqual(P.from_uri('file:///foo/bar'), P('/foo/bar'))
self.assertEqual(P.from_uri('file:////foo/bar'), P('//foo/bar'))
self.assertEqual(P.from_uri('file://localhost/foo/bar'), P('/foo/bar'))
if not is_wasi:
self.assertEqual(P.from_uri('file://127.0.0.1/foo/bar'), P('/foo/bar'))
self.assertEqual(P.from_uri(f'file://{socket.gethostname()}/foo/bar'),
P('/foo/bar'))
self.assertRaises(ValueError, P.from_uri, 'foo/bar')
self.assertRaises(ValueError, P.from_uri, '/foo/bar')
self.assertRaises(ValueError, P.from_uri, '//foo/bar')

View file

@ -11,6 +11,7 @@ from test import support
from test.support import os_helper
from test.support import socket_helper
import os
import socket
try:
import ssl
except ImportError:
@ -1424,6 +1425,17 @@ class Pathname_Tests(unittest.TestCase):
"url2pathname() failed; %s != %s" %
(expect, result))
def test_pathname2url(self):
# Test cases common to Windows and POSIX.
fn = urllib.request.pathname2url
sep = os.path.sep
self.assertEqual(fn(''), '')
self.assertEqual(fn(sep), '///')
self.assertEqual(fn('a'), 'a')
self.assertEqual(fn(f'a{sep}b.c'), 'a/b.c')
self.assertEqual(fn(f'{sep}a{sep}b.c'), '///a/b.c')
self.assertEqual(fn(f'{sep}a{sep}b%#c'), '///a/b%25%23c')
@unittest.skipUnless(sys.platform == 'win32',
'test specific to Windows pathnames.')
def test_pathname2url_win(self):
@ -1466,12 +1478,9 @@ class Pathname_Tests(unittest.TestCase):
'test specific to POSIX pathnames')
def test_pathname2url_posix(self):
fn = urllib.request.pathname2url
self.assertEqual(fn('/'), '///')
self.assertEqual(fn('/a/b.c'), '///a/b.c')
self.assertEqual(fn('//a/b.c'), '////a/b.c')
self.assertEqual(fn('///a/b.c'), '/////a/b.c')
self.assertEqual(fn('////a/b.c'), '//////a/b.c')
self.assertEqual(fn('/a/b%#c'), '///a/b%25%23c')
@unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII')
def test_pathname2url_nonascii(self):
@ -1480,11 +1489,25 @@ class Pathname_Tests(unittest.TestCase):
url = urllib.parse.quote(os_helper.FS_NONASCII, encoding=encoding, errors=errors)
self.assertEqual(urllib.request.pathname2url(os_helper.FS_NONASCII), url)
def test_url2pathname(self):
# Test cases common to Windows and POSIX.
fn = urllib.request.url2pathname
sep = os.path.sep
self.assertEqual(fn(''), '')
self.assertEqual(fn('/'), f'{sep}')
self.assertEqual(fn('///'), f'{sep}')
self.assertEqual(fn('////'), f'{sep}{sep}')
self.assertEqual(fn('foo'), 'foo')
self.assertEqual(fn('foo/bar'), f'foo{sep}bar')
self.assertEqual(fn('/foo/bar'), f'{sep}foo{sep}bar')
self.assertEqual(fn('//localhost/foo/bar'), f'{sep}foo{sep}bar')
self.assertEqual(fn('///foo/bar'), f'{sep}foo{sep}bar')
self.assertEqual(fn('////foo/bar'), f'{sep}{sep}foo{sep}bar')
@unittest.skipUnless(sys.platform == 'win32',
'test specific to Windows pathnames.')
def test_url2pathname_win(self):
fn = urllib.request.url2pathname
self.assertEqual(fn('/'), '\\')
self.assertEqual(fn('/C:/'), 'C:\\')
self.assertEqual(fn("///C|"), 'C:')
self.assertEqual(fn("///C:"), 'C:')
@ -1530,11 +1553,13 @@ class Pathname_Tests(unittest.TestCase):
'test specific to POSIX pathnames')
def test_url2pathname_posix(self):
fn = urllib.request.url2pathname
self.assertEqual(fn('/foo/bar'), '/foo/bar')
self.assertEqual(fn('//foo/bar'), '//foo/bar')
self.assertEqual(fn('///foo/bar'), '/foo/bar')
self.assertEqual(fn('////foo/bar'), '//foo/bar')
self.assertEqual(fn('//localhost/foo/bar'), '/foo/bar')
self.assertRaises(urllib.error.URLError, fn, '//foo/bar')
self.assertRaises(urllib.error.URLError, fn, '//localhost:/foo/bar')
self.assertRaises(urllib.error.URLError, fn, '//:80/foo/bar')
self.assertRaises(urllib.error.URLError, fn, '//:/foo/bar')
self.assertRaises(urllib.error.URLError, fn, '//c:80/foo/bar')
self.assertEqual(fn('//127.0.0.1/foo/bar'), '/foo/bar')
self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar'), '/foo/bar')
@unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII')
def test_url2pathname_nonascii(self):

View file

@ -1450,16 +1450,6 @@ def parse_http_list(s):
return [part.strip() for part in res]
class FileHandler(BaseHandler):
# Use local file or FTP depending on form of URL
def file_open(self, req):
url = req.selector
if url[:2] == '//' and url[2:3] != '/' and (req.host and
req.host != 'localhost'):
if not req.host in self.get_names():
raise URLError("file:// scheme is supported only on localhost")
else:
return self.open_local_file(req)
# names for the localhost
names = None
def get_names(self):
@ -1476,8 +1466,7 @@ class FileHandler(BaseHandler):
def open_local_file(self, req):
import email.utils
import mimetypes
host = req.host
filename = req.selector
filename = _splittype(req.full_url)[1]
localfile = url2pathname(filename)
try:
stats = os.stat(localfile)
@ -1487,21 +1476,21 @@ class FileHandler(BaseHandler):
headers = email.message_from_string(
'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
(mtype or 'text/plain', size, modified))
if host:
host, port = _splitport(host)
if not host or \
(not port and _safe_gethostbyname(host) in self.get_names()):
origurl = 'file:' + pathname2url(localfile)
return addinfourl(open(localfile, 'rb'), headers, origurl)
origurl = f'file:{pathname2url(localfile)}'
return addinfourl(open(localfile, 'rb'), headers, origurl)
except OSError as exp:
raise URLError(exp, exp.filename)
raise URLError('file not on local host')
def _safe_gethostbyname(host):
file_open = open_local_file
def _is_local_authority(authority):
if not authority or authority == 'localhost':
return True
try:
return socket.gethostbyname(host)
except socket.gaierror:
return None
address = socket.gethostbyname(authority)
except (socket.gaierror, AttributeError):
return False
return address in FileHandler().get_names()
class FTPHandler(BaseHandler):
def ftp_open(self, req):
@ -1649,16 +1638,13 @@ class DataHandler(BaseHandler):
def url2pathname(url):
"""OS-specific conversion from a relative URL of the 'file' scheme
to a file system path; not recommended for general use."""
if url[:3] == '///':
# Empty authority section, so the path begins on the third character.
url = url[2:]
elif url[:12] == '//localhost/':
# Skip past 'localhost' authority.
url = url[11:]
authority, url = _splithost(url)
if os.name == 'nt':
if url[:3] == '///':
# Skip past extra slash before UNC drive in URL path.
if not _is_local_authority(authority):
# e.g. file://server/share/file.txt
url = '//' + authority + url
elif url[:3] == '///':
# e.g. file://///server/share/file.txt
url = url[1:]
else:
if url[:1] == '/' and url[2:3] in (':', '|'):
@ -1668,6 +1654,8 @@ def url2pathname(url):
# Older URLs use a pipe after a drive letter
url = url[:1] + ':' + url[2:]
url = url.replace('/', '\\')
elif not _is_local_authority(authority):
raise URLError("file:// scheme is supported only on localhost")
encoding = sys.getfilesystemencoding()
errors = sys.getfilesystemencodeerrors()
return unquote(url, encoding=encoding, errors=errors)

View file

@ -0,0 +1,5 @@
Fix issue where :func:`urllib.request.url2pathname` mishandled file URLs with
authorities. If an authority is present and resolves to ``localhost``, it is
now discarded. If an authority is present but *doesn't* resolve to
``localhost``, then on Windows a UNC path is returned (as before), and on
other platforms a :exc:`urllib.error.URLError` is now raised.