Patch #1462790: fix urllib2 ProxyHandler for host:port proxies

This commit is contained in:
Georg Brandl 2006-04-02 20:45:34 +00:00
parent 4eb521e595
commit 720096a6bf
2 changed files with 110 additions and 22 deletions

View file

@ -119,7 +119,8 @@ from urllib import (unwrap, unquote, splittype, splithost, quote,
# support for FileHandler, proxies via environment variables
from urllib import localhost, url2pathname, getproxies
__version__ = "2.5"
# used in User-Agent header sent
__version__ = sys.version[:3]
_opener = None
def urlopen(url, data=None):
@ -563,6 +564,80 @@ class HTTPRedirectHandler(BaseHandler):
"lead to an infinite loop.\n" \
"The last 30x error message was:\n"
def _parse_proxy(proxy):
"""Return (scheme, user, password, host/port) given a URL or an authority.
If a URL is supplied, it must have an authority (host:port) component.
According to RFC 3986, having an authority component means the URL must
have two slashes after the scheme:
>>> _parse_proxy('file:/ftp.example.com/')
Traceback (most recent call last):
ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
The first three items of the returned tuple may be None.
Examples of authority parsing:
>>> _parse_proxy('proxy.example.com')
(None, None, None, 'proxy.example.com')
>>> _parse_proxy('proxy.example.com:3128')
(None, None, None, 'proxy.example.com:3128')
The authority component may optionally include userinfo (assumed to be
username:password):
>>> _parse_proxy('joe:password@proxy.example.com')
(None, 'joe', 'password', 'proxy.example.com')
>>> _parse_proxy('joe:password@proxy.example.com:3128')
(None, 'joe', 'password', 'proxy.example.com:3128')
Same examples, but with URLs instead:
>>> _parse_proxy('http://proxy.example.com/')
('http', None, None, 'proxy.example.com')
>>> _parse_proxy('http://proxy.example.com:3128/')
('http', None, None, 'proxy.example.com:3128')
>>> _parse_proxy('http://joe:password@proxy.example.com/')
('http', 'joe', 'password', 'proxy.example.com')
>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
('http', 'joe', 'password', 'proxy.example.com:3128')
Everything after the authority is ignored:
>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
('ftp', 'joe', 'password', 'proxy.example.com')
Test for no trailing '/' case:
>>> _parse_proxy('http://joe:password@proxy.example.com')
('http', 'joe', 'password', 'proxy.example.com')
"""
from urlparse import _splitnetloc
scheme, r_scheme = splittype(proxy)
if not r_scheme.startswith("/"):
# authority
scheme = None
authority = proxy
else:
# URL
if not r_scheme.startswith("//"):
raise ValueError("proxy URL with no authority: %r" % proxy)
# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
# and 3.3.), path is empty or starts with '/'
end = r_scheme.find("/", 2)
if end == -1:
end = None
authority = r_scheme[2:end]
userinfo, hostport = splituser(authority)
if userinfo is not None:
user, password = splitpasswd(userinfo)
else:
user = password = None
return scheme, user, password, hostport
class ProxyHandler(BaseHandler):
# Proxies must be in front
handler_order = 100
@ -579,30 +654,25 @@ class ProxyHandler(BaseHandler):
def proxy_open(self, req, proxy, type):
orig_type = req.get_type()
type, r_type = splittype(proxy)
if not type or r_type.isdigit():
# proxy is specified without protocol
type = orig_type
host = proxy
else:
host, r_host = splithost(r_type)
user_pass, host = splituser(host)
user, password = splitpasswd(user_pass)
proxy_type, user, password, hostport = _parse_proxy(proxy)
if proxy_type is None:
proxy_type = orig_type
if user and password:
user, password = user_pass.split(':', 1)
user_pass = base64.encodestring('%s:%s' % (unquote(user),
unquote(password))).strip()
req.add_header('Proxy-authorization', 'Basic ' + user_pass)
host = unquote(host)
req.set_proxy(host, type)
if orig_type == type:
user_pass = '%s:%s' % (unquote(user), unquote(password))
creds = base64.encodestring(user_pass).strip()
req.add_header('Proxy-authorization', 'Basic ' + creds)
hostport = unquote(hostport)
req.set_proxy(hostport, proxy_type)
if orig_type == proxy_type:
# let other handlers take care of it
# XXX this only makes sense if the proxy is before the
# other handlers
return None
else:
# need to start over, because the other handlers don't
# grok the proxy's URL type
# e.g. if we have a constructor arg proxies like so:
# {'http': 'ftp://proxy.example.com'}, we may end up turning
# a request for http://acme.example.com/a into one for
# ftp://proxy.example.com/a
return self.parent.open(req)
# feature suggested by Duncan Booth