bpo-36216: Add check for characters in netloc that normalize to separators (GH-12201)

This commit is contained in:
Steve Dower 2019-03-07 08:02:26 -08:00 committed by GitHub
parent 1f58f4fa6a
commit 16e6f7dee7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 61 additions and 0 deletions

View file

@ -396,6 +396,21 @@ def _splitnetloc(url, start=0):
delim = min(delim, wdelim) # use earliest delim position
return url[start:delim], url[delim:] # return (domain, rest)
def _checknetloc(netloc):
if not netloc or netloc.isascii():
return
# looking for characters like \u2100 that expand to 'a/c'
# IDNA uses NFKC equivalence, so normalize for this check
import unicodedata
netloc2 = unicodedata.normalize('NFKC', netloc)
if netloc == netloc2:
return
_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
for c in '/?#@:':
if c in netloc2:
raise ValueError("netloc '" + netloc2 + "' contains invalid " +
"characters under NFKC normalization")
def urlsplit(url, scheme='', allow_fragments=True):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
@ -424,6 +439,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
url, fragment = url.split('#', 1)
if '?' in url:
url, query = url.split('?', 1)
_checknetloc(netloc)
v = SplitResult('http', netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)
@ -447,6 +463,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
url, fragment = url.split('#', 1)
if '?' in url:
url, query = url.split('?', 1)
_checknetloc(netloc)
v = SplitResult(scheme, netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)