bpo-43882 - urllib.parse should sanitize urls containing ASCII newline and tabs. (GH-25595)

* issue43882 - urllib.parse should sanitize urls containing ASCII newline and tabs. Co-authored-by: Gregory P. Smith <greg@krypto.org> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-08-03 16:39:00 +00:00 · 2021-04-29 10:16:50 -07:00 · 2021-04-29 10:16:50 -07:00 · 76cd81d603
commit 76cd81d603
parent 14fc2bdfab
4 changed files with 54 additions and 0 deletions
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@ -78,6 +78,9 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
                '0123456789'
                '+-.')

+# Unsafe bytes to be removed per WHATWG spec
+_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
+
 # XXX: Consider replacing with functools.lru_cache
 MAX_CACHE_SIZE = 20
 _parse_cache = {}
@ -469,6 +472,9 @@ def urlsplit(url, scheme='', allow_fragments=True):
        else:
            scheme, url = url[:i].lower(), url[i+1:]

+    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
+        url = url.replace(b, "")
+
    if url[:2] == '//':
        netloc, url = _splitnetloc(url, 2)
        if (('[' in netloc and ']' not in netloc) or