gh-102153: Start stripping C0 control and space chars in urlsplit (#102508)

`urllib.parse.urlsplit` has already been respecting the WHATWG spec a bit #25595. This adds more sanitizing to respect the "Remove any leading C0 control or space from input" [rule](https://url.spec.whatwg.org/#url-parsing:~:text=Remove%20any%20leading%20and%20trailing%20C0%20control%20or%20space%20from%20input.) in response to [CVE-2023-24329](https://nvd.nist.gov/vuln/detail/CVE-2023-24329). --------- Co-authored-by: Gregory P. Smith [Google] <greg@krypto.org>
2025-11-25 04:34:37 +00:00 · 2023-05-17 11:49:20 +03:00 · 2023-05-17 11:49:20 +03:00 · 2f630e1ce1
commit 2f630e1ce1
parent b58bc8c2a9
4 changed files with 119 additions and 3 deletions
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@ -25,6 +25,10 @@ currently not entirely compliant with this RFC due to defacto
 scenarios for parsing, and for backward compatibility purposes, some
 parsing quirks from older RFCs are retained. The testcases in
 test_urlparse.py provides a good indicator of parsing behavior.
+
+The WHATWG URL Parser spec should also be considered.  We are not compliant with
+it either due to existing user code API behavior expectations (Hyrum's Law).
+It serves as a useful guide when making changes.
 """

 from collections import namedtuple
@ -80,6 +84,10 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
                '0123456789'
                '+-.')

+# Leading and trailing C0 control and space to be stripped per WHATWG spec.
+# == "".join([chr(i) for i in range(0, 0x20 + 1)])
+_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '
+
 # Unsafe bytes to be removed per WHATWG spec
 _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']

@ -464,6 +472,10 @@ def urlsplit(url, scheme='', allow_fragments=True):
    """

    url, scheme, _coerce_result = _coerce_args(url, scheme)
+    # Only lstrip url as some applications rely on preserving trailing space.
+    # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
+    url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
+    scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)

    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
        url = url.replace(b, "")