Skip to content

gh-102153: Start stripping C0 control and space chars in urlsplit #102508

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
May 17, 2023
Merged
8 changes: 6 additions & 2 deletions Doc/library/urllib.parse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,9 @@ or on combining URL components into a URL string.
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
decomposed before parsing, no error will be raised.

Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline
``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL.
Following some of the `WHATWG spec`_ that updates RFC 3986, leading C0
control control and space characters are stripped from the URL. ``\n``,
``\r`` and tab ``\t`` characters are removed from the URL at any position.

.. versionchanged:: 3.6
Out-of-range port numbers now raise :exc:`ValueError`, instead of
Expand All @@ -338,6 +339,9 @@ or on combining URL components into a URL string.
.. versionchanged:: 3.10
ASCII newline and tab characters are stripped from the URL.

.. versionchanged:: 3.12
Leading WHATWG C0 control and space characters are stripped from the URL.

.. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser

.. function:: urlunsplit(parts)
Expand Down
61 changes: 60 additions & 1 deletion Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,14 +649,73 @@ def test_urlsplit_remove_unsafe_bytes(self):
self.assertEqual(p.scheme, "http")
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")

def test_urlsplit_strip_url(self):
noise = bytes(range(0, 0x20 + 1))
base_url = "http://User:[email protected]:080/doc/?query=yes#frag"

url = noise.decode("utf-8") + base_url
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, "http")
self.assertEqual(p.netloc, "User:[email protected]:080")
self.assertEqual(p.path, "/doc/")
self.assertEqual(p.query, "query=yes")
self.assertEqual(p.fragment, "frag")
self.assertEqual(p.username, "User")
self.assertEqual(p.password, "Pass")
self.assertEqual(p.hostname, "www.python.org")
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), base_url)

url = noise + base_url.encode("utf-8")
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, b"http")
self.assertEqual(p.netloc, b"User:[email protected]:080")
self.assertEqual(p.path, b"/doc/")
self.assertEqual(p.query, b"query=yes")
self.assertEqual(p.fragment, b"frag")
self.assertEqual(p.username, b"User")
self.assertEqual(p.password, b"Pass")
self.assertEqual(p.hostname, b"www.python.org")
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), base_url.encode("utf-8"))

# Test that trailing space is preserved as some applications rely on
# this within query strings.
query_spaces_url = "https://www.python.org:88/doc/?query= "
p = urllib.parse.urlsplit(noise.decode("utf-8") + query_spaces_url)
self.assertEqual(p.scheme, "https")
self.assertEqual(p.netloc, "www.python.org:88")
self.assertEqual(p.path, "/doc/")
self.assertEqual(p.query, "query= ")
self.assertEqual(p.port, 88)
self.assertEqual(p.geturl(), query_spaces_url)

p = urllib.parse.urlsplit("www.pypi.org ")
# That "hostname" gets considered a "path" due to the
# trailing space and our existing logic... YUCK...
# and re-assembles via geturl aka unurlsplit into the original.
# django.core.validators.URLValidator (at least through v3.2) relies on
# this, for better or worse, to catch it in a ValidationError via its
# regular expressions.
# Here we test the basic round trip concept of such a trailing space.
self.assertEqual(urllib.parse.urlunsplit(p), "www.pypi.org ")

# with scheme as cache-key
url = "//www.python.org/"
scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8")
for _ in range(2):
p = urllib.parse.urlsplit(url, scheme=scheme)
self.assertEqual(p.scheme, "https")
self.assertEqual(p.geturl(), "https://www.python.org/")

def test_attributes_bad_port(self):
"""Check handling of invalid ports."""
for bytes in (False, True):
for parse in (urllib.parse.urlsplit, urllib.parse.urlparse):
for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "६"):
with self.subTest(bytes=bytes, parse=parse, port=port):
netloc = "www.example.net:" + port
url = "http://" + netloc
url = "http://" + netloc + "/"
if bytes:
if netloc.isascii() and port.isascii():
netloc = netloc.encode("ascii")
Expand Down
12 changes: 12 additions & 0 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
scenarios for parsing, and for backward compatibility purposes, some
parsing quirks from older RFCs are retained. The testcases in
test_urlparse.py provides a good indicator of parsing behavior.

The WHATWG URL Parser spec should also be considered. We are not compliant with
it either due to existing user code API behavior expectations (Hyrum's Law).
It serves as a useful guide when making changes.
"""

from collections import namedtuple
Expand Down Expand Up @@ -79,6 +83,10 @@
'0123456789'
'+-.')

# Leading and trailing C0 control and space to be stripped per WHATWG spec.
# == "".join([chr(i) for i in range(0, 0x20 + 1)])
_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '

# Unsafe bytes to be removed per WHATWG spec
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']

Expand Down Expand Up @@ -452,6 +460,10 @@ def urlsplit(url, scheme='', allow_fragments=True):
"""

url, scheme, _coerce_result = _coerce_args(url, scheme)
# Only lstrip url as some applications rely on preserving trailing space.
# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)

for b in _UNSAFE_URL_BYTES_TO_REMOVE:
url = url.replace(b, "")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
:func:`urllib.parse.urlsplit` now strips leading and trailing C0 control and
space characters following the controlling specification for URLs defined by
WHATWG in response to CVE-2023-24329. Patch by Illia Volochii.