Skip to content

Add URL and intersphinx mapping sanitization utilities to docs config #373

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
sys.path.insert(0, os.path.abspath(".."))
from docs.utils import _validate_git_ref # noqa
from docs.utils import slugify_header # noqa
from docs.utils import sanitize_url # noqa
from docs.utils import sanitize_intersphinx_mapping # noqa

# Define the branch reference for linkcode_resolve
DOCS_BUILD_REF: str = _validate_git_ref(os.environ.get("DOCS_BUILD_REF", "stable"))
Expand Down Expand Up @@ -309,7 +311,7 @@
myst_gfm_only = False

myst_html_meta = {
"github_url": f"https://github.com/reactive-firewall/{project}"
"github_url": sanitize_url(f"https://github.com/reactive-firewall/{project}"),
}

# For GH-style admonitions to MyST conversion
Expand Down Expand Up @@ -419,7 +421,7 @@

# -- Link resolver -------------------------------------------------------------

linkcode_url_prefix: str = f"https://github.com/reactive-firewall/{project}"
linkcode_url_prefix: str = sanitize_url(f"https://github.com/reactive-firewall/{project}")

suffix = "/issues/%s"

Expand All @@ -432,9 +434,11 @@

# try to link with official python3 documentation.
# see https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html for more
intersphinx_mapping = {
"python": ("https://docs.python.org/3", (None, "python-inv.txt"))
}
intersphinx_mapping = sanitize_intersphinx_mapping(
{
"python": ("https://docs.python.org/3", (None, "python-inv.txt")),
},
)


def linkcode_resolve(domain, info):
Expand All @@ -450,4 +454,4 @@ def linkcode_resolve(domain, info):
theResult = theResult.replace("/multicast.py", "/multicast/__init__.py")
if "/tests.py" in theResult:
theResult = theResult.replace("/tests.py", "/tests/__init__.py")
return quote(theResult, safe=":/-._")
return sanitize_url(quote(theResult, safe=":/-._"))
194 changes: 194 additions & 0 deletions docs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# limitations under the License.

import re
from urllib.parse import urlparse, urlunparse, quote


# Git reference validation pattern
Expand All @@ -26,6 +27,110 @@
GIT_REF_PATTERN = r'^[a-zA-Z0-9][a-zA-Z0-9_\-./]*$'


# URL allowed scheme list
# Enforces:
# - URLs Must start with https
URL_ALLOWED_SCHEMES = frozenset({"https"})


# URL allowed domain list
# Enforces:
# - URLs Must belong to one of these domains
URL_ALLOWED_NETLOCS = frozenset({"github.com", "readthedocs.com", "docs.python.org"})


# Maximum allowed URL length
MAX_URL_LENGTH = 2048 # Common browser limit
"""Maximum allowed length for URL validation.

Should be large enough for most URLs but no larger than common browser limits.

Unit-Testing:

First set up test fixtures by importing utils.

>>> import docs.utils as _utils
>>>

>>> _utils.MAX_URL_LENGTH is not None
True
>>> type(_utils.MAX_URL_LENGTH) is type(int())
True
>>> _utils.MAX_URL_LENGTH > 0
True
>>> _utils.MAX_URL_LENGTH >= 256
True
>>> _utils.MAX_URL_LENGTH <= 2048
True
>>>

"""


# Error messages for URL validation
INVALID_LENGTH_ERROR = f"URL exceeds maximum length of {MAX_URL_LENGTH} characters."
"""Length error message for URL validation.

Unit-Testing:

First set up test fixtures by importing utils.

>>> import docs.utils as _utils
>>>

>>> _utils.INVALID_LENGTH_ERROR is not None
True
>>> type(_utils.INVALID_LENGTH_ERROR) is type(str())
True
>>> len(_utils.INVALID_LENGTH_ERROR) > 0
True
>>>

"""


INVALID_SCHEME_ERROR = "Invalid URL scheme. Only 'https' is allowed."
"""Scheme error message for URL validation.

Unit-Testing:

First set up test fixtures by importing utils.

>>> import docs.utils as _utils
>>>

>>> _utils.INVALID_SCHEME_ERROR is not None
True
>>> type(_utils.INVALID_SCHEME_ERROR) is type(str())
True
>>> len(_utils.INVALID_SCHEME_ERROR) > 0
True
>>>

"""


INVALID_DOMAIN_ERROR = f"Invalid or untrusted domain. Only {URL_ALLOWED_NETLOCS} are allowed."
"""Domain error message for URL validation.

Unit-Testing:

First set up test fixtures by importing utils.

>>> import docs.utils as _utils
>>>

>>> _utils.INVALID_DOMAIN_ERROR is not None
True
>>> type(_utils.INVALID_DOMAIN_ERROR) is type(str())
True
>>> len(_utils.INVALID_DOMAIN_ERROR) > 0
True
>>>

"""


def _validate_git_ref(ref: str) -> str:
"""
Validate if the provided string is a valid Git reference.
Expand Down Expand Up @@ -126,3 +231,92 @@ def slugify_header(s: str) -> str:
text = re.sub(r'[^\w\- ]', "", s).strip().lower()
# Then replace consecutive spaces or dashes with a single dash
return re.sub(r'[-\s]+', "-", text)


def sanitize_url(url: str) -> str:
"""
Sanitize and validate a URL according to allowed schemes and domains.

This function validates that the URL uses an allowed scheme (https) and points
to a trusted domain, then safely encodes its path and query components.

Args:
url (str) -- The URL to sanitize.

Returns:
str -- The sanitized URL.

Raises:
ValueError -- If the URL has an invalid scheme or points to an untrusted domain.


Unit-Testing:

Testcase 0: First set up test fixtures by importing utils.

>>> import docs.utils as _utils
>>>

Testcase 1: Basic URL with spaces and special characters.

>>> url_fxtr = "https://github.com/user/Hello World!"
>>> _utils.sanitize_url(url_fxtr)
'https://github.com/user/Hello%20World%21'
>>>

"""
# Validate length
if len(url) > MAX_URL_LENGTH:
raise ValueError(INVALID_LENGTH_ERROR)
parsed_url = urlparse(url)
# Validate scheme
if parsed_url.scheme not in URL_ALLOWED_SCHEMES:
raise ValueError(INVALID_SCHEME_ERROR)
# Validate netloc
if parsed_url.netloc not in URL_ALLOWED_NETLOCS:
raise ValueError(INVALID_DOMAIN_ERROR)
# Sanitize path and query - using the safe parameter to preserve URL structure
sanitized_path = quote(parsed_url.path, safe="/=")
sanitized_query = quote(parsed_url.query, safe="&=")
# Reconstruct the sanitized URL
return urlunparse((
parsed_url.scheme,
parsed_url.netloc,
sanitized_path,
parsed_url.params,
sanitized_query,
parsed_url.fragment,
))


def sanitize_intersphinx_mapping(mapping: dict) -> dict:
"""
Sanitize URLs in an intersphinx mapping dictionary.

This function applies URL sanitization to each URL in the mapping while
preserving the associated extra values.

Args:
mapping (dict) -- A dictionary mapping names to tuples of (url, extra_value).

Returns:
dict -- A dictionary with the same structure but with sanitized URLs.

Unit-Testing:

Testcase 1: Basic intersphinx mapping.

>>> mapping = {'python': ('https://docs.python.org/3', None)}
>>> sanitize_intersphinx_mapping(mapping)
{'python': ('https://docs.python.org/3', None)}

Testcase 2: Mapping with URL containing special characters.

>>> mapping = {'project': ('https://github.com/user/project with spaces', None)}
>>> result = sanitize_intersphinx_mapping(mapping)
>>> result['project'][0]
'https://github.com/user/project%20with%20spaces'
>>>

"""
return {key: (sanitize_url(url), extra_value) for key, (url, extra_value) in mapping.items()}
7 changes: 7 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,13 @@ def loadDocstringsFromModule(module: types.ModuleType) -> TestSuite:
except Exception: # pragma: no branch
_LOGGER.warning("Error loading optional debug tests", exc_info=True)

try:
from tests import test_extra
depends.insert(11, test_extra)
EXTRA_TESTS["security"].append(test_extra.ExtraDocsUtilsTestSuite)
except Exception: # pragma: no branch
_LOGGER.warning("Error loading optional extra tests", exc_info=True)

try:
FUZZING_TESTS = {
"slow": [
Expand Down
1 change: 1 addition & 0 deletions tests/check_spelling
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ declare -a SPECIFIC_TYPOS=(
"sentance:sentence" # from #330
"reccomended:recommended" # from #348
"absolutly:absolutely" # from #348
"belone:belong" # from #373
)

function cleanup() {
Expand Down
96 changes: 96 additions & 0 deletions tests/test_extra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-

# Multicast Python Module (Testing)
# ..................................
# Copyright (c) 2025, Mr. Walls
# ..................................
# Licensed under MIT (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# ..........................................
# https://www.github.com/reactive-firewall/multicast/LICENSE.md
# ..........................................
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Extra Test module for docs.utils functionality.

This module provides extra test cases for the docs.utils module, focusing on the
utils.sanitize_url method for url encoding.
"""


__module__ = "tests"


try:
try:
import context
except Exception as _: # pragma: no branch
del _ # skipcq - cleanup any error vars early
from . import context
if context.__name__ is None:
raise ModuleNotFoundError("[CWE-758] Failed to import context") from None
else:
from context import unittest
import docs.utils
except Exception as err:
raise ImportError("[CWE-758] Failed to import test context") from err


@context.markWithMetaTag("extra", "security")
class ExtraDocsUtilsTestSuite(context.BasicUsageTestSuite):
"""Test cases for docs.utils module."""

__module__ = "tests.test_extra"

URL_TEST_FIXTURES = [
{
"input_url": "https://github.com/user/repo",
"expected": "https://github.com/user/repo",
},
{
"input_url": "https://github.com/user/repo with spaces",
"expected": "https://github.com/user/repo%20with%20spaces",
},
{
"input_url": "https://github.com/user/repo?q=test&sort=desc",
"expected": "https://github.com/user/repo?q%3Dtest%26sort%3Ddesc",
},
{
"input_url": "https://github.com/user/repo#section",
"expected": "https://github.com/user/repo#section",
},
{
"input_url": "https://github.com/user/repo/<script>alert('xss')</script>",
"expected": "https://github.com/user/repo/%3Cscript%3Ealert%28%27xss%27%29%3C/script%3E",
},
]

def test_sanitize_url_GIVEN_raw_url_IS_reliable(self) -> None:
"""Test case 1: Test to ensure reliable URL sanitization."""
# Mock _hearstep to return a non-empty response
for test_params in self.URL_TEST_FIXTURES:
sanitized_url = docs.utils.sanitize_url(test_params["input_url"])
# check for results
self.assertIsNotNone(sanitized_url)
# Verify results
if test_params["input_url"] == test_params["expected"]:
self.assertEqual(
test_params["input_url"], sanitized_url,
"Input and output URLs were different, should be the same.",
)
else:
self.assertNotEqual(
test_params["input_url"], sanitized_url,
"Input and output URLs were the same, should be different.",
)
self.assertEqual(sanitized_url, test_params["expected"])


if __name__ == '__main__':
unittest.main()
Loading