Skip to content

bpo-44002: Switch to lru_cache in urllib.parse. #25798

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 12, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import unicodedata
import unittest
import urllib.parse
import warnings

RFC1808_BASE = "http://a/b/c/d;p?q#f"
RFC2396_BASE = "http://a/b/c/d;p?q"
Expand Down Expand Up @@ -1035,8 +1036,29 @@ def test_telurl_params(self):
self.assertEqual(p1.path, '863-1234')
self.assertEqual(p1.params, 'phone-context=+1-914-555')

def test_clear_cache_deprecation(self):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
urllib.parse.clear_cache()
self.assertEqual(len(w), 1, msg=repr(w))
v = sys.version_info
exc = PendingDeprecationWarning if v <= (3, 11) else DeprecationWarning
self.assertIs(w[0].category, exc)
self.assertIn('clear_cache() will be removed', str(w[0].message))

def test_Quoter_deprecation(self):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
old_class = urllib.parse.Quoter
self.assertIs(old_class, urllib.parse._Quoter)
self.assertEqual(len(w), 1, msg=repr(w))
v = sys.version_info
exc = PendingDeprecationWarning if v <= (3, 11) else DeprecationWarning
self.assertIs(w[0].category, exc)
self.assertIn('Quoter will be removed', str(w[0].message))

def test_Quoter_repr(self):
quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE)
quoter = urllib.parse._Quoter(urllib.parse._ALWAYS_SAFE)
self.assertIn('Quoter', repr(quoter))

def test_all(self):
Expand Down
62 changes: 32 additions & 30 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,11 @@
test_urlparse.py provides a good indicator of parsing behavior.
"""

from collections import namedtuple
import functools
import re
import sys
import types
import collections
import warnings

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Expand Down Expand Up @@ -81,15 +82,15 @@
# Unsafe bytes to be removed per WHATWG spec
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']

# XXX: Consider replacing with functools.lru_cache
MAX_CACHE_SIZE = 20
_parse_cache = {}

def clear_cache():
"""Clear the parse cache and the quoters cache."""
_parse_cache.clear()
_safe_quoters.clear()

warnings.warn(
'Deprecated in 3.11. '
'urllib.parse.clear_cache() will be removed in Python 3.14. '
'Use urllib.parse.urlsplit.cache_clear() on Python 3.11 or later.',
PendingDeprecationWarning)
urlsplit.cache_clear()
_byte_quoter_factory.cache_clear()

# Helpers for bytes handling
# For 3.2, we deliberately require applications that
Expand Down Expand Up @@ -243,8 +244,6 @@ def _hostinfo(self):
return hostname, port


from collections import namedtuple

_DefragResultBase = namedtuple('DefragResult', 'url fragment')
_SplitResultBase = namedtuple(
'SplitResult', 'scheme netloc path query fragment')
Expand Down Expand Up @@ -434,6 +433,7 @@ def _checknetloc(netloc):
raise ValueError("netloc '" + netloc + "' contains invalid " +
"characters under NFKC normalization")

@functools.lru_cache
def urlsplit(url, scheme='', allow_fragments=True):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
Expand All @@ -457,12 +457,6 @@ def urlsplit(url, scheme='', allow_fragments=True):

url, scheme, _coerce_result = _coerce_args(url, scheme)
allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)
if cached:
return _coerce_result(cached)
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
clear_cache()
netloc = query = fragment = ''
i = url.find(':')
if i > 0:
Expand All @@ -486,7 +480,6 @@ def urlsplit(url, scheme='', allow_fragments=True):
url, query = url.split('?', 1)
_checknetloc(netloc)
v = SplitResult(scheme, netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)

def urlunparse(components):
Expand Down Expand Up @@ -789,23 +782,30 @@ def unquote_plus(string, encoding='utf-8', errors='replace'):
b'0123456789'
b'_.-~')
_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
_safe_quoters = {}

class Quoter(collections.defaultdict):
"""A mapping from bytes (in range(0,256)) to strings.
def __getattr__(name):
if name == 'Quoter':
warnings.warn('Deprecated in 3.11. '
'urllib.parse.Quoter will be removed in Python 3.14. '
'It was not intended to be a public API.',
PendingDeprecationWarning, stacklevel=2)
return _Quoter
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')

class _Quoter(dict):
"""A mapping from bytes numbers (in range(0,256)) to strings.

String values are percent-encoded byte values, unless the key < 128, and
in the "safe" set (either the specified safe set, or default set).
in either of the specified safe set, or the always safe set.
"""
# Keeps a cache internally, using defaultdict, for efficiency (lookups
# Keeps a cache internally, via __missing__, for efficiency (lookups
# of cached keys don't call Python code at all).
def __init__(self, safe):
"""safe: bytes object."""
self.safe = _ALWAYS_SAFE.union(safe)

def __repr__(self):
# Without this, will just display as a defaultdict
return "<%s %r>" % (self.__class__.__name__, dict(self))
return f"<Quoter {dict(self)!r}>"

def __missing__(self, b):
# Handle a cache miss. Store quoted string in cache and return.
Expand Down Expand Up @@ -884,6 +884,11 @@ def quote_plus(string, safe='', encoding=None, errors=None):
string = quote(string, safe + space, encoding, errors)
return string.replace(' ', '+')

# Expectation: A typical program is unlikely to create more than 5 of these.
@functools.lru_cache
def _byte_quoter_factory(safe):
return _Quoter(safe).__getitem__

def quote_from_bytes(bs, safe='/'):
"""Like quote(), but accepts a bytes object rather than a str, and does
not perform string-to-bytes encoding. It always returns an ASCII string.
Expand All @@ -897,14 +902,11 @@ def quote_from_bytes(bs, safe='/'):
# Normalize 'safe' by converting to bytes and removing non-ASCII chars
safe = safe.encode('ascii', 'ignore')
else:
safe = bytes([c for c in safe if c < 128])
safe = bytes(c for c in safe if c < 128)
if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
return bs.decode()
try:
quoter = _safe_quoters[safe]
except KeyError:
_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
return ''.join([quoter(char) for char in bs])
quoter = _byte_quoter_factory(safe)
return ''.join(quoter(char_num) for char_num in bs)

def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
quote_via=quote_plus):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
:mod:`urllib.parse` now uses :func:`functool.lru_cache` for its internal URL
splitting and quoting caches instead of rolling its own like its the 90s.

The undocumented internal :mod:`urllib.parse` ``Quoted`` class and
``clear_cache()`` APIs are now deprecated, for removal in 3.14.