feat(core): add max_items functionality, so memory usage is decreased

Rizhiy · Rizhiy · commit 337727bd990c · 2024-07-28T18:11:41.000+01:00
diff --git a/class_cache/core.py b/class_cache/core.py
@@ -1,45 +1,63 @@
+import math
 from abc import abstractmethod
 from typing import Any, Callable, ClassVar, Iterable
 
 from replete.consistent_hash import consistent_hash
 
 from class_cache.backends import SQLiteBackend
+from class_cache.lru_queue import LRUQueue
 from class_cache.types import CacheInterface, IdType, KeyType, ValueType
 
 DEFAULT_BACKEND_TYPE = SQLiteBackend
 
 
 class Cache(CacheInterface[KeyType, ValueType]):
+    """
+    :param max_items: Maximum number of items to keep in memory
+    :param flush_ratio: Amount of stored items to write to backend when memory if full
+        ceiling will be used to calculate the final amount
+    """
+
     def __init__(
         self,
         id_: IdType = None,
         backend_type: type[CacheInterface] | Callable[[IdType], CacheInterface] = DEFAULT_BACKEND_TYPE,
         max_items=128,
+        *,
+        flush_ratio=0.1,
     ) -> None:
         super().__init__(id_)
         self._backend = backend_type(id_)
-        # TODO: Implement max_size logic
+
+        self._max_items = max_items
+        self._flush_amount = math.ceil(self._max_items * flush_ratio)
+        self._lru_queue = LRUQueue()
+
         self._data: dict[KeyType, ValueType] = {}
         self._to_write = set()
         self._to_delete = set()
-        self._max_items = max_items
 
     @property
     def backend(self) -> CacheInterface:
         return self._backend
 
     def __contains__(self, key: KeyType) -> bool:
         if key in self._data:
+            self._lru_queue.update(key)
             return True
         return key not in self._to_delete and key in self._backend
 
     def __setitem__(self, key: KeyType, value: ValueType) -> None:
         self._data[key] = value
         self._to_write.add(key)
+        self._lru_queue.update(key)
+        self._check_max_items()
 
     def __getitem__(self, key: KeyType) -> ValueType:
         if key not in self._data:
             self._data[key] = self._backend[key]
+            self._check_max_items()
+        self._lru_queue.update(key)
         return self._data[key]
 
     def __iter__(self) -> Iterable[KeyType]:
@@ -54,6 +72,8 @@ def __delitem__(self, key: KeyType) -> None:
         # Check that key is present. Can't check self._data, since it can be unloaded
         if key not in self:
             raise KeyError(key)
+        if key in self._data:
+            del self._lru_queue[key]
         self._data.pop(key, None)
         self._to_delete.add(key)
 
@@ -69,8 +89,20 @@ def clear(self) -> None:
         self._data = {}
         self._to_write = set()
         self._to_delete = set()
+        self._lru_queue.clear()
+
+    def _check_max_items(self) -> None:
+        if len(self._data) <= self._max_items:
+            return
+
+        keys_to_free = self._lru_queue.pop_many(self._flush_amount)
+        if any(key in self._to_write for key in keys_to_free):
+            self.write()
+        for key in keys_to_free:
+            self._data.pop(key)
 
 
+# TODO: Refactor this, this should use composition, not inheritance. Maybe a wrapper.
 class CacheWithDefault(Cache[KeyType, ValueType]):
     VERSION = 0
     NON_HASH_ATTRIBUTES: ClassVar[frozenset[str]] = frozenset(
diff --git a/class_cache/lru_queue.py b/class_cache/lru_queue.py
@@ -92,7 +92,7 @@ def pop(self) -> KeyType:
         return last.key
 
     def pop_many(self, count: int) -> list[KeyType]:
-        if self._check_empty(no_raise=True):
+        if count == 0 or self._check_empty(no_raise=True):
             return []
 
         first = self._root.prev
@@ -128,5 +128,10 @@ def __str__(self) -> str:
             result += f"{key} -> "
         return result[:-4]
 
+    def clear(self) -> None:
+        if self._root.next == self._root:
+            return
+        self._cut(self._root.next, self._root)
+
 
 __all__ = ["LRUQueue"]
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1,3 +1,7 @@
+import resource
+
+import numpy as np
+
 from class_cache import Cache, CacheWithDefault
 
 TEST_DICT = {1: "foo", "foo": "bar", (2, 3): [4, 5]}
@@ -14,8 +18,8 @@ def _get_data(self, key: str) -> str:
         return self._name + key
 
 
-def get_new_cache(id_: str = None, *, clear=True) -> Cache:
-    cache = Cache(id_)
+def get_new_cache(id_: str = None, *, clear=True, **kwargs) -> Cache:
+    cache = Cache(id_, **kwargs)
     if clear:
         cache.clear()
     return cache
@@ -118,3 +122,26 @@ def test_len():
     del cache
     cache = get_new_cache(clear=False)
     assert len(cache) == len(TEST_DICT)
+
+
+def get_max_memory_used() -> int:
+    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+
+
+def get_random_array(rng: np.random.Generator) -> np.ndarray:
+    return rng.uniform(size=1024)
+
+
+def test_max_memory_usage():
+    cache = get_new_cache(max_items=16)
+    rng = np.random.default_rng()
+    # Get an array to account for generation in memory calculation
+    _ = get_random_array(rng)
+    starting_max_memory = get_max_memory_used()
+    for idx in range(1024):
+        cache[idx] = get_random_array(rng)
+    end_max_memory_usage = get_max_memory_used()
+    assert end_max_memory_usage - starting_max_memory < 1_000
+
+
+# TODO: Add parallel test for cache as well (threading)
diff --git a/tests/test_lru_queue.py b/tests/test_lru_queue.py
@@ -89,3 +89,10 @@ def test_pop_many():
     assert small_queue.pop_many(3) == [0, 1, 2]
     assert small_queue.pop_many(2) == [3]
     assert small_queue.pop_many(1) == []
+
+
+def test_clear():
+    small_queue = get_queue()
+
+    small_queue.clear()
+    assert len(small_queue) == 0