-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Cache files for different CachingFileManager objects separately #4879
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 28 commits
a80cfd2
d8b3bb7
6bc80e7
e637165
d587bfc
4f4ba13
e93f1f5
257eb00
7d857f3
a3556d1
7c7c4e8
c51e81e
89c2b55
c320acb
2cab733
997c3d4
a3486c8
d24914e
7105ec2
38c2a16
6d6e2dd
d95f9f0
a837f3b
25706eb
1466c82
382d734
46f4fef
3ec678e
929e5d1
fe7b3c3
915976d
06c5d51
cb16f88
e05cb3b
4dfbfc4
a5bf621
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -3,6 +3,7 @@ | |||||
import contextlib | ||||||
import io | ||||||
import threading | ||||||
import uuid | ||||||
import warnings | ||||||
from typing import Any | ||||||
|
||||||
|
@@ -12,12 +13,11 @@ | |||||
from .lru_cache import LRUCache | ||||||
|
||||||
# Global cache for storing open files. | ||||||
FILE_CACHE: LRUCache[str, io.IOBase] = LRUCache( | ||||||
FILE_CACHE: LRUCache[Any, io.IOBase] = LRUCache( | ||||||
maxsize=OPTIONS["file_cache_maxsize"], on_evict=lambda k, v: v.close() | ||||||
) | ||||||
assert FILE_CACHE.maxsize, "file cache must be at least size one" | ||||||
|
||||||
|
||||||
REF_COUNTS: dict[Any, int] = {} | ||||||
|
||||||
_DEFAULT_MODE = utils.ReprObject("<unused>") | ||||||
|
@@ -85,12 +85,13 @@ def __init__( | |||||
kwargs=None, | ||||||
lock=None, | ||||||
cache=None, | ||||||
manager_id=None, | ||||||
dcherian marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
ref_counts=None, | ||||||
): | ||||||
"""Initialize a FileManager. | ||||||
"""Initialize a CachingFileManager. | ||||||
|
||||||
The cache and ref_counts arguments exist solely to facilitate | ||||||
dependency injection, and should only be set for tests. | ||||||
The cache, manager_id and ref_counts arguments exist solely to | ||||||
facilitate dependency injection, and should only be set for tests. | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
|
@@ -120,6 +121,8 @@ def __init__( | |||||
global variable and contains non-picklable file objects, an | ||||||
unpickled FileManager objects will be restored with the default | ||||||
cache. | ||||||
manager_id : hashable, optional | ||||||
Identifier for this CachingFileManager. | ||||||
ref_counts : dict, optional | ||||||
Optional dict to use for keeping track the number of references to | ||||||
the same file. | ||||||
|
@@ -129,13 +132,17 @@ def __init__( | |||||
self._mode = mode | ||||||
self._kwargs = {} if kwargs is None else dict(kwargs) | ||||||
|
||||||
self._default_lock = lock is None or lock is False | ||||||
self._lock = threading.Lock() if self._default_lock else lock | ||||||
self._use_default_lock = lock is None or lock is False | ||||||
self._lock = threading.Lock() if self._use_default_lock else lock | ||||||
|
||||||
# cache[self._key] stores the file associated with this object. | ||||||
if cache is None: | ||||||
cache = FILE_CACHE | ||||||
self._cache = cache | ||||||
if manager_id is None: | ||||||
# Each call to CachingFileManager should separately open files. | ||||||
manager_id = str(uuid.uuid4()) | ||||||
self._manager_id = manager_id | ||||||
self._key = self._make_key() | ||||||
|
||||||
# ref_counts[self._key] stores the number of CachingFileManager objects | ||||||
|
@@ -153,6 +160,7 @@ def _make_key(self): | |||||
self._args, | ||||||
"a" if self._mode == "w" else self._mode, | ||||||
tuple(sorted(self._kwargs.items())), | ||||||
self._manager_id, | ||||||
) | ||||||
return _HashedSequence(value) | ||||||
|
||||||
|
@@ -224,19 +232,13 @@ def close(self, needs_lock=True): | |||||
file.close() | ||||||
|
||||||
def __del__(self): | ||||||
dcherian marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
# If we're the only CachingFileManger referencing a unclosed file, we | ||||||
# should remove it from the cache upon garbage collection. | ||||||
# If we're the only CachingFileManger referencing a unclosed file, | ||||||
# remove it from the cache upon garbage collection. | ||||||
# | ||||||
# Keeping our own count of file references might seem like overkill, | ||||||
# but it's actually pretty common to reopen files with the same | ||||||
# variable name in a notebook or command line environment, e.g., to | ||||||
# fix the parameters used when opening a file: | ||||||
# >>> ds = xarray.open_dataset('myfile.nc') | ||||||
# >>> ds = xarray.open_dataset('myfile.nc', decode_times=False) | ||||||
# This second assignment to "ds" drops CPython's ref-count on the first | ||||||
# "ds" argument to zero, which can trigger garbage collections. So if | ||||||
# we didn't check whether another object is referencing 'myfile.nc', | ||||||
# the newly opened file would actually be immediately closed! | ||||||
# We keep track of our own reference count because we don't want to | ||||||
# close files if another identical file manager needs it. This can | ||||||
# happen if a CachingFileManager is pickled and unpickled without | ||||||
# closing the original file. | ||||||
ref_count = self._ref_counter.decrement(self._key) | ||||||
|
||||||
if not ref_count and self._key in self._cache: | ||||||
|
@@ -249,30 +251,40 @@ def __del__(self): | |||||
|
||||||
if OPTIONS["warn_for_unclosed_files"]: | ||||||
warnings.warn( | ||||||
"deallocating {}, but file is not already closed. " | ||||||
"This may indicate a bug.".format(self), | ||||||
f"deallocating {self}, but file is not already closed. " | ||||||
"This may indicate a bug.", | ||||||
RuntimeWarning, | ||||||
stacklevel=2, | ||||||
) | ||||||
|
||||||
def __getstate__(self): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
The Any's can be replaced with narrower versions, I couldn't figure them out on a quick glance. |
||||||
"""State for pickling.""" | ||||||
# cache and ref_counts are intentionally omitted: we don't want to try | ||||||
# to serialize these global objects. | ||||||
lock = None if self._default_lock else self._lock | ||||||
return (self._opener, self._args, self._mode, self._kwargs, lock) | ||||||
# cache is intentionally omitted: we don't want to try to serialize | ||||||
# these global objects. | ||||||
lock = None if self._use_default_lock else self._lock | ||||||
return ( | ||||||
self._opener, | ||||||
self._args, | ||||||
self._mode, | ||||||
self._kwargs, | ||||||
lock, | ||||||
self._manager_id, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know enough what exactly this is used for, but make sure that you don't need to do a similar thing as for lock (replace with None in case it is default). |
||||||
) | ||||||
|
||||||
def __setstate__(self, state): | ||||||
dcherian marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
"""Restore from a pickle.""" | ||||||
opener, args, mode, kwargs, lock = state | ||||||
self.__init__(opener, *args, mode=mode, kwargs=kwargs, lock=lock) | ||||||
opener, args, mode, kwargs, lock, manager_id = state | ||||||
self.__init__( | ||||||
opener, *args, mode=mode, kwargs=kwargs, lock=lock, manager_id=manager_id | ||||||
) | ||||||
|
||||||
def __repr__(self): | ||||||
dcherian marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
args_string = ", ".join(map(repr, self._args)) | ||||||
if self._mode is not _DEFAULT_MODE: | ||||||
args_string += f", mode={self._mode!r}" | ||||||
return "{}({!r}, {}, kwargs={})".format( | ||||||
type(self).__name__, self._opener, args_string, self._kwargs | ||||||
return ( | ||||||
f"{type(self).__name__}({self._opener!r}, {args_string}, " | ||||||
f"kwargs={self._kwargs}, manager_id={self._manager_id!r})" | ||||||
) | ||||||
|
||||||
|
||||||
|
Uh oh!
There was an error while loading. Please reload this page.