Skip to content

Refactor apiv2.open_dataset #4642

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 2, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 56 additions & 31 deletions xarray/backends/apiv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,33 @@
)


def dataset_from_backend_dataset(
ds,
def _get_mtime(filename_or_obj):
# if passed an actual file path, augment the token with
# the file modification time
if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):
mtime = os.path.getmtime(filename_or_obj)
else:
mtime = None
return mtime


def _chunk_ds(
backend_ds,
filename_or_obj,
engine,
chunks,
cache,
overwrite_encoded_chunks,
**extra_tokens,
):
if not (isinstance(chunks, (int, dict)) or chunks is None):
if chunks != "auto":
raise ValueError(
"chunks must be an int, dict, 'auto', or None. "
"Instead found %s. " % chunks
)

_protect_dataset_variables_inplace(ds, cache)
if chunks is not None and engine != "zarr":
if engine != "zarr":
from dask.base import tokenize

# if passed an actual file path, augment the token with
# the file modification time
if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):
mtime = os.path.getmtime(filename_or_obj)
else:
mtime = None
mtime = _get_mtime(filename_or_obj)
token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens)
name_prefix = "open_dataset-%s" % token
ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
ds = backend_ds.chunk(chunks, name_prefix=name_prefix, token=token)

elif engine == "zarr":
else:

if chunks == "auto":
try:
Expand All @@ -50,35 +46,64 @@ def dataset_from_backend_dataset(
chunks = None

if chunks is None:
return ds
return backend_ds

if isinstance(chunks, int):
chunks = dict.fromkeys(ds.dims, chunks)
chunks = dict.fromkeys(backend_ds.dims, chunks)

variables = {}
for k, v in ds.variables.items():
for k, v in backend_ds.variables.items():
var_chunks = _get_chunk(k, v, chunks)
variables[k] = _maybe_chunk(
k,
v,
var_chunks,
overwrite_encoded_chunks=overwrite_encoded_chunks,
)
ds2 = ds._replace(variables)
ds = backend_ds._replace(variables)
return ds


def _dataset_from_backend_dataset(
backend_ds,
filename_or_obj,
engine,
chunks,
cache,
overwrite_encoded_chunks,
**extra_tokens,
):
if not (isinstance(chunks, (int, dict)) or chunks is None):
if chunks != "auto":
raise ValueError(
"chunks must be an int, dict, 'auto', or None. "
"Instead found %s. " % chunks
)

_protect_dataset_variables_inplace(backend_ds, cache)
if chunks is None:
ds = backend_ds
else:
ds2 = ds
ds2._file_obj = ds._file_obj
ds = _chunk_ds(
backend_ds,
filename_or_obj,
engine,
chunks,
overwrite_encoded_chunks,
**extra_tokens,
)

ds._file_obj = backend_ds._file_obj

# Ensure source filename always stored in dataset object (GH issue #2550)
if "source" not in ds.encoding:
if isinstance(filename_or_obj, str):
ds2.encoding["source"] = filename_or_obj
ds.encoding["source"] = filename_or_obj

return ds2
return ds


def resolve_decoders_kwargs(decode_cf, engine, **decoders):
def _resolve_decoders_kwargs(decode_cf, engine, **decoders):
signature = plugins.ENGINES[engine]["signature"]
if decode_cf is False:
for d in decoders:
Expand Down Expand Up @@ -225,7 +250,7 @@ def open_dataset(
if engine is None:
engine = _autodetect_engine(filename_or_obj)

decoders = resolve_decoders_kwargs(
decoders = _resolve_decoders_kwargs(
decode_cf,
engine=engine,
mask_and_scale=mask_and_scale,
Expand All @@ -249,7 +274,7 @@ def open_dataset(
**backend_kwargs,
**{k: v for k, v in kwargs.items() if v is not None},
)
ds = dataset_from_backend_dataset(
ds = _dataset_from_backend_dataset(
backend_ds,
filename_or_obj,
engine,
Expand Down