Skip to content

Commit e64da31

Browse files
lobispre-commit-ci[bot]
authored andcommitted
feat: globbing with fsspec (#1061)
* feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb. * sesparate PR for s3 fix (#1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb. * sesparate PR for s3 fix (#1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source * feat: simplify object path split (#1028) * simplify object path split * add example from #975 * fix tests * add more test cases * test case update * remove scheme unused regex * feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash * test: improve path object split tests (#1039) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb. * sesparate PR for s3 fix (#1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb. * sesparate PR for s3 fix (#1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source * feat: simplify object path split (#1028) * simplify object path split * add example from #975 * fix tests * add more test cases * test case update * remove scheme unused regex * feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash * add new test case * split big test in two * retry on socket error * xrootd iterator * iterate over different files * iterate over tree * pytest fixture for test directory * pytest fixture for test directory * add annotation to open argument * remove repeated test * test: add test for issue 1054 (newer fsspec failing to parse files with colons in name) (#1055) * add test for issue 1054 * additional test * make sure fsspec fix works * try new test in older fsspec version (need to test windows) * skip test in windows due to colons in name * add explicit object-path split with open * revert use fsspec fork in ci * use fsspec to expand glob * skip root from remote_schemas * test iterate over xrootd * test * add temporary install to ci * remove ci debug * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb. * sesparate PR for s3 fix (#1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb. * sesparate PR for s3 fix (#1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source * feat: simplify object path split (#1028) * simplify object path split * add example from #975 * fix tests * add more test cases * test case update * remove scheme unused regex * feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash * test: add test for issue 1054 (newer fsspec failing to parse files with colons in name) (#1055) * add test for issue 1054 * additional test * make sure fsspec fix works * try new test in older fsspec version (need to test windows) * skip test in windows due to colons in name * add explicit object-path split with open * revert use fsspec fork in ci * test: improve path object split tests (#1039) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb. * sesparate PR for s3 fix (#1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb. * sesparate PR for s3 fix (#1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source * feat: simplify object path split (#1028) * simplify object path split * add example from #975 * fix tests * add more test cases * test case update * remove scheme unused regex * feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash * add new test case * split big test in two * retry on socket error * xrootd iterator * iterate over different files * iterate over tree * pytest fixture for test directory * pytest fixture for test directory * add annotation to open argument * remove repeated test * test: add test for issue 1054 (newer fsspec failing to parse files with colons in name) (#1055) * add test for issue 1054 * additional test * make sure fsspec fix works * try new test in older fsspec version (need to test windows) * skip test in windows due to colons in name * add explicit object-path split with open * revert use fsspec fork in ci * try to expand all glob strings if they have the protocol * making it work on windows * testing globbing for s3 * add failing test for http globbing * test more handlers, failing test for xrootd (missing files) * understanding error * add class method to extract fsspec options * call super constructor for fsspec source * pass options to regularize files util * python 3.12 aiohttp test in other PR * attempt to hide the ssl destructor error * retry on "expired" * style: pre-commit fixes --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent af0b55a commit e64da31

File tree

6 files changed

+137
-30
lines changed

6 files changed

+137
-30
lines changed

src/uproot/_dask.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def dask(
161161
array from ``TTrees``.
162162
"""
163163

164-
files = uproot._util.regularize_files(files, steps_allowed=True)
164+
files = uproot._util.regularize_files(files, steps_allowed=True, **options)
165165

166166
is_3arg = [len(x) == 3 for x in files]
167167
if any(is_3arg):

src/uproot/_util.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -287,11 +287,6 @@ def regularize_path(path):
287287
return path
288288

289289

290-
# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs)
291-
_remote_schemes = ["root", "s3", "http", "https"]
292-
_schemes = list({*_remote_schemes, *fsspec.available_protocols()})
293-
294-
295290
def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
296291
"""
297292
Split a path with a colon into a file path and an object-in-file path.
@@ -815,7 +810,9 @@ def regularize_steps(steps):
815810
return out.tolist()
816811

817812

818-
def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allowed):
813+
def _regularize_files_inner(
814+
files, parse_colon, counter, HasBranches, steps_allowed, **options
815+
):
819816
files2 = regularize_path(files)
820817

821818
maybe_steps = None
@@ -830,12 +827,24 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo
830827
else:
831828
file_path, object_path = files, None
832829

830+
# This parses the windows drive letter as a scheme!
833831
parsed_url = urlparse(file_path)
834-
835-
if parsed_url.scheme.lower() in _remote_schemes:
836-
yield file_path, object_path, maybe_steps
837-
832+
scheme = parsed_url.scheme
833+
if "://" in file_path and scheme not in ("file", "local"):
834+
# user specified a protocol, so we use fsspec to expand the glob and return the full paths
835+
file_names_full = [
836+
file.full_name
837+
for file in fsspec.open_files(
838+
files,
839+
**uproot.source.fsspec.FSSpecSource.extract_fsspec_options(options),
840+
)
841+
]
842+
# https://github.com/fsspec/filesystem_spec/issues/1459
843+
# Not all protocols return the full_name attribute correctly (if they have url parameters)
844+
for file_name_full in file_names_full:
845+
yield file_name_full, object_path, maybe_steps
838846
else:
847+
# no protocol, default to local file system
839848
expanded = os.path.expanduser(file_path)
840849
if _regularize_files_isglob.search(expanded) is None:
841850
yield file_path, object_path, maybe_steps
@@ -885,14 +894,15 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo
885894
counter,
886895
HasBranches,
887896
steps_allowed,
897+
**options,
888898
):
889899
yield file_path, object_path, maybe_steps
890900

891901
elif isinstance(files, Iterable):
892902
for file in files:
893903
counter[0] += 1
894904
for file_path, object_path, maybe_steps in _regularize_files_inner(
895-
file, parse_colon, counter, HasBranches, steps_allowed
905+
file, parse_colon, counter, HasBranches, steps_allowed, **options
896906
):
897907
yield file_path, object_path, maybe_steps
898908

@@ -905,7 +915,7 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo
905915
)
906916

907917

908-
def regularize_files(files, steps_allowed):
918+
def regularize_files(files, steps_allowed, **options):
909919
"""
910920
Common code for regularizing the possible file inputs accepted by uproot so they can be used by uproot internal functions.
911921
"""
@@ -915,7 +925,7 @@ def regularize_files(files, steps_allowed):
915925
seen = set()
916926
counter = [0]
917927
for file_path, object_path, maybe_steps in _regularize_files_inner(
918-
files, True, counter, HasBranches, steps_allowed
928+
files, True, counter, HasBranches, steps_allowed, **options
919929
):
920930
if isinstance(file_path, str):
921931
key = (counter[0], file_path, object_path)

src/uproot/behaviors/TBranch.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def iterate(
174174
array from ``TTrees``.
175175
* :doc:`uproot._dask.dask`: returns an unevaluated Dask array from ``TTrees``.
176176
"""
177-
files = uproot._util.regularize_files(files, steps_allowed=False)
177+
files = uproot._util.regularize_files(files, steps_allowed=False, **options)
178178
decompression_executor, interpretation_executor = _regularize_executors(
179179
decompression_executor, interpretation_executor, None
180180
)
@@ -340,7 +340,7 @@ def concatenate(
340340
single concatenated array from ``TTrees``.
341341
* :doc:`uproot._dask.dask`: returns an unevaluated Dask array from ``TTrees``.
342342
"""
343-
files = uproot._util.regularize_files(files, steps_allowed=False)
343+
files = uproot._util.regularize_files(files, steps_allowed=False, **options)
344344
decompression_executor, interpretation_executor = _regularize_executors(
345345
decompression_executor, interpretation_executor, None
346346
)

src/uproot/models/TTree.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -906,7 +906,6 @@ def read_members(self, chunk, cursor, context, file):
906906
uproot.classes["TTree"] = Model_TTree
907907
uproot.classes["ROOT::TIOFeatures"] = Model_ROOT_3a3a_TIOFeatures
908908

909-
910909
fEntriesStruct = struct.Struct(">q")
911910

912911

src/uproot/source/fsspec.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,30 +27,29 @@ class FSSpecSource(uproot.source.chunk.Source):
2727
"""
2828

2929
def __init__(self, file_path: str, **options):
30-
options = dict(uproot.reading.open.defaults, **options)
31-
storage_options = {
32-
k: v
33-
for k, v in options.items()
34-
if k not in uproot.reading.open.defaults.keys()
35-
}
36-
37-
self._fs, self._file_path = fsspec.core.url_to_fs(file_path, **storage_options)
30+
super().__init__()
31+
self._fs, self._file_path = fsspec.core.url_to_fs(
32+
file_path, **self.extract_fsspec_options(options)
33+
)
3834

3935
# What should we do when there is a chain of filesystems?
4036
self._async_impl = self._fs.async_impl
4137

42-
self._executor = None
4338
self._file = None
4439
self._fh = None
4540

46-
self._num_requests = 0
47-
self._num_requested_chunks = 0
48-
self._num_requested_bytes = 0
49-
5041
self._open()
5142

5243
self.__enter__()
5344

45+
@classmethod
46+
def extract_fsspec_options(cls, options: dict) -> dict:
47+
uproot_default_options = dict(uproot.reading.open.defaults)
48+
options = dict(uproot_default_options, **options)
49+
return {
50+
k: v for k, v in options.items() if k not in uproot_default_options.keys()
51+
}
52+
5453
def _open(self):
5554
self._executor = FSSpecLoopExecutor()
5655
self._file = self._fs.open(self._file_path)

tests/test_0692_fsspec_reading.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,3 +388,102 @@ def test_issue_1035(handler):
388388
branch = tree["MuonSpectrometerTrackParticlesAuxDyn.truthParticleLink"]
389389
data = branch.array()
390390
assert len(data) == 40
391+
392+
393+
@pytest.mark.network
394+
@pytest.mark.xrootd
395+
@pytest.mark.parametrize(
396+
"handler",
397+
[
398+
uproot.source.fsspec.FSSpecSource,
399+
None,
400+
],
401+
)
402+
def test_fsspec_globbing_xrootd(handler):
403+
pytest.importorskip("XRootD")
404+
pytest.importorskip("fsspec_xrootd")
405+
iterator = uproot.iterate(
406+
{
407+
"root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_*.root": "Events"
408+
},
409+
["PV_x"],
410+
handler=handler,
411+
)
412+
413+
arrays = [array for array in iterator]
414+
# if more files are added that match the glob, this test needs to be updated
415+
assert len(arrays) == 2
416+
417+
418+
@pytest.mark.network
419+
@pytest.mark.xrootd
420+
@pytest.mark.parametrize(
421+
"handler",
422+
[
423+
uproot.source.fsspec.FSSpecSource,
424+
None,
425+
],
426+
)
427+
def test_fsspec_globbing_xrootd_no_files(handler):
428+
pytest.importorskip("XRootD")
429+
pytest.importorskip("fsspec_xrootd")
430+
iterator = uproot.iterate(
431+
{
432+
"root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/*/ThisFileShouldNotExist.root": "Events"
433+
},
434+
["PV_x"],
435+
handler=handler,
436+
)
437+
with pytest.raises(FileNotFoundError):
438+
arrays = [array for array in iterator]
439+
440+
441+
@pytest.mark.parametrize(
442+
"handler",
443+
[
444+
uproot.source.fsspec.FSSpecSource,
445+
None,
446+
],
447+
)
448+
def test_fsspec_globbing_s3(handler):
449+
pytest.importorskip("s3fs")
450+
if sys.version_info < (3, 11):
451+
pytest.skip(
452+
"https://github.com/scikit-hep/uproot5/pull/1012",
453+
)
454+
455+
iterator = uproot.iterate(
456+
{"s3://pivarski-princeton/pythia_ppZee_run17emb.*.root": "PicoDst"},
457+
["Event/Event.mEventId"],
458+
anon=True,
459+
handler=handler,
460+
)
461+
462+
# if more files are added that match the glob, this test needs to be updated
463+
arrays = [array for array in iterator]
464+
assert len(arrays) == 1
465+
for array in arrays:
466+
assert len(array) == 8004
467+
468+
469+
@pytest.mark.parametrize(
470+
"handler",
471+
[
472+
uproot.source.fsspec.FSSpecSource,
473+
None,
474+
],
475+
)
476+
def test_fsspec_globbing_http(handler):
477+
pytest.importorskip("aiohttp")
478+
479+
# Globbing does not work with http filesystems and will return an empty list of files
480+
# We leave this test here to be notified when this feature is added
481+
iterator = uproot.iterate(
482+
{
483+
"https://github.com/scikit-hep/scikit-hep-testdata/raw/main/src/skhep_testdata/data/uproot-issue*.root": "Events"
484+
},
485+
["MET_pt"],
486+
handler=handler,
487+
)
488+
with pytest.raises(FileNotFoundError):
489+
arrays = [array for array in iterator]

0 commit comments

Comments
 (0)