Skip to content

Commit 02f5114

Browse files
committed
Make a comparison of TreeStore with h5py
1 parent ba42d43 commit 02f5114

File tree

2 files changed

+199
-60
lines changed

2 files changed

+199
-60
lines changed

bench/large-tree-store.py

Lines changed: 193 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
#######################################################################
88

99
"""
10-
Benchmark for TreeStore with large arrays.
10+
Benchmark for TreeStore vs h5py with large arrays.
1111
1212
This benchmark creates N numpy arrays with sizes following a normal distribution
13-
and measures the time and memory consumption for storing them in a TreeStore.
13+
and measures the time and memory consumption for storing them in both TreeStore and h5py.
1414
"""
1515

1616
import os
@@ -22,12 +22,19 @@
2222

2323
import blosc2
2424

25+
try:
26+
import h5py
27+
HAS_H5PY = True
28+
except ImportError:
29+
HAS_H5PY = False
30+
2531
# Configuration
2632
N_ARRAYS = 100 # Number of arrays to store
2733
NGROUPS_MAX = 10
28-
PEAK_SIZE_MB = 1 # Peak size in MB for the normal distribution
34+
PEAK_SIZE_MB = 10 # Peak size in MB for the normal distribution
2935
STDDEV_MB = 2 # Standard deviation in MB
30-
OUTPUT_DIR = "large-tree-store.b2d"
36+
OUTPUT_DIR_TSTORE = "large-tree-store.b2z"
37+
OUTPUT_FILE_H5PY = "large-h5py-store.h5"
3138
MIN_SIZE_MB = 0.1 # Minimum array size in MB
3239
MAX_SIZE_MB = 32 # Maximum array size in MB
3340

@@ -68,12 +75,14 @@ def store_arrays_in_treestore(arrays, output_dir):
6875
print(f"\nStoring {len(arrays)} arrays in TreeStore at {output_dir}...")
6976

7077
# Clean up existing directory
71-
if os.path.exists(output_dir):
78+
if os.path.exists(output_dir) and os.path.isdir(output_dir):
7279
shutil.rmtree(output_dir)
80+
elif os.path.exists(output_dir):
81+
os.remove(output_dir)
7382

7483
start_time = time.time()
7584

76-
with blosc2.TreeStore(output_dir, mode="w") as tstore:
85+
with blosc2.TreeStore(output_dir, mode="w", threshold=2**13) as tstore:
7786
for i, arr in enumerate(arrays):
7887
# Distribute arrays evenly across NGROUPS_MAX subdirectories
7988
group_id = i % NGROUPS_MAX
@@ -96,6 +105,52 @@ def store_arrays_in_treestore(arrays, output_dir):
96105
return total_time
97106

98107

108+
@profile
109+
def store_arrays_in_h5py(arrays, output_file):
110+
"""Store arrays in h5py and measure performance."""
111+
if not HAS_H5PY:
112+
return None
113+
114+
print(f"\nStoring {len(arrays)} arrays in h5py at {output_file}...")
115+
116+
# Clean up existing file
117+
if os.path.exists(output_file):
118+
os.remove(output_file)
119+
120+
start_time = time.time()
121+
122+
with h5py.File(output_file, "w") as f:
123+
for i, arr in enumerate(arrays):
124+
# Distribute arrays evenly across NGROUPS_MAX subdirectories
125+
group_id = i % NGROUPS_MAX
126+
group_name = f"group_{group_id:02d}"
127+
dataset_name = f"array_{i:04d}"
128+
129+
# Create group if it doesn't exist
130+
if group_name not in f:
131+
grp = f.create_group(group_name)
132+
else:
133+
grp = f[group_name]
134+
135+
# Store array with compression
136+
grp.create_dataset(dataset_name, data=arr, compression="gzip", shuffle=True) #, compression_opts=9)
137+
138+
if (i + 1) % 10 == 0:
139+
elapsed = time.time() - start_time
140+
print(f" Stored {i + 1}/{len(arrays)} arrays ({elapsed:.2f}s)")
141+
142+
# Add some metadata
143+
f.attrs["n_arrays"] = len(arrays)
144+
f.attrs["peak_size_mb"] = PEAK_SIZE_MB
145+
f.attrs["benchmark_timestamp"] = time.time()
146+
f.attrs["n_groups"] = NGROUPS_MAX
147+
148+
end_time = time.time()
149+
total_time = end_time - start_time
150+
151+
return total_time
152+
153+
99154
def measure_memory_usage(func, *args, **kwargs):
100155
"""Measure memory usage of a function."""
101156
print("\nMeasuring memory usage...")
@@ -113,56 +168,117 @@ def wrapper():
113168
return max_memory_mb, min_memory_mb, memory_increase_mb, mem_usage
114169

115170

116-
def print_statistics(sizes_mb, sizes_elements, total_time, memory_stats):
117-
"""Print benchmark statistics."""
118-
max_mem, min_mem, mem_increase, _ = memory_stats
119-
120-
print("\n" + "="*60)
121-
print("BENCHMARK RESULTS")
122-
print("="*60)
123-
124-
print(f"Configuration:")
125-
print(f" Number of arrays: {N_ARRAYS}")
126-
print(f" Peak size: {PEAK_SIZE_MB} MB")
127-
print(f" Standard deviation: {STDDEV_MB} MB")
128-
print(f" Output directory: {OUTPUT_DIR}")
129-
130-
print(f"\nArray size statistics:")
131-
print(f" Mean size: {np.mean(sizes_mb):.2f} MB")
132-
print(f" Median size: {np.median(sizes_mb):.2f} MB")
133-
print(f" Min size: {np.min(sizes_mb):.2f} MB")
134-
print(f" Max size: {np.max(sizes_mb):.2f} MB")
135-
print(f" Total data: {np.sum(sizes_mb):.2f} MB")
136-
137-
print(f"\nPerformance metrics:")
138-
print(f" Total time: {total_time:.2f} seconds")
139-
print(f" Average time per array: {total_time / N_ARRAYS:.3f} seconds")
140-
print(f" Throughput: {np.sum(sizes_mb) / total_time:.2f} MB/s")
141-
142-
print(f"\nMemory usage:")
143-
print(f" Baseline memory: {min_mem:.2f} MB")
144-
print(f" Peak memory: {max_mem:.2f} MB")
145-
print(f" Memory increase: {mem_increase:.2f} MB")
146-
147-
# Check final directory size
148-
if os.path.exists(OUTPUT_DIR):
171+
def get_storage_size(path):
172+
"""Get storage size in MB for a file or directory."""
173+
if os.path.isfile(path):
174+
return os.path.getsize(path) / (1024 * 1024)
175+
elif os.path.isdir(path):
149176
total_size = 0
150-
for dirpath, dirnames, filenames in os.walk(OUTPUT_DIR):
177+
for dirpath, dirnames, filenames in os.walk(path):
151178
for filename in filenames:
152179
filepath = os.path.join(dirpath, filename)
153180
total_size += os.path.getsize(filepath)
181+
return total_size / (1024 * 1024)
182+
return 0
183+
154184

155-
compressed_size_mb = total_size / (1024 * 1024)
156-
compression_ratio = np.sum(sizes_mb) / compressed_size_mb
185+
def print_comparison_table(sizes_mb, tstore_results, h5py_results):
186+
"""Print a comparison table of TreeStore vs h5py results."""
187+
total_data_mb = np.sum(sizes_mb)
157188

158-
print(f"\nStorage efficiency:")
159-
print(f" Compressed size: {compressed_size_mb:.2f} MB")
160-
print(f" Compression ratio: {compression_ratio:.2f}x")
189+
print("\n" + "="*80)
190+
print("PERFORMANCE COMPARISON: TreeStore vs h5py")
191+
print("="*80)
192+
193+
# Configuration info
194+
print(f"Configuration:")
195+
print(f" Arrays: {N_ARRAYS:,} | Peak size: {PEAK_SIZE_MB} MB | Total data: {total_data_mb:.1f} MB")
196+
print()
197+
198+
# Extract results
199+
tstore_time, tstore_memory, tstore_storage = tstore_results
200+
if h5py_results:
201+
h5py_time, h5py_memory, h5py_storage = h5py_results
202+
has_h5py = True
203+
else:
204+
has_h5py = False
205+
206+
# Table header
207+
print(f"{'Metric':<25} {'TreeStore':<15} {'h5py':<15} {'Ratio (T/H)':<12}")
208+
print("-" * 70)
209+
210+
# Time metrics
211+
print(f"{'Total time (s)':<25} {tstore_time:<15.2f} ", end="")
212+
if has_h5py:
213+
ratio = tstore_time / h5py_time if h5py_time > 0 else float('inf')
214+
print(f"{h5py_time:<15.2f} {ratio:<12.2f}")
215+
else:
216+
print(f"{'N/A':<15} {'N/A':<12}")
217+
218+
print(f"{'Throughput (MB/s)':<25} {total_data_mb/tstore_time:<15.1f} ", end="")
219+
if has_h5py:
220+
h5py_throughput = total_data_mb / h5py_time
221+
ratio = (total_data_mb/tstore_time) / h5py_throughput if h5py_throughput > 0 else float('inf')
222+
print(f"{h5py_throughput:<15.1f} {ratio:<12.2f}")
223+
else:
224+
print(f"{'N/A':<15} {'N/A':<12}")
225+
226+
print()
227+
228+
# Memory metrics
229+
print(f"{'Peak memory (MB)':<25} {tstore_memory[0]:<15.1f} ", end="")
230+
if has_h5py:
231+
ratio = tstore_memory[0] / h5py_memory[0] if h5py_memory[0] > 0 else float('inf')
232+
print(f"{h5py_memory[0]:<15.1f} {ratio:<12.2f}")
233+
else:
234+
print(f"{'N/A':<15} {'N/A':<12}")
235+
236+
print(f"{'Memory increase (MB)':<25} {tstore_memory[2]:<15.1f} ", end="")
237+
if has_h5py:
238+
ratio = tstore_memory[2] / h5py_memory[2] if h5py_memory[2] > 0 else float('inf')
239+
print(f"{h5py_memory[2]:<15.1f} {ratio:<12.2f}")
240+
else:
241+
print(f"{'N/A':<15} {'N/A':<12}")
242+
243+
print()
244+
245+
# Storage metrics
246+
print(f"{'Storage size (MB)':<25} {tstore_storage:<15.1f} ", end="")
247+
if has_h5py:
248+
ratio = tstore_storage / h5py_storage if h5py_storage > 0 else float('inf')
249+
print(f"{h5py_storage:<15.1f} {ratio:<12.2f}")
250+
else:
251+
print(f"{'N/A':<15} {'N/A':<12}")
252+
253+
print(f"{'Compression ratio':<25} {total_data_mb/tstore_storage:<15.2f} ", end="")
254+
if has_h5py:
255+
h5py_compression = total_data_mb / h5py_storage
256+
ratio = (total_data_mb/tstore_storage) / h5py_compression if h5py_compression > 0 else float('inf')
257+
print(f"{h5py_compression:<15.2f} {ratio:<12.2f}")
258+
else:
259+
print(f"{'N/A':<15} {'N/A':<12}")
260+
261+
print()
262+
263+
# Summary
264+
print("Summary:")
265+
if has_h5py:
266+
if tstore_time < h5py_time:
267+
print(f" TreeStore is {h5py_time/tstore_time:.1f}x faster")
268+
else:
269+
print(f" h5py is {tstore_time/h5py_time:.1f}x faster")
270+
271+
if tstore_storage < h5py_storage:
272+
print(f" TreeStore uses {h5py_storage/tstore_storage:.1f}x less storage")
273+
else:
274+
print(f" h5py uses {tstore_storage/h5py_storage:.1f}x less storage")
275+
else:
276+
print(" h5py not available for comparison")
161277

162278

163279
def main():
164280
"""Run the benchmark."""
165-
print("TreeStore Large Array Benchmark")
281+
print("TreeStore vs h5py Large Array Benchmark")
166282
print("="*60)
167283

168284
# Set random seed for reproducibility
@@ -177,19 +293,37 @@ def main():
177293
# Create test arrays
178294
arrays = create_test_arrays(sizes_elements)
179295

180-
# Measure memory usage during storage
181-
memory_stats = measure_memory_usage(store_arrays_in_treestore, arrays, OUTPUT_DIR)
182-
total_time = memory_stats[0] # This will be overwritten, we need the actual time
183-
184-
# Get the actual timing by running the storage function again
185-
# (memory_profiler doesn't return the function result easily)
186-
print("\nRunning final timing measurement...")
187-
actual_time = store_arrays_in_treestore(arrays, OUTPUT_DIR)
188-
189-
# Print results
190-
print_statistics(sizes_mb, sizes_elements, actual_time, memory_stats)
191-
192-
print(f"\nBenchmark completed. Results saved to: {OUTPUT_DIR}")
296+
# Benchmark TreeStore
297+
print("\n" + "="*60)
298+
print("BENCHMARKING TreeStore")
299+
print("="*60)
300+
tstore_memory_stats = measure_memory_usage(store_arrays_in_treestore, arrays, OUTPUT_DIR_TSTORE)
301+
tstore_time = store_arrays_in_treestore(arrays, OUTPUT_DIR_TSTORE)
302+
tstore_storage_size = get_storage_size(OUTPUT_DIR_TSTORE)
303+
tstore_results = (tstore_time, tstore_memory_stats, tstore_storage_size)
304+
305+
# Benchmark h5py if available
306+
h5py_results = None
307+
if HAS_H5PY:
308+
print("\n" + "="*60)
309+
print("BENCHMARKING h5py")
310+
print("="*60)
311+
h5py_memory_stats = measure_memory_usage(store_arrays_in_h5py, arrays, OUTPUT_FILE_H5PY)
312+
h5py_time = store_arrays_in_h5py(arrays, OUTPUT_FILE_H5PY)
313+
h5py_storage_size = get_storage_size(OUTPUT_FILE_H5PY)
314+
h5py_results = (h5py_time, h5py_memory_stats, h5py_storage_size)
315+
else:
316+
print("\n" + "="*60)
317+
print("h5py not available - skipping h5py benchmark")
318+
print("="*60)
319+
320+
# Print comparison table
321+
print_comparison_table(sizes_mb, tstore_results, h5py_results)
322+
323+
print(f"\nBenchmark completed.")
324+
print(f"TreeStore results saved to: {OUTPUT_DIR_TSTORE}")
325+
if HAS_H5PY:
326+
print(f"h5py results saved to: {OUTPUT_FILE_H5PY}")
193327

194328

195329
if __name__ == "__main__":

src/blosc2/dict_store.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,12 @@ def __setitem__(self, key: str, value: np.ndarray | blosc2.NDArray | SChunk | C2
237237

238238
# Save the value to the destination path
239239
if not external_file:
240-
value.save(urlpath=dest_path)
240+
if hasattr(value, "save"):
241+
value.save(urlpath=dest_path)
242+
else:
243+
# An SChunk does not have a save() method
244+
with open(dest_path, "wb") as f:
245+
f.write(value.to_cframe())
241246
else:
242247
# This should be faster than using value.save() ?
243248
shutil.copy2(value.urlpath, dest_path)

0 commit comments

Comments
 (0)