7
7
#######################################################################
8
8
9
9
"""
10
- Benchmark for TreeStore with large arrays.
10
+ Benchmark for TreeStore vs h5py with large arrays.
11
11
12
12
This benchmark creates N numpy arrays with sizes following a normal distribution
13
- and measures the time and memory consumption for storing them in a TreeStore.
13
+ and measures the time and memory consumption for storing them in both TreeStore and h5py .
14
14
"""
15
15
16
16
import os
22
22
23
23
import blosc2
24
24
25
+ try :
26
+ import h5py
27
+ HAS_H5PY = True
28
+ except ImportError :
29
+ HAS_H5PY = False
30
+
25
31
# Configuration
26
32
N_ARRAYS = 100 # Number of arrays to store
27
33
NGROUPS_MAX = 10
28
- PEAK_SIZE_MB = 1 # Peak size in MB for the normal distribution
34
+ PEAK_SIZE_MB = 10 # Peak size in MB for the normal distribution
29
35
STDDEV_MB = 2 # Standard deviation in MB
30
- OUTPUT_DIR = "large-tree-store.b2d"
36
+ OUTPUT_DIR_TSTORE = "large-tree-store.b2z"
37
+ OUTPUT_FILE_H5PY = "large-h5py-store.h5"
31
38
MIN_SIZE_MB = 0.1 # Minimum array size in MB
32
39
MAX_SIZE_MB = 32 # Maximum array size in MB
33
40
@@ -68,12 +75,14 @@ def store_arrays_in_treestore(arrays, output_dir):
68
75
print (f"\n Storing { len (arrays )} arrays in TreeStore at { output_dir } ..." )
69
76
70
77
# Clean up existing directory
71
- if os .path .exists (output_dir ):
78
+ if os .path .exists (output_dir ) and os . path . isdir ( output_dir ) :
72
79
shutil .rmtree (output_dir )
80
+ elif os .path .exists (output_dir ):
81
+ os .remove (output_dir )
73
82
74
83
start_time = time .time ()
75
84
76
- with blosc2 .TreeStore (output_dir , mode = "w" ) as tstore :
85
+ with blosc2 .TreeStore (output_dir , mode = "w" , threshold = 2 ** 13 ) as tstore :
77
86
for i , arr in enumerate (arrays ):
78
87
# Distribute arrays evenly across NGROUPS_MAX subdirectories
79
88
group_id = i % NGROUPS_MAX
@@ -96,6 +105,52 @@ def store_arrays_in_treestore(arrays, output_dir):
96
105
return total_time
97
106
98
107
108
+ @profile
109
+ def store_arrays_in_h5py (arrays , output_file ):
110
+ """Store arrays in h5py and measure performance."""
111
+ if not HAS_H5PY :
112
+ return None
113
+
114
+ print (f"\n Storing { len (arrays )} arrays in h5py at { output_file } ..." )
115
+
116
+ # Clean up existing file
117
+ if os .path .exists (output_file ):
118
+ os .remove (output_file )
119
+
120
+ start_time = time .time ()
121
+
122
+ with h5py .File (output_file , "w" ) as f :
123
+ for i , arr in enumerate (arrays ):
124
+ # Distribute arrays evenly across NGROUPS_MAX subdirectories
125
+ group_id = i % NGROUPS_MAX
126
+ group_name = f"group_{ group_id :02d} "
127
+ dataset_name = f"array_{ i :04d} "
128
+
129
+ # Create group if it doesn't exist
130
+ if group_name not in f :
131
+ grp = f .create_group (group_name )
132
+ else :
133
+ grp = f [group_name ]
134
+
135
+ # Store array with compression
136
+ grp .create_dataset (dataset_name , data = arr , compression = "gzip" , shuffle = True ) #, compression_opts=9)
137
+
138
+ if (i + 1 ) % 10 == 0 :
139
+ elapsed = time .time () - start_time
140
+ print (f" Stored { i + 1 } /{ len (arrays )} arrays ({ elapsed :.2f} s)" )
141
+
142
+ # Add some metadata
143
+ f .attrs ["n_arrays" ] = len (arrays )
144
+ f .attrs ["peak_size_mb" ] = PEAK_SIZE_MB
145
+ f .attrs ["benchmark_timestamp" ] = time .time ()
146
+ f .attrs ["n_groups" ] = NGROUPS_MAX
147
+
148
+ end_time = time .time ()
149
+ total_time = end_time - start_time
150
+
151
+ return total_time
152
+
153
+
99
154
def measure_memory_usage (func , * args , ** kwargs ):
100
155
"""Measure memory usage of a function."""
101
156
print ("\n Measuring memory usage..." )
@@ -113,56 +168,117 @@ def wrapper():
113
168
return max_memory_mb , min_memory_mb , memory_increase_mb , mem_usage
114
169
115
170
116
- def print_statistics (sizes_mb , sizes_elements , total_time , memory_stats ):
117
- """Print benchmark statistics."""
118
- max_mem , min_mem , mem_increase , _ = memory_stats
119
-
120
- print ("\n " + "=" * 60 )
121
- print ("BENCHMARK RESULTS" )
122
- print ("=" * 60 )
123
-
124
- print (f"Configuration:" )
125
- print (f" Number of arrays: { N_ARRAYS } " )
126
- print (f" Peak size: { PEAK_SIZE_MB } MB" )
127
- print (f" Standard deviation: { STDDEV_MB } MB" )
128
- print (f" Output directory: { OUTPUT_DIR } " )
129
-
130
- print (f"\n Array size statistics:" )
131
- print (f" Mean size: { np .mean (sizes_mb ):.2f} MB" )
132
- print (f" Median size: { np .median (sizes_mb ):.2f} MB" )
133
- print (f" Min size: { np .min (sizes_mb ):.2f} MB" )
134
- print (f" Max size: { np .max (sizes_mb ):.2f} MB" )
135
- print (f" Total data: { np .sum (sizes_mb ):.2f} MB" )
136
-
137
- print (f"\n Performance metrics:" )
138
- print (f" Total time: { total_time :.2f} seconds" )
139
- print (f" Average time per array: { total_time / N_ARRAYS :.3f} seconds" )
140
- print (f" Throughput: { np .sum (sizes_mb ) / total_time :.2f} MB/s" )
141
-
142
- print (f"\n Memory usage:" )
143
- print (f" Baseline memory: { min_mem :.2f} MB" )
144
- print (f" Peak memory: { max_mem :.2f} MB" )
145
- print (f" Memory increase: { mem_increase :.2f} MB" )
146
-
147
- # Check final directory size
148
- if os .path .exists (OUTPUT_DIR ):
171
+ def get_storage_size (path ):
172
+ """Get storage size in MB for a file or directory."""
173
+ if os .path .isfile (path ):
174
+ return os .path .getsize (path ) / (1024 * 1024 )
175
+ elif os .path .isdir (path ):
149
176
total_size = 0
150
- for dirpath , dirnames , filenames in os .walk (OUTPUT_DIR ):
177
+ for dirpath , dirnames , filenames in os .walk (path ):
151
178
for filename in filenames :
152
179
filepath = os .path .join (dirpath , filename )
153
180
total_size += os .path .getsize (filepath )
181
+ return total_size / (1024 * 1024 )
182
+ return 0
183
+
154
184
155
- compressed_size_mb = total_size / (1024 * 1024 )
156
- compression_ratio = np .sum (sizes_mb ) / compressed_size_mb
185
+ def print_comparison_table (sizes_mb , tstore_results , h5py_results ):
186
+ """Print a comparison table of TreeStore vs h5py results."""
187
+ total_data_mb = np .sum (sizes_mb )
157
188
158
- print (f"\n Storage efficiency:" )
159
- print (f" Compressed size: { compressed_size_mb :.2f} MB" )
160
- print (f" Compression ratio: { compression_ratio :.2f} x" )
189
+ print ("\n " + "=" * 80 )
190
+ print ("PERFORMANCE COMPARISON: TreeStore vs h5py" )
191
+ print ("=" * 80 )
192
+
193
+ # Configuration info
194
+ print (f"Configuration:" )
195
+ print (f" Arrays: { N_ARRAYS :,} | Peak size: { PEAK_SIZE_MB } MB | Total data: { total_data_mb :.1f} MB" )
196
+ print ()
197
+
198
+ # Extract results
199
+ tstore_time , tstore_memory , tstore_storage = tstore_results
200
+ if h5py_results :
201
+ h5py_time , h5py_memory , h5py_storage = h5py_results
202
+ has_h5py = True
203
+ else :
204
+ has_h5py = False
205
+
206
+ # Table header
207
+ print (f"{ 'Metric' :<25} { 'TreeStore' :<15} { 'h5py' :<15} { 'Ratio (T/H)' :<12} " )
208
+ print ("-" * 70 )
209
+
210
+ # Time metrics
211
+ print (f"{ 'Total time (s)' :<25} { tstore_time :<15.2f} " , end = "" )
212
+ if has_h5py :
213
+ ratio = tstore_time / h5py_time if h5py_time > 0 else float ('inf' )
214
+ print (f"{ h5py_time :<15.2f} { ratio :<12.2f} " )
215
+ else :
216
+ print (f"{ 'N/A' :<15} { 'N/A' :<12} " )
217
+
218
+ print (f"{ 'Throughput (MB/s)' :<25} { total_data_mb / tstore_time :<15.1f} " , end = "" )
219
+ if has_h5py :
220
+ h5py_throughput = total_data_mb / h5py_time
221
+ ratio = (total_data_mb / tstore_time ) / h5py_throughput if h5py_throughput > 0 else float ('inf' )
222
+ print (f"{ h5py_throughput :<15.1f} { ratio :<12.2f} " )
223
+ else :
224
+ print (f"{ 'N/A' :<15} { 'N/A' :<12} " )
225
+
226
+ print ()
227
+
228
+ # Memory metrics
229
+ print (f"{ 'Peak memory (MB)' :<25} { tstore_memory [0 ]:<15.1f} " , end = "" )
230
+ if has_h5py :
231
+ ratio = tstore_memory [0 ] / h5py_memory [0 ] if h5py_memory [0 ] > 0 else float ('inf' )
232
+ print (f"{ h5py_memory [0 ]:<15.1f} { ratio :<12.2f} " )
233
+ else :
234
+ print (f"{ 'N/A' :<15} { 'N/A' :<12} " )
235
+
236
+ print (f"{ 'Memory increase (MB)' :<25} { tstore_memory [2 ]:<15.1f} " , end = "" )
237
+ if has_h5py :
238
+ ratio = tstore_memory [2 ] / h5py_memory [2 ] if h5py_memory [2 ] > 0 else float ('inf' )
239
+ print (f"{ h5py_memory [2 ]:<15.1f} { ratio :<12.2f} " )
240
+ else :
241
+ print (f"{ 'N/A' :<15} { 'N/A' :<12} " )
242
+
243
+ print ()
244
+
245
+ # Storage metrics
246
+ print (f"{ 'Storage size (MB)' :<25} { tstore_storage :<15.1f} " , end = "" )
247
+ if has_h5py :
248
+ ratio = tstore_storage / h5py_storage if h5py_storage > 0 else float ('inf' )
249
+ print (f"{ h5py_storage :<15.1f} { ratio :<12.2f} " )
250
+ else :
251
+ print (f"{ 'N/A' :<15} { 'N/A' :<12} " )
252
+
253
+ print (f"{ 'Compression ratio' :<25} { total_data_mb / tstore_storage :<15.2f} " , end = "" )
254
+ if has_h5py :
255
+ h5py_compression = total_data_mb / h5py_storage
256
+ ratio = (total_data_mb / tstore_storage ) / h5py_compression if h5py_compression > 0 else float ('inf' )
257
+ print (f"{ h5py_compression :<15.2f} { ratio :<12.2f} " )
258
+ else :
259
+ print (f"{ 'N/A' :<15} { 'N/A' :<12} " )
260
+
261
+ print ()
262
+
263
+ # Summary
264
+ print ("Summary:" )
265
+ if has_h5py :
266
+ if tstore_time < h5py_time :
267
+ print (f" TreeStore is { h5py_time / tstore_time :.1f} x faster" )
268
+ else :
269
+ print (f" h5py is { tstore_time / h5py_time :.1f} x faster" )
270
+
271
+ if tstore_storage < h5py_storage :
272
+ print (f" TreeStore uses { h5py_storage / tstore_storage :.1f} x less storage" )
273
+ else :
274
+ print (f" h5py uses { tstore_storage / h5py_storage :.1f} x less storage" )
275
+ else :
276
+ print (" h5py not available for comparison" )
161
277
162
278
163
279
def main ():
164
280
"""Run the benchmark."""
165
- print ("TreeStore Large Array Benchmark" )
281
+ print ("TreeStore vs h5py Large Array Benchmark" )
166
282
print ("=" * 60 )
167
283
168
284
# Set random seed for reproducibility
@@ -177,19 +293,37 @@ def main():
177
293
# Create test arrays
178
294
arrays = create_test_arrays (sizes_elements )
179
295
180
- # Measure memory usage during storage
181
- memory_stats = measure_memory_usage (store_arrays_in_treestore , arrays , OUTPUT_DIR )
182
- total_time = memory_stats [0 ] # This will be overwritten, we need the actual time
183
-
184
- # Get the actual timing by running the storage function again
185
- # (memory_profiler doesn't return the function result easily)
186
- print ("\n Running final timing measurement..." )
187
- actual_time = store_arrays_in_treestore (arrays , OUTPUT_DIR )
188
-
189
- # Print results
190
- print_statistics (sizes_mb , sizes_elements , actual_time , memory_stats )
191
-
192
- print (f"\n Benchmark completed. Results saved to: { OUTPUT_DIR } " )
296
+ # Benchmark TreeStore
297
+ print ("\n " + "=" * 60 )
298
+ print ("BENCHMARKING TreeStore" )
299
+ print ("=" * 60 )
300
+ tstore_memory_stats = measure_memory_usage (store_arrays_in_treestore , arrays , OUTPUT_DIR_TSTORE )
301
+ tstore_time = store_arrays_in_treestore (arrays , OUTPUT_DIR_TSTORE )
302
+ tstore_storage_size = get_storage_size (OUTPUT_DIR_TSTORE )
303
+ tstore_results = (tstore_time , tstore_memory_stats , tstore_storage_size )
304
+
305
+ # Benchmark h5py if available
306
+ h5py_results = None
307
+ if HAS_H5PY :
308
+ print ("\n " + "=" * 60 )
309
+ print ("BENCHMARKING h5py" )
310
+ print ("=" * 60 )
311
+ h5py_memory_stats = measure_memory_usage (store_arrays_in_h5py , arrays , OUTPUT_FILE_H5PY )
312
+ h5py_time = store_arrays_in_h5py (arrays , OUTPUT_FILE_H5PY )
313
+ h5py_storage_size = get_storage_size (OUTPUT_FILE_H5PY )
314
+ h5py_results = (h5py_time , h5py_memory_stats , h5py_storage_size )
315
+ else :
316
+ print ("\n " + "=" * 60 )
317
+ print ("h5py not available - skipping h5py benchmark" )
318
+ print ("=" * 60 )
319
+
320
+ # Print comparison table
321
+ print_comparison_table (sizes_mb , tstore_results , h5py_results )
322
+
323
+ print (f"\n Benchmark completed." )
324
+ print (f"TreeStore results saved to: { OUTPUT_DIR_TSTORE } " )
325
+ if HAS_H5PY :
326
+ print (f"h5py results saved to: { OUTPUT_FILE_H5PY } " )
193
327
194
328
195
329
if __name__ == "__main__" :
0 commit comments