Skip to content

Commit b1c1ad4

Browse files
committed
add ability to handle direct memory allocated datasets in addition to shared memory
1 parent 78750c1 commit b1c1ad4

File tree

6 files changed

+42
-25
lines changed

6 files changed

+42
-25
lines changed

docs/benchmarks/ebm-benchmark.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"force_recreate = False\n",
1818
"exist_ok = True\n",
1919
"TIMEOUT_SEC = 60 * 60 * 24 * 180 # 180 days\n",
20-
"wheel_filepaths = ['interpret_core-0.6.13-py3-none-any.whl', 'powerlift-0.1.12-py3-none-any.whl']\n",
20+
"wheel_filepaths = ['interpret_core-0.6.15-py3-none-any.whl', 'powerlift-0.1.12-py3-none-any.whl']\n",
2121
"\n",
2222
"import datetime\n",
2323
"experiment_name = datetime.datetime.now().strftime('%Y_%m_%d_%H%M__') + 'myexperiment'\n",

python/interpret-core/interpret/glassbox/_ebm/_boost.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def boost(
1717
shm_name,
1818
bag_idx,
1919
callback,
20-
dataset_name,
20+
dataset,
2121
intercept_rounds,
2222
intercept_learning_rate,
2323
intercept,
@@ -56,10 +56,12 @@ def boost(
5656
):
5757
try:
5858
develop._develop_options = develop_options # restore these in this process
59+
shared_dataset = None
5960
try:
60-
shared_dataset = shared_memory.SharedMemory(name=dataset_name)
61-
# we do not know the length of the dataset, so we create a 1-element array
62-
dataset = np.ndarray(1, dtype=np.ubyte, buffer=shared_dataset.buf)
61+
if isinstance(dataset, str): # if str it is shared memory
62+
shared_dataset = shared_memory.SharedMemory(name=dataset)
63+
# we do not know the length of the dataset, so we create a 1-element array
64+
dataset = np.ndarray(1, dtype=np.ubyte, buffer=shared_dataset.buf)
6365

6466
shm = None
6567
try:
@@ -372,6 +374,7 @@ def boost(
372374
if shm is not None:
373375
shm.close()
374376
finally:
375-
shared_dataset.close()
377+
if shared_dataset is not None:
378+
shared_dataset.close()
376379
except Exception as e:
377380
return e, None, None, None, None

python/interpret-core/interpret/glassbox/_ebm/_ebm.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,7 +1166,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
11661166
shm_name,
11671167
idx,
11681168
callback,
1169-
shared.name,
1169+
shared.name if shared.name is not None else shared.dataset,
11701170
n_intercept_rounds,
11711171
develop.get_option("intercept_learning_rate"),
11721172
bagged_intercept[idx],
@@ -1292,7 +1292,9 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
12921292
(
12931293
shm_name,
12941294
idx,
1295-
shared.name,
1295+
shared.name
1296+
if shared.name is not None
1297+
else shared.dataset,
12961298
bagged_intercept[idx],
12971299
internal_bags[idx],
12981300
scores_bags[idx],
@@ -1434,7 +1436,9 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
14341436
shm_name,
14351437
idx,
14361438
callback,
1437-
shared.name,
1439+
shared.name
1440+
if shared.name is not None
1441+
else shared.dataset,
14381442
0, # intercept should already be close for pairs
14391443
0.0, # intercept should already be close for pairs
14401444
bagged_intercept[idx],
@@ -1570,7 +1574,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
15701574
None,
15711575
0,
15721576
None,
1573-
shared.name,
1577+
shared.dataset,
15741578
develop.get_option("n_intercept_rounds_final"),
15751579
develop.get_option("intercept_learning_rate"),
15761580
np.zeros(n_scores, np.float64),
@@ -1741,8 +1745,9 @@ def estimate_mem(self, X):
17411745
# One shared memory copy of the data mapped into all processes, plus a copy of
17421746
# the test and train data for each outer bag. Assume all processes are started
17431747
# at some point and are eating up memory.
1744-
1745-
max_bytes = n_bytes_mains + n_bytes_mains * self.outer_bags
1748+
# When we cannot use shared memory the parent has a copy of the dataset and
1749+
# all the children share one copy.
1750+
max_bytes = n_bytes_mains + n_bytes_mains + n_bytes_mains * self.outer_bags
17461751

17471752
n_features_in = len(bins)
17481753

@@ -1766,7 +1771,7 @@ def estimate_mem(self, X):
17661771
# each outer bag makes a copy of the features. Only the training features
17671772
# are kept for interaction detection, but don't estimate that for now.
17681773
interaction_detection_bytes = (
1769-
n_bytes_pairs + n_bytes_pairs * self.outer_bags
1774+
n_bytes_pairs + n_bytes_pairs + n_bytes_pairs * self.outer_bags
17701775
)
17711776

17721777
max_bytes = max(max_bytes, interaction_detection_bytes)
@@ -1777,8 +1782,10 @@ def estimate_mem(self, X):
17771782
# then the resulting data storage should take approx 14 bits in total,
17781783
# so as a loose approximation we can add the bits in a pair.
17791784
interaction_multiple *= 2.0
1780-
interaction_boosting_bytes = n_bytes_pairs + int(
1781-
n_bytes_pairs * interaction_multiple * self.outer_bags
1785+
interaction_boosting_bytes = (
1786+
n_bytes_pairs
1787+
+ n_bytes_pairs
1788+
+ int(n_bytes_pairs * interaction_multiple * self.outer_bags)
17821789
)
17831790

17841791
max_bytes = max(max_bytes, interaction_boosting_bytes)

python/interpret-core/interpret/utils/_compressed_dataset.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,15 @@ def bin_native(
102102
raise ValueError(msg)
103103

104104
if shared is not None:
105-
shared_mem = shared_memory.SharedMemory(create=True, size=n_bytes, name=None)
106-
shared.shared_memory = shared_mem
107-
shared.name = shared_mem.name
105+
# shared_mem = shared_memory.SharedMemory(create=True, size=n_bytes, name=None)
106+
# shared.shared_memory = shared_mem
107+
# shared.name = shared_mem.name
108+
# dataset = np.ndarray(n_bytes, dtype=np.ubyte, buffer=shared_mem.buf)
109+
110+
# Large amounts of shared memory cannot be allocated inside docker images
111+
# so for now allocate a normal numpy array.
112+
dataset = np.empty(n_bytes, np.ubyte)
108113

109-
dataset = np.ndarray(n_bytes, dtype=np.ubyte, buffer=shared_mem.buf)
110114
shared.dataset = dataset
111115

112116
native.fill_dataset_header(len(feature_idxs), n_weights, 1, dataset)

python/interpret-core/interpret/utils/_measure_interactions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ def measure_interactions(
270270
ranked_interactions = rank_interactions(
271271
None,
272272
0,
273-
dataset_name=shared.name,
273+
dataset=shared.name if shared.name is not None else shared.dataset,
274274
intercept=None,
275275
bag=None,
276276
init_scores=init_score,

python/interpret-core/interpret/utils/_rank_interactions.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
def rank_interactions(
2121
shm_name,
2222
bag_idx,
23-
dataset_name,
23+
dataset,
2424
intercept,
2525
bag,
2626
init_scores,
@@ -44,10 +44,12 @@ def rank_interactions(
4444
try:
4545
develop._develop_options = develop_options # restore these in this process
4646

47+
shared_dataset = None
4748
try:
48-
shared_dataset = shared_memory.SharedMemory(name=dataset_name)
49-
# we do not know the length of the dataset, so we create a 1-element array
50-
dataset = np.ndarray(1, dtype=np.ubyte, buffer=shared_dataset.buf)
49+
if isinstance(dataset, str): # if str it is shared memory
50+
shared_dataset = shared_memory.SharedMemory(name=dataset)
51+
# we do not know the length of the dataset, so we create a 1-element array
52+
dataset = np.ndarray(1, dtype=np.ubyte, buffer=shared_dataset.buf)
5153

5254
shm = None
5355
try:
@@ -100,6 +102,7 @@ def rank_interactions(
100102
if shm is not None:
101103
shm.close()
102104
finally:
103-
shared_dataset.close()
105+
if shared_dataset is not None:
106+
shared_dataset.close()
104107
except Exception as e:
105108
return e

0 commit comments

Comments
 (0)