add option to include X, y, and fixed memory in the memory estimation

paulbkoch · paulbkoch · commit 0e83d968497e · 2025-08-04T20:21:36.000-07:00
diff --git a/python/interpret-core/interpret-core.pyproj b/python/interpret-core/interpret-core.pyproj
@@ -84,6 +84,7 @@
     <Compile Include="interpret\utils\_compressed_dataset.py" />
     <Compile Include="interpret\utils\_histogram.py" />
     <Compile Include="interpret\utils\_link.py" />
+    <Compile Include="interpret\utils\_measure_mem.py" />
     <Compile Include="interpret\utils\_misc.py" />
     <Compile Include="interpret\utils\_purify.py" />
     <Compile Include="interpret\utils\_rank_interactions.py" />
diff --git a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py
@@ -72,6 +72,7 @@
     remove_extra_bins,
 )
 from ...utils._shared_dataset import SharedDataset
+from ...utils._measure_mem import total_bytes
 
 _log = logging.getLogger(__name__)
 
@@ -496,8 +497,8 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
         """Fit model to provided samples.
 
         Args:
-            X: NumPy array for training samples.
-            y: NumPy array as training labels.
+            X: {array-like, sparse matrix} of shape (n_samples, n_features). Training data.
+            y: array-like of shape (n_samples,). Target values.
             sample_weight: Optional array of weights per sample. Should be same length as X and y.
             bags: Optional bag definitions. The first dimension should have length equal to the number of samples.
                 The second dimension should have length equal to the number of outer_bags. The contents should be
@@ -1695,18 +1696,33 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
 
         return self
 
-    def estimate_mem(self, X, y=None):
+    def estimate_mem(self, X, y=None, data_multiplier=0.0):
         """Estimate memory usage of the model.
         Args:
-            X: dataset
+            X: {array-like, sparse matrix} of shape (n_samples, n_features). Training data.
+            y: array-like of shape (n_samples,). Target values.
+            data_multiplier: The data in X needs to be allocated by the caller.
+                If data_multiplier is set to 0.0 then this function only estimates the additional
+                memory consumed by the fit function. If data_multiplier is set to 1.0 then
+                it will include the memory allocated to X by the caller. Often the caller will make
+                copies of X before calling fit, and in that case the data_multiplier could be set to a
+                value above 1.0 if the caller would like this function to include that in the memory estimate.
+
         Returns:
             Estimated memory usage in bytes.
             The estimate does not include the memory from the
             caller's copy of X, nor the process's code or other data.
             The estimate will be more accurate for larger datasets.
         """
 
+        n_bytes = total_bytes(X)
+        if y is not None:
+            n_bytes += total_bytes(y)
+
+        n_bytes = int(n_bytes * data_multiplier)
+
         if y is not None:
+            y_id = id(y)
             n_classes = Native.Task_Unknown
             y = clean_dimensions(y, "y")
             if y.ndim != 1:
@@ -1757,10 +1773,18 @@ def estimate_mem(self, X, y=None):
                 _log.error(msg)
                 raise ValueError(msg)
 
+            if y_id != id(y):
+                # in fit we'll also make a copy of y that cannot be deleted until the end
+                n_bytes += total_bytes(y)
+
         n_samples = None if y is None else len(y)
+        X_id = id(X)
         X, n_samples = preclean_X(
             X, self.feature_names, self.feature_types, n_samples, "y"
         )
+        if X_id != id(X):
+            # a copy was made, and we'll need to also do this on fit, so add the new memory too
+            n_bytes += total_bytes(X)
 
         if y is None:
             n_classes = Native.Task_Regression
@@ -1794,11 +1818,19 @@ def estimate_mem(self, X, y=None):
             feature_types_in,
             None,
         )
-
-        bin_lengths = [
-            len(x[0]) + 2 if isinstance(x[0], dict) else len(x[0]) + 3 for x in bins
-        ]
-        n_tensor_bytes = sum(bin_lengths) * np.float64().nbytes * self.outer_bags * 2
+        # first calculate the number of cells in the mains for all features
+        n_tensor_bytes = sum(
+            2
+            if len(x[0]) == 0
+            else max(x[0].values()) + 2
+            if isinstance(x[0], dict)
+            else len(x[0]) + 3
+            for x in bins
+            if len(x) != 0
+        )
+        # We have 2 copies of the upate tensors in C++ (current and best) and we extract
+        # one more in python for the update before tearning down the C++ data.
+        n_tensor_bytes = n_tensor_bytes * np.float64().nbytes * self.outer_bags * 3
 
         # One shared memory copy of the data mapped into all processes, plus a copy of
         # the test and train data for each outer bag. Assume all processes are started
@@ -1831,6 +1863,19 @@ def estimate_mem(self, X, y=None):
                 None,
             )
 
+            bin_lengths = [x[0] if len(x) == 1 else x[1] for x in bins if len(x) != 0]
+            bin_lengths = [
+                2
+                if len(x) == 0
+                else max(x.values()) + 2
+                if isinstance(x, dict)
+                else len(x) + 3
+                for x in bin_lengths
+            ]
+            bin_lengths.sort()
+            # we use the 75th percentile bin length to estimate the number of bins
+            n_bad_case_bins = bin_lengths[len(bin_lengths) // 4 * 3]
+
             # each outer bag makes a copy of the features. Only the training features
             # are kept for interaction detection, but don't estimate that for now.
             interaction_detection_bytes = (
@@ -1839,15 +1884,15 @@ def estimate_mem(self, X, y=None):
 
             max_bytes = max(max_bytes, interaction_detection_bytes)
 
-            bin_lengths.sort()
-            n_bad_case_bins = bin_lengths[len(bin_lengths) // 4 * 3]
+            # We have 2 copies of the upate tensors in C++ (current and best) and we extract
+            # one more in python for the update before tearning down the C++ data.
             interaction_boosting_bytes = (
                 n_bad_case_bins
                 * n_bad_case_bins
                 * np.float64().nbytes
                 * self.outer_bags
                 * interactions
-                * 2
+                * 3
             )
 
             # We merge the interactions together to make a combined interaction
@@ -1866,7 +1911,7 @@ def estimate_mem(self, X, y=None):
 
             max_bytes = max(max_bytes, interaction_boosting_bytes)
 
-        return max_bytes
+        return int(n_bytes + max_bytes)
 
     def to_jsonable(self, detail="all"):
         """Convert the model to a JSONable representation.
diff --git a/python/interpret-core/interpret/utils/_clean_simple.py b/python/interpret-core/interpret/utils/_clean_simple.py
@@ -86,7 +86,7 @@ def clean_dimensions(data, param_name):
             data = np.array(data, np.object_)
         elif callable(getattr(data, "__array__", None)):
             data = data.__array__()
-        elif isinstance(data, str):
+        elif isinstance(data, (str, bytes)):
             # we have just 1 item, so re-pack it and return
             ret = np.empty(1, np.object_)
             ret[0] = data
@@ -131,7 +131,7 @@ def clean_dimensions(data, param_name):
     while idx < n:
         item = data[idx]
 
-        if isinstance(item, str):
+        if isinstance(item, (str, bytes)):
             if n_second_dim is not None and n_second_dim != 1:
                 msg = (
                     f"{param_name} is not consistent in length for the second dimension"
@@ -180,7 +180,7 @@ def clean_dimensions(data, param_name):
         while sub_idx < n_items:
             subitem = item[sub_idx]
 
-            if isinstance(subitem, str):
+            if isinstance(subitem, (str, bytes)):
                 sub_idx = sub_idx + 1
                 continue
 
diff --git a/python/interpret-core/interpret/utils/_clean_x.py b/python/interpret-core/interpret/utils/_clean_x.py
@@ -650,7 +650,7 @@ def _encode_pandas_categorical_initial(X_col, pd_categories, is_ordered, process
         _log.error(msg)
         raise ValueError(msg)
     else:
-        if isinstance(processing, str):
+        if isinstance(processing, (str, bytes)):
             # isinstance(, str) also works for np.str_
 
             # don't allow strings to get to the for loop below
@@ -1133,7 +1133,7 @@ def _process_dict_column(X_col, is_initial, feature_type, min_unique_continuous)
             raise ValueError(msg)
     elif isinstance(X_col, _list_tuple_types):
         X_col = np.array(X_col, np.object_)
-    elif isinstance(X_col, str):
+    elif isinstance(X_col, (str, bytes)):
         # isinstance(, str) also works for np.str_
 
         # don't allow strings to get to the np.array conversion below
@@ -1814,7 +1814,7 @@ def preclean_X(X, feature_names, feature_types, n_samples=None, sample_source="y
         msg = "X cannot be None"
         _log.error(msg)
         raise TypeError(msg)
-    elif isinstance(X, str):
+    elif isinstance(X, (str, bytes)):
         # str objects are iterable, so don't allow them to get to the list() conversion below
         # isinstance(, str) also works for np.str_
         msg = "X cannot be a str type"
@@ -1900,7 +1900,7 @@ def preclean_X(X, feature_names, feature_types, n_samples=None, sample_source="y
                 is_copied = True
                 X = list(X)
             X[idx] = _reshape_1D_if_possible(sample)
-        elif isinstance(sample, str):
+        elif isinstance(sample, (str, bytes)):
             # isinstance(, str) also works for np.str_
             break  # this only legal if we have one sample
         else:
diff --git a/python/interpret-core/interpret/utils/_measure_mem.py b/python/interpret-core/interpret/utils/_measure_mem.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2023 The InterpretML Contributors
+# Distributed under the MIT software license
+
+from collections.abc import Iterable
+import sys
+from ._misc import safe_isinstance
+import numpy as np
+from typing import Any
+
+
+def total_bytes(obj: Any) -> int:
+    n_bytes = 0
+    items = [obj]
+
+    seen_ids = set()
+    while items:
+        item = items.pop()
+
+        obj_id = id(item)
+        if obj_id in seen_ids:
+            continue
+        seen_ids.add(obj_id)
+
+        if safe_isinstance(item, "pandas.DataFrame"):
+            n_bytes += item.memory_usage().sum()
+            # pandas only includes the pointer to the object but not the object
+            for col in item.select_dtypes(include=["object"]):
+                for val in item[col]:
+                    try:
+                        n_bytes += sys.getsizeof(val)
+                    except Exception:
+                        pass
+                    if isinstance(val, Iterable) and not isinstance(val, (str, bytes)):
+                        try:
+                            items.extend(val)
+                        except Exception:
+                            pass
+        elif safe_isinstance(item, "pandas.Series"):
+            n_bytes += item.memory_usage()
+            if item.dtype == "O":
+                for val in item:
+                    try:
+                        n_bytes += sys.getsizeof(val)
+                    except Exception:
+                        pass
+                    if isinstance(val, Iterable) and not isinstance(val, (str, bytes)):
+                        try:
+                            items.extend(val)
+                        except Exception:
+                            pass
+        elif isinstance(item, np.ndarray):
+            n_bytes += item.nbytes
+            if item.dtype == "O":
+                items.extend(item.flat)
+        elif safe_isinstance(item, "scipy.sparse.spmatrix") or safe_isinstance(
+            item, "scipy.sparse.sparray"
+        ):
+            n_bytes += item.data.nbytes + item.indptr.nbytes + item.indices.nbytes
+        elif isinstance(item, dict):
+            try:
+                n_bytes += sys.getsizeof(item)
+            except Exception:
+                pass
+            items.extend(item.values())
+            items.extend(item.keys())
+        else:
+            try:
+                n_bytes += sys.getsizeof(item)
+            except Exception:
+                pass
+            if isinstance(item, Iterable) and not isinstance(item, (str, bytes)):
+                try:
+                    items.extend(item)
+                except Exception:
+                    pass
+
+    return int(n_bytes)
diff --git a/python/interpret-core/tests/glassbox/ebm/test_ebm.py b/python/interpret-core/tests/glassbox/ebm/test_ebm.py
@@ -1386,12 +1386,13 @@ def __call__(self, bag_index, step_index, progress, metric):
 
 
 def test_estimate_mem():
-    X, y, names, types = make_synthetic(seed=42, output_type="float", n_samples=10000)
+    for t in ["object", "pandas", "str", "float", "csc_matrix", "csc_array"]:
+        X, y, names, types = make_synthetic(seed=42, output_type=t, n_samples=10000)
 
-    ebm = ExplainableBoostingClassifier(names, types, interactions=[])
-    n_bytes = ebm.estimate_mem(X, y)
-    # print(n_bytes)
+        ebm = ExplainableBoostingClassifier(names, types, interactions=[])
+        n_bytes_classifier = ebm.estimate_mem(X, y, 1.0)
 
-    ebm = ExplainableBoostingClassifier(names, types)
-    n_bytes = ebm.estimate_mem(X, y)
-    # print(n_bytes)
+        ebm = ExplainableBoostingClassifier(names, types)
+        n_bytes_regressor = ebm.estimate_mem(X, y, 1.0)
+
+        # print(f"datatype={t}, bytes_classifier[mains]={n_bytes_classifier}, bytes_regressor[+interactions]={n_bytes_regressor}")