Speed up MCMC by passing data less often

raymondEhlers · raymondEhlers · commit f7a55ed0c3e6 · 2023-08-31T11:56:48.000-07:00
The concept is presented here: https://emcee.readthedocs.io/en/stable/tutorials/parallel/#parallel . We're frequently passing the same data to the log posterior (every time the model is called, per emcee), which is quite slow. The workaround is to create a separate log_posterior module, and then initialize each process with global variables containing the args for calculating the log posterior. It's not great coding conventions, but it appears to be an ~3x speed up, so it's a good trade off
diff --git a/src/bayesian_inference/log_posterior.py b/src/bayesian_inference/log_posterior.py
@@ -0,0 +1,147 @@
+"""Define the likelihood separately for performance reasons
+
+In doing so, we can use global variables. This isn't a nice thing to do, but it may improve MCMC performance
+during multiprocessing.
+
+
+"""
+
+import logging
+
+import numpy as np
+from scipy.linalg import lapack
+
+from bayesian_inference import emulation
+
+logger = logging.getLogger(__name__)
+
+
+min = None
+max = None
+emulation_config = None
+emulation_results = None
+experimental_results = None
+emulator_cov_unexplained = None
+
+def initialize_pool_variables(local_min, local_max, local_emulation_config, local_emulation_results, local_experimental_results, local_emulator_cov_unexplained) -> None:
+    global min 
+    global max
+    global emulation_config
+    global emulation_results
+    global experimental_results
+    global emulator_cov_unexplained
+    min = local_min
+    max = local_max
+    emulation_config = local_emulation_config
+    emulation_results = local_emulation_results
+    experimental_results = local_experimental_results
+    emulator_cov_unexplained = local_emulator_cov_unexplained
+
+
+#---------------------------------------------------------------
+def log_posterior(X):
+    """
+    Function to evaluate the log-posterior for a given set of input parameters.
+
+    This function is called by https://emcee.readthedocs.io/en/stable/user/sampler/
+
+    :param X input ndarray of parameter space values
+    :param min list of minimum boundaries for each emulator parameter
+    :param max list of maximum boundaries for each emulator parameter
+    :param config emulation_configuration object
+    :param emulation_results dict of emulation groups
+    :param experimental_results arrays of experimental results
+    """
+
+    # Convert to 2darray of shape (n_samples, n_parameters)
+    X = np.array(X, copy=False, ndmin=2)
+
+    # Initialize log-posterior array, which we will populate and return
+    log_posterior = np.zeros(X.shape[0])
+
+    # Check if any samples are outside the parameter bounds, and set log-posterior to -inf for those
+    inside = np.all((X > min) & (X < max), axis=1)
+    log_posterior[~inside] = -np.inf
+
+    # Evaluate log-posterior for samples inside parameter bounds
+    n_samples = np.count_nonzero(inside)
+    n_features = experimental_results['y'].shape[0]
+
+    if n_samples > 0:
+
+        # Get experimental data
+        data_y = experimental_results['y']
+        data_y_err = experimental_results['y_err']
+
+        # Compute emulator prediction
+        # Returns dict of matrices of emulator predictions:
+        #     emulator_predictions['central_value'] -- (n_samples, n_features)
+        #     emulator_predictions['cov'] -- (n_samples, n_features, n_features)
+        emulator_predictions = emulation.predict(X[inside], emulation_config, 
+                                                 emulation_group_results=emulation_results,
+                                                 emulator_cov_unexplained=emulator_cov_unexplained)
+
+        # Construct array to store the difference between emulator prediction and experimental data
+        # (using broadcasting to subtract each data point from each emulator prediction)
+        assert data_y.shape[0] == emulator_predictions['central_value'].shape[1]
+        dY = emulator_predictions['central_value'] - data_y
+
+        # Construct the covariance matrix
+        # TODO: include full experimental data covariance matrix -- currently we only include uncorrelated data uncertainty
+        #-------------------------
+        covariance_matrix = np.zeros((n_samples, n_features, n_features))
+        covariance_matrix += emulator_predictions['cov']
+        covariance_matrix += np.diag(data_y_err**2)
+
+        # Compute log likelihood at each point in the sample
+        # We take constant priors, so the log-likelihood is just the log-posterior
+        # (since above we set the log-posterior to -inf for samples outside the parameter bounds)
+        log_posterior[inside] += list(map(_loglikelihood, dY, covariance_matrix))
+
+    return log_posterior
+
+#---------------------------------------------------------------
+def _loglikelihood(y, cov):
+    """
+    Evaluate the multivariate-normal log-likelihood for difference vector `y`
+    and covariance matrix `cov`:
+
+        log_p = -1/2*[(y^T).(C^-1).y + log(det(C))] + const.
+
+    The likelihood is NOT NORMALIZED, since this does not affect MCMC.
+    The normalization const = -n/2*log(2*pi), where n is the dimensionality.
+
+    Arguments `y` and `cov` MUST be np.arrays with dtype == float64 and shapes
+    (n) and (n, n), respectively.  These requirements are NOT CHECKED.
+
+    The calculation follows algorithm 2.1 in Rasmussen and Williams (Gaussian
+    Processes for Machine Learning).
+
+    """
+    # Compute the Cholesky decomposition of the covariance.
+    # Use bare LAPACK function to avoid scipy.linalg wrapper overhead.
+    L, info = lapack.dpotrf(cov, clean=False)
+
+    if info < 0:
+        raise ValueError(
+            'lapack dpotrf error: '
+            'the {}-th argument had an illegal value'.format(-info)
+        )
+    elif info < 0:
+        raise np.linalg.LinAlgError(
+            'lapack dpotrf error: '
+            'the leading minor of order {} is not positive definite'
+            .format(info)
+        )
+
+    # Solve for alpha = cov^-1.y using the Cholesky decomp.
+    alpha, info = lapack.dpotrs(L, y)
+
+    if info != 0:
+        raise ValueError(
+            'lapack dpotrs error: '
+            'the {}-th argument had an illegal value'.format(-info)
+        )
+
+    return -.5*np.dot(y, alpha) - np.log(L.diagonal()).sum()
+
diff --git a/src/bayesian_inference/mcmc.py b/src/bayesian_inference/mcmc.py
@@ -21,11 +21,11 @@
 import emcee
 import multiprocessing
 import numpy as np
-from scipy.linalg import lapack
 
 from bayesian_inference import common_base
 from bayesian_inference import data_IO
 from bayesian_inference import emulation
+from bayesian_inference import log_posterior
 
 logger = logging.getLogger(__name__)
 
@@ -75,13 +75,13 @@ def run_mcmc(config, closure_index=-1):
     # NOTE: We use `get_context` here to avoid having to globally specify the context. Plus, it then should be fine
     #       to repeated call this function. (`set_context` can only be called once - otherwise, it's a runtime error).
     ctx = multiprocessing.get_context('spawn')
-    with ctx.Pool() as pool:
+    with ctx.Pool(initializer=log_posterior.initialize_pool_variables, initargs=[min, max, emulation_config, emulation_results, experimental_results, emulator_cov_unexplained]) as pool:
 
         # Construct sampler (we create a dummy daughter class from emcee.EnsembleSampler, to add some logging info)
         # Note: we pass the emulators and experimental data as args to the log_posterior function
         logger.info('Initializing sampler...')
-        sampler = LoggingEnsembleSampler(config.n_walkers, ndim, _log_posterior,
-                                        args=[min, max, emulation_config, emulation_results, experimental_results, emulator_cov_unexplained],
+        sampler = LoggingEnsembleSampler(config.n_walkers, ndim, log_posterior.log_posterior,
+                                        #args=[min, max, emulation_config, emulation_results, experimental_results, emulator_cov_unexplained],
                                         pool=pool)
 
         # Generate random starting positions for each walker
@@ -183,112 +183,6 @@ def map_parameters(posterior, method='quantile'):
 
     return map_parameters
 
-#---------------------------------------------------------------
-def _log_posterior(X, min, max, emulation_config, emulation_results, experimental_results, emulator_cov_unexplained):
-    """
-    Function to evaluate the log-posterior for a given set of input parameters.
-
-    This function is called by https://emcee.readthedocs.io/en/stable/user/sampler/
-
-    :param X input ndarray of parameter space values
-    :param min list of minimum boundaries for each emulator parameter
-    :param max list of maximum boundaries for each emulator parameter
-    :param config emulation_configuration object
-    :param emulation_results dict of emulation groups
-    :param experimental_results arrays of experimental results
-    """
-
-    # Convert to 2darray of shape (n_samples, n_parameters)
-    X = np.array(X, copy=False, ndmin=2)
-
-    # Initialize log-posterior array, which we will populate and return
-    log_posterior = np.zeros(X.shape[0])
-
-    # Check if any samples are outside the parameter bounds, and set log-posterior to -inf for those
-    inside = np.all((X > min) & (X < max), axis=1)
-    log_posterior[~inside] = -np.inf
-
-    # Evaluate log-posterior for samples inside parameter bounds
-    n_samples = np.count_nonzero(inside)
-    n_features = experimental_results['y'].shape[0]
-    if n_samples > 0:
-
-        # Get experimental data
-        data_y = experimental_results['y']
-        data_y_err = experimental_results['y_err']
-
-        # Compute emulator prediction
-        # Returns dict of matrices of emulator predictions:
-        #     emulator_predictions['central_value'] -- (n_samples, n_features)
-        #     emulator_predictions['cov'] -- (n_samples, n_features, n_features)
-        emulator_predictions = emulation.predict(X[inside], emulation_config, 
-                                                 emulation_group_results=emulation_results,
-                                                 emulator_cov_unexplained=emulator_cov_unexplained)
-
-        # Construct array to store the difference between emulator prediction and experimental data
-        # (using broadcasting to subtract each data point from each emulator prediction)
-        assert data_y.shape[0] == emulator_predictions['central_value'].shape[1]
-        dY = emulator_predictions['central_value'] - data_y
-
-        # Construct the covariance matrix
-        # TODO: include full experimental data covariance matrix -- currently we only include uncorrelated data uncertainty
-        #-------------------------
-        covariance_matrix = np.zeros((n_samples, n_features, n_features))
-        covariance_matrix += emulator_predictions['cov']
-        covariance_matrix += np.diag(data_y_err**2)
-
-        # Compute log likelihood at each point in the sample
-        # We take constant priors, so the log-likelihood is just the log-posterior
-        # (since above we set the log-posterior to -inf for samples outside the parameter bounds)
-        log_posterior[inside] += list(map(_loglikelihood, dY, covariance_matrix))
-
-    return log_posterior
-
-#---------------------------------------------------------------
-def _loglikelihood(y, cov):
-    """
-    Evaluate the multivariate-normal log-likelihood for difference vector `y`
-    and covariance matrix `cov`:
-
-        log_p = -1/2*[(y^T).(C^-1).y + log(det(C))] + const.
-
-    The likelihood is NOT NORMALIZED, since this does not affect MCMC.
-    The normalization const = -n/2*log(2*pi), where n is the dimensionality.
-
-    Arguments `y` and `cov` MUST be np.arrays with dtype == float64 and shapes
-    (n) and (n, n), respectively.  These requirements are NOT CHECKED.
-
-    The calculation follows algorithm 2.1 in Rasmussen and Williams (Gaussian
-    Processes for Machine Learning).
-
-    """
-    # Compute the Cholesky decomposition of the covariance.
-    # Use bare LAPACK function to avoid scipy.linalg wrapper overhead.
-    L, info = lapack.dpotrf(cov, clean=False)
-
-    if info < 0:
-        raise ValueError(
-            'lapack dpotrf error: '
-            'the {}-th argument had an illegal value'.format(-info)
-        )
-    elif info < 0:
-        raise np.linalg.LinAlgError(
-            'lapack dpotrf error: '
-            'the leading minor of order {} is not positive definite'
-            .format(info)
-        )
-
-    # Solve for alpha = cov^-1.y using the Cholesky decomp.
-    alpha, info = lapack.dpotrs(L, y)
-
-    if info != 0:
-        raise ValueError(
-            'lapack dpotrs error: '
-            'the {}-th argument had an illegal value'.format(-info)
-        )
-
-    return -.5*np.dot(y, alpha) - np.log(L.diagonal()).sum()
-
 ####################################################################################################################
 class LoggingEnsembleSampler(emcee.EnsembleSampler):
     '''