Skip to content

Commit f7a55ed

Browse files
committed
Speed up MCMC by passing data less often
The concept is presented here: https://emcee.readthedocs.io/en/stable/tutorials/parallel/#parallel . We're frequently passing the same data to the log posterior (every time the model is called, per emcee), which is quite slow. The workaround is to create a separate log_posterior module, and then initialize each process with global variables containing the args for calculating the log posterior. It's not great coding conventions, but it appears to be an ~3x speed up, so it's a good trade off
1 parent d5a4c73 commit f7a55ed

File tree

2 files changed

+151
-110
lines changed

2 files changed

+151
-110
lines changed
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
"""Define the likelihood separately for performance reasons
2+
3+
In doing so, we can use global variables. This isn't a nice thing to do, but it may improve MCMC performance
4+
during multiprocessing.
5+
6+
7+
"""
8+
9+
import logging
10+
11+
import numpy as np
12+
from scipy.linalg import lapack
13+
14+
from bayesian_inference import emulation
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
min = None
20+
max = None
21+
emulation_config = None
22+
emulation_results = None
23+
experimental_results = None
24+
emulator_cov_unexplained = None
25+
26+
def initialize_pool_variables(local_min, local_max, local_emulation_config, local_emulation_results, local_experimental_results, local_emulator_cov_unexplained) -> None:
27+
global min
28+
global max
29+
global emulation_config
30+
global emulation_results
31+
global experimental_results
32+
global emulator_cov_unexplained
33+
min = local_min
34+
max = local_max
35+
emulation_config = local_emulation_config
36+
emulation_results = local_emulation_results
37+
experimental_results = local_experimental_results
38+
emulator_cov_unexplained = local_emulator_cov_unexplained
39+
40+
41+
#---------------------------------------------------------------
42+
def log_posterior(X):
43+
"""
44+
Function to evaluate the log-posterior for a given set of input parameters.
45+
46+
This function is called by https://emcee.readthedocs.io/en/stable/user/sampler/
47+
48+
:param X input ndarray of parameter space values
49+
:param min list of minimum boundaries for each emulator parameter
50+
:param max list of maximum boundaries for each emulator parameter
51+
:param config emulation_configuration object
52+
:param emulation_results dict of emulation groups
53+
:param experimental_results arrays of experimental results
54+
"""
55+
56+
# Convert to 2darray of shape (n_samples, n_parameters)
57+
X = np.array(X, copy=False, ndmin=2)
58+
59+
# Initialize log-posterior array, which we will populate and return
60+
log_posterior = np.zeros(X.shape[0])
61+
62+
# Check if any samples are outside the parameter bounds, and set log-posterior to -inf for those
63+
inside = np.all((X > min) & (X < max), axis=1)
64+
log_posterior[~inside] = -np.inf
65+
66+
# Evaluate log-posterior for samples inside parameter bounds
67+
n_samples = np.count_nonzero(inside)
68+
n_features = experimental_results['y'].shape[0]
69+
70+
if n_samples > 0:
71+
72+
# Get experimental data
73+
data_y = experimental_results['y']
74+
data_y_err = experimental_results['y_err']
75+
76+
# Compute emulator prediction
77+
# Returns dict of matrices of emulator predictions:
78+
# emulator_predictions['central_value'] -- (n_samples, n_features)
79+
# emulator_predictions['cov'] -- (n_samples, n_features, n_features)
80+
emulator_predictions = emulation.predict(X[inside], emulation_config,
81+
emulation_group_results=emulation_results,
82+
emulator_cov_unexplained=emulator_cov_unexplained)
83+
84+
# Construct array to store the difference between emulator prediction and experimental data
85+
# (using broadcasting to subtract each data point from each emulator prediction)
86+
assert data_y.shape[0] == emulator_predictions['central_value'].shape[1]
87+
dY = emulator_predictions['central_value'] - data_y
88+
89+
# Construct the covariance matrix
90+
# TODO: include full experimental data covariance matrix -- currently we only include uncorrelated data uncertainty
91+
#-------------------------
92+
covariance_matrix = np.zeros((n_samples, n_features, n_features))
93+
covariance_matrix += emulator_predictions['cov']
94+
covariance_matrix += np.diag(data_y_err**2)
95+
96+
# Compute log likelihood at each point in the sample
97+
# We take constant priors, so the log-likelihood is just the log-posterior
98+
# (since above we set the log-posterior to -inf for samples outside the parameter bounds)
99+
log_posterior[inside] += list(map(_loglikelihood, dY, covariance_matrix))
100+
101+
return log_posterior
102+
103+
#---------------------------------------------------------------
104+
def _loglikelihood(y, cov):
105+
"""
106+
Evaluate the multivariate-normal log-likelihood for difference vector `y`
107+
and covariance matrix `cov`:
108+
109+
log_p = -1/2*[(y^T).(C^-1).y + log(det(C))] + const.
110+
111+
The likelihood is NOT NORMALIZED, since this does not affect MCMC.
112+
The normalization const = -n/2*log(2*pi), where n is the dimensionality.
113+
114+
Arguments `y` and `cov` MUST be np.arrays with dtype == float64 and shapes
115+
(n) and (n, n), respectively. These requirements are NOT CHECKED.
116+
117+
The calculation follows algorithm 2.1 in Rasmussen and Williams (Gaussian
118+
Processes for Machine Learning).
119+
120+
"""
121+
# Compute the Cholesky decomposition of the covariance.
122+
# Use bare LAPACK function to avoid scipy.linalg wrapper overhead.
123+
L, info = lapack.dpotrf(cov, clean=False)
124+
125+
if info < 0:
126+
raise ValueError(
127+
'lapack dpotrf error: '
128+
'the {}-th argument had an illegal value'.format(-info)
129+
)
130+
elif info < 0:
131+
raise np.linalg.LinAlgError(
132+
'lapack dpotrf error: '
133+
'the leading minor of order {} is not positive definite'
134+
.format(info)
135+
)
136+
137+
# Solve for alpha = cov^-1.y using the Cholesky decomp.
138+
alpha, info = lapack.dpotrs(L, y)
139+
140+
if info != 0:
141+
raise ValueError(
142+
'lapack dpotrs error: '
143+
'the {}-th argument had an illegal value'.format(-info)
144+
)
145+
146+
return -.5*np.dot(y, alpha) - np.log(L.diagonal()).sum()
147+

src/bayesian_inference/mcmc.py

Lines changed: 4 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@
2121
import emcee
2222
import multiprocessing
2323
import numpy as np
24-
from scipy.linalg import lapack
2524

2625
from bayesian_inference import common_base
2726
from bayesian_inference import data_IO
2827
from bayesian_inference import emulation
28+
from bayesian_inference import log_posterior
2929

3030
logger = logging.getLogger(__name__)
3131

@@ -75,13 +75,13 @@ def run_mcmc(config, closure_index=-1):
7575
# NOTE: We use `get_context` here to avoid having to globally specify the context. Plus, it then should be fine
7676
# to repeated call this function. (`set_context` can only be called once - otherwise, it's a runtime error).
7777
ctx = multiprocessing.get_context('spawn')
78-
with ctx.Pool() as pool:
78+
with ctx.Pool(initializer=log_posterior.initialize_pool_variables, initargs=[min, max, emulation_config, emulation_results, experimental_results, emulator_cov_unexplained]) as pool:
7979

8080
# Construct sampler (we create a dummy daughter class from emcee.EnsembleSampler, to add some logging info)
8181
# Note: we pass the emulators and experimental data as args to the log_posterior function
8282
logger.info('Initializing sampler...')
83-
sampler = LoggingEnsembleSampler(config.n_walkers, ndim, _log_posterior,
84-
args=[min, max, emulation_config, emulation_results, experimental_results, emulator_cov_unexplained],
83+
sampler = LoggingEnsembleSampler(config.n_walkers, ndim, log_posterior.log_posterior,
84+
#args=[min, max, emulation_config, emulation_results, experimental_results, emulator_cov_unexplained],
8585
pool=pool)
8686

8787
# Generate random starting positions for each walker
@@ -183,112 +183,6 @@ def map_parameters(posterior, method='quantile'):
183183

184184
return map_parameters
185185

186-
#---------------------------------------------------------------
187-
def _log_posterior(X, min, max, emulation_config, emulation_results, experimental_results, emulator_cov_unexplained):
188-
"""
189-
Function to evaluate the log-posterior for a given set of input parameters.
190-
191-
This function is called by https://emcee.readthedocs.io/en/stable/user/sampler/
192-
193-
:param X input ndarray of parameter space values
194-
:param min list of minimum boundaries for each emulator parameter
195-
:param max list of maximum boundaries for each emulator parameter
196-
:param config emulation_configuration object
197-
:param emulation_results dict of emulation groups
198-
:param experimental_results arrays of experimental results
199-
"""
200-
201-
# Convert to 2darray of shape (n_samples, n_parameters)
202-
X = np.array(X, copy=False, ndmin=2)
203-
204-
# Initialize log-posterior array, which we will populate and return
205-
log_posterior = np.zeros(X.shape[0])
206-
207-
# Check if any samples are outside the parameter bounds, and set log-posterior to -inf for those
208-
inside = np.all((X > min) & (X < max), axis=1)
209-
log_posterior[~inside] = -np.inf
210-
211-
# Evaluate log-posterior for samples inside parameter bounds
212-
n_samples = np.count_nonzero(inside)
213-
n_features = experimental_results['y'].shape[0]
214-
if n_samples > 0:
215-
216-
# Get experimental data
217-
data_y = experimental_results['y']
218-
data_y_err = experimental_results['y_err']
219-
220-
# Compute emulator prediction
221-
# Returns dict of matrices of emulator predictions:
222-
# emulator_predictions['central_value'] -- (n_samples, n_features)
223-
# emulator_predictions['cov'] -- (n_samples, n_features, n_features)
224-
emulator_predictions = emulation.predict(X[inside], emulation_config,
225-
emulation_group_results=emulation_results,
226-
emulator_cov_unexplained=emulator_cov_unexplained)
227-
228-
# Construct array to store the difference between emulator prediction and experimental data
229-
# (using broadcasting to subtract each data point from each emulator prediction)
230-
assert data_y.shape[0] == emulator_predictions['central_value'].shape[1]
231-
dY = emulator_predictions['central_value'] - data_y
232-
233-
# Construct the covariance matrix
234-
# TODO: include full experimental data covariance matrix -- currently we only include uncorrelated data uncertainty
235-
#-------------------------
236-
covariance_matrix = np.zeros((n_samples, n_features, n_features))
237-
covariance_matrix += emulator_predictions['cov']
238-
covariance_matrix += np.diag(data_y_err**2)
239-
240-
# Compute log likelihood at each point in the sample
241-
# We take constant priors, so the log-likelihood is just the log-posterior
242-
# (since above we set the log-posterior to -inf for samples outside the parameter bounds)
243-
log_posterior[inside] += list(map(_loglikelihood, dY, covariance_matrix))
244-
245-
return log_posterior
246-
247-
#---------------------------------------------------------------
248-
def _loglikelihood(y, cov):
249-
"""
250-
Evaluate the multivariate-normal log-likelihood for difference vector `y`
251-
and covariance matrix `cov`:
252-
253-
log_p = -1/2*[(y^T).(C^-1).y + log(det(C))] + const.
254-
255-
The likelihood is NOT NORMALIZED, since this does not affect MCMC.
256-
The normalization const = -n/2*log(2*pi), where n is the dimensionality.
257-
258-
Arguments `y` and `cov` MUST be np.arrays with dtype == float64 and shapes
259-
(n) and (n, n), respectively. These requirements are NOT CHECKED.
260-
261-
The calculation follows algorithm 2.1 in Rasmussen and Williams (Gaussian
262-
Processes for Machine Learning).
263-
264-
"""
265-
# Compute the Cholesky decomposition of the covariance.
266-
# Use bare LAPACK function to avoid scipy.linalg wrapper overhead.
267-
L, info = lapack.dpotrf(cov, clean=False)
268-
269-
if info < 0:
270-
raise ValueError(
271-
'lapack dpotrf error: '
272-
'the {}-th argument had an illegal value'.format(-info)
273-
)
274-
elif info < 0:
275-
raise np.linalg.LinAlgError(
276-
'lapack dpotrf error: '
277-
'the leading minor of order {} is not positive definite'
278-
.format(info)
279-
)
280-
281-
# Solve for alpha = cov^-1.y using the Cholesky decomp.
282-
alpha, info = lapack.dpotrs(L, y)
283-
284-
if info != 0:
285-
raise ValueError(
286-
'lapack dpotrs error: '
287-
'the {}-th argument had an illegal value'.format(-info)
288-
)
289-
290-
return -.5*np.dot(y, alpha) - np.log(L.diagonal()).sum()
291-
292186
####################################################################################################################
293187
class LoggingEnsembleSampler(emcee.EnsembleSampler):
294188
'''

0 commit comments

Comments
 (0)