Skip to content

Improvements and refactoring for preprocessing and MCMC #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,273 changes: 1,726 additions & 1,547 deletions pdm.lock

Large diffs are not rendered by default.

32 changes: 27 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ name = "bayesian_inference"
description = "Analysis pipeline to implement Bayesian inference in high-energy physics "
license = {text = "BSD-3-Clause"}
# NOTE: <3.12 cap needed for tensorflow-io-gcs-filesystem
requires-python = ">=3.8,<3.12"
requires-python = ">=3.10,<3.13"
authors = [
{ name = "James Mulligan", email = "[email protected]" },
{ name = "Raymond Ehlers", email = "[email protected]" },
Expand All @@ -36,15 +36,15 @@ dependencies = [
"statsmodels>=0.14.0",
]

# NOTE: If moving to hatch, move this to "[project.optional-dependencies]"
[tool.pdm.dev-dependencies]
[project.optional-dependencies]
dev = [
"ruff>=0.0.209",
"ruff >=0.0.209",
"black >=22.1.0",
"mypy >=0.931",
"ipython >=8.0",
"ipykernel >=6.15.1",
"pytest>=7.4.0",
"pytest >=7.4.0",
"pocoMC >=1.2.2",
]

[tool.hatch]
Expand All @@ -56,6 +56,28 @@ version = { source = "file", path = "src/bayesian_inference/__init__.py" }
[tool.black]
line-length = 120

[tool.mypy]
files = ["src", "tests"]
python_version = "3.10"
warn_unused_configs = true
strict = true
show_error_codes = true
enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
warn_unreachable = true
no_implicit_reexport = false
disallow_untyped_defs = false
disallow_incomplete_defs = false
exclude = [".venv*"]

[[tool.mypy.overrides]]
module = [
"sklearn",
"sklearn.decompoistion",
"sklearn.gaussian_process",
"sklearn.preprocessing",
]
ignore_missing_imports = true

[tool.ruff]
exclude = [
'.git',
Expand Down
63 changes: 37 additions & 26 deletions src/bayesian_inference/emulation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#! /usr/bin/env python
'''
Module related to emulators, with functionality to train and call emulators for a given analysis run

Expand All @@ -16,20 +15,19 @@

import logging
import os
import yaml
import pickle
from pathlib import Path
from typing import Any

import attrs
import numpy as np
import numpy.typing as npt
import pickle
import sklearn.preprocessing as sklearn_preprocessing
import sklearn.decomposition as sklearn_decomposition
import sklearn.gaussian_process as sklearn_gaussian_process
import sklearn.preprocessing as sklearn_preprocessing
import yaml

from bayesian_inference import data_IO
from bayesian_inference import common_base
from bayesian_inference import common_base, data_IO

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -61,9 +59,9 @@ def fit_emulator_group(config: EmulationGroupConfig) -> dict[str, Any]:
'''

# Check if emulator already exists
if os.path.exists(config.emulation_outputfile):
if config.emulation_outputfile.exists():
if config.force_retrain:
os.remove(config.emulation_outputfile)
config.emulation_outputfile.unlink()
logger.info(f'Removed {config.emulation_outputfile}')
else:
logger.info(f'Emulators already exist: {config.emulation_outputfile} (to force retrain, set force_retrain: True)')
Expand All @@ -72,7 +70,7 @@ def fit_emulator_group(config: EmulationGroupConfig) -> dict[str, Any]:
# Initialize predictions into a single 2D array: (design_point_index, observable_bins) i.e. (n_samples, n_features)
# A consistent order of observables is enforced internally in data_IO
# NOTE: One sample corresponds to one design point, while one feature is one bin of one observable
logger.info(f'Doing PCA...')
logger.info('Doing PCA...')
Y = data_IO.predictions_matrix_from_h5(config.output_dir, filename=config.observables_filename, observable_filter=config.observable_filter)

# Use sklearn to:
Expand Down Expand Up @@ -112,6 +110,8 @@ def fit_emulator_group(config: EmulationGroupConfig) -> dict[str, Any]:
max_n_components = config.max_n_components_to_calculate
if max_n_components is not None:
logger.info(f"Running with max n_pc={max_n_components}")
# NOTE-STAT: Whiten=True, but here, Whiten=False.
# NOTE-STAT: RJE thinks this doesn't matter, based on the comments above.
pca = sklearn_decomposition.PCA(n_components=max_n_components, svd_solver='full', whiten=False) # Include all PCs here, so we can access them later
# Scale data and perform PCA
Y_pca = pca.fit_transform(scaler.fit_transform(Y))
Expand Down Expand Up @@ -164,7 +164,7 @@ def fit_emulator_group(config: EmulationGroupConfig) -> dict[str, Any]:
# Fit a GP (optimize the kernel hyperparameters) to map each design point to each of its PCs
# Note that Y_PCA=(n_samples, n_components), so each PC corresponds to a row (i.e. a column of Y_PCA.T)
logger.info("")
logger.info(f'Fitting GPs...')
logger.info('Fitting GPs...')
logger.info(f' The design has {design.shape[1]} parameters')
emulators = [sklearn_gaussian_process.GaussianProcessRegressor(kernel=kernel,
alpha=config.alpha,
Expand Down Expand Up @@ -199,7 +199,8 @@ def read_emulators(config: EmulationGroupConfig) -> dict[str, Any]:

with filename.open("rb") as f:
results = pickle.load(f)
return results
return results # noqa: RET504


####################################################################################################################
def write_emulators(config: EmulationGroupConfig, output_dict: dict[str, Any]) -> None:
Expand All @@ -208,10 +209,11 @@ def write_emulators(config: EmulationGroupConfig, output_dict: dict[str, Any]) -
filename = Path(config.emulation_outputfile)

with filename.open('wb') as f:
pickle.dump(output_dict, f)
pickle.dump(output_dict, f)


####################################################################################################################
def compute_emulator_cov_unexplained(emulation_config, emulation_results):
def compute_emulator_cov_unexplained(emulation_config, emulation_results) -> dict:
'''
Compute the predictive variance due to PC truncation, for all emulator groups.
See further details in compute_emulator_group_cov_unexplained().
Expand All @@ -222,6 +224,8 @@ def compute_emulator_cov_unexplained(emulation_config, emulation_results):
for emulation_group_name, emulation_group_config in emulation_config.emulation_groups_config.items():
emulation_group_result = emulation_results.get(emulation_group_name)
emulator_cov_unexplained[emulation_group_name] = compute_emulator_group_cov_unexplained(emulation_group_config, emulation_group_result)
return emulator_cov_unexplained


####################################################################################################################
def compute_emulator_group_cov_unexplained(emulation_group_config, emulation_group_result):
Expand All @@ -243,12 +247,15 @@ def compute_emulator_group_cov_unexplained(emulation_group_config, emulation_gro
We will generally pre-compute this once in mcmc.py to save time, although we define this function
here to allow us to re-compute it as needed if it is not pre-computed (e.g. when plotting).
'''
# TODO: NOTE-STAT: Compare this more carefully with STAT L145 and on.
pca = emulation_group_result['PCA']['pca']
S_unexplained = pca.components_.T[:,emulation_group_config.n_pc:]
D_unexplained = np.diag(pca.explained_variance_[emulation_group_config.n_pc:])
emulator_cov_unexplained = S_unexplained.dot(D_unexplained.dot(S_unexplained.T))

return emulator_cov_unexplained
# NOTE-STAT: bayesian-inference does not include a small term for numerical stability
return emulator_cov_unexplained # noqa: RET504


####################################################################################################################
def nd_block_diag(arrays):
Expand Down Expand Up @@ -351,7 +358,7 @@ def convert(self, group_matrices: dict[str, dict[str, npt.NDArray[np.float64]]])
:return: Converted matrix for each available value type.
"""
if self._available_value_types is None:
self._available_value_types = set([
self._available_value_types = set([ # noqa: C403
value_type
for group in group_matrices.values()
for value_type in group
Expand Down Expand Up @@ -443,9 +450,9 @@ def predict(parameters: npt.NDArray[np.float64],

# Compute unexplained variance due to PC truncation for this emulator group, if not already precomputed
if emulator_cov_unexplained:
emulator_group_cov_unexplained=emulator_cov_unexplained[emulation_group_name]
emulator_group_cov_unexplained = emulator_cov_unexplained[emulation_group_name]
else:
emulator_group_cov_unexplained=compute_emulator_group_cov_unexplained(emulation_group_config, emulation_group_result)
emulator_group_cov_unexplained = compute_emulator_group_cov_unexplained(emulation_group_config, emulation_group_result)

predict_output[emulation_group_name] = predict_emulation_group(
parameters,
Expand Down Expand Up @@ -519,6 +526,7 @@ def predict_emulation_group(parameters, results, emulation_group_config, emulato
# So C_Y[i] = S * C_Y_PCA[i] * S^T.
# Note: should be equivalent to: https://github.com/jdmulligan/STAT/blob/master/src/emulator.py#L145
# TODO: one can make this faster with broadcasting/einsum
# TODO: NOTE-STAT: Compare this more carefully with STAT L286 and on.
n_features = pca.components_.shape[1]
S = pca.components_.T[:,:emulation_group_config.n_pc]
emulator_cov_reconstructed_scaled = np.zeros((n_samples, n_features, n_features))
Expand Down Expand Up @@ -560,7 +568,7 @@ def __init__(self, analysis_name='', parameterization='', analysis_config='', co
self.analysis_config = analysis_config
self.config_file = config_file

with open(self.config_file, 'r') as stream:
with open(self.config_file) as stream:
config = yaml.safe_load(stream)

# Observable inputs
Expand Down Expand Up @@ -591,13 +599,14 @@ def __init__(self, analysis_name='', parameterization='', analysis_config='', co
# Validation for noise configuration
if 'noise' in self.active_kernels:
# Check we have the appropriate keys
assert [k in self.active_kernels['noise'].keys() for k in ["type", "args"]], "Noise configuration must have keys 'type' and 'args'"
assert [k in self.active_kernels['noise'] for k in ["type", "args"]], "Noise configuration must have keys 'type' and 'args'"
if self.active_kernels['noise']["type"] == "white":
# Validate arguments
# We don't want to do too much since we'll just be reinventing the wheel, but a bit can be helpful.
assert set(self.active_kernels['noise']["args"]) == set(["noise_level", "noise_level_bounds"]), "Must provide arguments 'noise_level' and 'noise_level_bounds' for white noise kernel"
assert set(self.active_kernels['noise']["args"]) == set(["noise_level", "noise_level_bounds"]), "Must provide arguments 'noise_level' and 'noise_level_bounds' for white noise kernel" # noqa: C405
else:
raise ValueError("Unsupported noise kernel")
msg = "Unsupported noise kernel"
raise ValueError(msg)

# GPR
self.n_restarts = emulator_configuration["GPR"]['n_restarts']
Expand All @@ -615,11 +624,11 @@ def __init__(self, analysis_name='', parameterization='', analysis_config='', co
)

# Output options
self.output_dir = os.path.join(config['output_dir'], f'{analysis_name}_{parameterization}')
self.output_dir = Path(config['output_dir']) / f'{analysis_name}_{parameterization}'
emulation_outputfile_name = 'emulation.pkl'
if emulation_group_name is not None:
emulation_outputfile_name = f'emulation_group_{emulation_group_name}.pkl'
self.emulation_outputfile = os.path.join(self.output_dir, emulation_outputfile_name)
self.emulation_outputfile = Path(self.output_dir) / emulation_outputfile_name

@attrs.define
class EmulationConfig(common_base.CommonBase):
Expand Down Expand Up @@ -685,7 +694,8 @@ def read_all_emulator_groups(self) -> dict[str, dict[str, npt.NDArray[np.float64
def observable_filter(self) -> data_IO.ObservableFilter:
if self._observable_filter is None:
if not self.emulation_groups_config:
raise ValueError("Need to specify emulation groups to provide an observable filter")
msg = "Need to specify emulation groups to provide an observable filter"
raise ValueError(msg)
# Accumulate the include and exclude lists from all emulation groups
include_list: list[str] = []
exclude_list: list[str] = self.config.get("global_observable_exclude_list", [])
Expand All @@ -703,7 +713,8 @@ def observable_filter(self) -> data_IO.ObservableFilter:
def sort_observables_in_matrix(self) -> SortEmulationGroupObservables:
if self._sort_observables_in_matrix is None:
if not self.emulation_groups_config:
raise ValueError("Need to specify emulation groups to provide an sorting for observable group observables")
msg = "Need to specify emulation groups to provide an sorting for observable group observables"
raise ValueError(msg)
# Accumulate the include and exclude lists from all emulation groups
self._sort_observables_in_matrix = SortEmulationGroupObservables.learn_mapping(self)
return self._sort_observables_in_matrix
return self._sort_observables_in_matrix
Loading