Source code for openadmet.models.active_learning.committee

"""Committee regressor for active learning with uncertainty estimation."""

from os import PathLike
from typing import Any, ClassVar

import joblib
import numpy as np
import pandas as pd
import uncertainty_toolbox as uct
from loguru import logger

from openadmet.models.active_learning.acquisition import _ACQUISITION_FUNCTIONS
from openadmet.models.active_learning.ensemble_base import EnsembleBase, ensemblers
from openadmet.models.architecture.model_base import ModelBase


[docs]@ensemblers.register("CommitteeRegressor") class CommitteeRegressor(EnsembleBase): """ Committee Regressor. Attributes ---------- type : ClassVar[str] The type of the ensemble model. _calibration_model : Any The calibration model used for uncertainty calibration. _calibration_methods : dict A dictionary mapping calibration method names to their corresponding functions. """ type: ClassVar[str] = "CommitteeRegressor" _calibration_model: Any = None _calibration_methods: dict = { "isotonic-regression": "_isotonic_regression_calibration", "scaling-factor": "_scaling_factor_calibration", None: "_do_nothing_calibration", } @property def calibrated(self): """ Check if the committee regressor has a calibration model. Returns ------- bool True if the committee regressor has a calibration model, False otherwise. """ return self._calibration_model is not None @classmethod def from_models(cls, models: list = []): """ Create a committee from list of models. Parameters ---------- models : list A list of committee model members. """ # Initialize class from model list instance = cls( models=models, ) return instance def _isotonic_regression_calibration(self, X, y, **kwargs): """ Configure uncertainty calibration using isotonic regression. Parameters ---------- X : array-like of shape (n_samples, n_features) The input validation set samples to calibrate. y : array-like of shape (n_samples, n_features) The target validation set values. **kwargs : dict Additional keyword arguments to be passed to the committee's predict method. """ # Reset calibration model self._calibration_model = None if isinstance(y, (pd.Series, pd.DataFrame)): y = y.to_numpy() # Predict on recalibration (validation) set y_pred_mean, y_pred_std = self._predict(X, return_std=True, **kwargs) # Fit a separate isotonic regression model for each target dimension calibration_models = [] for i in range(y.shape[-1]): # Get the predictive uncertainties in terms of expected proportions and # observed proportions on the recalibration set y_exp_props, y_obs_props = ( uct.metrics_calibration.get_proportion_lists_vectorized( y_pred_mean[:, i], y_pred_std[:, i], y[:, i] ) ) # Train a recalibration model iso_model = uct.recalibration.iso_recal(y_exp_props, y_obs_props).predict # Append to per-dimension list calibration_models.append(iso_model) # Create per-dimension calibration model self._calibration_model = {"isotonic-regression": calibration_models} def _scaling_factor_calibration(self, X, y, **kwargs): """ Configure uncertainty calibration using scaling factor. Parameters ---------- X : array-like of shape (n_samples, n_features) The input validation set samples to calibrate. y : array-like of shape (n_samples, n_features) The target validation set values. **kwargs : dict Additional keyword arguments to be passed to the committee's predict method. """ # Reset calibration model self._calibration_model = None if isinstance(y, (pd.Series, pd.DataFrame)): y = y.to_numpy() # Predict on recalibration (validation) set y_pred_mean, y_pred_std = self._predict(X, return_std=True, **kwargs) # Fit a separate scaling factor for each target dimension calibration_models = [] for i in range(y.shape[-1]): # Determine scale factor scale_factor = uct.recalibration.optimize_recalibration_ratio( y_pred_mean[:, i], y_pred_std[:, i], y[:, i], criterion="miscal" ) calibration_models.append(scale_factor) self._calibration_model = {"scaling-factor": calibration_models} def _do_nothing_calibration(self, X, y, **kwargs): """ Passthrough function for no calibration. Parameters ---------- X : array-like of shape (n_samples, n_features) The input validation set samples to calibrate. y : array-like of shape (n_samples, n_features) The target validation set values. **kwargs : dict Additional keyword arguments to be passed to the committee's predict method. """ pass
[docs] def calibrate_uncertainty(self, X, y, method="isotonic-regression", **kwargs): """ Configure uncertainty calibration using selected method. Parameters ---------- X : array-like of shape (n_samples, n_features) The input validation set samples to calibrate. y : array-like of shape (n_samples, n_features) The target validation set values. method : str The calibration method to use. Options are "isotonic-regression" or "scaling-factor". **kwargs : dict Additional keyword arguments to be passed to the committee's predict method. """ # Validate method selection if method not in self._calibration_methods: raise ValueError( f"Invalid calibration method: {method}. " f"Valid options are: {self._calibration_methods.keys()}." ) getattr(self, self._calibration_methods[method])(X, y, **kwargs)
def _get_calibration_function(self): if "scaling-factor" in self._calibration_model: # Create per-dimension calibration model return lambda x: np.stack( [ self._calibration_model["scaling-factor"][i] * (x[:, i]) for i in range(x.shape[-1]) ], axis=1, ) elif "isotonic-regression" in self._calibration_model: # Create per-dimension calibration model return lambda x: np.stack( [ self._calibration_model["isotonic-regression"][i](x[:, i]) for i in range(x.shape[-1]) ], axis=1, )
[docs] def plot_uncertainty_calibration(self, X, y, **kwargs): """ Plot uncertainty calibration for the committee model. Parameters ---------- X : array-like of shape (n_samples, n_features) The input test set samples to calibrate. y : array-like of shape (n_samples, n_features) The target test set values. **kwargs : dict Additional keyword arguments to be passed to the committee's predict method. Returns ------- list A list of plots for each target dimension. """ if isinstance(y, (pd.Series, pd.DataFrame)): y = y.to_numpy() # Predict on recalibration (validation) set y_pred_mean, y_pred_std = self.predict(X, return_std=True, **kwargs) # Plot calibration plots = [] for i in range(y.shape[-1]): plots.append( uct.viz.plot_calibration( y_pred_mean[:, i].flatten(), y_pred_std[:, i].flatten(), y[:, i].flatten(), ) ) # If only one plot is generated, return it directly if len(plots) == 1: return plots[0] return plots
@classmethod def train( cls, X, y, mod_class: ModelBase = None, mod_params: dict = {}, n_models: int = 1, use_bagging: bool = True, ): """ Train committee regressor members on bootstrapped data subsets. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples to train on. y : array-like of shape (n_samples,) The target values. mod_class : ModelBase The type of model to use for training. mod_params : dict The parameters to pass to the model. n_models : int The number of models in the committee, by default 1. use_bagging : bool Whether to use bagging (bootstrap aggregation) for training models. If False, models are trained on the full dataset. Returns ------- CommitteeRegressor An instance of the CommitteeRegressor class. """ # Verify estimator input if mod_class is None: raise ValueError("Model type must be provided.") # Initialize set of models models = [] for i in range(n_models): # Update random state if present current_mod_params = mod_params.copy() if ( "random_state" in current_mod_params and current_mod_params["random_state"] is not None ): current_mod_params["random_state"] += i # Initialize model model = mod_class(**current_mod_params) model.build() if use_bagging: # Bootstrap the data bootstrap_idx = np.random.choice( X.shape[0], size=X.shape[0], replace=True ) # Train the model on the bootstrapped data model.train(X[bootstrap_idx, :], y[bootstrap_idx, :]) else: # Train the model on the full data model.train(X, y) # Add to list models.append(model) # Instantiate the committee regressor return cls.from_models(models)
[docs] def query(self, X, query_strategy: str = None, **kwargs): """ Query the committee to select instances for labeling. Parameters ---------- X : array-like The input data from which instances are to be queried. query_strategy : str, optional The query strategy to use for selecting instances. **kwargs : dict Additional keyword arguments to be passed to the committee's query method. Returns ------- np.array Values of the query strategy applied to the input data `X`. """ if query_strategy.lower() not in _ACQUISITION_FUNCTIONS: raise ValueError( f"Invalid query strategy: {query_strategy}. " f"Valid options are: {list(_ACQUISITION_FUNCTIONS.keys())}" ) mean, std = self.predict(X, return_std=True, **kwargs) return _ACQUISITION_FUNCTIONS[query_strategy](mean, std, **kwargs)
def _predict(self, X, return_std=False, return_all=False, **kwargs): """ Make predictions using the committee model. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples to predict. return_std : bool, optional Whether to return the standard deviation of the predictions. Mutually exclusive with ``return_all``. return_all : bool, optional Whether to return the raw per-member predictions of shape (n_samples, n_tasks, n_members) instead of the mean (and std). Mutually exclusive with ``return_std``. **kwargs : dict Additional keyword arguments to pass to the committee's predict method. Returns ------- array-like or tuple mean, or (mean, std), or ndarray of shape (n_samples, n_tasks, n_members) depending on the values of return_std and return_all. """ if return_std and return_all: raise ValueError( "return_std and return_all are mutually exclusive. " "When return_all=True, compute mean and std from the returned array as needed." ) # Make predictions: (n_samples, n_tasks, n_members) preds = np.stack([model.predict(X, **kwargs) for model in self.models], axis=-1) if return_all: return preds # Compute mean mean = np.mean(preds, axis=-1) if not return_std: return mean # Compute standard deviation, guard against zero std std = np.maximum(np.std(preds, axis=-1), 1e-8) if self.calibrated: std = self._get_calibration_function()(std) return mean, std
[docs] def predict(self, X, return_std=False, return_all=False, **kwargs): """ Make predictions using the committee model. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples to predict. return_std : bool, optional Whether to return the standard deviation of the predictions. Mutually exclusive with ``return_all``. return_all : bool, optional Whether to return the raw per-member predictions of shape (n_samples, n_tasks, n_members) instead of the mean (and std). Mutually exclusive with ``return_std``. **kwargs : dict Additional keyword arguments to pass to the committee's predict method. Returns ------- array-like or tuple mean, or (mean, std), or ndarray of shape (n_samples, n_tasks, n_members) depending on the values of return_std and return_all. """ if return_std is True and not self.calibrated: logger.warning( "Standard deviation not calibrated: consider calling `calibrate_uncertainty`." ) return self._predict(X, return_std=return_std, return_all=return_all, **kwargs)
def _save_calibration_model(self, path: PathLike = "calibration_model.pkl"): # Save calibration model if self.calibrated: with open(path, "wb") as f: joblib.dump(self._calibration_model, f) else: logger.warning( "Standard deviation not calibrated: consider calling `calibrate_uncertainty` before saving." ) def _load_calibration_model(self, path: PathLike = "calibration_model.pkl"): # Load calibration model with open(path, "rb") as f: self._calibration_model = joblib.load(f) logger.info(f"Successfully loaded calibration from {path}")
[docs] def save( self, paths: list[PathLike], calibration_path: PathLike = "calibration_model.pkl", ): """ Save the committee model to the provided paths. Parameters ---------- paths : list of PathLike The file paths to save the model weights. calibration_path: PathLike Path to save calibration model. """ # Check number of paths match if self.n_models != len(paths): raise ValueError( f"Number of models ({self.n_models}) in the committee does not match the number of paths ({len(paths)})." ) # Save each model to the provided paths for model, path in zip(self.models, paths): model.save(path) # Save calibration model self._save_calibration_model(calibration_path)
@classmethod def load( cls, paths: list[PathLike], models: list[ModelBase] = None, calibration_path: PathLike = None, ): """ Load a committee model from the provided paths. Parameters ---------- paths : list of PathLike The file paths to the model weights. models : list of ModelBase Model instances associated with path to weights. calibration_path : PathLike The file path to the calibration model. Returns ------- CommitteeRegressor A committee model created from the loaded models. """ # Check model type if models is None: raise ValueError("Must provide a list of model instances to load.") # Check lengths match if len(paths) != len(models): raise ValueError("Number of paths and models do not match.") # Load each model from the provided paths [model.load(path) for model, path in zip(models, paths)] # Create a CommitteeRegressor instance from the loaded models instance = cls.from_models(models) # Load calibration model if calibration_path is not None: try: instance._load_calibration_model(calibration_path) except Exception as e: logger.warning( f"Failed to load calibration model from {calibration_path}: {e}, if a a calibration model is not expected, this warning can be ignored." ) return instance
[docs] def serialize( self, param_paths: list[PathLike], serial_paths: list[PathLike], calibration_path: PathLike = "calibration_model.pkl", ): """ Save the model to json files and pickled files. Parameters ---------- param_paths : list of PathLike The file paths to save the model weights. serial_paths : list of PathLike The file paths to save the model architecture. calibration_path : PathLike The file path to save the calibration model. """ # Check number of paths match if len(param_paths) != len(serial_paths): raise ValueError( f"Number of parameter files ({len(param_paths)}) and serial files ({len(serial_paths)}) do not match." ) # Check number of models match if self.n_models != len(param_paths): raise ValueError( f"Number of models ({self.n_models}) in the committee does not match the number of parameter files ({len(param_paths)})." ) # Serialize each model for model, param_path, serial_path in zip( self.models, param_paths, serial_paths ): model.serialize(param_path, serial_path) # Save calibration model self._save_calibration_model(calibration_path)
@classmethod def deserialize( cls, param_paths: list[PathLike], serial_paths: list[PathLike], mod_class: ModelBase = None, calibration_path: PathLike = None, ): """ Create a model from parameters and a pickled model. Parameters ---------- param_paths : list of PathLike The file paths to the model parameters. serial_paths : list of PathLike The file paths to the model serializations. mod_class : ModelBase Model class to update with the deserialized parameters. calibration_path : PathLike The file path to the calibration model. Returns ------- Committee A committee model created from the deserialized parameters. """ # Check model type if mod_class is None: raise ValueError("Must provide a model type to load.") # Check lengths match if len(param_paths) != len(serial_paths): raise ValueError( f"Number of parameter files {len(param_paths)} and serial files {len(serial_paths)} do not match." ) # Deserialize each model models = [] for param_path, serial_path in zip(param_paths, serial_paths): models.append(mod_class.deserialize(param_path, serial_path)) # Create a CommitteeRegressor instance from the deserialized models instance = cls.from_models(models) # Load calibration model if calibration_path is not None: try: instance._load_calibration_model(calibration_path) except Exception as e: logger.warning( f"Failed to load calibration model from {calibration_path}: {e}, if a a calibration model is not expected, this warning can be ignored." ) return instance