Source code for openadmet.models.active_learning.committee

"""Committee regressor for active learning with uncertainty estimation."""

from os import PathLike
from typing import Any, ClassVar

import joblib
import numpy as np
import pandas as pd
import uncertainty_toolbox as uct
from loguru import logger

from openadmet.models.active_learning.acquisition import _ACQUISITION_FUNCTIONS
from openadmet.models.active_learning.ensemble_base import EnsembleBase, ensemblers
from openadmet.models.architecture.model_base import ModelBase


[docs]@ensemblers.register("CommitteeRegressor")
class CommitteeRegressor(EnsembleBase):
    """
    Committee Regressor.

    Attributes
    ----------
    type : ClassVar[str]
        The type of the ensemble model.
    _calibration_model : Any
        The calibration model used for uncertainty calibration.
    _calibration_methods : dict
        A dictionary mapping calibration method names to their corresponding functions.

    """

    type: ClassVar[str] = "CommitteeRegressor"
    _calibration_model: Any = None
    _calibration_methods: dict = {
        "isotonic-regression": "_isotonic_regression_calibration",
        "scaling-factor": "_scaling_factor_calibration",
        None: "_do_nothing_calibration",
    }

    @property
    def calibrated(self):
        """
        Check if the committee regressor has a calibration model.

        Returns
        -------
        bool
            True if the committee regressor has a calibration model, False otherwise.

        """
        return self._calibration_model is not None

    @classmethod
    def from_models(cls, models: list = []):
        """
        Create a committee from list of models.

        Parameters
        ----------
        models : list
            A list of committee model members.

        """
        # Initialize class from model list
        instance = cls(
            models=models,
        )

        return instance

    def _isotonic_regression_calibration(self, X, y, **kwargs):
        """
        Configure uncertainty calibration using isotonic regression.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input validation set samples to calibrate.
        y : array-like of shape (n_samples, n_features)
            The target validation set values.
        **kwargs : dict
            Additional keyword arguments to be passed to the committee's predict method.

        """
        # Reset calibration model
        self._calibration_model = None

        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.to_numpy()

        # Predict on recalibration (validation) set
        y_pred_mean, y_pred_std = self._predict(X, return_std=True, **kwargs)

        # Fit a separate isotonic regression model for each target dimension
        calibration_models = []
        for i in range(y.shape[-1]):
            # Get the predictive uncertainties in terms of expected proportions and
            # observed proportions on the recalibration set
            y_exp_props, y_obs_props = (
                uct.metrics_calibration.get_proportion_lists_vectorized(
                    y_pred_mean[:, i], y_pred_std[:, i], y[:, i]
                )
            )

            # Train a recalibration model
            iso_model = uct.recalibration.iso_recal(y_exp_props, y_obs_props).predict

            # Append to per-dimension list
            calibration_models.append(iso_model)

        # Create per-dimension calibration model
        self._calibration_model = {"isotonic-regression": calibration_models}

    def _scaling_factor_calibration(self, X, y, **kwargs):
        """
        Configure uncertainty calibration using scaling factor.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input validation set samples to calibrate.
        y : array-like of shape (n_samples, n_features)
            The target validation set values.
        **kwargs : dict
            Additional keyword arguments to be passed to the committee's predict method.

        """
        # Reset calibration model
        self._calibration_model = None

        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.to_numpy()

        # Predict on recalibration (validation) set
        y_pred_mean, y_pred_std = self._predict(X, return_std=True, **kwargs)

        # Fit a separate scaling factor for each target dimension
        calibration_models = []
        for i in range(y.shape[-1]):
            # Determine scale factor
            scale_factor = uct.recalibration.optimize_recalibration_ratio(
                y_pred_mean[:, i], y_pred_std[:, i], y[:, i], criterion="miscal"
            )

            calibration_models.append(scale_factor)

        self._calibration_model = {"scaling-factor": calibration_models}

    def _do_nothing_calibration(self, X, y, **kwargs):
        """
        Passthrough function for no calibration.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input validation set samples to calibrate.
        y : array-like of shape (n_samples, n_features)
            The target validation set values.
        **kwargs : dict
            Additional keyword arguments to be passed to the committee's predict method.

        """
        pass

[docs]    def calibrate_uncertainty(self, X, y, method="isotonic-regression", **kwargs):
        """
        Configure uncertainty calibration using selected method.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input validation set samples to calibrate.
        y : array-like of shape (n_samples, n_features)
            The target validation set values.
        method : str
            The calibration method to use. Options are "isotonic-regression" or "scaling-factor".
        **kwargs : dict
            Additional keyword arguments to be passed to the committee's predict method.

        """
        # Validate method selection
        if method not in self._calibration_methods:
            raise ValueError(
                f"Invalid calibration method: {method}. "
                f"Valid options are: {self._calibration_methods.keys()}."
            )

        getattr(self, self._calibration_methods[method])(X, y, **kwargs)

    def _get_calibration_function(self):
        if "scaling-factor" in self._calibration_model:
            # Create per-dimension calibration model
            return lambda x: np.stack(
                [
                    self._calibration_model["scaling-factor"][i] * (x[:, i])
                    for i in range(x.shape[-1])
                ],
                axis=1,
            )

        elif "isotonic-regression" in self._calibration_model:
            # Create per-dimension calibration model
            return lambda x: np.stack(
                [
                    self._calibration_model["isotonic-regression"][i](x[:, i])
                    for i in range(x.shape[-1])
                ],
                axis=1,
            )

[docs]    def plot_uncertainty_calibration(self, X, y, **kwargs):
        """
        Plot uncertainty calibration for the committee model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input test set samples to calibrate.
        y : array-like of shape (n_samples, n_features)
            The target test set values.
        **kwargs : dict
            Additional keyword arguments to be passed to the committee's predict method.

        Returns
        -------
        list
            A list of plots for each target dimension.

        """
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.to_numpy()

        # Predict on recalibration (validation) set
        y_pred_mean, y_pred_std = self.predict(X, return_std=True, **kwargs)

        # Plot calibration
        plots = []
        for i in range(y.shape[-1]):
            plots.append(
                uct.viz.plot_calibration(
                    y_pred_mean[:, i].flatten(),
                    y_pred_std[:, i].flatten(),
                    y[:, i].flatten(),
                )
            )

        # If only one plot is generated, return it directly
        if len(plots) == 1:
            return plots[0]

        return plots

    @classmethod
    def train(
        cls,
        X,
        y,
        mod_class: ModelBase = None,
        mod_params: dict = {},
        n_models: int = 1,
        use_bagging: bool = True,
    ):
        """
        Train committee regressor members on bootstrapped data subsets.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples to train on.
        y : array-like of shape (n_samples,)
            The target values.
        mod_class : ModelBase
            The type of model to use for training.
        mod_params : dict
            The parameters to pass to the model.
        n_models : int
            The number of models in the committee, by default 1.
        use_bagging : bool
            Whether to use bagging (bootstrap aggregation) for training models.
            If False, models are trained on the full dataset.

        Returns
        -------
        CommitteeRegressor
            An instance of the CommitteeRegressor class.

        """
        # Verify estimator input
        if mod_class is None:
            raise ValueError("Model type must be provided.")

        # Initialize set of models
        models = []
        for i in range(n_models):
            # Vary each member's seed so the ensemble is not a set of identical models
            current_mod_params = mod_params.copy()
            if current_mod_params.get("random_seed") is not None:
                current_mod_params["random_seed"] += i

            # Initialize model
            model = mod_class(**current_mod_params)
            model.build()

            if use_bagging:
                # Bootstrap the data
                bootstrap_idx = np.random.choice(
                    X.shape[0], size=X.shape[0], replace=True
                )

                # Train the model on the bootstrapped data
                model.train(X[bootstrap_idx, :], y[bootstrap_idx, :])

            else:
                # Train the model on the full data
                model.train(X, y)

            # Add to list
            models.append(model)

        # Instantiate the committee regressor
        return cls.from_models(models)

[docs]    def query(self, X, query_strategy: str = None, **kwargs):
        """
        Query the committee to select instances for labeling.

        Parameters
        ----------
        X : array-like
            The input data from which instances are to be queried.
        query_strategy : str, optional
            The query strategy to use for selecting instances.
        **kwargs : dict
            Additional keyword arguments to be passed to the committee's query method.

        Returns
        -------
        np.array
            Values of the query strategy applied to the input data `X`.

        """
        if query_strategy.lower() not in _ACQUISITION_FUNCTIONS:
            raise ValueError(
                f"Invalid query strategy: {query_strategy}. "
                f"Valid options are: {list(_ACQUISITION_FUNCTIONS.keys())}"
            )

        mean, std = self.predict(X, return_std=True, **kwargs)

        return _ACQUISITION_FUNCTIONS[query_strategy](mean, std, **kwargs)

    def _predict(self, X, return_std=False, return_all=False, **kwargs):
        """
        Make predictions using the committee model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples to predict.
        return_std : bool, optional
            Whether to return the standard deviation of the predictions.
            Mutually exclusive with ``return_all``.
        return_all : bool, optional
            Whether to return the raw per-member predictions of shape
            (n_samples, n_tasks, n_members) instead of the mean (and std).
            Mutually exclusive with ``return_std``.
        **kwargs : dict
            Additional keyword arguments to pass to the committee's predict method.

        Returns
        -------
        array-like or tuple
            mean, or (mean, std), or ndarray of shape (n_samples, n_tasks, n_members)
            depending on the values of return_std and return_all.

        """
        if return_std and return_all:
            raise ValueError(
                "return_std and return_all are mutually exclusive. "
                "When return_all=True, compute mean and std from the returned array as needed."
            )

        # Make predictions: (n_samples, n_tasks, n_members)
        preds = np.stack([model.predict(X, **kwargs) for model in self.models], axis=-1)

        if return_all:
            return preds

        # Compute mean
        mean = np.mean(preds, axis=-1)

        if not return_std:
            return mean

        # Compute standard deviation, guard against zero std
        std = np.maximum(np.std(preds, axis=-1), 1e-8)
        if self.calibrated:
            std = self._get_calibration_function()(std)
        return mean, std

[docs]    def predict(self, X, return_std=False, return_all=False, **kwargs):
        """
        Make predictions using the committee model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples to predict.
        return_std : bool, optional
            Whether to return the standard deviation of the predictions.
            Mutually exclusive with ``return_all``.
        return_all : bool, optional
            Whether to return the raw per-member predictions of shape
            (n_samples, n_tasks, n_members) instead of the mean (and std).
            Mutually exclusive with ``return_std``.
        **kwargs : dict
            Additional keyword arguments to pass to the committee's predict method.

        Returns
        -------
        array-like or tuple
            mean, or (mean, std), or ndarray of shape (n_samples, n_tasks, n_members)
            depending on the values of return_std and return_all.

        """
        if return_std is True and not self.calibrated:
            logger.warning(
                "Standard deviation not calibrated: consider calling `calibrate_uncertainty`."
            )

        return self._predict(X, return_std=return_std, return_all=return_all, **kwargs)

    def _save_calibration_model(self, path: PathLike = "calibration_model.pkl"):
        # Save calibration model
        if self.calibrated:
            with open(path, "wb") as f:
                joblib.dump(self._calibration_model, f)

        else:
            logger.warning(
                "Standard deviation not calibrated: consider calling `calibrate_uncertainty` before saving."
            )

    def _load_calibration_model(self, path: PathLike = "calibration_model.pkl"):
        # Load calibration model
        with open(path, "rb") as f:
            self._calibration_model = joblib.load(f)

        logger.info(f"Successfully loaded calibration from {path}")

[docs]    def save(
        self,
        paths: list[PathLike],
        calibration_path: PathLike = "calibration_model.pkl",
    ):
        """
        Save the committee model to the provided paths.

        Parameters
        ----------
        paths : list of PathLike
            The file paths to save the model weights.
        calibration_path: PathLike
            Path to save calibration model.

        """
        # Check number of paths match
        if self.n_models != len(paths):
            raise ValueError(
                f"Number of models ({self.n_models}) in the committee does not match the number of paths ({len(paths)})."
            )

        # Save each model to the provided paths
        for model, path in zip(self.models, paths):
            model.save(path)

        # Save calibration model
        self._save_calibration_model(calibration_path)

    @classmethod
    def load(
        cls,
        paths: list[PathLike],
        models: list[ModelBase] = None,
        calibration_path: PathLike = None,
    ):
        """
        Load a committee model from the provided paths.

        Parameters
        ----------
        paths : list of PathLike
            The file paths to the model weights.
        models : list of ModelBase
            Model instances associated with path to weights.
        calibration_path : PathLike
            The file path to the calibration model.

        Returns
        -------
        CommitteeRegressor
            A committee model created from the loaded models.

        """
        # Check model type
        if models is None:
            raise ValueError("Must provide a list of model instances to load.")

        # Check lengths match
        if len(paths) != len(models):
            raise ValueError("Number of paths and models do not match.")

        # Load each model from the provided paths
        [model.load(path) for model, path in zip(models, paths)]

        # Create a CommitteeRegressor instance from the loaded models
        instance = cls.from_models(models)

        # Load calibration model
        if calibration_path is not None:
            try:
                instance._load_calibration_model(calibration_path)
            except Exception as e:
                logger.warning(
                    f"Failed to load calibration model from {calibration_path}: {e}, if a a calibration model is not expected, this warning can be ignored."
                )

        return instance

[docs]    def serialize(
        self,
        param_paths: list[PathLike],
        serial_paths: list[PathLike],
        calibration_path: PathLike = "calibration_model.pkl",
    ):
        """
        Save the model to json files and pickled files.

        Parameters
        ----------
        param_paths : list of PathLike
            The file paths to save the model weights.
        serial_paths : list of PathLike
            The file paths to save the model architecture.
        calibration_path : PathLike
            The file path to save the calibration model.

        """
        # Check number of paths match
        if len(param_paths) != len(serial_paths):
            raise ValueError(
                f"Number of parameter files ({len(param_paths)}) and serial files ({len(serial_paths)}) do not match."
            )

        # Check number of models match
        if self.n_models != len(param_paths):
            raise ValueError(
                f"Number of models ({self.n_models}) in the committee does not match the number of parameter files ({len(param_paths)})."
            )

        # Serialize each model
        for model, param_path, serial_path in zip(
            self.models, param_paths, serial_paths
        ):
            model.serialize(param_path, serial_path)

        # Save calibration model
        self._save_calibration_model(calibration_path)

    @classmethod
    def deserialize(
        cls,
        param_paths: list[PathLike],
        serial_paths: list[PathLike],
        mod_class: ModelBase = None,
        calibration_path: PathLike = None,
    ):
        """
        Create a model from parameters and a pickled model.

        Parameters
        ----------
        param_paths : list of PathLike
            The file paths to the model parameters.
        serial_paths : list of PathLike
            The file paths to the model serializations.
        mod_class : ModelBase
            Model class to update with the deserialized parameters.
        calibration_path : PathLike
            The file path to the calibration model.

        Returns
        -------
        Committee
            A committee model created from the deserialized parameters.

        """
        # Check model type
        if mod_class is None:
            raise ValueError("Must provide a model type to load.")

        # Check lengths match
        if len(param_paths) != len(serial_paths):
            raise ValueError(
                f"Number of parameter files {len(param_paths)} and serial files {len(serial_paths)} do not match."
            )

        # Deserialize each model
        models = []
        for param_path, serial_path in zip(param_paths, serial_paths):
            models.append(mod_class.deserialize(param_path, serial_path))

        # Create a CommitteeRegressor instance from the deserialized models
        instance = cls.from_models(models)

        # Load calibration model
        if calibration_path is not None:
            try:
                instance._load_calibration_model(calibration_path)
            except Exception as e:
                logger.warning(
                    f"Failed to load calibration model from {calibration_path}: {e}, if a a calibration model is not expected, this warning can be ignored."
                )

        return instance