Source code for openadmet.models.features.feature_base

"""Base classes and utilities for molecular featurizers."""

from abc import ABC, abstractmethod
from collections.abc import Iterable

import numpy as np
from class_registry import ClassRegistry, RegistryKeyError
from molfeat.trans import MoleculeTransformer
from pydantic import BaseModel
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset

featurizers = ClassRegistry(unique=True)


[docs]def get_featurizer_class(feat_type): """Retrieve a featurizer class from the registry by type.""" try: feat_class = featurizers.get_class(feat_type) except RegistryKeyError: raise ValueError(f"Feature type {feat_type} not found in feature catalouge") return feat_class
[docs]class FeaturizerBase(BaseModel, ABC): """ Base class for featurizers, allowing for arbitrary featurization of molecules. This class defines the interface for all featurizers. Subclasses should implement the `featurize` method to convert a list of SMILES strings into features suitable for machine learning models. """
[docs] @abstractmethod def featurize(self, smiles: Iterable[str], *args, **kwargs): """ Featurize a list of SMILES strings. Parameters ---------- smiles : Iterable[str] List or iterable of SMILES strings to featurize. *args Additional positional arguments. **kwargs Additional keyword arguments. Returns ------- Any Features in an appropriate format for the model (e.g., numpy arrays, dataloaders, etc.) and optional processing info. """ pass
[docs]class DeepLearningFeaturizer(FeaturizerBase): """ Base class for deep learning featurizers. This class extends FeaturizerBase and standardizes the output for deep learning workflows. Subclasses should implement the `featurize` method to return a DataLoader, indices, a StandardScaler, and a PyTorch Dataset. """
[docs] @abstractmethod def featurize( self, smiles: Iterable[str], y: Iterable[float] = None ) -> tuple[DataLoader, np.ndarray, StandardScaler, Dataset]: """ Featurize a list of SMILES strings for deep learning models. Parameters ---------- smiles : Iterable[str] List or iterable of SMILES strings to featurize. y : Iterable[float], optional Target values corresponding to the SMILES strings. Returns ------- tuple Tuple containing: - DataLoader: PyTorch DataLoader for the dataset. - np.ndarray: Array of indices corresponding to the original input. - StandardScaler: Scaler used for any scaling during featurization. - Dataset: PyTorch Dataset containing the features and targets. """ pass
[docs]class MolfeatFeaturizer(FeaturizerBase): """ Featurizer using molfeat. This class provides a base for featurizers that use the molfeat library. It manages a MoleculeTransformer instance for feature extraction. Attributes ---------- _transformer : MoleculeTransformer The underlying molfeat transformer used for featurization. """ _transformer: MoleculeTransformer = None def __init__(self, *args, **kwargs): """ Initialize the MolfeatFeaturizer. Parameters ---------- *args Additional positional arguments. **kwargs Additional keyword arguments. """ super().__init__(*args, **kwargs) self._prepare() @abstractmethod def _prepare(self): """ Prepare the featurizer. This method should be implemented by subclasses to initialize or configure the underlying molfeat transformer. """ pass @property def transformer(self): """Return the transformer, for use in SkLearn pipelines etc.""" return self._transformer