Source code for openadmet.models.features.combine

"""Combine features from multiple featurizers into a single feature array."""

from functools import reduce

import numpy as np
from numpy.typing import ArrayLike
from pydantic import Field, field_validator

from openadmet.models.features.feature_base import (
    FeaturizerBase,
    featurizers,
    get_featurizer_class,
)


[docs]@featurizers.register("FeatureConcatenator") class FeatureConcatenator(FeaturizerBase): """ Concatenate features from multiple featurizers into a single feature array. Attributes ---------- featurizers : list of FeaturizerBase List of featurizer instances to concatenate. """ featurizers: list[FeaturizerBase] = Field( ..., description="List of featurizers to concatenate" ) @field_validator("featurizers", mode="before") @classmethod def validate_featurizers(cls, value): """ Validate and construct the list of featurizers. If passed a dictionary of parameters, construct the relevant featurizers and pack them into the featurizers list. If a list is provided, use it directly. Parameters ---------- value : dict or list Dictionary of featurizer types and parameters, or a list of featurizer instances. Returns ------- list Sorted list of featurizer instances. """ processed_featurizers = [] if isinstance(value, dict): for feat_type, feat_params in value.items(): feat_class = get_featurizer_class(feat_type) feat = feat_class(**feat_params) processed_featurizers.append(feat) elif isinstance(value, list): processed_featurizers = value else: # Or raise an error if the type is unexpected return value # Sort the featurizers by class name return sorted(processed_featurizers, key=lambda f: f.__class__.__name__)
[docs] def featurize(self, smiles: list[str]) -> np.ndarray: """ Featurize a list of SMILES strings using all featurizers and concatenate the results. Parameters ---------- smiles : list of str List of SMILES strings to featurize. Returns ------- np.ndarray Concatenated feature array for all SMILES. """ features = [] indices = [] for feat in self.featurizers: feat_res, idx = feat.featurize(smiles) features.append(feat_res) indices.append(idx) return self.concatenate(features, indices)
[docs] @staticmethod def concatenate(feats: list[ArrayLike], indices: list[np.ndarray]) -> np.ndarray: """ Concatenate a list of feature arrays, keeping only features present in all datasets. Parameters ---------- feats : list of array-like List of feature arrays to concatenate. indices : list of np.ndarray List of index arrays indicating valid entries for each feature array. Returns ------- tuple Tuple of (concatenated feature array, common indices). """ # If the input arrays are 1d, make them 2d feats = [ feat.reshape(1, -1) if len(feat.shape) == 1 else feat for feat in feats ] # Use indices to mask out the features that are not present in all datasets common_indices = reduce(np.intersect1d, indices) # Filter features to only include common indices filtered_feats = [] for feat, idx in zip(feats, indices): # Find where common_indices are in idx mask = np.isin(idx, common_indices) filtered_feats.append(feat[mask]) # Handle 1d features from single input by making them 2, concatenate column wise concat_feats = np.concatenate(filtered_feats, axis=1) return ( concat_feats, common_indices, )