Source code for openadmet.models.features.molfeat_fingerprint
"""Fingerprint featurizer using molfeat library."""
from collections.abc import Iterable
from typing import Any, ClassVar
import datamol as dm
import numpy as np
from molfeat.trans import MoleculeTransformer
from molfeat.trans.fp import FPVecTransformer
from pydantic import Field
from openadmet.models.features.feature_base import MolfeatFeaturizer, featurizers
[docs]@featurizers.register("FingerprintFeaturizer")
class FingerprintFeaturizer(MolfeatFeaturizer):
"""
Fingerprint featurizer for molecules, relies on molfeat backend.
Attributes
----------
type : ClassVar[str]
The type of the featurizer.
fp_type : str
The type of fingerprint to use (e.g., 'ecfp4', 'morgan', 'rdkit', etc.).
dtype : Any
The data type to use for the fingerprint (e.g., np.float32).
n_jobs : int
The number of jobs to use for featurization, -1 for maximum parallelism.
"""
type: ClassVar[str] = "FingerprintFeaturizer"
fp_type: str = Field(
..., title="Fingerprint type", description="The type of fingerprint to use"
)
dtype: Any = Field(
np.float32,
title="Data type",
description="The data type to use for the fingerprint",
)
n_jobs: int = Field(
-1,
title="Number of jobs",
description="The number of jobs to use for featurization, -1 for maximum parallelism",
)
def _prepare(self):
"""Prepare the featurizer."""
vec_featurizer = FPVecTransformer(self.fp_type, dtype=self.dtype)
self._transformer = MoleculeTransformer(
vec_featurizer,
n_jobs=self.n_jobs,
dtype=self.dtype,
parallel_kwargs={"progress": False},
verbose=True,
)
[docs] def featurize(self, smiles: Iterable[str]) -> tuple[np.ndarray, np.ndarray]:
"""
Featurize a list of SMILES strings.
Parameters
----------
smiles : Iterable[str]
List or iterable of SMILES strings to featurize.
Returns
-------
tuple
Tuple of (features, indices). Features is a 2D numpy array of shape (
n_samples, n_features) and indices is a 1D numpy array of the indices of the
successfully featurized molecules.
"""
with dm.without_rdkit_log():
feat, indices = self._transformer(smiles, ignore_errors=True)
# datamol returns with an extra dimension
return np.squeeze(feat), indices