Source code for openadmet.models.features.molfeat_properties
"""Molecular descriptor featurizer using molfeat library."""
from collections.abc import Iterable
from typing import Any, ClassVar
import datamol as dm
import numpy as np
from molfeat.trans import MoleculeTransformer
from pydantic import Field, field_validator
from openadmet.models.features.feature_base import MolfeatFeaturizer, featurizers
[docs]@featurizers.register("DescriptorFeaturizer")
class DescriptorFeaturizer(MolfeatFeaturizer):
"""
Molecular descriptor featurizer, relies on molfeat backend.
Attributes
----------
descr_type : str
The type of descriptor to use, must be one of 'mordred', desc2
'desc3d'.
dtype : Any
The data type to use for the fingerprint.
n_jobs : int
The number of jobs to use for featurization, -1 for maximum parallelism.
"""
type: ClassVar[str] = "DescriptorFeaturizer"
descr_type: str = Field(
...,
title="Descriptor type",
description="The type of descriptor to use, must be one of 'mordred', desc2d', 'desc3d'",
)
dtype: Any = Field(
np.float32,
title="Data type",
description="The data type to use for the fingerprint",
)
n_jobs: int = Field(
-1,
title="Number of jobs",
description="The number of jobs to use for featurization, -1 for maximum parallelism",
)
@field_validator("descr_type")
@classmethod
def validate_descr_type(cls, value):
"""Validate the descriptor type."""
if value not in ["mordred", "desc2d", "desc3d"]:
raise ValueError(
"Descriptor type must be one of 'mordred', 'desc2d', 'desc3d'"
)
return value
def _prepare(self):
"""Prepare the featurizer."""
self._transformer = MoleculeTransformer(
self.descr_type,
n_jobs=self.n_jobs,
dtype=self.dtype,
parallel_kwargs={"progress": False},
verbose=True,
)
[docs] def featurize(self, smiles: Iterable[str]) -> tuple[np.ndarray, np.ndarray]:
"""
Featurize a list of SMILES strings.
Parameters
----------
smiles : Iterable[str]
List or iterable of SMILES strings to featurize.
Returns
-------
tuple
Tuple of (features, indices). Features is a 2D numpy array of shape (
n_samples, n_features) and indices is a 1D numpy array of the indices
of the successfully featurized molecules.
"""
with dm.without_rdkit_log():
feat, indices = self._transformer(smiles, ignore_errors=True)
# datamol returns with an extra dimension
return np.squeeze(feat), indices