Source code for bonafide.utils.feature_output

"""Output formatting after atom and bond featurization."""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union

import pandas as pd
from rdkit import Chem

from bonafide.utils.constants import UNDESIRED_ATOM_BOND_PROPERTIES2
from bonafide.utils.helper_functions_output import (
    get_energy_based_reduced_features,
    get_non_energy_based_reduced_features,
)

if TYPE_CHECKING:
    from bonafide.utils.molecule_vault import MolVault


[docs] class FeatureOutput: """Format the output of the calculated atom or bond features. Attributes ---------- _index_name : str The name of the index of the pandas DataFrame, either "ATOM_INDEX" or "BOND_INDEX". _loc : str The name of the current location in the code for logging purposes. feature_type : str The type of features to return, either "atom" or "bond". ignore_invalid : bool Whether to ignore invalid conformers during feature reduction. indices : List[int] The list of atom or bond indices to include. mol_vault : MolVault The instance of the dataclass for storing all relevant data on the molecule for which features were calculated. reduce : bool Whether to reduce the features to their minimum, maximum, and mean values across all conformers. If energies are available, also Boltzmann-averaged values are calculated as well as the data for the lowest- and highest-energy conformers. """ def __init__( self, mol_vault: MolVault, indices: List[int], feature_type: str, reduce: bool, ignore_invalid: bool, _loc: str, ) -> None: # Initialize attributes self.mol_vault = mol_vault self.indices = indices self.feature_type = feature_type self.reduce = reduce self.ignore_invalid = ignore_invalid self._loc = _loc # Define name of the index of the DataFrame if self.feature_type == "atom": self._index_name = "ATOM_INDEX" if self.feature_type == "bond": self._index_name = "BOND_INDEX"
[docs] def get_results( self, output_format: str ) -> Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol]: """Get the atom and bond features, respectively, in the desired output format. Parameters ---------- output_format : str The name of the desired output format, can be "df", "dict", or "mol_object". Returns ------- Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol] The features in the desired output format. """ # Base case (no feature formatting needed) if output_format == "mol_object" and self.reduce is False: _mols = self._clear_mols(mols=self.mol_vault.mol_objects) _mols = self._fill_missing_features(mols=_mols) return _mols # Get the features of all conformers in one DataFrame combined_df = None for conf_idx, mol in enumerate(self.mol_vault.mol_objects): # Skip invalid conformers if self.mol_vault.is_valid[conf_idx] is False: logging.warning( f"'{self.mol_vault.namespace}' | {self._loc}()\nThe conformer with index " f"'{conf_idx}' is invalid. It is ignored during any feature output " "calculation and formatting." ) continue # Get feature DataFrame for this conformer and add it to the DataFrame for # all conformers combined_df = self._get_feature_df( mol=mol, conformer_idx=conf_idx, combined_df=combined_df ) # Check if any valid conformers were found if combined_df is None: _errmsg = "No valid conformers were found in the molecule vault." logging.error(f"'{self.mol_vault.namespace}' | {self._loc}()\n{_errmsg}") raise ValueError(f"{self._loc}(): {_errmsg}") # Clean the DataFrame from undesired properties combined_df = combined_df.drop(columns=UNDESIRED_ATOM_BOND_PROPERTIES2, errors="ignore") # Get the reduced DataFrame assert isinstance(self.mol_vault.size, int) # for type checker if self.reduce is True and self.mol_vault.size > 1: processed_df = self._reduce_conformer_data(df=combined_df) elif self.reduce is True and self.mol_vault.size == 1: logging.warning( f"'{self.mol_vault.namespace}' | {self._loc}()\nThe 'reduce' parameter was set to " "True but the molecule vault only contains one conformer. Therefore, unreduced " "data is returned." ) processed_df = combined_df else: processed_df = combined_df # Postprocess the DataFrame (remove energy related columns) processed_df = self._postprocess_df(df=processed_df) match output_format: # DataFrame output case "df": return processed_df # Dict output case "dict": # Special case: dict output without reduce and multiple conformers if self.reduce is False and self.mol_vault.size > 1: result_dict = {} for idx in processed_df.index.unique(): subset = processed_df.loc[[idx]] result_dict[idx] = { col: subset[col].tolist() for col in processed_df.columns } return result_dict # Otherwse just return the dict (since this is either a reduced dataframe - one # entry per feature - or a single mol object) else: return processed_df.to_dict("index") # Mol output with reduce (the non-reduce case is handled above) case _: _mol = self._cast_reduced_props_to_mol( df=processed_df, mol=self.mol_vault.mol_objects[0] ) return _mol
[docs] def _clear_mols(self, mols: List[Chem.rdchem.Mol]) -> List[Chem.rdchem.Mol]: """Remove all properties from all atoms or bonds in the given list of molecule objects. Parameters ---------- mols : List[Chem.rdchem.Mol] The list of RDKit molecule objects to clean. Returns ------- List[Chem.rdchem.Mol] The list of cleaned RDKit molecule objects. """ _mols = [] for mol in mols: _mol = Chem.Mol(mol) # Remove all bond properties if self.feature_type == "atom": for bond in _mol.GetBonds(): for prop in bond.GetPropNames(): bond.ClearProp(prop) # Remove all atom properties if self.feature_type == "bond": for atom in _mol.GetAtoms(): for prop in atom.GetPropNames(): atom.ClearProp(prop) _mols.append(_mol) return _mols
[docs] def _fill_missing_features(self, mols: List[Chem.rdchem.Mol]) -> List[Chem.rdchem.Mol]: """Fill missing features in the given list of molecule objects with ``NaN`` values. Parameters ---------- mols : List[Chem.rdchem.Mol] The list of RDKit molecule objects to process. Returns ------- List[Chem.rdchem.Mol] The list of RDKit molecule objects with missing features filled with ``NaN`` values. """ # Get all feature names feature_names = set() for mol in mols: # Atoms if self.feature_type == "atom": for atom in mol.GetAtoms(): feature_names.update(atom.GetPropNames()) # Bonds if self.feature_type == "bond": for bond in mol.GetBonds(): feature_names.update(bond.GetPropNames()) # Fill missing features with NaN and for mol in mols: # Atoms if self.feature_type == "atom": for atom in mol.GetAtoms(): if atom.GetIdx() in self.indices: for feature in feature_names: if atom.HasProp(feature) == 0: atom.SetProp(feature, "NaN") # Bonds if self.feature_type == "bond": for bond in mol.GetBonds(): if bond.GetIdx() in self.indices: for feature in feature_names: if bond.HasProp(feature) == 0: bond.SetProp(feature, "NaN") # Check if an atom or bond has only NaN features for mol in mols: # Atoms if self.feature_type == "atom": for atom in mol.GetAtoms(): if atom.GetIdx() in self.indices: if all( [atom.HasProp(f) and atom.GetProp(f) == "NaN" for f in feature_names] ): logging.warning( f"'{self.mol_vault.namespace}' | {self._loc}()\nThe atom with " f"index {atom.GetIdx()} has all features as NaN values. This is " "probably due to the fact that no features were calculated for " "this atom." ) # Bonds if self.feature_type == "bond": for bond in mol.GetBonds(): if bond.GetIdx() in self.indices: if all( [bond.HasProp(f) and bond.GetProp(f) == "NaN" for f in feature_names] ): logging.warning( f"'{self.mol_vault.namespace}' | {self._loc}()\nThe bond with " f"index {bond.GetIdx()} has all features as NaN values. This is " "probably due to the fact that no features were calculated for " "this bond." ) return mols
[docs] def _get_feature_df( self, mol: Chem.rdchem.Mol, conformer_idx: int, combined_df: Optional[pd.DataFrame] ) -> pd.DataFrame: """Get all atom or bond properties as a pandas DataFrame. Parameters ---------- mol : Chem.rdchem.Mol The RDKit molecule object with calculated features as atom and bond properties. conformer_idx : int The index of the conformer in the molecule vault. combined_df : Optional[pd.DataFrame] The DataFrame with the features from all conformers. This is ``None`` if the current conformer is the first valid conformer. Returns ------- pd.DataFrame The pandas DataFrame with the atoms or bonds as rows and the features as columns. """ # Get atom properties if self.feature_type == "atom": idx_to_prop = {idx: mol.GetAtomWithIdx(idx).GetPropsAsDict() for idx in self.indices} # Get bond properties if self.feature_type == "bond": idx_to_prop = {idx: mol.GetBondWithIdx(idx).GetPropsAsDict() for idx in self.indices} # Get DataFrame df = pd.DataFrame(idx_to_prop).T df.index.name = self._index_name # Add additional information to the DataFrame assert isinstance(self.mol_vault.size, int) # for type checker if self.mol_vault.size > 1: df["conformer_name"] = self.mol_vault.conformer_names[conformer_idx] if self.mol_vault.energies_n_read is True: df["conformer_energy"] = self.mol_vault.energies_n[conformer_idx][0] # Add Boltzmann weights if available if all( [ self.mol_vault.boltzmann_weights != (None, [None]), self.mol_vault.boltzmann_weights != (), ] ): df["boltzmann_weight"] = self.mol_vault.boltzmann_weights[-1][conformer_idx] # type: ignore[index, misc] # Add to combined DataFrame if combined_df is None: return df else: return pd.concat([combined_df, df])
[docs] def _reduce_conformer_data(self, df: pd.DataFrame) -> pd.DataFrame: """Reduce conformer data by calculating various statistics and Boltzmann-weighted averages. Parameters ---------- df : pd.DataFrame The feature pandas DataFrame containing the data for the individual conformers. Returns ------- pd.DataFrame The feature pandas DataFrame with the reduced conformer data. """ _exclude_cols = ["conformer_name", "conformer_energy", "boltzmann_weight"] # Non-energy-based reduced feature values mean_features, min_features, max_features = get_non_energy_based_reduced_features( df=df, exclude_cols=_exclude_cols, feature_type=self.feature_type, _namespace=self.mol_vault.namespace, _loc=self._loc, ) logging.info( f"'{self.mol_vault.namespace}' | {self._loc}()\nCalculation of the mean, min, and max " f"feature values across all conformers per {self.feature_type} done." ) # Build the results DataFrame (reduced DataFrame) reduced_df = pd.concat( [mean_features, min_features, max_features], axis=1, ) reduced_df.index.name = self._index_name # Energy-based reduced feature values if self.mol_vault.energies_n_read is False: logging.warning( f"'{self.mol_vault.namespace}' | {self._loc}()\nNo energies are available for the " "individual conformers. Therefore, the energy-based feature calculations are " "skipped and only the min, max, and mean values are returned in the reduced" " DataFrame." ) logging.info( f"'{self.mol_vault.namespace}' | {self._loc}()\nCalculation of the feature " "values of the lowest- and highest-energy conformer and the Boltzmann-weighted " f"average across all conformers per {self.feature_type} done." ) return reduced_df # Check if any of the conformer energies are NaN, if they are and the user has asked to not # ignore these structures, then the energy-based calculations cannot be performed elif df["conformer_energy"].isna().any(): if self.ignore_invalid is True: logging.warning( f"'{self.mol_vault.namespace}' | {self._loc}()\nSome conformers are invalid " "and 'ignore_invalid' was set to True. Therefore, Boltzmann-weighted features " "will be calculated only considering the valid conformers." ) else: logging.info( f"'{self.mol_vault.namespace}' | {self._loc}()\nSome conformers are invalid " "and 'ignore_invalid' was set to False. Therefore, all energy-based feature " "calculations are skipped and only the min, max, and mean values are returned " "in the reduced DataFrame." ) logging.info( f"'{self.mol_vault.namespace}' | {self._loc}()\nCalculation of the feature " "values of the lowest- and highest-energy conformer and the Boltzmann-weighted " f"average across all conformers per {self.feature_type} done." ) return reduced_df # Calculate min energy, max energy, and Boltzmann-weighted average features min_e_features, max_e_features, boltzmann_features = get_energy_based_reduced_features( df=df, exclude_cols=_exclude_cols, feature_type=self.feature_type, _namespace=self.mol_vault.namespace, _loc=self._loc, ) logging.info( f"'{self.mol_vault.namespace}' | {self._loc}()\nCalculation of the feature values of " "the lowest- and highest-energy conformer and the Boltzmann-weighted average across " f"all conformers per {self.feature_type} done." ) # Rebuild the results DataFrame (reduced DataFrame) to have a nicer column order reduced_df = pd.concat( [ mean_features, min_features, max_features, min_e_features, max_e_features, boltzmann_features, ], axis=1, ) reduced_df.index.name = self._index_name return reduced_df
[docs] def _postprocess_df(self, df: pd.DataFrame) -> pd.DataFrame: """Postprocess the feature DataFrame by removing unneeded columns and check if any atom or bond has all features as ``NaN`` values. Parameters ---------- df : pd.DataFrame The formatted feature pandas DataFrame before postprocessing. Returns ------- pd.DataFrame The postprocessed feature pandas DataFrame. """ # Check if conformer_energy is in the DataFrame if "conformer_energy" in df.columns: _energies_all_nan = bool(df["conformer_energy"].isna().all()) else: _energies_all_nan = True # Remove columns assert isinstance(self.mol_vault.size, int) # for type checker if any([self.mol_vault.size <= 1, self.reduce is True, _energies_all_nan]): df = df.drop(["conformer_energy", "boltzmann_weight"], axis=1, errors="ignore") # Check for NaN rows df_ = df.drop( ["conformer_name", "conformer_energy", "boltzmann_weight"], axis=1, errors="ignore" ) nan_indices = df_.index[df_.isna().all(axis=1)].tolist() nan_indices = list(set(nan_indices)) nan_indices.sort() for idx in nan_indices: logging.warning( f"'{self.mol_vault.namespace}' | {self._loc}()\nThe {self.feature_type} with index " f"{idx} has all features as NaN values. This is probably due to the fact that no " f"features were calculated for this {self.feature_type}." ) return df
[docs] def _cast_reduced_props_to_mol(self, df: pd.DataFrame, mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol: """Cast the features in the reduced DataFrame to atom or bond properties in a molecule object. The provided RDKit molecule object is copied and cleaned from all properties and conformers. Parameters ---------- df : pd.DataFrame The feature DataFrame containing the reduced data. mol : Chem.rdchem.Mol The RDKit molecule object to which the features should be added as properties. Returns ------- Chem.rdchem.Mol The RDKit molecule object with the features added as atom or bond properties. """ _mol = Chem.Mol(mol) # Clear properties for atom in _mol.GetAtoms(): for prop in atom.GetPropNames(): atom.ClearProp(prop) for bond in _mol.GetBonds(): for prop in bond.GetPropNames(): bond.ClearProp(prop) # Remove conformer _mol.RemoveAllConformers() # Iterate over the feature DataFrame and set properties in the mol object for idx, row in df.iterrows(): for feature_name in df.columns: value = row[feature_name] dtype = type(value).__name__ # Atom properties if self.feature_type == "atom": obj = _mol.GetAtomWithIdx(idx) # Bond properties if self.feature_type == "bond": obj = _mol.GetBondWithIdx(idx) # Set property if "int" in dtype: obj.SetIntProp(feature_name, int(value)) elif "float" in dtype: obj.SetDoubleProp(feature_name, float(value)) elif dtype == "str": obj.SetProp(feature_name, value) elif dtype == "bool": obj.SetBoolProp(feature_name, value) return _mol