"""BONAFIDE main module."""
from __future__ import annotations
import logging
import os
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
import pandas as pd
from rdkit import Chem
from bonafide._bonafide import _AtomBondFeaturizer
from bonafide.utils.constants import (
DETERMINE_BONDS_METHODS,
ELECTRONIC_STRUCTURE_ENGINES,
FEATURE_TYPES,
INPUT_FILE_EXTENSIONS,
INPUT_TYPES,
REDOX_STATES,
REDOX_STATES2,
)
from bonafide.utils.custom_featurizer_input_validation import custom_featurizer_data_validator
from bonafide.utils.feature_factories import FEATURE_FACTORIES
from bonafide.utils.helper_functions import get_function_or_method_name, standardize_string
from bonafide.utils.io_ import read_mol_object, read_sd_file, read_xyz_file
from bonafide.utils.molecule_vault import MolVault
from bonafide.utils.string_formatting import (
_make_bold_end,
_make_bold_start,
_make_green_end,
_make_green_start,
_make_underlined_end,
_make_underlined_start,
)
if TYPE_CHECKING:
import ipywidgets
from PIL import PngImagePlugin
[docs]
class AtomBondFeaturizer(_AtomBondFeaturizer):
"""Main class of the Bond and Atom Featurizer and Descriptor Extractor (BONAFIDE).
It implements all the methods available to the user to calculate atom and or bond-specific
features.
Parameters
----------
log_file_name : str, optional
The name of the log file to which all logging messages are written, by default
"bonafide.log". A file with this name cannot already exists.
Attributes
----------
_atom_feature_indices_2D : List[int]
The list of atom feature indices that can be calculated for molecules for which only 2D
information is available.
_atom_feature_indices_3D : List[int]
The list of atom feature indices that can be calculated for molecules for which 3D
information is available.
_bond_feature_indices_2D : List[int]
The list of bond feature indices that can be calculated for molecules for which only 2D
information is available.
_bond_feature_indices_3D : List[int]
The list of bond feature indices that can be calculated for molecules for which 3D
information is available.
_feature_config : Dict[str, Any]
The configuration settings for the individual programs used for feature calculation. The
default settings are loaded from the ``_feature_config.toml`` file. The current settings
can be inspected with the ``print_options()`` method and changed using the
``set_options()`` method.
_feature_info : Dict[int, Dict[str, Any]]
The metadata of all implemented atom and bond features, e.g., the name of the feature, its
dimensionality requirements (either 2D or 3D), or the program it is calculated with
(origin). The data is loaded from the ``_feature_info.json`` file and should not be
manually modified.
_feature_info_df : pd.DataFrame
A pandas DataFrame containing the feature indices (as index of the DataFrame) and their key
characteristics of all implemented atom and bond features.
_functional_groups_smarts : Dict[str, List[Tuple[str, Chem.rdchem.Mol]]]
A dictionary containing the names and SMARTS patterns of different functional groups.
_init_directory : str
The path to the directory where the ``AtomBondFeaturizer`` object was initialized.
_keep_output_files : bool
If ``True``, all output files created during the feature calculations are kept. If
``False``, they are removed when the calculation is done.
_loc : str
The location string representing the current class and method for logging purposes.
_namespace : Optional[str]
The namespace for the molecule as defined by the user when reading in the molecule.
_output_directory : Optional[str]
The path to the directory where all output files created during the feature calculations
are stored (if requested).
_periodic_table : Dict[str, element]
A dictionary representing the periodic table with element symbols as keys and mendeleev
``element`` objects as values.
mol_vault : Optional[MolVault]
Dataclass object for storing all relevant data on the molecule for which features should be
calculated.
"""
def __init__(self, log_file_name: str = "bonafide.log") -> None:
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
# Initialize attributes
self._feature_config = {}
self._feature_info = {}
self._feature_info_df = pd.DataFrame()
self._atom_feature_indices_2D = []
self._bond_feature_indices_2D = []
self._atom_feature_indices_3D = []
self._bond_feature_indices_3D = []
self._periodic_table = {}
self._functional_groups_smarts = {}
self.mol_vault = None # type: ignore[assignment]
self._namespace = None
self._keep_output_files = False
self._output_directory = None
self._init_directory = os.getcwd()
# Initialize logging
self._init_logging(log_file_name=log_file_name)
# Load configuration toml file
self._load_config_file()
# Load feature file
self._load_feature_info_file()
[docs]
def list_atom_features(self, **kwargs: Any) -> pd.DataFrame:
"""Display all available atom features.
The DataFrame can be filtered with the following optional keyword arguments:
* name
* origin
* dimensionality
* data_type
* requires_electronic_structure_data
* requires_bond_data
* requires_charge
* requires_multiplicity
* config_path
* factory
Parameters
----------
**kwargs : Any
Additional optional keyword arguments for filtering the feature DataFrame. If empty,
all atom features are returned.
Returns
-------
pd.DataFrame
A pandas DataFrame containing the selected atom features and their characteristics.
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
if "feature_type" in kwargs:
_errmsg = "Invalid input: 'feature_type' is set automatically and must not be provided."
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
return self._list_features(feature_type="atom", **kwargs)
[docs]
def list_bond_features(self, **kwargs: Any) -> pd.DataFrame:
"""Display all available bond features.
The DataFrame can be filtered with the following optional keyword arguments:
* name
* origin
* dimensionality
* data_type
* requires_electronic_structure_data
* requires_bond_data
* requires_charge
* requires_multiplicity
* config_path
* factory
Parameters
----------
**kwargs : Any
Additional optional keyword arguments for filtering the feature DataFrame. If empty,
all bond features are returned.
Returns
-------
pd.DataFrame
A pandas DataFrame containing the selected bond features and their characteristics.
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
if "feature_type" in kwargs:
_errmsg = "Invalid input: 'feature_type' is set automatically and must not be provided."
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
return self._list_features(feature_type="bond", **kwargs)
[docs]
def print_options(self, origin: Optional[Union[str, List[str]]] = None) -> None:
"""Print the configuration settings of the individual programs for feature calculation.
By providing input to the ``origin`` parameter, it can be selected which program's settings
are printed. Valid origins are:
* alfabet
* bonafide
* dbstep
* dscribe
* kallisto
* mendeleev
* morfeus
* multiwfn
* psi4
* qmdesc
* rdkit
* xtb
Parameters
----------
origin : Optional[Union[str, List[str]]], optional
The name(s) of the program(s) for which the configuration settings should be printed.
Can either be given as string or list of multiple programs, by default ``None``. If
kept ``None``, the settings of all programs are printed.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(f"'{self._namespace}' | {self._loc}() | START\n> 'origin': {origin}\n-----")
# Format origin input
if len(self._feature_config) == 0:
_errmsg = (
"The configuration settings cannot be printed because the default settings could "
"not be loaded."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
_valid_origins = list(self._feature_config.keys())
_inpt = type(origin)
if origin is None:
_for_printing = self._feature_config
elif isinstance(origin, str):
self._check_is_str_in_list(
parameter_name="origin", value=origin, allowed_values=_valid_origins
)
_for_printing = {origin: self._feature_config[origin]}
elif isinstance(origin, list):
origin = [standardize_string(inp_data=o) for o in origin]
for o in origin:
self._check_is_str_in_list(
parameter_name="origin", value=o, allowed_values=_valid_origins
)
_for_printing = {o: self._feature_config[o] for o in origin}
else:
_errmsg = (
"Invalid input to 'origin': must be either None, of type str, or a list of "
f"strings but obtained {_inpt.__name__}."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise TypeError(f"{self._loc}(): {_errmsg}")
# Print data
_file_name = "_feature_config.toml"
_toml_config_file_path = os.path.join(os.path.dirname(__file__), _file_name)
print(
f"{_make_underlined_start}{_make_bold_start}Default configuration settings at:"
f"{_make_bold_end}{_make_underlined_end}"
)
print(_toml_config_file_path)
print()
for orig, data in _for_printing.items():
print(
f"{_make_underlined_start}{_make_bold_start}{_make_green_start}{orig}"
f"{_make_green_end}{_make_bold_end}{_make_underlined_end}"
)
for d, vals in data.items():
print(f" {_make_bold_start}{d}{_make_bold_end}: {vals}")
print()
logging.info(f"'{self._namespace}' | {self._loc}()\nConfiguration settings were printed.")
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def set_options(self, configs: Union[Tuple[str, Any], List[Tuple[str, Any]]]) -> None:
"""Change configuration settings for the individual programs used for feature calculation.
The input to this method must be a 2-tuples (or a list thereof), where the first entry is
the path to the configuration setting that should be changed (point-separated) and the
second entry is the new value.
For listing all available configuration settings and their current values, see the
``print_options()`` method.
Parameters
----------
configs : Union[Tuple[str, Any], List[Tuple[str, Any]]]
A 2-tuple or a list of 2-tuples containing the configuration paths and their new
values, e.g.: ("bonafide.autocorrelation.depth", 3)
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(f"'{self._namespace}' | {self._loc}() | START\n> 'configs': {configs}\n-----")
# Check input types
self._check_is_of_type(expected_type=[tuple, list], value=configs, parameter_name="configs")
if isinstance(configs, tuple):
configs = [configs]
# Check if list is empty
if len(configs) == 0:
_errmsg = "Invalid input to 'configs': must not be an empty list."
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Check individual entries in list
for entry in configs:
self._check_is_of_type(
expected_type=tuple,
value=entry,
parameter_name="configs",
prefix="each element of the list",
)
if len(entry) != 2:
_errmsg = (
f"Invalid input to 'configs': must be a 2-tuple or a list of 2-tuples but "
f"obtained a list containing a tuple of length {len(entry)}."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Iterate over all configuration changes
for config_path, value in configs:
self._check_is_of_type(
expected_type=str,
value=config_path,
parameter_name="configs",
prefix="first entry of each 2-tuple",
)
# Make branch of config path case insensitive, actual option name stays case-sensitive
if len(config_path.split(".")) >= 2:
_option_name, _branch = config_path[::-1].split(".", 1)
_branch = standardize_string(inp_data=_branch)
config_path = f"{_branch[::-1]}.{_option_name[::-1]}"
# Apply changes
self._set_options(config_path, value)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def show_molecule(
self,
index_type: Optional[str] = "atom",
in_3D: bool = False,
image_size: Tuple[int, int] = (500, 500),
) -> Union[PngImagePlugin.PngImageFile, ipywidgets.VBox]:
"""Display the molecule with atom, bond or no indices.
Molecules can either be shown in an interactive 3D view (if 3D information is available)
or in 2D as a Lewis structure.
Parameters
----------
index_type : str, optional
The type of indices to add to the structure, either "atom", "bond", or ``None``. By
default "atom".
in_3D : bool, optional
If ``True``, the molecule is shown in 3D (if 3D information is available), by default
``False``.
image_size : Tuple[int, int], optional
The size of the displayed image in pixels (width, height), by default (500, 500).
Returns
-------
Union[PngImagePlugin.PngImageFile, ipywidgets.VBox]
A 2D or 3D depiction of the molecule, either as an image or an interactive 3D view.
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n"
f"> 'index_type': {index_type}\n"
f"> 'in_3D': {in_3D}\n"
f"> 'image_size': {image_size}\n"
"-----"
)
# Pre-checks
self._check_is_initialized(error_message="showing the molecule")
# Check input types
self._check_is_of_type(
expected_type=[str, None], value=index_type, parameter_name="index_type"
)
self._check_is_of_type(expected_type=bool, value=in_3D, parameter_name="in_3D")
self._check_is_of_type(expected_type=tuple, value=image_size, parameter_name="image_size")
# Check input to index_type
_helper_list = [x for x in FEATURE_TYPES]
_helper_list.append("none")
index_type = self._check_is_str_in_list(
parameter_name="index_type", value=index_type, allowed_values=_helper_list
)
# Don't allow 3D rendering if no 3D information is available
if in_3D is True and self.mol_vault.dimensionality == "2D":
_errmsg = "A molecule cannot be rendered in 3D if no 3D information is available."
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Check size input
if len(image_size) != 2:
_errmsg = (
f"Invalid input to 'image_size': must be a 2-tuple but obtained a tuple of "
f"length {len(image_size)}."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
for value in image_size:
if type(value) != int or value <= 0:
_errmsg = (
f"Invalid input to 'image_size': both entries of the 2-tuple must be "
f"positive integers but obtained {image_size}."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Show molecule
disp = self.mol_vault.render_mol(idx_type=index_type, in_3D=in_3D, image_size=image_size)
if in_3D is True:
_dimensionality = "3D"
else:
_dimensionality = "2D"
if index_type is None or index_type.lower() == "none":
_suffix = "."
else:
_suffix = f" with '{index_type}' indices."
logging.info(
f"'{self._namespace}' | {self._loc}()\nMolecule was rendered in "
f"{_dimensionality}{_suffix}"
)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
return disp
[docs]
def set_charge(self, charge: int) -> None:
"""Set the charge of the molecule.
Parameters
----------
charge : int
The total charge of the molecule that is used for feature calculation.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(f"'{self._namespace}' | {self._loc}() | START\n> 'charge': {charge}\n-----")
# Pre-checks
self._check_is_initialized(error_message="setting the charge")
# Check input type
self._check_is_of_type(expected_type=int, value=charge, parameter_name="charge")
# Set charge
self.mol_vault.charge = charge
logging.info(f"'{self._namespace}' | {self._loc}()\nMolecular charge was set to {charge}.")
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def set_multiplicity(self, multiplicity: int) -> None:
"""Set the multiplicity of the molecule.
Parameters
----------
multiplicity : int
The spin multiplicity of the molecule that is used for feature calculation.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n> 'multiplicity': {multiplicity}\n-----"
)
# Pre-checks
self._check_is_initialized(error_message="setting the multiplicity")
# Check input type
self._check_is_of_type(expected_type=int, value=multiplicity, parameter_name="multiplicity")
# Check input
if multiplicity < 1:
_errmsg = f"Invalid input to 'multiplicity': must be >= 1 but obtained {multiplicity}."
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Set multiplicity
self.mol_vault.multiplicity = multiplicity
logging.info(
f"'{self._namespace}' | {self._loc}()\nSpin multiplicity was set to {multiplicity}."
)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def attach_smiles(
self,
smiles: str,
align: bool = True,
connectivity_method: str = "connect_the_dots",
covalent_radius_factor: Union[int, float] = 1.3,
) -> None:
"""Attach a SMILES string to a molecule vault that is hosting a 3D molecule.
Before attaching a SMILES string, the compatibility of the SMILES string with the already
existing molecule in the vault is checked. The ``align`` parameter allows to decide whether
to keep the initial atom order (``align=True``) or apply the one of the SMILES string
(``align=False``).
The additional optional parameters ``connectivity_method`` and ``covalent_radius_factor``
influence how the atom connectivity of the RDKit molecule object(s) initially hosted in the
molecule vault is determined (required for attaching the SMILES string).
A SMILES string can only be attached to a molecule vault for which the bonds are not
determined yet. This also means that once a SMILES string is attached to a molecule
vault, it cannot be changed anymore. A SMILES string cannot be attached to a molecule vault
hosting a 2D molecule.
Parameters
----------
smiles : str
The SMILES string that should be attached to the molecule vault.
align : bool, optional
If ``True``, the atom indices of the initially provided 3D structures are preserved, if
``False``, the atoms are re-ordered according to the order in the SMILES string, by
default ``True``.
connectivity_method : str
The name of the method that is used to determine the atom connectivity. Available
options are "connect_the_dots", "van_der_waals", and "hueckel".
covalent_radius_factor : float
A scaling factor that is applied to the covalent radii of the atoms when determining the
bonds with the van-der-Waals method.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n"
f"> 'smiles': {smiles}\n"
f"> 'align': {align}\n"
f"> 'connectivity_method': {connectivity_method}\n"
f"> 'covalent_radius_factor': {covalent_radius_factor}\n"
"-----"
)
# Pre-checks
self._check_is_initialized(error_message="attaching a SMILES string")
self._check_is_2D("Attaching a SMILES string to a 2D ensemble is not allowed.")
# Check if bonds were already determined
if self.mol_vault.bonds_determined is True:
_errmsg = (
"A SMILES string can only be attached to a molecule vault that has its "
"bonds not yet determined."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Check input types
self._check_is_of_type(expected_type=str, value=smiles, parameter_name="smiles")
self._check_is_of_type(expected_type=bool, value=align, parameter_name="align")
self._check_is_of_type(
expected_type=str, value=connectivity_method, parameter_name="connectivity_method"
)
self._check_is_of_type(
expected_type=[int, float],
value=covalent_radius_factor,
parameter_name="covalent_radius_factor",
)
covalent_radius_factor = float(covalent_radius_factor)
if covalent_radius_factor <= 0:
_errmsg = (
f"Invalid input to 'covalent_radius_factor': must be > 0 but "
f"obtained {covalent_radius_factor}."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Check connectivity_method input
connectivity_method = self._check_is_str_in_list(
parameter_name="connectivity_method",
value=connectivity_method,
allowed_values=DETERMINE_BONDS_METHODS,
)
# Check if charge is set if Hueckel method is selected
if connectivity_method == "hueckel" and self.mol_vault.charge is None:
_errmsg = (
"Set the charge of the molecule vault with the set_charge() method "
"before determining bonds with the Hueckel method."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Check if features have already been calculated
_atom_features_calculated = set([len(c) for c in self.mol_vault.atom_feature_cache_n]) != {
0
}
_bond_features_calculated = set([len(c) for c in self.mol_vault.bond_feature_cache]) != {0}
if any([_atom_features_calculated, _bond_features_calculated]) and align is False:
_errmsg = (
'Attaching a SMILES string with "align=False" is not allowed after features '
"were calculated. This is due to the fact that some features contain atom or bond "
"indices which would be rendered meaningless when re-ordering of the atoms of the "
"molecule in the vault is performed."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Attach SMILES
self._attach_smiles(
smiles=smiles,
align=align,
connectivity_method=connectivity_method,
covalent_radius_factor=covalent_radius_factor,
)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def attach_energy(
self,
energy_data: Union[Tuple[Union[int, float], str], List[Tuple[Union[int, float], str]]],
state: str = "n",
prune_by_energy: Optional[Tuple[Union[int, float], str]] = None,
) -> None:
"""Attach molecular energy values to a molecule vault hosting a 3D molecule.
The input to ``energy_data`` can either be a single 2-tuple or a list of 2-tuples. Each
2-tuple must contain the energy value (first entry) and the respective energy unit (second
entry). Supported energy units are "Eh", "kcal/mol", and "kJ/mol".
The ``state`` parameter allows to specify to which redox state of the molecule the energy
values should be attached to.
If desired, the conformer ensemble can be pruned based on the attached energy values for
state "n" (actual molecule) through the ``prune_by_energy`` parameter.
Parameters
----------
energy_data : Union[Tuple[Union[int, float], str], List[Tuple[Union[int, float], str]]]
A 2-tuple or a list of 2-tuples containing the energy values and respective units.
state : str, optional
The redox state of the electronic structure data to be attached, by default "n". Can
either be
* "n" (actual molecule),
* "n+1" (actual molecule plus one electron), or
* "n-1" (actual molecule minus one electron).
prune_by_energy : Optional[Tuple[Union[int, float], str]], optional
If a value other than ``None`` is provided, all conformers with a relative energy above
this value are set to be invalid and ignored during feature calculation and any
further processing. The input must be a 2-tuple in which the first entry is the
relative energy cutoff value and the second entry is the respective energy unit.
Supported units are "Eh", "kcal/mol", and "kJ/mol". If ``None``, no pruning is
performed, by default ``None``.
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n"
f"> 'energy_data': {energy_data}\n"
f"> 'state': {state}\n"
"-----"
)
# Pre-checks
self._check_is_initialized(error_message="attaching energy data")
self._check_is_2D(
"Attaching molecular energy values to a molecule vault hosting a 2D molecule is "
"not allowed."
)
# Check input types
# The input to prune_by_energy is checked in MolVault.prune_ensemble_by_energy()
self._check_is_of_type(
expected_type=[tuple, list], value=energy_data, parameter_name="energy_data"
)
if isinstance(energy_data, tuple):
energy_data_formatted = [energy_data]
else:
energy_data_formatted = [e for e in energy_data]
for energy_unit_pair in energy_data_formatted:
self._check_is_of_type(
expected_type=tuple,
value=energy_unit_pair,
parameter_name="energy_data",
prefix="each entry",
)
if len(energy_unit_pair) != 2:
_errmsg = (
f"Invalid entry in 'energy_data': each entry must be a 2-tuple but obtained "
f"a tuple of length {len(energy_unit_pair)}."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
self._check_is_of_type(expected_type=str, value=state, parameter_name="state")
# Check state input
state = self._check_is_str_in_list(
parameter_name="state", value=state, allowed_values=REDOX_STATES2
)
# Check that energies matches number of conformers
if len(energy_data_formatted) != self.mol_vault.size:
_errmsg = (
f"The number of provided energy values ({len(energy_data_formatted)}) for state "
f"'{state}' does not match the number of conformers in the molecule vault "
f"({self.mol_vault.size})."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Attach energies
self._attach_energy(energy_data=energy_data_formatted, state=state)
# Prune conformer ensemble if requested (only based on state "n" energies)
if prune_by_energy is not None and state == "n":
self.mol_vault.prune_ensemble_by_energy(
energy_cutoff=prune_by_energy, _called_from=self._loc
)
elif prune_by_energy is not None and state != "n":
logging.warning(
f"'{self._namespace}' | {self._loc}()\nPruning the conformer ensemble based on "
f"energy values for state '{state}' is not supported. No pruning was performed."
)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def attach_electronic_structure(
self, electronic_structure_data: Union[str, List[str]], state: str = "n"
) -> None:
"""Attach electronic structure data files to a molecule vault hosting a 3D molecule.
The input can either be a single file path or a list of file paths. The ``state`` parameter
allows to specify to which redox state of the molecule the electronic structure data
should be attached to.
Parameters
----------
electronic_structure_data : Union[str, List[str]]
A list of file paths to the electronic structure files or a single file path.
state : str, optional
The redox state of the electronic structure data to be attached, by default "n". Can
either be
* "n" (actual molecule),
* "n+1" (actual molecule plus one electron), or
* "n-1" (actual molecule minus one electron).
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n"
f"> 'electronic_structure_data': {electronic_structure_data}\n"
f"> 'state': {state}\n"
"-----"
)
# Pre-checks
self._check_is_initialized(error_message="attaching electronic structure data")
self._check_is_2D(
"Attaching electronic structure files to a molecule vault hosting a "
"2D molecule is not allowed."
)
# Check input types
self._check_is_of_type(
expected_type=[str, list],
value=electronic_structure_data,
parameter_name="electronic_structure_data",
)
if type(electronic_structure_data) == str:
el_struc_list = [electronic_structure_data]
else:
el_struc_list = [e for e in electronic_structure_data]
self._check_is_of_type(expected_type=str, value=state, parameter_name="state")
# Check state input
state = self._check_is_str_in_list(
parameter_name="state", value=state, allowed_values=REDOX_STATES2
)
# Determine the electronic structure list and types based on the state
if state == "n+1":
_el_struc_list = self.mol_vault.electronic_strucs_n_plus1
_el_struc_types = self.mol_vault.electronic_struc_types_n_plus1
elif state == "n-1":
_el_struc_list = self.mol_vault.electronic_strucs_n_minus1
_el_struc_types = self.mol_vault.electronic_struc_types_n_minus1
else:
_el_struc_list = self.mol_vault.electronic_strucs_n
_el_struc_types = self.mol_vault.electronic_struc_types_n
# Check if number of electronic structure files matches number of conformers
if len(el_struc_list) != self.mol_vault.size:
_errmsg = (
f"The number of provided electronic structure files ({len(el_struc_list)}) "
"does not match the number of conformers in the molecule vault "
f"({self.mol_vault.size})."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
if len(_el_struc_list) != 0:
_errmsg = (
f"Electronic structure data for state '{state}' is already attached to the "
"molecule vault and cannot be attached again."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
self._attach_electronic_structure(
electronic_struc_list=el_struc_list,
_el_struc_list=_el_struc_list, # type: ignore[arg-type]
_el_struc_types=_el_struc_types, # type: ignore[arg-type]
state=state,
)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def determine_bonds(
self,
connectivity_method: str = "connect_the_dots",
covalent_radius_factor: Union[int, float] = 1.3,
allow_charged_fragments: bool = True,
embed_chiral: bool = True,
) -> None:
"""Determine the chemical bonds of each conformer of a molecule vault hosting a
3D molecule.
This method can be used to define the chemical bonds of a molecule that was provided
without information on the bonds (connectivity and bond type). Bond information is
required for the calculation of certain atom and all bond features.
The optional parameters ``connectivity_method``, ``covalent_radius_factor``,
``allow_charged_fragments``, and ``embed_chiral`` influence how the bonds of the
individual RDKit molecule object(s) are.
Parameters
----------
connectivity_method : str
The name of the method that is used to determine the atom connectivity and bond type.
Available options are "connect_the_dots", "van_der_waals", and "hueckel".
covalent_radius_factor : float
A scaling factor that is applied to the covalent radii of the atoms when determining
the bonds with the van-der-Waals method.
allow_charged_fragments : bool, optional
If ``True``, fragments with a net charge are allowed when determining the bonds of the
molecule, by default ``True``.
embed_chiral : bool, optional
If ``True``, chiral centers are embedded when determining the bonds of the molecule,
by default ``True``.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n"
f"> 'connectivity_method': {connectivity_method}\n"
f"> 'covalent_radius_factor': {covalent_radius_factor}\n"
f"> 'allow_charged_fragments': {allow_charged_fragments}\n"
f"> 'embed_chiral': {embed_chiral}\n"
"-----"
)
# Pre-checks
self._check_is_initialized(error_message="determining bonds")
self._check_is_2D("All bonds are already defined.")
if self.mol_vault.bonds_determined is True:
_errmsg = (
"The bonds of the molecule in the molecule vault are already determined "
"and cannot be determined again."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Check input types
self._check_is_of_type(
expected_type=str, value=connectivity_method, parameter_name="connectivity_method"
)
self._check_is_of_type(
expected_type=[int, float],
value=covalent_radius_factor,
parameter_name="covalent_radius_factor",
)
self._check_is_of_type(
expected_type=bool,
value=allow_charged_fragments,
parameter_name="allow_charged_fragments",
)
self._check_is_of_type(
expected_type=bool, value=embed_chiral, parameter_name="embed_chiral"
)
# Check covalent_radius_factor input
covalent_radius_factor = float(covalent_radius_factor)
if covalent_radius_factor <= 0:
_errmsg = (
f"Invalid input to 'covalent_radius_factor': must be > 0 but "
f"obtained {covalent_radius_factor}."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Check connectivity_method input
connectivity_method = self._check_is_str_in_list(
parameter_name="connectivity_method",
value=connectivity_method,
allowed_values=DETERMINE_BONDS_METHODS,
)
# Check if charge is set if Hueckel method is selected
if connectivity_method == "hueckel" and self.mol_vault.charge is None:
_errmsg = (
"Set the charge of the molecule vault with the set_charge() method "
"before determining bonds with the Hueckel method."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Determine the bonds
self._determine_bonds(
connectivity_method=connectivity_method,
covalent_radius_factor=covalent_radius_factor,
allow_charged_fragments=allow_charged_fragments,
embed_chiral=embed_chiral,
)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def calculate_electronic_structure(
self,
engine: str,
redox: str = "n",
prune_by_energy: Optional[Tuple[Union[int, float], str]] = None,
) -> None:
"""Calculate the electronic structure of all conformers of a molecule vault hosting
a 3D molecule.
The calculation can be performed with either the Psi4 or xtb engine. The ``redox``
parameter allows to select for which redox states the electronic structure should be
calculated.
Parameters
----------
engine : str
The name of the electronic structure program to be used, either "psi4" or "xtb".
redox : str, optional
The redox state for which the electronic structure should be calculated. Can either be
* "n" (only the actual molecule is calculated),
* "n-1" (the actual molecule and its one-electron-oxidized form are calculated),
* "n+1" (the actual molecule and its one-electron-reduced form are calculated), or
* "all" (the actual molecule and both, its one-electron-reduced and -oxidized form are
calculated), by default "n".
prune_by_energy : Optional[Tuple[Union[int, float], str]], optional
If a value other than ``None`` is provided, all conformers with a relative energy above
this value are set to be invalid and ignored during feature calculation and any
further processing. The input must be a 2-tuple in which the first entry is the
relative energy cutoff value and the second entry is the respective energy unit.
Supported units are "Eh", "kcal/mol", and "kJ/mol". If ``None``, no pruning is
performed, by default ``None``.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n"
f"> 'engine': {engine}\n"
f"> 'redox': {redox}\n"
f"> 'prune_by_energy': {prune_by_energy}\n"
"-----"
)
# Pre-checks
self._check_is_initialized(error_message="calculating electronic structures")
self._check_is_2D(
"Electronic structure calculations are not feasible for 2D ensembles. "
"Please provide a 3D ensemble."
)
# Check input types
self._check_is_of_type(expected_type=str, value=engine, parameter_name="engine")
self._check_is_of_type(expected_type=str, value=redox, parameter_name="redox")
# The input to prune_by_energy is checked in MolVault.prune_ensemble_by_energy()
# Check if charge was set
if self.mol_vault.charge is None:
_errmsg = (
"Set the charge of the molecule vault with the set_charge() method "
"before calculating the electronic structure."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Check if multiplicity was set
if self.mol_vault.multiplicity is None:
_errmsg = (
"Set the multiplicity of the molecule vault with the set_multiplicity() "
"method before calculating the electronic structure."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise ValueError(f"{self._loc}(): {_errmsg}")
# Check engine input
engine = self._check_is_str_in_list(
parameter_name="engine", value=engine, allowed_values=ELECTRONIC_STRUCTURE_ENGINES
)
# Check redox input
redox = self._check_is_str_in_list(
parameter_name="redox", value=redox, allowed_values=REDOX_STATES
)
# Format redox input
if redox == "n":
states_to_calculate = ["n"]
elif redox == "n+1":
states_to_calculate = ["n", "n+1"]
elif redox == "n-1":
states_to_calculate = ["n", "n-1"]
else:
states_to_calculate = ["n", "n-1", "n+1"]
# Loop over all states to calculate electronic structure for all conformers
for state in states_to_calculate:
# Check if data is already present (either energies or electronic structures)
_already_present = False
if state == "n" and any(
[
self.mol_vault.energies_n_read is True,
len(self.mol_vault.electronic_strucs_n) > 0,
]
):
_already_present = True
elif state == "n+1" and any(
[
self.mol_vault.energies_n_plus1_read is True,
len(self.mol_vault.electronic_strucs_n_plus1) > 0,
]
):
_already_present = True
elif state == "n-1" and any(
[
self.mol_vault.energies_n_minus1_read is True,
len(self.mol_vault.electronic_strucs_n_minus1) > 0,
]
):
_already_present = True
if _already_present is True:
logging.warning(
f"'{self._namespace}' | {self._loc}()\nElectronic structure or energy data "
f"for state '{state}' is already attached to the molecule vault. Electronic "
"structure calculations are skipped for this state."
)
continue
self._calculate_electronic_structure(engine=engine, state=state)
# Prune conformer ensemble if requested
if prune_by_energy is not None and state == "n":
self.mol_vault.prune_ensemble_by_energy(
energy_cutoff=prune_by_energy, _called_from=self._loc
)
elif prune_by_energy is not None and state != "n":
logging.warning(
f"'{self._namespace}' | {self._loc}()\nPruning the conformer ensemble based on "
f"energy values for state '{state}' is not possible. No pruning was performed."
)
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def featurize_atoms(
self,
atom_indices: Union[str, int, List[int]],
feature_indices: Union[str, int, List[int]],
) -> None:
"""Calculate one or multiple features for selected or all atoms.
A list of all available atom features can be obtained with the ``list_atom_features()``
method. For certain features, 3D information, electronic structure data or information on
the chemical bonds in the molecule is required.
Parameters
----------
atom_indices : Union[str, int, List[int]]
The indices of the atoms to be featurized. Can be a single index, a list of indices, or
"all" to consider all atoms.
feature_indices : Union[str, int, List[int]]
The indices of the features to be calculated. Can be a single index, a list of indices,
or "all" to consider all atom features.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n"
f"> 'atom_indices': {atom_indices}\n"
f"> 'feature_indices': {feature_indices}\n"
"-----"
)
# Pre-checks
self._check_is_initialized(error_message="featurizing atoms")
if len(self._feature_info) == 0:
_errmsg = (
"The feature information file was not sucessfully loaded. Therefore, atom "
"features cannot be calculated."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise RuntimeError(f"{self._loc}(): {_errmsg}")
# Check feature indices
feature_indices = self._check_feature_indices(
feature_indices=feature_indices,
feature_type="atom",
dimensionality=str(self.mol_vault.dimensionality),
)
# Reformat feature indices for certain iterable options; atom_indices is set to "all"
# in certain cases because the 'interable option base feature' must be available for
# all atoms in order to calculate the actual feature for the selected atoms.
feature_indices, set_atom_indices_to_all = self._rearrange_feature_indices(
feature_indices=feature_indices
)
if set_atom_indices_to_all is True:
atom_indices = "all"
logging.warning(
f"'{self._namespace}' | {self._loc}()\nThe 'atom_indices' parameter was set to "
"'all' because at least one of the selected features requires its iterable option "
"base feature(s) to be available for all atoms."
)
# Check atom indices
atom_indices = self._check_atom_indices(atom_indices=atom_indices)
logging.info(
f"'{self._namespace}' | {self._loc}()\nConsidered features: "
f"{[(idx, self._feature_info[idx]['name']) for idx in feature_indices]}."
)
# Calculate features
logging.info(f"'{self._namespace}' | {self._loc}()\nCalculating atom features now ...")
self._run_featurization(feature_indices=feature_indices, atom_bond_indices=atom_indices)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def featurize_bonds(
self,
bond_indices: Union[str, int, List[int]],
feature_indices: Union[str, int, List[int]],
) -> None:
"""Calculate one or multiple features for selected or all bonds.
A list of all available bond features can be obtained with the ``list_bond_features()``
method. For all bond features, information on the chemical bonds in the molecule is
required. Some bond features further require 3D information or electronic structure data.
Parameters
----------
bond_indices : Union[str, int, List[int]]
The indices of the bonds to be featurized. Can be a single index, a list of indices, or
"all" to consider all bonds.
feature_indices : Union[str, int, List[int]]
The indices of the features to be calculated. Can be a single index, a list of indices,
or "all" to consider all bond features.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n"
f"> 'bond_indices': {bond_indices}\n"
f"> 'feature_indices': {feature_indices}\n"
"-----"
)
# Pre-checks
self._check_is_initialized(error_message="featurizing bonds")
if len(self._feature_info) == 0:
_errmsg = (
"The feature information file was not sucessfully loaded. Therefore, bond "
"features cannot be calculated."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise RuntimeError(f"{self._loc}(): {_errmsg}")
# Check bond indices
bond_indices = self._check_bond_indices(bond_indices=bond_indices)
# Check feature indices
feature_indices = self._check_feature_indices(
feature_indices=feature_indices,
feature_type="bond",
dimensionality=str(self.mol_vault.dimensionality),
)
logging.info(
f"'{self._namespace}' | {self._loc}()\nConsidered features: "
f"{[(idx, self._feature_info[idx]['name']) for idx in feature_indices]}."
)
# Calculate features
logging.info(f"'{self._namespace}' | {self._loc}()\nCalculating bond features now ...")
self._run_featurization(feature_indices=feature_indices, atom_bond_indices=bond_indices)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
[docs]
def return_atom_features(
self,
atom_indices: Union[str, int, List[int]] = "all",
output_format: str = "df",
reduce: bool = False,
temperature: Union[int, float] = 298.15,
ignore_invalid: bool = True,
) -> Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol]:
"""Return the calculated atom features after feature calculation.
The features of selected or all atoms can be returned as a pandas DataFrame, a hierarchical
dictionary, or as one or multiple RDKit molecule objects with the features embedded as
atom properties.
If a dictionary is requested as output format, the outer dictionary keys correspond to the
atom indices. The values are dictionaries in which the keys are the feature names and the
values are the respective feature values.
Parameters
----------
atom_indices : Union[str, int, List[int]], optional
The indices of the atoms for which features should be returned. If features are
requested for atoms for which no data was calculated, the feature value will be
``NaN``. The input to ``atom_indices`` can be a single index, a list of indices, or
"all" to consider all atoms, by default "all".
output_format : str, optional
The name of the desired output format, can be "df", "dict", or "mol_object". If "df" is
selected, a pandas DataFrame is returned. If "dict" is selected, the features are
returned as a hierarchical dictionary. If "mol_object" is selected, one or multiple
RDKit molecule objects with the features embedded as atom properties are returned, by
default "df".
reduce : bool, optional
This is only relevant for molecule vaults hosting a 3D molecule with more than one
conformer. If ``True``, the features are reduced to a single value per atom across all
conformers reporting the minimum, maximum, and mean value for each feature. In
addition, if energy data is available in the molecule vault, the Boltzmann-weighted
average value at the provided temperature is reported as well as the data for the
lowest- and highest-energy conformer. If ``False``, the features are returned for each
conformer separately, by default ``False``.
temperature : Union[int, float], optional
The temperature in Kelvin at which the Boltzmann-weighted values are calculated, by
default 298.15.
ignore_invalid : bool, optional
If set to ``True``, the presence of any invalid conformer in the molecule vault will
be ignored during feature reduction. If is set to ``False``, the presence of any
invalid conformer will lead to returning the unreduced features. Note that in both
cases, invalid conformers are ignored when calculating the mean, min, and max feature
values.
Returns
-------
Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol]
The atom features in the desired output format.
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
return self._return_features(
feature_type="atom",
atom_bond_indices=atom_indices,
output_format=output_format,
reduce=reduce,
temperature=temperature,
ignore_invalid=ignore_invalid,
)
[docs]
def return_bond_features(
self,
bond_indices: Union[str, int, List[int]] = "all",
output_format: str = "df",
reduce: bool = False,
temperature: Union[int, float] = 298.15,
ignore_invalid: bool = True,
) -> Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol]:
"""Return the calculated bond features after feature calculation.
The features of selected or all bonds can be returned as a pandas DataFrame, a hierarchical
dictionary, or as one or multiple RDKit molecule objects with the features embedded as
bond properties.
If a dictionary is requested as output format, the outer dictionary keys correspond to the
bond indices. The values are dictionaries in which the keys are the feature names and the
values are the respective feature values.
Parameters
----------
bond_indices : Union[str, int, List[int]], optional
The indices of the bonds for which features should be returned. If features are
requested for bonds for which no data was calculated, the feature value will be
``NaN``. The input to ``bond_indices`` can be a single index, a list of indices, or
"all" to consider all bonds, by default "all".
output_format : str, optional
The name of the desired output format, can be "df", "dict", or "mol_object". If "df" is
selected, a pandas DataFrame is returned. If "dict" is selected, the features are
returned as a hierarchical dictionary. If "mol_object" is selected, one or multiple
RDKit molecule objects with the features embedded as bond properties are returned, by
default "df".
reduce : bool, optional
This is only relevant for molecule vaults hosting a 3D molecule with more than one
conformer. If ``True``, the features are reduced to a single value per bond across all
conformers reporting the minimum, maximum, and mean value for each feature. In
addition, if energy data is available in the molecule vault, the Boltzmann-weighted
average value at the provided temperature is reported as well as the data for the
lowest- and highest-energy conformer. If ``False``, the features are returned for each
conformer separately, by default ``False``.
temperature : Union[int, float], optional
The temperature in Kelvin at which the Boltzmann-weighted values are calculated, by
default 298.15.
ignore_invalid : bool, optional
If set to ``True``, the presence of any invalid conformer in the molecule vault will
be ignored during feature reduction. If is set to ``False``, the presence of any
invalid conformer will lead to returning the unreduced features. Note that in both
cases, invalid conformers are ignored when calculating the mean, min, and max feature
values.
Returns
-------
Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol]
The bond features in the desired output format.
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
return self._return_features(
feature_type="bond",
atom_bond_indices=bond_indices,
output_format=output_format,
reduce=reduce,
temperature=temperature,
ignore_invalid=ignore_invalid,
)
[docs]
def clear_atom_feature_cache(self, origin: Optional[Union[str, List[str]]] = None) -> None:
"""Clear the atom feature cache of the molecule vault.
This method can be used to clear previously calculated atom features from the feature
cache of the molecule vault to recalculate them (e.g., after changing the configuration
settings of a featurizer, see the ``set_options()`` method).
Parameters
----------
origin : Optional[Union[str, List[str]]]
The name or a list of the names of the program(s) of the feature(s) to be cleared
(e.g., "rdkit", "xtb"), by default ``None``. If ``None``, all features are cleared.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
self._clear_feature_cache(feature_type="atom", origin=origin)
[docs]
def clear_bond_feature_cache(self, origin: Optional[Union[str, List[str]]] = None) -> None:
"""Clear the bond feature cache of the molecule vault.
This method can be used to clear previously calculated bond features from the feature
cache of the molecule vault to recalculate them (e.g., after changing the configuration
settings of a featurizer, see the ``set_options()`` method).
Parameters
----------
origin : Optional[Union[str, List[str]]]
The name or a list of the names of the program(s) of the feature(s) to be cleared
(e.g., "rdkit", "xtb"), by default ``None``. If ``None``, all features are cleared.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
self._clear_feature_cache(feature_type="bond", origin=origin)
[docs]
def add_custom_featurizer(self, custom_metadata: Dict[str, Any]) -> None:
"""Add a custom featurizer to the BONAFIDE framework.
After successfully calling this method, the custom feature is assigned its own feature
index and can be used like any other built-in feature.
Parameters
----------
custom_metadata : Dict[str, Any]
A dictionary containing the required information on the custom featurizer. It must
contain the following data:
* name (str): The name of the custom feature.
* origin (str): The origin program of the custom feature (e.g., "custom")
* feature_type (str): The type of the custom feature (either "atom" or "bond").
* dimensionality (str): The dimensionality of the custom feature (either "2D" or "3D").
* data_type (str): The data type of the custom feature specified as string (either
"str", "int", "float", or "bool").
* requires_electronic_structure_data (bool): Whether electronic structure data is
required for calculating the custom feature.
* requires_bond_data (bool): Whether bond data is required for calculating the
custom feature.
* requires_charge (bool): Whether the charge of the molecule is required for
calculating the custom feature.
* requires_multiplicity (bool): Whether the multiplicity of the molecule is
required for calculating the custom feature.
* config_path (dict): Dictionary of optional parameters passed to the custom
featurizer. The keys of this dictionary will be available as attributes in the
custom featurizer class.
* factory (callable): The factory class for calculating the custom feature. It
must inherit from ``BaseFeaturizer`` from ``bonafide/utils/base_featurizer.py``.
Returns
-------
None
"""
self._loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"
logging.info(
f"'{self._namespace}' | {self._loc}() | START\n"
f"> 'custom_metadata': {custom_metadata}\n"
"-----"
)
# Pre-checks
if len(self._feature_info) == 0:
_errmsg = (
"The feature information file was not sucessfully loaded. Therefore, a custom "
"featurizer cannot be added."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise RuntimeError(f"{self._loc}(): {_errmsg}")
if len(self._feature_config) == 0:
_errmsg = (
"The default feature configuration settings file was not sucessfully loaded. "
"Therefore, a custom featurizer cannot be added."
)
logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
raise RuntimeError(f"{self._loc}(): {_errmsg}")
# Check input type
self._check_is_of_type(
expected_type=dict, value=custom_metadata, parameter_name="custom_metadata"
)
# Validate user input
origin, custom_metadata = custom_featurizer_data_validator(
custom_metadata=custom_metadata,
feature_info=self._feature_info,
namespace=str(self._namespace),
loc=self._loc,
feature_config=self._feature_config,
)
# Add new feature to the FeatureFactory dictionary
factory_name = custom_metadata["factory"].__name__
FEATURE_FACTORIES[factory_name] = custom_metadata["factory"]
# Change the data in factory to the actually needed information
custom_metadata["factory"] = factory_name
# Add new feature to the feature config dictionary
self._feature_config[origin] = custom_metadata["config_path"]
self._check_config_dict()
# Add new feature to self._feature_info
feature_idx = max(self._feature_info.keys()) + 1
custom_metadata["config_path"] = (
origin # Change the data in config_path to the actually needed information
)
self._feature_info[feature_idx] = custom_metadata
self._process_feature_info_dict()
logging.info(
f"'{self._namespace}' | {self._loc}()\nCustom {custom_metadata['feature_type']} "
f"feature '{custom_metadata['name']}' (produced in '{factory_name}') successfully "
f"added to the feature space with INDEX {feature_idx}."
)
logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")