Source code for bonafide._bonafide

"""BONAFIDE base class with the abstract implementations of all public as well as all private
methods.
"""

from __future__ import annotations

import copy
import json
import logging
import os
import tomllib
from abc import ABC, abstractmethod
from importlib.metadata import version
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union

import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdDetermineBonds

from bonafide._bonafide_utils import _AtomBondFeaturizerUtils
from bonafide.utils.constants import (
    ATTRIBUTE_BLACK_LIST,
    ELECTRONIC_STRUCTURE_DATA_FILE_EXTENSIONS,
    FEATURE_TYPES,
    OUTPUT_TYPES,
)
from bonafide.utils.feature_factories import FEATURE_FACTORIES
from bonafide.utils.feature_output import FeatureOutput
from bonafide.utils.helper_functions import clean_up, flatten_dict, standardize_string
from bonafide.utils.helper_functions_chemistry import (
    bind_smiles_with_xyz,
    get_charge_from_mol_object,
    get_molecular_formula,
)
from bonafide.utils.input_validation import config_data_validator
from bonafide.utils.io_ import extract_energy_from_string, read_smiles
from bonafide.utils.logging_format import IndentationFormatter
from bonafide.utils.sp_psi4 import Psi4SP
from bonafide.utils.sp_xtb import XtbSP

if TYPE_CHECKING:
    import ipywidgets
    from mendeleev import element
    from PIL import PngImagePlugin

    from bonafide.utils.molecule_vault import MolVault



[docs]
class _AtomBondFeaturizer(ABC, _AtomBondFeaturizerUtils):
    _atom_feature_indices_2D: List[int]
    _atom_feature_indices_3D: List[int]
    _bond_feature_indices_2D: List[int]
    _bond_feature_indices_3D: List[int]
    _feature_config: Dict[str, Any]
    _feature_info: Dict[int, Dict[str, Any]]
    _feature_info_df: pd.DataFrame
    _functional_groups_smarts: Dict[str, List[Tuple[str, Chem.rdchem.Mol]]]
    _init_directory: str
    _keep_output_files: bool
    _loc: str
    _namespace: Optional[str]
    _output_directory: Optional[str]
    _periodic_table: Dict[str, element]
    mol_vault: MolVault


[docs]
    @abstractmethod
    def list_atom_features(self, **kwargs: Any) -> pd.DataFrame: ...



[docs]
    @abstractmethod
    def list_bond_features(self, **kwargs: Any) -> pd.DataFrame: ...



[docs]
    @abstractmethod
    def print_options(self, origin: Optional[Union[str, List[str]]] = None) -> None: ...



[docs]
    @abstractmethod
    def set_options(self, configs: Union[Tuple[str, Any], List[Tuple[str, Any]]]) -> None: ...



[docs]
    @abstractmethod
    def read_input(

        self,
        input_value: Union[str, Chem.rdchem.Mol],
        namespace: str,
        input_format: str = "smiles",
        read_energy: bool = False,
        prune_by_energy: Optional[Tuple[Union[int, float], str]] = None,
        output_directory: Optional[str] = None,
    ) -> None: ...


[docs]
    @abstractmethod
    def show_molecule(

        self,
        index_type: Optional[str] = "atom",
        in_3D: bool = False,
        image_size: Tuple[int, int] = (500, 500),
    ) -> Union[PngImagePlugin.PngImageFile, ipywidgets.VBox]: ...


[docs]
    @abstractmethod
    def set_charge(self, charge: int) -> None: ...



[docs]
    @abstractmethod
    def set_multiplicity(self, multiplicity: int) -> None: ...



[docs]
    @abstractmethod
    def attach_smiles(

        self,
        smiles: str,
        align: bool = True,
        connectivity_method: str = "connect_the_dots",
        covalent_radius_factor: Union[int, float] = 1.3,
    ) -> None: ...


[docs]
    @abstractmethod
    def attach_energy(

        self,
        energy_data: Union[Tuple[Union[int, float], str], List[Tuple[Union[int, float], str]]],
        state: str = "n",
        prune_by_energy: Optional[Tuple[Union[int, float], str]] = None,
    ) -> None: ...


[docs]
    @abstractmethod
    def attach_electronic_structure(

        self,
        electronic_structure_data: Union[str, List[str]],
        state: str = "n",
        enable_structure_sanity_check: bool = True,
        structure_sanity_check_relative_tolerance: Union[int, float] = 0.001,
        structure_sanity_check_absolute_tolerance: Union[int, float] = 0.0001,
    ) -> None: ...


[docs]
    @abstractmethod
    def determine_bonds(

        self,
        connectivity_method: str = "connect_the_dots",
        covalent_radius_factor: Union[int, float] = 1.3,
        allow_charged_fragments: bool = True,
        embed_chiral: bool = True,
    ) -> None: ...


[docs]
    @abstractmethod
    def calculate_electronic_structure(

        self,
        engine: str,
        redox: str = "n",
        prune_by_energy: Optional[Tuple[Union[int, float], str]] = None,
    ) -> None: ...


[docs]
    @abstractmethod
    def featurize_atoms(

        self,
        atom_indices: Union[str, int, List[int]],
        feature_indices: Union[str, int, List[int]],
    ) -> None: ...


[docs]
    @abstractmethod
    def featurize_bonds(

        self,
        bond_indices: Union[str, int, List[int]],
        feature_indices: Union[str, int, List[int]],
    ) -> None: ...


[docs]
    @abstractmethod
    def return_atom_features(

        self,
        atom_indices: Union[str, int, List[int]] = "all",
        output_format: str = "df",
        reduce: bool = False,
        temperature: Union[int, float] = 298.15,
        ignore_invalid: bool = True,
    ) -> Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol]: ...


[docs]
    @abstractmethod
    def return_bond_features(

        self,
        bond_indices: Union[str, int, List[int]] = "all",
        output_format: str = "df",
        reduce: bool = False,
        temperature: Union[int, float] = 298.15,
        ignore_invalid: bool = True,
    ) -> Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol]: ...


[docs]
    @abstractmethod
    def clear_atom_feature_cache(self, origin: Optional[Union[str, List[str]]] = None) -> None: ...



[docs]
    @abstractmethod
    def clear_bond_feature_cache(self, origin: Optional[Union[str, List[str]]] = None) -> None: ...



[docs]
    @abstractmethod
    def add_custom_featurizer(self, custom_metadata: Dict[str, Any]) -> None: ...



[docs]
    def _init_logging(self, log_file_name: Any) -> None:
        """Set up the logging to a file with the provided log file name.

        Initially, the input is checked for validity. If the input is valid, the logging is set up.

        Parameters
        ----------
        log_file_name : Any
            The name of the log file to which the logging messages should be written.

        Returns
        -------
        None
        """
        # Check input type
        self._check_is_of_type(
            expected_type=str, value=log_file_name, parameter_name="log_file_name"
        )

        # Check if input is empty
        if log_file_name.strip() == "":
            _errmsg = "Invalid input to 'log_file_name': must not be an empty string."
            raise ValueError(f"{self._loc}(): {_errmsg}")

        # Check if log file already exists
        if os.path.exists(log_file_name):
            _errmsg = (
                f"The log file at '{os.path.abspath(log_file_name)}' already exists. "
                "Remove or rename the file before running BONAFIDE with the provided "
                "log file name."
            )
            raise FileExistsError(f"{self._loc}(): {_errmsg}")

        # Remove potential old handlers to avoid logging conflicts
        _root_logger = logging.getLogger()
        for handler in _root_logger.handlers[:]:  # Use slice to avoid modification during iteration
            # Don't remove pytest-related handlers when tests are run
            if (
                type(handler).__name__ in ["LogCaptureHandler", "_LiveLoggingNullHandler"]
                and os.environ.get("BONAFIDE_PYTEST_TESTING_SESSION", "0") == "1"
            ):
                continue
            handler.close()
            _root_logger.removeHandler(hdlr=handler)

        # Setup logging
        _handler = logging.FileHandler(filename=log_file_name)
        _format = "%(asctime)s | %(levelname)s | %(message)s"
        _dateformat = "%Y-%m-%d %H:%M:%S"

        _handler.setFormatter(IndentationFormatter(fmt=_format, datefmt=_dateformat))
        logging.basicConfig(level=logging.INFO, handlers=[_handler])

        # Capture warnings from other programs that calculate the features
        logging.captureWarnings(capture=True)

        # Log file header
        logging.info(
            "=======================================================================\n"
            r"            ____   __   __ _   __   ____  ___  ____  ____             " + "\n"
            r"           (  _ \ /  \ (  ( \ / _\ (  __)(   )(    \(  __)            " + "\n"
            r"            ) B ((  O )/  N // A  \ )F_)  )I(  ) D ( )E_)             " + "\n"
            r"           (____/ \__/ \_)__)\_/\_/(__)  (___)(____/(____)            " + "\n"
            "                                                                       \n"
            "               Features for Atoms and Bonds in Molecules               \n"
            "=======================================================================\n"
            f"* Version:                 {version(distribution_name='bonafide')}\n"
            "* Documentation:           https://molecularai.github.io/atom-bond-featurizer\n"
            "* GitHub:                  https://github.com/MolecularAI/atom-bond-featurizer\n"
            f"* DOI:                     10.26434/chemrxiv.15001386/v1\n"
            f"* Reference:               ChemRXiv 2026\n"
            "\n"
            f"* Installation directory:  {os.path.dirname(__file__)}\n"
        )



[docs]
    def _load_config_file(self) -> None:
        """Load the ``_feature_config.toml`` configuration file that stores the default setting
        parameters for the individual featurization programs.

        After reading the file, it is checked for disallowed keys that would interfere with
        the rest of the code.

        Returns
        -------
        None
        """
        # Read the file
        _file_name = "_feature_config.toml"
        _toml_config_file_path = os.path.join(os.path.dirname(__file__), _file_name)
        try:
            with open(_toml_config_file_path, "rb") as config_file:
                self._feature_config = tomllib.load(config_file)
        except Exception as e:
            _errmsg = f"Error while reading the '{_file_name}' file: {e}."
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise RuntimeError(f"{self._loc}(): {_errmsg}")

        # Check the read data
        self._check_config_dict()

        logging.info(
            f"'{self._namespace}' | {self._loc}()\nDefault feature configuration settings "
            f"successfully loaded from '{_toml_config_file_path}'."
        )



[docs]
    def _check_config_dict(self) -> None:
        """Check for disallowed keys in the configuration settings dictionary.

        The keys listed in ``ATTRIBUTE_BLACK_LIST`` are not allowed in the configuration settings
        dictionary because they are used internally for other data.

        Returns
        -------
        None
        """
        # Check for disallowed keys in the toml file that would interfere with the rest of the code
        all_config_keys = flatten_dict(self._feature_config, [])
        for key in all_config_keys:
            if key in ATTRIBUTE_BLACK_LIST:
                _errmsg = (
                    f"'{key}' is not an allowed key in the '_feature_config.toml' "
                    "configuration file."
                )
                logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                raise ValueError(f"{self._loc}(): {_errmsg}")



[docs]
    def _load_feature_info_file(self) -> None:
        """Read the ``_feature_info.json`` feature configuration file that stores all implemented
        features with their associated metadata.

        After reading the file, it is processed to define the atom and bond feature indices
        for 2D and 3D molecules.

        Returns
        -------
        None
        """
        # Read file
        _file_name = "_feature_info.json"
        _feature_info_file_path = os.path.join(os.path.dirname(__file__), _file_name)
        try:
            with open(_feature_info_file_path, "r") as feature_file:
                self._feature_info = json.load(feature_file)
        except Exception as e:
            _errmsg = f"Error while reading the '{_file_name}' file: {e}."
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise RuntimeError(f"{self._loc}(): {_errmsg}")

        self._feature_info = {int(idx): data for idx, data in self._feature_info.items()}
        self._process_feature_info_dict()

        logging.info(
            f"'{self._namespace}' | {self._loc}()\nFeature information file successfully loaded "
            f"from '{_feature_info_file_path}'.\n"
        )



[docs]
    def _process_feature_info_dict(self) -> None:
        """Process the feature information dictionary to define the atom and bond feature for 2D
        and 3D molecules and set up the feature information pandas DataFrame.

        All 2D features are also valid for 3D molecules.

        Returns
        -------
        None
        """
        df = pd.DataFrame(self._feature_info).T
        df = df.reset_index(names="INDEX")
        df = df.set_index("INDEX")
        df.index = pd.Index(df.index.map(int), dtype=object)

        # 2D atom features
        self._atom_feature_indices_2D = list(
            df[(df["feature_type"] == "atom") & (df["dimensionality"] == "2D")].index
        )

        # 2D bond features
        self._bond_feature_indices_2D = list(
            df[(df["feature_type"] == "bond") & (df["dimensionality"] == "2D")].index
        )

        # 3D atom features
        self._atom_feature_indices_3D = list(df[df["feature_type"] == "atom"].index)

        # 3D bond features
        self._bond_feature_indices_3D = list(df[df["feature_type"] == "bond"].index)

        self._feature_info_df = df



[docs]
    def _list_features(self, feature_type: str, **kwargs: Any) -> pd.DataFrame:
        """Display all available features for atoms or bonds.

        Parameters
        ----------
        feature_type : str
            The type of features to be listed, either "atom" or "bond".
        **kwargs: Any
            Additional optional keyword arguments for filtering the feature DataFrame. If empty,
            all features are returned.

        Returns
        -------
        pd.DataFrame
            A pandas DataFrame containing the selected features and their characteristics.
        """
        logging.info(
            f"'{self._namespace}' | {self._loc}() | START\n> 'arguments':  {kwargs}\n-----"
        )

        # Pre-checks
        if self._feature_info_df.shape == (0, 0):
            _errmsg = (
                "The feature information file was not successfully loaded. Therefore, "
                f"{feature_type} features cannot be listed."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise RuntimeError(f"{self._loc}(): {_errmsg}")

        df = self._feature_info_df.copy()
        df["name"] = [n.split("-")[-1] for n in df["name"]]
        df = df[df["feature_type"] == feature_type]

        _cols = list(df.columns)
        _cols.remove("feature_type")
        _filters = []

        for col_name, filter_key in kwargs.items():
            # Check if valid column name was passed
            if col_name not in _cols:
                _errmsg = (
                    f"Invalid input to '**kwargs': '{col_name}' is not a valid column in "
                    f"the feature DataFrame, available: {_cols}."
                )
                logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                raise ValueError(f"{self._loc}(): {_errmsg}")

            # Format filter key dependent on passed column name
            if col_name in ["name", "origin", "feature_type", "data_type", "config_path"]:
                filter_key = standardize_string(inp_data=filter_key)
            elif col_name in ["dimensionality"]:
                filter_key = standardize_string(inp_data=filter_key, case="upper")

            # Filter DataFrame
            df = df[df[col_name] == filter_key]
            _filters.append(f"{col_name}={filter_key}")

        logging.info(
            f"'{self._namespace}' | {self._loc}()\nFeature DataFrame was compiled. "
            f"Applied filters: {_filters}."
        )
        logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
        return df



[docs]
    def _set_options(self, config_path: str, value: Any) -> None:
        """Execute the change of the configuration settings for the individual programs used for
        feature calculation.

        Parameters
        ----------
        config_path : str
            The path to the configuration setting to be changed (point-separated).
        value : Any
            The new value for the configuration setting.

        Returns
        -------
        None
        """
        # Walk through the configurations dictionary
        config_key_list = config_path.split(".")
        section = self._feature_config
        for key in config_key_list[:-1]:
            try:
                section = section[key]
            except KeyError:
                _errmsg = (
                    f"Invalid input to 'configs': '{key}' is not a valid branch within "
                    "the configuration settings tree."
                )
                logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                raise ValueError(f"{self._loc}(): {_errmsg}")

        # Set the value
        setting_name = config_key_list[-1]
        if setting_name not in section:
            _errmsg = (
                f"Invalid input to 'configs': '{setting_name}' is not a valid option "
                f"for '{'.'.join(config_key_list[:-1])}'."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise ValueError(f"{self._loc}(): {_errmsg}")

        _old_value = section[setting_name]
        section[setting_name] = value

        # Validate the new configuration settings
        params = self._get_configs(key_list=config_key_list[:-1], include_root_data=True)
        params["feature_info"] = self._feature_info

        try:
            params = config_data_validator(
                config_path=config_key_list[:-1], params=params, _namespace=self._namespace
            )
        except Exception as e:
            section[setting_name] = _old_value
            raise e
        else:
            logging.info(
                f"'{self._namespace}' | {self._loc}()\n'{'.'.join(config_key_list)}' was updated. "
                f"The new value is: {params[config_key_list[-1]]}."
            )



[docs]
    def _setup_output_directory(self, dir_path: str) -> None:
        """Create a folder for all output files created during feature calculation.

        Parameters
        ----------
        dir_path : str
            The path to the output directory to be created.

        Returns
        -------
        None
        """
        dir_path = os.path.abspath(dir_path)

        # Check if provided output directory already exists
        if os.path.isdir(dir_path):
            _errmsg = (
                f"The directory at '{dir_path}' already exists and can therefore not be used as "
                "output directory."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise FileExistsError(f"{self._loc}(): {_errmsg}")

        # Check if provided output directory is writable
        _parent_dir = os.path.dirname(dir_path)
        if not os.access(path=_parent_dir, mode=os.W_OK):
            _errmsg = (
                f"The directory at '{_parent_dir}' is not writable and can therefore not be used "
                "as parent directory of the output directory."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise PermissionError(f"{self._loc}(): {_errmsg}")

        # Create the output directory
        try:
            os.mkdir(path=dir_path)
        except Exception as e:
            _errmsg = f"Creating the output directory at '{dir_path}' failed: {e}."
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise RuntimeError(f"{self._loc}(): {_errmsg}")

        # Keep output files when the user specified an output directory and internally save the
        # output directory path
        self._keep_output_files = True
        self._output_directory = os.path.abspath(dir_path)
        logging.info(
            f"'{self._namespace}' | {self._loc}()\nOutput directory was created at "
            f"'{os.path.abspath(dir_path)}'. All generated output files will be kept."
        )



[docs]
    def _attach_smiles(
        self, smiles: str, align: bool, connectivity_method: str, covalent_radius_factor: float
    ) -> None:
        """Execute the attachment of a SMILES string to a molecule vault hosting a 3D molecule.

        For details on how atom connectivity is determined in the SMILES attachment process,
        please refer to the RDKit documentation
        (https://rdkit.org/docs/source/rdkit.Chem.rdDetermineBonds.html, last accessed on
        29.09.2025).

        Parameters
        ----------
        smiles : str
            The SMILES string that should be attached to the molecule vault.
        align : bool, optional
            If ``True``, the atom indices of the initially provided 3D structure(s) are preserved,
            if ``False``, the atoms are re-ordered according to the order in the SMILES string.
        connectivity_method : str
            The name of the method that is used to determine atom connectivity when binding the
            SMILES string to the molecule vault. Available options are "connect_the_dots",
            "van_der_waals", and "hueckel".
        covalent_radius_factor : float
            A scaling factor that is applied to the covalent radii of the atoms when determining the
            atom connectivity with the van-der-Waals method.

        Returns
        -------
        None
        """
        # Get the first mol object of the molecule vault as reference
        ref_mol = Chem.Mol(self.mol_vault.mol_objects[0])

        # Read in the provided SMILES string
        smiles_mol, error_message = read_smiles(smiles)
        if error_message is not None:
            logging.error(f"'{self._namespace}' | {self._loc}()\n{error_message}")
            raise ValueError(f"{self._loc}(): {error_message}")

        # Check if provided SMILES is compatible with already existing molecule in the vault
        assert smiles_mol is not None  # for type checker
        _ref_atom_count = ref_mol.GetNumAtoms()
        _smiles_atom_count = smiles_mol.GetNumAtoms()
        if _ref_atom_count != _smiles_atom_count:
            _errmsg = (
                "The number of atoms of the structure(s) in the molecule vault "
                f"({_ref_atom_count}) does not match the number of atoms from the SMILES string "
                f"({_smiles_atom_count}). Did you add hydrogen atoms to the SMILES string?"
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise ValueError(f"{self._loc}(): {_errmsg}")

        # Double-check with molecular formula
        _ref_formula = get_molecular_formula(mol=ref_mol)
        _smiles_formula = get_molecular_formula(mol=smiles_mol)
        if _ref_formula != _smiles_formula:
            _errmsg = (
                f"Molecular formula of the molecule in the molecule vault ({_ref_formula}) does "
                "not match the molecular formula of the molecule represented by the SMILES string "
                f"({_smiles_formula})."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise ValueError(f"{self._loc}(): {_errmsg}")

        # Double-check with molecular charge
        _ref_charge = self.mol_vault.charge
        _smiles_charge = get_charge_from_mol_object(mol=smiles_mol)
        if _ref_charge != _smiles_charge:
            _errmsg = (
                f"Formal charge of the molecule in the molecule vault ({_ref_charge}) does not "
                "match the formal charge of the molecule represented by the SMILES string "
                f"({_smiles_charge})."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise ValueError(f"{self._loc}(): {_errmsg}")

        # Add smiles string to the molecule vault
        self.mol_vault.smiles = smiles

        if align is False:
            _wmsg = (
                "The align parameter is set to False. This will change the atom order of the "
                "molecule in the vault to the one defined by the SMILES string. Ensure that the "
                "new atom order is consistent with any follow-up steps; most importantly, with the "
                "atom order of electronic structure data files that might be attached and used for "
                "feature calculation."
            )
            logging.warning(f"'{self._namespace}' | {self._loc}()\n{_wmsg}")

        # Update the bond information of the mol objects of the molecule vault
        for idx, mol in enumerate(self.mol_vault.mol_objects):
            # Copy smiles_mol to avoid modifying it throughout the loop
            _smiles_mol = Chem.Mol(smiles_mol)

            # Try to attach the SMILES string to the conformer
            try:
                assert isinstance(self.mol_vault.charge, int)  # for type checker
                new_mol, error_message = bind_smiles_with_xyz(
                    smiles_mol=_smiles_mol,
                    xyz_mol=mol,
                    align=align,
                    connectivity_method=connectivity_method,
                    covalent_radius_factor=covalent_radius_factor,
                    charge=self.mol_vault.charge,
                )

            except Exception as e:
                _errmsg = (
                    f"Attaching the SMILES string to the conformer with index {idx} failed: "
                    f"{e}. Therefore, the conformer is set to be invalid."
                )
                self.mol_vault.is_valid[idx] = False
                logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")

            else:
                # Do attachment
                if error_message is None:
                    self.mol_vault.mol_objects[idx] = new_mol
                    _infmsg = (
                        f"'{self._namespace}' | {self._loc}()\nSMILES string was "
                        f"successfully attached to the conformer with index {idx}.\n"
                    )
                    if align is True:
                        _infmsg += "Initial atom order was maintained within the molecule vault."
                    else:
                        _infmsg += "Atom order of the provided SMILES string was applied."
                    logging.info(_infmsg)
                # Error handling
                else:
                    _errmsg = (
                        f"Attaching the SMILES string to the conformer with index {idx} failed: "
                        f"{error_message}. Therefore, fhe conformer is set to be invalid."
                    )
                    self.mol_vault.is_valid[idx] = False
                    logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")

        # Set bonds as determined
        self.mol_vault.bonds_determined = True

        # Update elements list with the potentially changed atom order
        self.mol_vault.get_elements()



[docs]
    def _attach_energy(
        self,
        energy_data: List[Tuple[Union[int, float], str]],
        state: str,
    ) -> None:
        """Execute the attachment of energy data to a molecule vault hosting a 3D molecule.

        Parameters
        ----------
        energy_data : List[Tuple[Union[int, float], str]]
            The list of 2-tuples containing the energy values and respective units to be attached to
            the molecule vault.
        state : str
            The redox state of the energy data to be attached. Can either be "n" (actual molecule),
            "n+1" (actual molecule plus one electron), or "n-1" (actual molecule minus one
            electron).

        Returns
        -------
        None
        """
        # Check if energy was already read
        if any(
            [
                state == "n" and self.mol_vault.energies_n_read is True,
                state == "n-1" and self.mol_vault.energies_n_minus1_read is True,
                state == "n+1" and self.mol_vault.energies_n_plus1_read is True,
            ]
        ):
            _errmsg = (
                f"Energy data for state '{state}' is already attached to the molecule vault "
                "and cannot be attached again."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise ValueError(f"{self._loc}(): {_errmsg}")

        # Get correct energy data list
        if state == "n":
            _energy_data_list = self.mol_vault.energies_n
            _energy_data_list_as_submitted = self.mol_vault._input_energies_n

        if state == "n-1":
            _energy_data_list = self.mol_vault.energies_n_minus1
            _energy_data_list_as_submitted = self.mol_vault._input_energies_n_minus1

        if state == "n+1":
            _energy_data_list = self.mol_vault.energies_n_plus1
            _energy_data_list_as_submitted = self.mol_vault._input_energies_n_plus1

        # Process all energy data
        energy_value: Optional[Union[int, float]]
        for idx, (energy_value, unit) in enumerate(energy_data):
            # Check energy value
            if energy_value is None:
                pass
            else:
                self._check_is_of_type(
                    expected_type=[int, float],
                    value=energy_value,
                    parameter_name="energy_data",
                    prefix=f"energy value for conformer with index {idx}",
                )

            # Check unit
            self._check_is_of_type(
                expected_type=str,
                value=unit,
                parameter_name="energy_data",
                prefix=f"energy unit for conformer with index {idx}",
            )

            # Handle energy input (check unit and convert to kJ/mol)
            if energy_value is not None:
                _energy_value_pair = f"{energy_value} {unit}"
                try:
                    energy_value_as_submitted, unit_as_submitted, energy_value, error_message = (
                        extract_energy_from_string(line=_energy_value_pair)
                    )
                except Exception as e:
                    _errmsg = f"Reading of input to 'energy_data' failed: {e}."
                    logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                    raise RuntimeError(f"{self._loc}(): {_errmsg}")

                if error_message is not None:
                    logging.error(f"'{self._namespace}' | {self._loc}()\n{error_message}")
                    raise ValueError(f"{self._loc}(): {error_message}.")

            else:
                energy_value_as_submitted = None
                unit_as_submitted = unit

            # Attach data
            _energy_data_list.append((energy_value, "kj_mol"))
            _energy_data_list_as_submitted.append((energy_value_as_submitted, unit_as_submitted))

            if state == "n":
                self.mol_vault.energies_n_read = True
            if state == "n-1":
                self.mol_vault.energies_n_minus1_read = True
            if state == "n+1":
                self.mol_vault.energies_n_plus1_read = True

            if energy_value is None:
                logging.warning(
                    f"'{self._namespace}' | {self._loc}()\nThe energy of conformer with index "
                    f"{idx} for state '{state}' is None. The conformer is therefore set to be "
                    "invalid. This is probably not intended."
                )
                self.mol_vault.is_valid[idx] = False

            logging.info(
                f"'{self._namespace}' | {self._loc}()\nEnergy was attached to conformer with "
                f"index {idx} for state '{state}'."
            )



[docs]
    def _attach_electronic_structure(
        self,
        electronic_struc_list: List[str],
        _el_struc_list: List[str],
        _el_struc_types: List[str],
        state: str,
        enable_structure_sanity_check: bool,
        structure_sanity_check_relative_tolerance: float,
        structure_sanity_check_absolute_tolerance: float,
    ) -> None:
        """Execute the attachment of electronic structure data file(s) to a molecule vault hosting
        a 3D molecule including the sanity check.

        Parameters
        ----------
        electronic_struc_list : List[str]
            The list of paths to the electronic structure data files to be attached to the
            molecule vault.
        _el_struc_list : List[str]
            The attribute of the ``MolVault`` object that stores the paths to the electronic
            structure data files.
        _el_struc_types : List[str]
            The attribute of the ``MolVault`` object that stores the file types of the
            electronic structure data files (file extensions).
        state : str
            The redox state of the electronic structure data to be attached. Can either be "n"
            (actual molecule), "n+1" (actual molecule plus one electron), or "n-1" (actual molecule
            minus one electron).
        enable_structure_sanity_check : bool
            Whether to perform a sanity check of the structure provided with the electronic
            structure data.
        structure_sanity_check_relative_tolerance : float
            The relative tolerance for the structure sanity check.
        structure_sanity_check_absolute_tolerance : float
            The absolute tolerance for the structure sanity check.

        Returns
        -------
        None
        """
        # Loop over all provided electronic structure files and process them
        for idx, el_struc_file in enumerate(electronic_struc_list):
            # Determine file type
            if el_struc_file is not None:
                file_type = os.path.splitext(el_struc_file)[-1][1:]

                # Check if the file exists
                if not os.path.exists(el_struc_file):
                    # Avoid leaving the MolVault in a partially-attached state when raising
                    _el_struc_list.clear()
                    _el_struc_types.clear()

                    _errmsg = (
                        f"Invalid input to 'electronic_structure_data': path to the input file at "
                        f"'{el_struc_file}' is invalid."
                    )
                    logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                    raise FileNotFoundError(f"{self._loc}(): {_errmsg}")

                file_path = os.path.join(os.getcwd(), el_struc_file)

                # Check file type
                _file_type = standardize_string(inp_data=file_type)
                if _file_type not in ELECTRONIC_STRUCTURE_DATA_FILE_EXTENSIONS:
                    logging.warning(
                        f"'{self._namespace}' | {self._loc}()\nElectronic structure data files of "
                        f"type '{file_type}' may lead to unexpected behavior or errors during "
                        "feature calculation. Ensure that the obtained results are valid. BONAFIDE "
                        "was developed and extensively tested with 'molden' and 'fchk' files."
                    )

                # Make a sanity check of the structure to be attached against the already present
                # one if requested
                if enable_structure_sanity_check is True:
                    error_message = self._attach_electronic_structure_sanity_check(
                        conformer_idx=idx,
                        electronic_structure_data_file_path=file_path,
                        structure_sanity_check_absolute_tolerance=structure_sanity_check_absolute_tolerance,
                        structure_sanity_check_relative_tolerance=structure_sanity_check_relative_tolerance,
                    )
                    if error_message is not None:
                        # Avoid leaving the MolVault in a partially-attached state when raising
                        _el_struc_list.clear()
                        _el_struc_types.clear()

                        _errmsg = (
                            f"The structure sanity check of the electronic structure data file "
                            f"failed for conformer with index {idx} for state '{state}': "
                            f"{error_message}. The conformer is therefore set to be invalid."
                        )
                        self.mol_vault.is_valid[idx] = False
                        logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                        raise ValueError(f"{self._loc}(): {_errmsg}")

                    logging.info(
                        f"'{self._namespace}' | {self._loc}()\nElectronic structure data "
                        f"(*.{file_type} file) was validated and attached to conformer with index "
                        f"{idx} for state '{state}'.",
                    )

                else:
                    logging.warning(
                        f"'{self._namespace}' | {self._loc}()\nElectronic structure data "
                        f"(*.{file_type} file) was attached to conformer with index {idx} for "
                        f"state '{state}' without a sanity check of the structure. Ensure that "
                        "the attached structure is valid and consistent with the already "
                        "present one."
                    )

            else:
                file_path = None
                file_type = None
                logging.warning(
                    f"'{self._namespace}' | {self._loc}()\nElectronic structure data of type None "
                    f"was attached to conformer with index {idx} for state '{state}'. The "
                    f"conformer is therefore set to be invalid. This is probably not intended."
                )
                self.mol_vault.is_valid[idx] = False

            # Attach the electronic structure file and the file type to the conformer ensemble
            _el_struc_list.append(file_path)
            _el_struc_types.append(file_type)



[docs]
    def _determine_bonds(
        self,
        connectivity_method: str,
        covalent_radius_factor: float,
        allow_charged_fragments: bool,
        embed_chiral: bool,
    ) -> None:
        """Execute the determination of the chemical bonds of each conformer of a molecule vault
        hosting a 3D molecule.

        For details on how the bonds are determined, please refer to the RDKit documentation
        (https://rdkit.org/docs/source/rdkit.Chem.rdDetermineBonds.html, last accessed on
        29.09.2025).

        Parameters
        ----------
        connectivity_method : str
            The name of the method that is used to determine the bonds. Available options are
            "connect_the_dots", "van_der_waals", and "hueckel".
        covalent_radius_factor : float
            A scaling factor that is applied to the covalent radii of the atoms when determining the
            bonds with the van-der-Waals method.
        allow_charged_fragments : bool
            If ``True``, charged fragments are allowed when determining the bonds. If ``False``,
            unpaired electrons are introduced according to the valence of the respective atom.
        embed_chiral : bool
            If ``True``, chiral information will be added to the molecule when determining the
            bonds.

        Returns
        -------
        None
        """
        # Set bonds as determined
        self.mol_vault.bonds_determined = True

        # Prepare input to RDKit (which method to use)
        _use_hueckel = False
        _use_vdw = True
        if connectivity_method == "hueckel":
            _use_hueckel = True
        if connectivity_method == "connect_the_dots":
            _use_vdw = False

        # Determine bonds for each conformer
        for idx, mol in enumerate(self.mol_vault.mol_objects):
            try:
                if _use_hueckel is True:
                    rdDetermineBonds.DetermineBonds(
                        mol=mol,
                        useHueckel=_use_hueckel,
                        charge=self.mol_vault.charge,
                        allowChargedFragments=allow_charged_fragments,
                        embedChiral=embed_chiral,
                    )
                else:
                    rdDetermineBonds.DetermineBonds(
                        mol=mol,
                        charge=self.mol_vault.charge,
                        covFactor=covalent_radius_factor,
                        allowChargedFragments=allow_charged_fragments,
                        embedChiral=embed_chiral,
                        useVdw=_use_vdw,
                    )

            except Exception as e:
                _errmsg = (
                    "Determining the chemical bonds failed for conformer with index "
                    f"{idx} which is therefore set to be invalid: {e}."
                )
                self.mol_vault.is_valid[idx] = False
                logging.warning(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            else:
                logging.info(
                    f"'{self._namespace}' | {self._loc}()\nChemical bonds were determined for "
                    f"conformer with index {idx}."
                )

        # Ensure that the conformers are identical
        self.mol_vault.compare_conformers()

        # Clean up properties after determining bonds to remove undesired values
        self.mol_vault.clean_properties()

        # Clean up (after Hueckel calculation)
        clean_up(to_be_removed=["nul", "run.out"])



[docs]
    def _calculate_electronic_structure(self, engine: str, state: str) -> None:
        """Execute the calculation of the electronic structure of all conformers of a molecule
        vault hosting a 3D molecule.

        Parameters
        ----------
        engine : str
            The name of the electronic structure program to be used, either "psi4" or "xtb".
        state : str
            The redox state of the electronic structure data to be calculated. Can either be "n"
            (actual molecule), "n+1" (actual molecule plus one electron), or "n-1" (actual molecule
            minus one electron).

        Returns
        -------
        None
        """
        # Fetch program-specific parameters and check their validity
        params = self._feature_config[engine]
        params = config_data_validator(
            config_path=[engine], params=params, _namespace=self._namespace
        )

        params["mol_vault"] = self.mol_vault
        params["state"] = state
        params["_keep_output_files"] = self._keep_output_files

        # Adjust charge and multiplicity if needed
        _init_charge = self.mol_vault.charge
        _init_multiplicity = self.mol_vault.multiplicity

        try:
            assert isinstance(self.mol_vault.charge, int)  # for type checker
            assert isinstance(self.mol_vault.multiplicity, int)  # for type checker
            if state == "n+1":
                self.mol_vault.charge -= 1
            if state == "n-1":
                self.mol_vault.charge += 1
            if state in ["n+1", "n-1"]:
                if self.mol_vault.multiplicity == 1:
                    self.mol_vault.multiplicity = 2
                else:
                    self.mol_vault.multiplicity -= 1

            # Initialize respective class for sp calculation(s)
            sp: Union[Psi4SP, XtbSP]
            if engine == "psi4":
                sp = Psi4SP(**params)
            if engine == "xtb":
                sp = XtbSP(**params)

            # Change current working directory to the output files directory
            assert self._output_directory is not None  # for type checker
            os.chdir(self._output_directory)

            try:
                # Run the calculation(s)
                _write_el_struc_file = False
                if self._keep_output_files is True:
                    _write_el_struc_file = True
                energies, electronic_strucs = sp.run(
                    state=state, write_el_struc_file=_write_el_struc_file
                )
            finally:
                # Always reset current working directory to the path where the featurizer was initialized
                os.chdir(self._init_directory)

            # Save the results to the molecule vault
            _init_log = self._loc
            logging.info(
                f"'{self._namespace}' | {self._loc}()\nSingle-point energy calculations done for "
                f"state '{state}'. The calculated energy data for all conformers is automatically "
                f"attached to the molecule vault for state '{state}'."
            )
            self.attach_energy(energy_data=energies, state=state)  # type: ignore[arg-type]
            self._loc = _init_log

            # Automatically attach electronic structure to the molecule vault
            if self._keep_output_files is True:
                logging.info(
                    f"'{self._namespace}' | {self._loc}()\nThe calculated electronic structure "
                    f"data files are automatically attached to the molecule vault for state '{state}'."
                )
                self.attach_electronic_structure(
                    electronic_structure_data=electronic_strucs,  # type: ignore[arg-type]
                    state=state,
                )
            else:
                logging.warning(
                    f"'{self._namespace}' | {self._loc}()\nThe electronic structure data files were "
                    "calculated but not attached to the molecule vault because the output files were "
                    "deleted after the calculations. Specify an output directory in the read_input() "
                    "method ('output_directory' parameter) to keep the output files and automatically "
                    "attach them to the molecule vault."
                )

        finally:
            # Always reset charge and multiplicity to initial values
            self.mol_vault.charge = _init_charge
            self.mol_vault.multiplicity = _init_multiplicity



[docs]
    def _run_featurization(self, feature_indices: List[int], atom_bond_indices: List[int]) -> None:
        """Calculate the requested atom or bond features.

        Features are calculated by running through four nested loops in the following order:

        1. Loop over all requested feature indices.
        2. Loop over all iterable options (if applicable, otherwise a dummy iterable option None
           is used that remains without any effect).
        3. Loop over all conformers in the molecule vault.
        4. Loop over all requested atom or bond indices.

        Parameters
        ----------
        feature_indices : List[int]
            The indices of the features to be calculated.
        atom_bond_indices : List[int]
            The indices of the atoms or bonds for which the features should be calculated.

        Returns
        -------
        None
        """
        elements = copy.deepcopy(self.mol_vault.elements)
        charge = copy.deepcopy(self.mol_vault.charge)
        multiplicity = copy.deepcopy(self.mol_vault.multiplicity)

        #################################
        # Loop over all feature indices #
        #################################
        for feature_idx in feature_indices:
            f_configs = self._feature_info[feature_idx]
            feature_name = f_configs["name"]
            feature_type = f_configs["feature_type"]
            feature_dimensionality = f_configs["dimensionality"]
            config_key_list = f_configs["config_path"].split(".")
            data_type = f_configs["data_type"]
            factory = f_configs["factory"]

            requires_el_struc_data = f_configs["requires_electronic_structure_data"]
            requires_bond_data = f_configs["requires_bond_data"]
            requires_charge = f_configs["requires_charge"]
            requires_multiplicity = f_configs["requires_multiplicity"]

            # Check if for the requested feature, data on the electronic structure is required
            assert isinstance(self.mol_vault.size, int)  # for type checker
            if all(
                [
                    requires_el_struc_data is True,
                    len(self.mol_vault.electronic_strucs_n) < self.mol_vault.size,
                ]
            ):
                _errmsg = (
                    f"For calculating the '{feature_name}' feature (INDEX = {feature_idx}), "
                    "electronic structure data is required but is not available. Attach "
                    "precomputed electronic structure data with the attach_electronic_structure() "
                    "method or calculate it from scratch with the calculate_electronic_structure() "
                    "method."
                )
                logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                raise ValueError(f"{self._loc}(): {_errmsg}")

            # Check if for the requested feature, data on the bonds is required
            if requires_bond_data is True and self.mol_vault.bonds_determined is False:
                _errmsg = (
                    f"For calculating the '{feature_name}' feature (INDEX = {feature_idx}), "
                    "bond data is required but is not available. Determine bonds with the "
                    "determine_bonds() method or provide bond information, e.g., through a "
                    "SMILES string or an SD file."
                )
                logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                raise ValueError(f"{self._loc}(): {_errmsg}")

            # Check if for the requested feature, the charge of the molecule is required
            if requires_charge is True and self.mol_vault.charge is None:
                _errmsg = (
                    f"For calculating the '{feature_name}' feature (INDEX = {feature_idx}), "
                    "the charge of the molecule is required but is not set. Set the charge with "
                    "the set_charge() method."
                )
                logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                raise ValueError(f"{self._loc}(): {_errmsg}")

            # Check if for the requested feature, the multiplicity of the molecule is required
            if requires_multiplicity is True and self.mol_vault.multiplicity is None:
                _errmsg = (
                    f"For calculating the '{feature_name}' feature (INDEX = {feature_idx}), "
                    "the multiplicity of the molecule is required but is not set. Set the "
                    "multiplicity with the set_multiplicity() method before calculating "
                    "this feature."
                )
                logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                raise ValueError(f"{self._loc}(): {_errmsg}")

            # Check if a 3D feature is attempted to be calculated for a 2D ensemble
            # Not required because _atom_feature_indices_2D and _bond_feature_indices_2D exclude
            # all 3D features.

            # Set up the input dictionary for the feature calculation and validate the data
            params = self._get_configs(key_list=config_key_list, include_root_data=True)

            # Temporarily add the entire self._feature_info to the params dictionary,
            # this is removed by config_data_validator after validation. This is required to
            # validate the input to iterable_options for the autocorrelation features (see
            # _ValidateIterableIntOptionMixin in utils/input_validation.py)
            params["feature_info"] = self._feature_info
            params = config_data_validator(
                config_path=config_key_list, params=params, _namespace=self._namespace
            )

            # Add additional information to the params dictionary
            params["_periodic_table"] = self._periodic_table
            params["_functional_groups_smarts"] = self._functional_groups_smarts
            params["_keep_output_files"] = self._keep_output_files
            params["feature_name"] = feature_name
            params["feature_type"] = feature_type
            params["feature_dimensionality"] = feature_dimensionality
            params["elements"] = elements
            params["charge"] = charge
            params["multiplicity"] = multiplicity
            params["ensemble_dimensionality"] = self.mol_vault.dimensionality

            # Add feature caches to the params dictionary
            if feature_type == "atom":
                params["feature_cache"] = self.mol_vault.atom_feature_cache_n
                params["feature_cache_n_minus1"] = self.mol_vault.atom_feature_cache_n_minus1
                params["feature_cache_n_plus1"] = self.mol_vault.atom_feature_cache_n_plus1
            if feature_type == "bond":
                params["feature_cache"] = self.mol_vault.bond_feature_cache

            params["global_feature_cache"] = self.mol_vault.global_feature_cache

            # Handle iterable option. In case no iterable option is present, a dummy iterable
            # option is created (None) which remains without any effect
            if "iterable_option" not in params:
                _iterable_option = [None]
            else:
                _iterable_option = params["iterable_option"]

            ##############################
            # Loop over iterable options #
            ##############################
            for iter_opt in _iterable_option:
                params["iterable_option"] = iter_opt

                ############################
                # Loop over all conformers #
                ############################
                for conf_idx, mol in enumerate(self.mol_vault.mol_objects):
                    # Skip conformers that were labeled as invalid
                    if self.mol_vault.is_valid[conf_idx] is False:
                        logging.warning(
                            f"'{self._namespace}' | {self._loc}()\nSkipping conformer with index "
                            f"{conf_idx} because it was set to be invalid."
                        )
                        continue

                    conformer_name = self.mol_vault.conformer_names[conf_idx]

                    coordinates = None
                    if self.mol_vault.dimensionality == "3D":
                        coordinates = mol.GetConformer(0).GetPositions()

                    energy_n = None
                    if len(self.mol_vault.energies_n) > 0:
                        energy_n = self.mol_vault.energies_n[conf_idx]

                    energy_n_minus1 = None
                    if len(self.mol_vault.energies_n_minus1) > 0:
                        energy_n_minus1 = self.mol_vault.energies_n_minus1[conf_idx]

                    energy_n_plus1 = None
                    if len(self.mol_vault.energies_n_plus1) > 0:
                        energy_n_plus1 = self.mol_vault.energies_n_plus1[conf_idx]

                    electronic_struc_n = None
                    electronic_struc_type_n = None
                    if len(self.mol_vault.electronic_strucs_n) > 0:
                        electronic_struc_n = self.mol_vault.electronic_strucs_n[conf_idx]
                        electronic_struc_type_n = self.mol_vault.electronic_struc_types_n[conf_idx]

                    electronic_struc_n_plus1 = None
                    if len(self.mol_vault.electronic_strucs_n_plus1) > 0:
                        electronic_struc_n_plus1 = self.mol_vault.electronic_strucs_n_plus1[
                            conf_idx
                        ]

                    electronic_struc_n_minus1 = None
                    if len(self.mol_vault.electronic_strucs_n_minus1) > 0:
                        electronic_struc_n_minus1 = self.mol_vault.electronic_strucs_n_minus1[
                            conf_idx
                        ]

                    params["mol"] = Chem.Mol(mol)  # protect the actual mol object by copying it
                    params["conformer_name"] = conformer_name
                    params["conformer_idx"] = conf_idx
                    params["coordinates"] = coordinates
                    params["energy_n"] = energy_n
                    params["energy_n_minus1"] = energy_n_minus1
                    params["energy_n_plus1"] = energy_n_plus1
                    params["electronic_struc_n"] = electronic_struc_n
                    params["electronic_struc_n_plus1"] = electronic_struc_n_plus1
                    params["electronic_struc_n_minus1"] = electronic_struc_n_minus1
                    params["electronic_struc_type_n"] = electronic_struc_type_n

                    ######################################
                    # Loop over all atom or bond indices #
                    ######################################
                    for atom_bond_idx in atom_bond_indices:
                        params["atom_bond_idx"] = atom_bond_idx

                        # Change current working directory to the output files directory
                        assert self._output_directory is not None  # for type checker
                        os.chdir(self._output_directory)

                        try:
                            # Calculate feature with callable factory class
                            calc_feature = FEATURE_FACTORIES[factory]()
                            feature_value, error_message = calc_feature(**params)
                        finally:
                            # Always reset current working directory to path where featurizer was initialized
                            os.chdir(self._init_directory)

                        # Adjust feature name in case the feature class modified it
                        # (when iterable options are used)
                        _feature_name = calc_feature.feature_name

                        # Write the feature value to the RDKit mol object
                        self._set_feature(
                            conf_idx=conf_idx,
                            mol=mol,
                            atom_bond_idx=atom_bond_idx,
                            feature_type=feature_type,
                            feature_name=_feature_name,
                            feature_value=feature_value,
                            error_message=error_message,
                            data_type=data_type,
                        )

        # Clean up the mol objects after property calculation
        self.mol_vault.clean_properties()



[docs]
    def _set_feature(
        self,
        conf_idx: int,
        mol: Chem.rdchem.Mol,
        atom_bond_idx: int,
        feature_type: str,
        feature_name: str,
        feature_value: Optional[Union[int, float, bool, str]],
        error_message: Optional[str],
        data_type: str,
    ) -> None:
        """Set a feature value for the specified atom or bond.

        The feature is stored as property of the respective RDKit atom or bond object.

        Parameters
        ----------
        conf_idx : int
            The index of the conformer in the molecule vault.
        mol : Chem.rdchem.Mol
            The RDKit molecule object within which the feature value should be set.
        atom_bond_idx : int
            The index of the atom or bond for which the feature value should be set.
        feature_type : str
            The type of the feature, either "atom" or "bond".
        feature_name : str
            The name of the feature for which the value should be set.
        feature_value : Optional[Union[int, float, bool, str]]
            The calculated feature value that should be set. If the feature calculation failed,
            this is ``None``.
        error_message : Optional[str]
            Any error message that occurred during feature calculation. If no error occurred, this
            is ``None``.
        data_type : str
            The expected data type of the feature value, either int, float, bool, or str.

        Returns
        -------
        None
        """
        # Log error message if not None
        if error_message is not None:
            logging.error(
                f"'{self._namespace}' | {self._loc}()\nFeature calculation for '{feature_name}' "
                f"of feature type '{feature_type}' (index = {atom_bond_idx}) and data type "
                f"'{data_type}' failed for conformer with index {conf_idx}: {error_message}."
            )
            return

        # Check if feature value is None
        if feature_value is None:
            logging.warning(
                f"'{self._namespace}' | {self._loc}()\nFeature calculation for '{feature_name}' "
                f"of feature type '{feature_type}' (index = {atom_bond_idx}) and data type "
                f"'{data_type}' terminated without error but the feature value is None for "
                f"conformer with index {conf_idx}."
            )
            return

        # Log if the feature calculation run successfully and the feature value is '_inaccessible'.
        # This is for example the case for certain vdW surface features if the atom does not
        # contribute to the vdW surface.
        _helper_bool = False
        try:
            _helper_bool = feature_value == "_inaccessible"
        except Exception:
            pass
        if _helper_bool is True and error_message is None:
            logging.warning(
                f"'{self._namespace}' | {self._loc}()\nFeature calculation for '{feature_name}' "
                f"of feature type '{feature_type}' (index = {atom_bond_idx}) and data type "
                f"'{data_type}' terminated without error but the feature value is '_inaccessible' "
                f"for conformer with index {conf_idx}. Probably the feature is not defined for "
                f"the requested {feature_type}.",
            )
            data_type = "str"

        # Check if correct data type is provided
        _obtained_data_type = type(feature_value).__name__
        if _obtained_data_type != data_type:
            _errmsg = (
                f"Data type '{_obtained_data_type}' of feature value '{feature_value}' associated "
                f"with feature '{feature_name}' calculated for {feature_type} with index "
                f"'{atom_bond_idx}' does not match the expected data type '{data_type}'."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise RuntimeError(f"{self._loc}(): {_errmsg}")

        # Get atom or bond object
        feature_type = self._check_is_str_in_list(
            parameter_name="feature_type",
            value=feature_type,
            allowed_values=FEATURE_TYPES,
        )
        if feature_type == "atom":
            obj = mol.GetAtomWithIdx(atom_bond_idx)
        if feature_type == "bond":
            obj = mol.GetBondWithIdx(atom_bond_idx)

        # Save result
        if data_type == "int":
            obj.SetIntProp(feature_name, feature_value)
        elif data_type == "float":
            obj.SetDoubleProp(feature_name, feature_value)
        elif data_type == "bool":
            obj.SetBoolProp(feature_name, feature_value)
        elif data_type == "str":
            obj.SetProp(feature_name, feature_value)
        else:
            _errmsg = (
                f"Data type {data_type} associated with '{feature_name}' is not supported. "
                "Supported feature types are int, float, bool, and str."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise RuntimeError(f"{self._loc}(): {_errmsg}")

        if feature_value != "_inaccessible":
            logging.info(
                f"'{self._namespace}' | {self._loc}()\nFeature calculation for '{feature_name}' "
                f"of feature type '{feature_type}' (index = {atom_bond_idx}) and data type "
                f"'{data_type}' successful for conformer with index {conf_idx}."
            )



[docs]
    def _return_features(
        self,
        feature_type: str,
        atom_bond_indices: Union[str, int, List[int]],
        output_format: str,
        reduce: bool,
        temperature: Union[int, float],
        ignore_invalid: bool,
    ) -> Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol]:
        """Return the calculated atom or bond features.

        Parameters
        ----------
        feature_type : str
            The type of features to be returned, either "atom" or "bond".
        atom_bond_indices : Union[str, int, List[int]], optional
            The indices of the atoms or bonds for which features should be returned.
        output_format : str, optional
            The name of the desired output format, can be "df", "dict", or "mol_object".
        reduce : bool, optional
            If ``True``, the features are reduced to a set of single values per atom or bond
            across all conformers. If ``False``, the features are returned for each conformer
            separately.
        temperature : Union[int, float], optional
            The temperature in Kelvin at which the Boltzmann-weighted values are calculated.
        ignore_invalid : bool, optional
            Whether to ignore conformers that were labeled as invalid when calculating the
            features.

        Returns
        -------
        Union[pd.DataFrame, Dict[int, Dict[str, Any]], List[Chem.rdchem.Mol], Chem.rdchem.Mol]
            The atom or bond features in the desired output format.
        """
        _loggingmsg = (
            f"'{self._namespace}' | {self._loc}() | START\n"
            f"> 'feature_type':        {feature_type}\n"
            f"> '{feature_type}_indices':        {atom_bond_indices}\n"
            f"> 'output_format':       {output_format}\n"
            f"> 'reduce':              {reduce}\n"
        )

        # Try because reduce input not yet type-checked
        try:
            if reduce is True:
                _loggingmsg += (
                    f"> 'temperature':         {temperature}\n"
                    f"> 'ignore_invalid':      {ignore_invalid}\n"
                )
        except Exception:
            pass
        _loggingmsg += "-----"
        logging.info(_loggingmsg)

        # Pre-checks
        self._check_is_initialized(error_message="returning features")

        # Check input types
        self._check_is_of_type(
            expected_type=str, value=output_format, parameter_name="output_format"
        )
        self._check_is_of_type(expected_type=bool, value=reduce, parameter_name="reduce")
        self._check_is_of_type(
            expected_type=[int, float], value=temperature, parameter_name="temperature"
        )
        self._check_is_of_type(
            expected_type=bool, value=ignore_invalid, parameter_name="ignore_invalid"
        )

        # Check provided temperature
        if temperature <= 0:
            _errmsg = (
                "Invalid input to 'temperature': must be greater than 0 but "
                f"obtained {temperature}."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise ValueError(f"{self._loc}(): {_errmsg}")

        # Check provided output_format
        output_format = self._check_is_str_in_list(
            parameter_name="output_format", value=output_format, allowed_values=OUTPUT_TYPES
        )

        # Check provided indices and if features were calculated
        _data_found = False
        if feature_type == "atom":
            atom_bond_indices_checked = self._check_atom_indices(atom_indices=atom_bond_indices)

            for mol in self.mol_vault.mol_objects:
                for atom in mol.GetAtoms():
                    if atom.GetPropsAsDict() != {}:
                        _data_found = True
                        break

        if feature_type == "bond":
            atom_bond_indices_checked = self._check_bond_indices(bond_indices=atom_bond_indices)

            for mol in self.mol_vault.mol_objects:
                for bond in mol.GetBonds():
                    if bond.GetPropsAsDict() != {}:
                        _data_found = True
                        break

        # Check if any features were calculated
        if _data_found is False:
            _errmsg = (
                f"No '{feature_type}' features were calculated yet. Run the "
                f"featurize_{feature_type}s() method before trying to return '{feature_type}' "
                "features."
            )
            logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
            raise ValueError(f"{self._loc}(): {_errmsg}")

        logging.info(
            f"'{self._namespace}' | {self._loc}()\nFetching '{feature_type}' features from "
            "molecule vault."
        )

        # Update the Boltzmann weights before getting the features (if required)
        assert isinstance(self.mol_vault.size, int)  # for type checker
        if all(
            [
                self.mol_vault.dimensionality == "3D",
                self.mol_vault.energies_n_read is True,
                self.mol_vault.size > 1,
                reduce is True,
            ]
        ):
            self.mol_vault.update_boltzmann_weights(
                temperature=temperature,
                ignore_invalid=ignore_invalid,
            )

        # Get features
        fout = FeatureOutput(
            mol_vault=self.mol_vault,
            indices=atom_bond_indices_checked,
            feature_type=feature_type,
            reduce=reduce,
            ignore_invalid=ignore_invalid,
            _loc=self._loc,
        )
        res = fout.get_results(output_format=output_format)

        logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")
        return res



[docs]
    def _clear_feature_cache(
        self, feature_type: str, origin: Optional[Union[str, List[str]]]
    ) -> None:
        """Clear the atom or bond feature cache of the molecule vault.

        Parameters
        ----------
        feature_type : str
            The type of the feature(s) to be cleared, either "atom" or "bond".
        origin : Optional[Union[str, List[str]]]
            The name or a list of the names of the program(s) of the feature(s) to be cleared
            (e.g., "rdkit", "xtb"). If ``None``, all features of the specified type are cleared.

        Returns
        -------
        None
        """
        logging.info(
            f"'{self._namespace}' | {self._loc}() | START\n"
            f"> 'origin':        {origin}\n"
            f"> 'feature_type':  {feature_type}\n"
            "-----"
        )

        # Check input types
        _inpt = type(origin)
        if origin is not None:
            _errmsg = (
                "Invalid input to 'origin': must be either None, of type str, or a list of "
                f"strings but obtained {_inpt.__name__}."
            )
            if _inpt == str:
                origin = [standardize_string(inp_data=origin)]
            elif _inpt == list:
                for o in origin:
                    if type(o) != str:
                        logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                        raise TypeError(f"{self._loc}(): {_errmsg}")
                origin = [standardize_string(inp_data=o) for o in origin]
            else:
                logging.error(f"'{self._namespace}' | {self._loc}()\n{_errmsg}")
                raise TypeError(f"{self._loc}(): {_errmsg}")

        # Check if provided origins are valid
        if origin is not None:
            _valid_origins = list(self._feature_config.keys())
            for o in origin:
                self._check_is_str_in_list(
                    parameter_name="origin", value=o, allowed_values=_valid_origins
                )

        # Clear feature cache
        self.mol_vault.clear_feature_cache_(feature_type=feature_type, origins=origin)

        # Also reset _functional_groups_smarts to allow the addition of new functional groups
        self._functional_groups_smarts = {}

        logging.info(f"'{self._namespace}' | {self._loc}() | DONE\n")