Source code for bonafide.features.alfabet_

"""Bond dissociation energy features from ``ALFABET``."""

import logging
from typing import Dict

import pandas as pd

from bonafide.utils.base_featurizer import BaseFeaturizer
from bonafide.utils.driver import external_driver
from bonafide.utils.helper_functions import get_function_or_method_name
from bonafide.utils.helper_functions_chemistry import get_atom_bond_mapping_dicts



[docs]
class _Alfabet2DBond(BaseFeaturizer):
    """Parent feature factory for the 2D atom ALFABET features.

    For details, please refer to the ALFABET repository (https://github.com/NREL/alfabet,
    last accessed on 09.09.2025).
    """

    python_interpreter_path: str

    def __init__(self) -> None:
        self.extraction_mode = "multi"
        super().__init__()


[docs]
    def calculate(self) -> None:
        """Calculate the ``alfabet2D-bond-bond_dissociation_energy`` and
        ``alfabet2D-bond-bond_dissociation_free_energy`` feature."""
        # Get the canonical SMILES string and the bond mapping dictionary to ensure that ALFABET
        # is run with the canonical SMILES string to avoid potential issues with different
        # atom/bond orderings.
        _, mapping_dict_bonds, canonical_smiles = get_atom_bond_mapping_dicts(self.mol)

        # ALFABET is run in its separate Python environment through a helper script that is
        # temporarily created and run with the respective Python interpreter. This was necessary
        # because ALFABET was not compatible with BONAFIDE's python environment.

        # Python script for ALFABET
        alfabet_script = [
            "import pandas as pd",
            "import os",
            "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'",
            "from alfabet import model",
            f"df = model.predict([r'{canonical_smiles}'])",
            f"df.to_csv('Alfabet2DBond_{self.conformer_name}.csv', index=False)",
        ]
        alfabet_script_str = "\n".join(alfabet_script)

        # Run ALFABET
        res = external_driver(
            program_path=self.python_interpreter_path,
            program_input=alfabet_script_str,
            input_file_extension=".py",
            namespace=self.conformer_name[::-1].split("__", 1)[-1][::-1],
            dependencies=["pandas", "alfabet"],
            capture_output=True,
            text=True,
            check=False,
        )

        # Check for errors
        stderr = res.stderr
        returncode = res.returncode
        if returncode != 0:
            self._err = f"returncode: {returncode}, stderr: {stderr}"
            return

        # Save the results
        self._read_output_file(mapping_dict=mapping_dict_bonds)



[docs]
    def _read_output_file(self, mapping_dict: Dict[int, int]) -> None:
        """Read the ALFABET output pandas DataFrame and write the results to the results
        dictionary.

        Only the bonds that can be predicted by ALFABET will have an entry in the DataFrame. If
        molecules with no hydrogen atoms added are passed to BONAFIDE, the X-H dissociation
        energies still will be predicted by ALFABET, but the results will not appear in the final
        BONAFIDE output, as the bonds do not exist in the actual input molecule. Add hydrogen atoms
        to the molecule before passing it to BONAFIDE to avoid this.

        Parameters
        ----------
        mapping_dict : Dict[int, int]
            The mapping dictionary to map the bond indices from the canonical SMILES string to the
            bond indices of the input molecule. This is included for security to ensure that the
            bond indices are handled correctly.

        Returns
        -------
        None
        """
        _loc = f"{self.__class__.__name__}.{get_function_or_method_name()}"

        # Read the output file
        df = pd.read_csv(f"Alfabet2DBond_{self.conformer_name}.csv")

        # Get the data and write it to the results dictionary
        for _, row_data in df.iterrows():
            bond_idx = int(row_data["bond_index"])
            bde = row_data["bde_pred"]
            bdfe = row_data["bdfe_pred"]
            valid = row_data["is_valid"]

            if valid is False:
                _namespace = self.conformer_name[::-1].split("__", 1)[-1][::-1]
                logging.warning(
                    f"'{_namespace}' | {_loc}()\nPrediction of the bond dissociation (free) "
                    f"energy with ALFABET for bond with index {bond_idx} was labeled as invalid. "
                    "Check your input and the output."
                )

            if bond_idx in mapping_dict:
                self.results[mapping_dict[bond_idx]] = {
                    "alfabet2D-bond-bond_dissociation_energy": bde,
                    "alfabet2D-bond-bond_dissociation_free_energy": bdfe,
                }





[docs]
class Alfabet2DBondBondDissociationEnergy(_Alfabet2DBond):
    """Feature factory for the 2D bond feature "bond_dissociation_energy", calculated with
    alfabet.

    The index of this feature is 0 (see the ``list_atom_features()`` and
    ``list_bond_features()`` method). The corresponding configuration settings can be found
    under "alfabet" in the _feature_config.toml file.
    """

    def __init__(self) -> None:
        super().__init__()


    # This feature is automatically calculated in _Alfabet2DBond



[docs]
class Alfabet2DBondBondDissociationFreeEnergy(_Alfabet2DBond):
    """Feature factory for the 2D bond feature "bond_dissociation_free_energy", calculated with
    alfabet.

    The index of this feature is 1 (see the ``list_atom_features()`` and
    ``list_bond_features()`` method). The corresponding configuration settings can be found
    under "alfabet" in the _feature_config.toml file.
    """

    def __init__(self) -> None:
        super().__init__()


    # This feature is automatically calculated in _Alfabet2DBond