Source code for bonafide.utils.io_

"""Utility functions for input/output operations."""

from __future__ import annotations

import re
from typing import TYPE_CHECKING, List, Optional, Tuple

from rdkit import Chem

from bonafide.utils.constants import EH_TO_KJ_MOL, ELEMENT_SYMBOLS, ENERGY_UNITS, KCAL_MOL_TO_KJ_MOL

if TYPE_CHECKING:
    import numpy as np
    from numpy.typing import NDArray



[docs]
def read_mol_object(
    mol: Chem.rdchem.Mol,
) -> Tuple[Chem.rdchem.Mol, List[Chem.rdchem.Mol], Optional[str]]:
    """Process an RDKit molecule object for incorporation into a molecule vault.

    The conformer molecule-level properties are moved to properties of the processed molecule
    objects. The mol objects are not sanitized.

    Parameters
    ----------
    mol : Chem.rdchem.Mol
        The RDKit molecule object to be processed. It can contain one or more conformers.

    Returns
    -------
    Tuple[Chem.rdchem.Mol, List[Chem.rdchem.Mol], Optional[str]]
        A tuple containing:

        * The initial input RDKit molecule object.
        * A list of RDKit molecule objects, each containing one conformer of the input molecule.
        * An error message if the input molecule object is not valid, otherwise ``None``.
    """
    _errmsg = None

    # Ensure validity of RDKit mol object
    # Sanitization is not performed as this might modify the user mol object in an undesired way.
    # Instead, the user is responsible to ensure that the input mol object is valid.
    if type(mol) != Chem.rdchem.Mol:
        _errmsg = "the input value is not a valid RDKit mol object"
        return mol, [], _errmsg

    if mol.GetNumConformers() < 1:
        return mol, [Chem.Mol(mol)], _errmsg

    # Generate a separate mol object for each conformer
    mols = []
    props = []
    for idx, conf in enumerate(mol.GetConformers()):
        if conf.Is3D() is False:
            _errmsg = f"the conformer with index {idx} is 2D; only 3D conformers are supported"
            return mol, [], _errmsg

        mol_ = Chem.Mol(mol)
        mol_.RemoveAllConformers()
        mol_.AddConformer(conf=conf, assignId=True)
        mols.append(mol_)
        props.append(conf.GetPropsAsDict())

    # Move conformer properties to mol object level
    for prop, m in zip(props, mols):
        for prop_name, value in prop.items():
            dtype = type(value).__name__

            if "int" in dtype:
                m.SetIntProp(prop_name, int(value))
            elif "float" in dtype:
                m.SetDoubleProp(prop_name, float(value))
            elif dtype == "str":
                m.SetProp(prop_name, value)
            elif dtype == "bool":
                m.SetBoolProp(prop_name, value)

    return mol, mols, _errmsg




[docs]
def read_smiles(smiles: str) -> Tuple[Optional[Chem.rdchem.Mol], Optional[str]]:
    """Read a SMILES string and return an RDKit molecule object and an error message
    (``None`` if no error occurs).

    Explicit hydrogen atoms are not removed. Sanitization is performed.

    Parameters
    ----------
    smiles : str
        The SMILES string of a molecule.

    Returns
    -------
    Tuple[Optional[Chem.rdchem.Mol], Optional[str]]
        A tuple containing:

        * An RDKit molecule object if the SMILES string could be parsed, otherwise ``None``.
        * An error message if the SMILES string could not be parsed or sanitized, otherwise
          ``None``.
    """
    _errmsg = None
    mol = None

    # Set params such that hydrogens are not removed and sanitization is performed
    params = Chem.SmilesParserParams()
    params.removeHs = False
    params.sanitize = True

    # Generate mol object
    try:
        mol = Chem.MolFromSmiles(SMILES=smiles, params=params)
    except Exception as e:
        _errmsg = f"Generation of RDKit mol object from SMILES string failed: {e}."
        return mol, _errmsg

    if mol is None:
        _errmsg = "Generation of RDKit mol object from SMILES string failed as it resulted in None."

    return mol, _errmsg




[docs]
def read_smarts(smarts: str) -> Tuple[Optional[Chem.rdchem.Mol], Optional[str]]:
    """Read a SMARTS pattern and return an RDKit molecule object and an error message
    (``None`` if no error).

    Parameters
    ----------
    smarts : str
        The SMARTS pattern.

    Returns
    -------
    Tuple[Optional[Chem.rdchem.Mol], Optional[str]]
        A tuple containing:

        * An RDKit molecule object if the SMARTS pattern could be parsed, otherwise ``None``.
        * An error message if the SMARTS pattern could not be parsed, otherwise ``None``.
    """
    _errmsg = None
    mol = None

    # Check if SMARTS string is empty
    smarts = smarts.strip()
    if smarts == "":
        _errmsg = "the SMARTS string must not be empty"
        return mol, _errmsg

    # Try to generate RDKit mol object
    try:
        mol = Chem.MolFromSmarts(SMARTS=smarts)
    except Exception as e:
        _errmsg = f"generation of RDKit mol object failed for SMARTS string '{smarts}': {e}"
        return mol, _errmsg

    # Check if mol is None
    if mol is None:
        _errmsg = (
            f"generation of RDKit mol object failed for SMARTS string '{smarts}' as it "
            "resulted in None"
        )

    return mol, _errmsg




[docs]
def _validate_xyz(
    file_lines: List[str], number_of_atoms: int
) -> Tuple[List[str], List[str], Optional[str]]:
    """Validate the individual lines of an XYZ file with one or more conformers.

    The following points are ensured:

    * The first line of each structure block contains only a valid integer specifying the number of
      atoms in the block.
    * The number of atoms specified in the first line of each block matches the number of atoms
      specified in the first line of the first block.
    * Each atom line contains exactly one valid element symbol and three valid cartesian
      coordinates (x, y, z) that can be converted to floats.
    * The number of atom lines in each block matches the number of atoms specified in the first
      line of the file.
    * The elements in each block are identical and in the same order as found in the first
      structure block.

    Please note: These checks are not exhaustive and beyond them the user is responsible to ensure
    that the individual structure blocks represent conformers of the same molecule.

    Parameters
    ----------
    file_lines : List[str]
        The individual lines of the XYZ file.
    number_of_atoms : int
        The number of atoms in the molecule as defined by the first line of the XYZ file.

    Returns
    -------
    Tuple[List[str], List[str], Optional[str]]
        A tuple containing:

        * A list of the comment lines of each conformer block.
        * A list of strings, each string representing one conformer's atom lines.
        * An error message if the file lines are not valid, otherwise ``None``.
    """
    _errmsg = None

    # Get individual atom blocks and check if they are valid
    comment_line_in_blocks: List[str] = []
    elements = None
    atom_lines_in_blocks: List[str] = []

    for block_idx, start_line_idx in enumerate(range(0, len(file_lines), number_of_atoms + 2)):
        # Check if first line in block is a valid number of atoms line
        try:
            n_atoms_in_block = int(file_lines[start_line_idx])
        except Exception as e:
            _errmsg = (
                f"the first line of XYZ block with index {block_idx} is not a single valid "
                f"integer specifying the number of atoms in the molecule: {e}."
            )
            return comment_line_in_blocks, atom_lines_in_blocks, _errmsg

        # Check if number of atoms in block matches number of atoms in first block
        if n_atoms_in_block != number_of_atoms:
            _errmsg = (
                f"the number of atoms specified in the first line of XYZ block with index "
                f"{block_idx} ({n_atoms_in_block}) does not match the number of atoms "
                f"specified in the XYZ block with index 0 ({number_of_atoms})"
            )
            return comment_line_in_blocks, atom_lines_in_blocks, _errmsg

        # Add comment line of block to list of all comment lines
        comment_line_in_blocks.append(file_lines[start_line_idx + 1])

        # Loop over individual atom lines in block (skip first two lines)
        atom_lines_in_block = []
        elements_in_block = []
        for line in file_lines[start_line_idx + 2 : start_line_idx + 2 + number_of_atoms]:
            splitted = line.split()

            # Check file format (element symbol + 3 coordinates per line)
            if len(splitted) != 4:
                _errmsg = (
                    f"a line in xzy block with index {block_idx} does not contain "
                    "exactly one element symbol and three cartesian coordinates (x, y, z)"
                )
                return comment_line_in_blocks, atom_lines_in_blocks, _errmsg

            # Check if element symbol is valid
            if splitted[0] not in ELEMENT_SYMBOLS:
                _errmsg = (
                    f"element symbol '{splitted[0]}' in xzy block with index "
                    f"{block_idx} is not a valid element symbol"
                )
                return comment_line_in_blocks, atom_lines_in_blocks, _errmsg

            # Check if coordinates are valid floats
            try:
                for coord in splitted[1:]:
                    float(coord)
            except Exception as e:
                _errmsg = (
                    f"one of the coordinates in xzy block with index {block_idx} is not a "
                    f"valid float: {e}."
                )
                return comment_line_in_blocks, atom_lines_in_blocks, _errmsg

            # Add atom line and element
            atom_lines_in_block.append(line)
            elements_in_block.append(splitted[0])

        # Check if number of atom lines matches number of atoms
        if len(atom_lines_in_block) != number_of_atoms:
            _errmsg = (
                f"the number of atom lines in XYZ block with index {block_idx} "
                f"({len(atom_lines_in_block)}) does not match the number of atoms of the molecule "
                f"specified in the first line of the file ({number_of_atoms})"
            )
            return comment_line_in_blocks, atom_lines_in_blocks, _errmsg

        # Check if elements are consistent for all conformers
        if elements is not None:
            if elements != elements_in_block:
                _errmsg = (
                    f"the elements in the XYZ block of conformer with index {block_idx} are not "
                    "identical and/or in the same order as found in the conformer with index 0"
                )
                return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
        else:
            elements = elements_in_block

        # Add atom lines of block to list of all blocks
        atom_lines_in_blocks.append("".join(atom_lines_in_block))

    return comment_line_in_blocks, atom_lines_in_blocks, _errmsg




[docs]
def read_xyz_file(file_path: str) -> Tuple[Optional[List[str]], Optional[str]]:
    """Read an XYZ file with one or more conformers and validate its content.

    The first line of each conformer block contains the number of atoms, the second line is a
    comment line, and the subsequent lines contain the atom symbols and their cartesian coordinates
    (in Angstrom). The individual conformers cannot be separated by empty lines. The file content
    is validated (see ``_validate_xyz()`` for details).

    Parameters
    ----------
    file_path : str
        The path to the XYZ file.

    Returns
    -------
    Tuple[Optional[List[str]], Optional[str]]
        A tuple containing:

        * A list of strings, each representing one conformer's XYZ block.
        * An error message if the file could not be read or is not valid, otherwise ``None``.
    """
    _errmsg = None
    xyz_blocks = None

    # Ensure correct file extension (just for double-checking)
    if not file_path.endswith(".xyz"):
        _errmsg = "the file does not have the expected .xyz file extension"
        return xyz_blocks, _errmsg

    # Try to open the file
    try:
        with open(file_path, "r") as f:
            lines = f.readlines()
    except Exception as e:
        _errmsg = f"opening of the file failed: {e}"
        return xyz_blocks, _errmsg

    # Strip away empty lines at the end of the file
    while len(lines) != 0 and lines[-1].strip() == "":
        lines.pop(-1)

    # Check if file is empty
    if len(lines) == 0:
        _errmsg = "the file is empty or only contains empty lines"
        return xyz_blocks, _errmsg

    # Try to get the number of atoms of the first conformer
    try:
        n_atoms = int(lines[0])
    except Exception:
        _errmsg = (
            "the first line of the file is not a single valid integer defining the number "
            "of atoms in the molecule"
        )
        return xyz_blocks, _errmsg

    # Check if at least as many lines as atoms + 2 are present
    if len(lines) < n_atoms + 2:
        _errmsg = (
            f"the file contains fewer non-empty lines ({len(lines)}) than required for a single "
            f"structure block with {n_atoms} atoms"
        )
        return xyz_blocks, _errmsg

    # Validate the input file
    comment_line_in_blocks, atom_lines_in_blocks, _errmsg = _validate_xyz(
        file_lines=lines, number_of_atoms=n_atoms
    )
    if _errmsg is not None:
        return xyz_blocks, _errmsg

    # Format XYZ blocks
    try:
        xyz_blocks = []
        for comment, atom_lines in zip(comment_line_in_blocks, atom_lines_in_blocks):
            xyz_block = f"{n_atoms}\n{comment}{atom_lines}"
            xyz_blocks.append(xyz_block)
    except Exception as e:
        _errmsg = (
            "the file was successfully read and validated, but the "
            f"final formatting of the individual XYZ blocks failed: {e}"
        )

    return xyz_blocks, _errmsg




[docs]
def _validate_sdf(sdf_mols: List[Optional[Chem.rdchem.Mol]]) -> Tuple[Optional[str], Optional[str]]:
    """Validate the individual RDKit molecule objects generated from an SD file with one or more
    conformers.

    The following points are ensured:

    * All conformers could be successfully converted to RDKit molecule objects that are not
      ``None``.
    * All elements in the conformers represent valid element symbols.
    * All conformers represent the same molecule (checked by comparing their SMILES string and
      chemical element symbols). Stereochemistry is not considered for this check, but a warning
      is issued if the conformers have different stereochemical information.
    * All conformers possess 3D coordinates.

    Parameters
    ----------
    sdf_mols : List[Optional[Chem.rdchem.Mol]]
        A list of RDKit molecule objects generated from the SD file (see the ``read_sd_file()``
        function). ``None`` can be present in the list if individual conformers could not be
        parsed.

    Returns
    -------
    Tuple[Optional[str], Optional[str]]
        A tuple containing:

        * An error message if the molecule objects are not valid, otherwise ``None``.
        * A warning message if the conformers have different stereochemical information, otherwise
          ``None``.
    """
    _errmsg = None
    _stereomsg = None
    _stereowarn = []

    _smiles = None
    _smiles_no_stereo = None
    _elements = None

    for idx, mol in enumerate(sdf_mols):
        # Check if mol is None
        if mol is None:
            _errmsg = (
                f"generation of RDKit mol object from SDF block failed for conformer "
                f"with index {idx} as it resulted in None"
            )
            return _errmsg, _stereomsg

        # Validate elements
        els = [atom.GetSymbol() for atom in mol.GetAtoms()]
        for el in els:
            if el not in ELEMENT_SYMBOLS:
                _errmsg = (
                    f"element symbol '{el}' in SDF block with index {idx} is not a valid "
                    "element symbol"
                )
                return _errmsg, _stereomsg

        # Compare conformers by elements and SMILES
        # The try/except block is necessary as the base mol objects are not sanitized and might
        # contain issues that cause the generation of the SMILES string to fail.
        try:
            smi = Chem.MolToSmiles(mol)
            smi_no_stereo = Chem.MolToSmiles(mol, isomericSmiles=False)
        except Exception as e:
            _errmsg = (
                "the generation of the SMILES string from the RDKit mol object for conformer "
                f"validation failed for conformer with index {idx}: {e}"
            )
            return _errmsg, _stereomsg

        if _smiles is None:
            _smiles = smi
            _smiles_no_stereo = smi_no_stereo
            _elements = els
        else:
            # SMILES
            if smi_no_stereo != _smiles_no_stereo:
                _errmsg = (
                    f"the generated SMILES string of the conformer with index {idx} "
                    f"('{smi_no_stereo}') does not match the SMILES string of the conformer with "
                    f"index 0 ('{_smiles_no_stereo}'); stereochemical information excluded"
                )
                return _errmsg, _stereomsg

            if smi_no_stereo == _smiles_no_stereo and smi != _smiles:
                _stereowarn.append(idx)

            # Elements
            if els != _elements:
                _errmsg = (
                    f"the elements of the conformer with index {idx} are not identical "
                    "and/or in the same order as found in the conformer with index 0"
                )
                return _errmsg, _stereomsg

        # Ensure that conformer is 3D
        if mol.GetConformer().Is3D() is False:
            _errmsg = (
                f"the conformer with index {idx} is not 3D. Only SD files containing 3D "
                "information are supported"
            )
            return _errmsg, _stereomsg

    # Format stereochemistry warning message if applicable
    if len(_stereowarn) > 0:
        _stereomsg = (
            f"File validation: the conformers with the following indices have different "
            "stereochemical information than the conformer with index 0 (but identical atom "
            f"connectivity): {_stereowarn}. Ensure that this is intended."
        )

    return _errmsg, _stereomsg




[docs]
def read_sd_file(
    file_path: str,
) -> Tuple[Optional[List[Chem.rdchem.Mol]], Optional[str], Optional[str]]:
    """Read an SD file with one or more conformers.

    Explicit hydrogen atoms are not removed. Sanitization is not performed at this stage. Instead,
    it is done for every conformer separately in
    ``bonafide.utils.molecule_vault.MolVault.initialize_mol``. The file must comply with the SD
    file format (see https://en.wikipedia.org/wiki/Chemical_table_file, last accessed on
    23.09.2025).

    Parameters
    ----------
    file_path : str
        The path to the SD file.

    Returns
    -------
    Tuple[Optional[List[Chem.rdchem.Mol]], Optional[str], Optional[str]]
        A tuple containing:

        * A list of RDKit molecule objects if the file could be read and validated, otherwise
          ``None``.
        * An error message if the file could not be read or is not valid, otherwise ``None``.
        * A warning message if the conformers have different stereochemical information, otherwise
          ``None``.
    """
    sdf_mols = None
    _errmsg = None
    _stereomsg = None

    # Ensure correct file extension (just for double-checking)
    if not file_path.endswith(".sdf"):
        _errmsg = "the file does not have the expected .sdf file extension"
        return sdf_mols, _errmsg, _stereomsg

    # Try to open the file
    try:
        suppl = Chem.SDMolSupplier(fileName=file_path, sanitize=False, removeHs=False)
    except Exception as e:
        _errmsg = f"opening of the file failed: {e}"
        return sdf_mols, _errmsg, _stereomsg

    # Get mol objects
    sdf_mols = [mol for mol in suppl]

    # Validate the mol objects
    _errmsg, _stereomsg = _validate_sdf(sdf_mols)
    if _errmsg is not None:
        _errmsg = f"validation of the file failed: {_errmsg}"
        sdf_mols = None

    return sdf_mols, _errmsg, _stereomsg




[docs]
def write_sd_file(mol: Chem.rdchem.Mol, file_path: str) -> None:
    """Write an SD file from an RDKit mol object.

    Parameters
    ----------
    mol : Chem.rdchem.Mol
        An RDKit molecule object.
    file_path : str
        The path to the file the data is written to.

    Returns
    -------
    None
    """
    with Chem.SDWriter(file_path) as writer:
        writer.write(mol)




[docs]
def extract_energy_from_string(
    line: str,
) -> Tuple[Optional[float], Optional[str], Optional[float], Optional[str]]:
    """Read the energy and its unit from a string and convert it to kJ/mol.

    Supported energy units are: kcal/mol, kJ/mol, and Eh (Hartree).

    Parameters
    ----------
    line : str
        A string containing the energy value and its unit.

    Returns
    -------
    Tuple[Optional[float], Optional[str], Optional[float], Optional[str]]
        A tuple containing:

        * The energy value as submitted if found (or ``None`` if no valid energy is found)
        * The unit as submitted if found (or ``None`` if no valid unit is found)
        * The energy value converted to kJ/mol (or ``None`` if no valid energy is found)
        * An error message (``None`` if no error occurred).
    """
    # Energy conversions
    _conversions = {
        "kj_mol": 1,
        "kcal_mol": KCAL_MOL_TO_KJ_MOL,
        "eh": EH_TO_KJ_MOL,
    }

    _line = line
    line = str(line).lower()

    energy_as_submitted = None
    unit_as_submitted = None
    energy = None
    _errmsg = None

    # Search for energy value and unit in string
    for base_unit, symbol_list in ENERGY_UNITS.items():
        for symbol in symbol_list:
            pattern = (
                rf"([-+]?(?:\d{{1,3}}(?:,\d{{3}})*|\d+)(?:\.\d+)?(?:[eE][-+]?\d+)?)\s*{symbol}"
            )
            matches = re.findall(pattern, line)

            if len(matches) > 0:
                energy_as_submitted = matches[0]
                energy_as_submitted = float(energy_as_submitted.replace(",", ""))
                unit_as_submitted = base_unit
                break

    # Check if data was found and convert to kJ/mol
    if energy_as_submitted is not None and unit_as_submitted is not None:
        energy_as_submitted = float(energy_as_submitted)
        energy = float(energy_as_submitted * _conversions[unit_as_submitted])
    else:
        _errmsg = (
            f"no valid energy value with a supported unit (kJ/mol, kcal/mol, Eh) could be "
            f"extracted from the string '{_line}'"
        )

    return energy_as_submitted, unit_as_submitted, energy, _errmsg




[docs]
def write_xyz_file_from_coordinates_array(
    elements: NDArray[np.str_],
    coordinates: NDArray[np.float64],
    file_path: str,
) -> None:
    """Write a list of elements and their coordinates to an XYZ file.

    Parameters
    ----------
    elements : NDArray[np.str\_]
        The element symbols of the molecule.
    coordinates : NDArray[np.float64]
        The cartesian coordinates of the structure.
    file_path : str
        The path to the output XYZ file.

    Returns
    -------
    None
    """
    # Format file content
    xyz = f"{len(elements)}\n\n"
    for el, coord in zip(elements, coordinates):
        xyz += f"{el}    {coord[0]}    {coord[1]}    {coord[2]}\n"

    # Write file
    with open(file_path, "w") as f:
        f.write(xyz)