"""Utility functions for input/output operations."""
from __future__ import annotations
import re
from typing import TYPE_CHECKING, List, Optional, Tuple
from rdkit import Chem
from bonafide.utils.constants import EH_TO_KJ_MOL, ELEMENT_SYMBOLS, ENERGY_UNITS, KCAL_MOL_TO_KJ_MOL
if TYPE_CHECKING:
import numpy as np
from numpy.typing import NDArray
[docs]
def read_mol_object(
mol: Chem.rdchem.Mol,
) -> Tuple[Chem.rdchem.Mol, List[Chem.rdchem.Mol], Optional[str]]:
"""Process an RDKit molecule object for incorporation into a molecule vault.
The conformer molecule-level properties are moved to properties of the processed molecule
objects. The mol objects are not sanitized.
Parameters
----------
mol : Chem.rdchem.Mol
The RDKit molecule object to be processed. It can contain one or more conformers.
Returns
-------
Tuple[Chem.rdchem.Mol, List[Chem.rdchem.Mol], Optional[str]]
A tuple containing:
* The initial input RDKit molecule object.
* A list of RDKit molecule objects, each containing one conformer of the input molecule.
* An error message if the input molecule object is not valid, otherwise ``None``.
"""
_errmsg = None
# Ensure validity of RDKit mol object
# Sanitization is not performed as this might modify the user mol object in an undesired way.
# Instead, the user is responsible to ensure that the input mol object is valid.
if type(mol) != Chem.rdchem.Mol:
_errmsg = "the input value is not a valid RDKit mol object"
return mol, [], _errmsg
if mol.GetNumConformers() < 1:
return mol, [Chem.Mol(mol)], _errmsg
# Generate a separate mol object for each conformer
mols = []
props = []
for idx, conf in enumerate(mol.GetConformers()):
if conf.Is3D() is False:
_errmsg = f"the conformer with index {idx} is 2D; only 3D conformers are supported"
return mol, [], _errmsg
mol_ = Chem.Mol(mol)
mol_.RemoveAllConformers()
mol_.AddConformer(conf=conf, assignId=True)
mols.append(mol_)
props.append(conf.GetPropsAsDict())
# Move conformer properties to mol object level
for prop, m in zip(props, mols):
for prop_name, value in prop.items():
dtype = type(value).__name__
if "int" in dtype:
m.SetIntProp(prop_name, int(value))
elif "float" in dtype:
m.SetDoubleProp(prop_name, float(value))
elif dtype == "str":
m.SetProp(prop_name, value)
elif dtype == "bool":
m.SetBoolProp(prop_name, value)
return mol, mols, _errmsg
[docs]
def read_smiles(smiles: str) -> Tuple[Optional[Chem.rdchem.Mol], Optional[str]]:
"""Read a SMILES string and return an RDKit molecule object and an error message
(``None`` if no error occurs).
Explicit hydrogen atoms are not removed. Sanitization is performed.
Parameters
----------
smiles : str
The SMILES string of a molecule.
Returns
-------
Tuple[Optional[Chem.rdchem.Mol], Optional[str]]
A tuple containing:
* An RDKit molecule object if the SMILES string could be parsed, otherwise ``None``.
* An error message if the SMILES string could not be parsed or sanitized, otherwise
``None``.
"""
_errmsg = None
mol = None
# Set params such that hydrogens are not removed and sanitization is performed
params = Chem.SmilesParserParams()
params.removeHs = False
params.sanitize = True
# Generate mol object
try:
mol = Chem.MolFromSmiles(SMILES=smiles, params=params)
except Exception as e:
_errmsg = f"Generation of RDKit mol object from SMILES string failed: {e}."
return mol, _errmsg
if mol is None:
_errmsg = "Generation of RDKit mol object from SMILES string failed as it resulted in None."
return mol, _errmsg
[docs]
def read_smarts(smarts: str) -> Tuple[Optional[Chem.rdchem.Mol], Optional[str]]:
"""Read a SMARTS pattern and return an RDKit molecule object and an error message
(``None`` if no error).
Parameters
----------
smarts : str
The SMARTS pattern.
Returns
-------
Tuple[Optional[Chem.rdchem.Mol], Optional[str]]
A tuple containing:
* An RDKit molecule object if the SMARTS pattern could be parsed, otherwise ``None``.
* An error message if the SMARTS pattern could not be parsed, otherwise ``None``.
"""
_errmsg = None
mol = None
# Check if SMARTS string is empty
smarts = smarts.strip()
if smarts == "":
_errmsg = "the SMARTS string must not be empty"
return mol, _errmsg
# Try to generate RDKit mol object
try:
mol = Chem.MolFromSmarts(SMARTS=smarts)
except Exception as e:
_errmsg = f"generation of RDKit mol object failed for SMARTS string '{smarts}': {e}"
return mol, _errmsg
# Check if mol is None
if mol is None:
_errmsg = (
f"generation of RDKit mol object failed for SMARTS string '{smarts}' as it "
"resulted in None"
)
return mol, _errmsg
[docs]
def _validate_xyz(
file_lines: List[str], number_of_atoms: int
) -> Tuple[List[str], List[str], Optional[str]]:
"""Validate the individual lines of an XYZ file with one or more conformers.
The following points are ensured:
* The first line of each structure block contains only a valid integer specifying the number of
atoms in the block.
* The number of atoms specified in the first line of each block matches the number of atoms
specified in the first line of the first block.
* Each atom line contains exactly one valid element symbol and three valid cartesian
coordinates (x, y, z) that can be converted to floats.
* The number of atom lines in each block matches the number of atoms specified in the first
line of the file.
* The elements in each block are identical and in the same order as found in the first
structure block.
Please note: These checks are not exhaustive and beyond them the user is responsible to ensure
that the individual structure blocks represent conformers of the same molecule.
Parameters
----------
file_lines : List[str]
The individual lines of the XYZ file.
number_of_atoms : int
The number of atoms in the molecule as defined by the first line of the XYZ file.
Returns
-------
Tuple[List[str], List[str], Optional[str]]
A tuple containing:
* A list of the comment lines of each conformer block.
* A list of strings, each string representing one conformer's atom lines.
* An error message if the file lines are not valid, otherwise ``None``.
"""
_errmsg = None
# Get individual atom blocks and check if they are valid
comment_line_in_blocks: List[str] = []
elements = None
atom_lines_in_blocks: List[str] = []
for block_idx, start_line_idx in enumerate(range(0, len(file_lines), number_of_atoms + 2)):
# Check if first line in block is a valid number of atoms line
try:
n_atoms_in_block = int(file_lines[start_line_idx])
except Exception as e:
_errmsg = (
f"the first line of XYZ block with index {block_idx} is not a single valid "
f"integer specifying the number of atoms in the molecule: {e}."
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Check if number of atoms in block matches number of atoms in first block
if n_atoms_in_block != number_of_atoms:
_errmsg = (
f"the number of atoms specified in the first line of XYZ block with index "
f"{block_idx} ({n_atoms_in_block}) does not match the number of atoms "
f"specified in the XYZ block with index 0 ({number_of_atoms})"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Add comment line of block to list of all comment lines
comment_line_in_blocks.append(file_lines[start_line_idx + 1])
# Loop over individual atom lines in block (skip first two lines)
atom_lines_in_block = []
elements_in_block = []
for line in file_lines[start_line_idx + 2 : start_line_idx + 2 + number_of_atoms]:
splitted = line.split()
# Check file format (element symbol + 3 coordinates per line)
if len(splitted) != 4:
_errmsg = (
f"a line in xzy block with index {block_idx} does not contain "
"exactly one element symbol and three cartesian coordinates (x, y, z)"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Check if element symbol is valid
if splitted[0] not in ELEMENT_SYMBOLS:
_errmsg = (
f"element symbol '{splitted[0]}' in xzy block with index "
f"{block_idx} is not a valid element symbol"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Check if coordinates are valid floats
try:
for coord in splitted[1:]:
float(coord)
except Exception as e:
_errmsg = (
f"one of the coordinates in xzy block with index {block_idx} is not a "
f"valid float: {e}."
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Add atom line and element
atom_lines_in_block.append(line)
elements_in_block.append(splitted[0])
# Check if number of atom lines matches number of atoms
if len(atom_lines_in_block) != number_of_atoms:
_errmsg = (
f"the number of atom lines in XYZ block with index {block_idx} "
f"({len(atom_lines_in_block)}) does not match the number of atoms of the molecule "
f"specified in the first line of the file ({number_of_atoms})"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Check if elements are consistent for all conformers
if elements is not None:
if elements != elements_in_block:
_errmsg = (
f"the elements in the XYZ block of conformer with index {block_idx} are not "
"identical and/or in the same order as found in the conformer with index 0"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
else:
elements = elements_in_block
# Add atom lines of block to list of all blocks
atom_lines_in_blocks.append("".join(atom_lines_in_block))
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
[docs]
def read_xyz_file(file_path: str) -> Tuple[Optional[List[str]], Optional[str]]:
"""Read an XYZ file with one or more conformers and validate its content.
The first line of each conformer block contains the number of atoms, the second line is a
comment line, and the subsequent lines contain the atom symbols and their cartesian coordinates
(in Angstrom). The individual conformers cannot be separated by empty lines. The file content
is validated (see ``_validate_xyz()`` for details).
Parameters
----------
file_path : str
The path to the XYZ file.
Returns
-------
Tuple[Optional[List[str]], Optional[str]]
A tuple containing:
* A list of strings, each representing one conformer's XYZ block.
* An error message if the file could not be read or is not valid, otherwise ``None``.
"""
_errmsg = None
xyz_blocks = None
# Ensure correct file extension (just for double-checking)
if not file_path.endswith(".xyz"):
_errmsg = "the file does not have the expected .xyz file extension"
return xyz_blocks, _errmsg
# Try to open the file
try:
with open(file_path, "r") as f:
lines = f.readlines()
except Exception as e:
_errmsg = f"opening of the file failed: {e}"
return xyz_blocks, _errmsg
# Strip away empty lines at the end of the file
while len(lines) != 0 and lines[-1].strip() == "":
lines.pop(-1)
# Check if file is empty
if len(lines) == 0:
_errmsg = "the file is empty or only contains empty lines"
return xyz_blocks, _errmsg
# Try to get the number of atoms of the first conformer
try:
n_atoms = int(lines[0])
except Exception:
_errmsg = (
"the first line of the file is not a single valid integer defining the number "
"of atoms in the molecule"
)
return xyz_blocks, _errmsg
# Check if at least as many lines as atoms + 2 are present
if len(lines) < n_atoms + 2:
_errmsg = (
f"the file contains fewer non-empty lines ({len(lines)}) than required for a single "
f"structure block with {n_atoms} atoms"
)
return xyz_blocks, _errmsg
# Validate the input file
comment_line_in_blocks, atom_lines_in_blocks, _errmsg = _validate_xyz(
file_lines=lines, number_of_atoms=n_atoms
)
if _errmsg is not None:
return xyz_blocks, _errmsg
# Format XYZ blocks
try:
xyz_blocks = []
for comment, atom_lines in zip(comment_line_in_blocks, atom_lines_in_blocks):
xyz_block = f"{n_atoms}\n{comment}{atom_lines}"
xyz_blocks.append(xyz_block)
except Exception as e:
_errmsg = (
"the file was successfully read and validated, but the "
f"final formatting of the individual XYZ blocks failed: {e}"
)
return xyz_blocks, _errmsg
[docs]
def _validate_sdf(sdf_mols: List[Optional[Chem.rdchem.Mol]]) -> Tuple[Optional[str], Optional[str]]:
"""Validate the individual RDKit molecule objects generated from an SD file with one or more
conformers.
The following points are ensured:
* All conformers could be successfully converted to RDKit molecule objects that are not
``None``.
* All elements in the conformers represent valid element symbols.
* All conformers represent the same molecule (checked by comparing their SMILES string and
chemical element symbols). Stereochemistry is not considered for this check, but a warning
is issued if the conformers have different stereochemical information.
* All conformers possess 3D coordinates.
Parameters
----------
sdf_mols : List[Optional[Chem.rdchem.Mol]]
A list of RDKit molecule objects generated from the SD file (see the ``read_sd_file()``
function). ``None`` can be present in the list if individual conformers could not be
parsed.
Returns
-------
Tuple[Optional[str], Optional[str]]
A tuple containing:
* An error message if the molecule objects are not valid, otherwise ``None``.
* A warning message if the conformers have different stereochemical information, otherwise
``None``.
"""
_errmsg = None
_stereomsg = None
_stereowarn = []
_smiles = None
_smiles_no_stereo = None
_elements = None
for idx, mol in enumerate(sdf_mols):
# Check if mol is None
if mol is None:
_errmsg = (
f"generation of RDKit mol object from SDF block failed for conformer "
f"with index {idx} as it resulted in None"
)
return _errmsg, _stereomsg
# Validate elements
els = [atom.GetSymbol() for atom in mol.GetAtoms()]
for el in els:
if el not in ELEMENT_SYMBOLS:
_errmsg = (
f"element symbol '{el}' in SDF block with index {idx} is not a valid "
"element symbol"
)
return _errmsg, _stereomsg
# Compare conformers by elements and SMILES
# The try/except block is necessary as the base mol objects are not sanitized and might
# contain issues that cause the generation of the SMILES string to fail.
try:
smi = Chem.MolToSmiles(mol)
smi_no_stereo = Chem.MolToSmiles(mol, isomericSmiles=False)
except Exception as e:
_errmsg = (
"the generation of the SMILES string from the RDKit mol object for conformer "
f"validation failed for conformer with index {idx}: {e}"
)
return _errmsg, _stereomsg
if _smiles is None:
_smiles = smi
_smiles_no_stereo = smi_no_stereo
_elements = els
else:
# SMILES
if smi_no_stereo != _smiles_no_stereo:
_errmsg = (
f"the generated SMILES string of the conformer with index {idx} "
f"('{smi_no_stereo}') does not match the SMILES string of the conformer with "
f"index 0 ('{_smiles_no_stereo}'); stereochemical information excluded"
)
return _errmsg, _stereomsg
if smi_no_stereo == _smiles_no_stereo and smi != _smiles:
_stereowarn.append(idx)
# Elements
if els != _elements:
_errmsg = (
f"the elements of the conformer with index {idx} are not identical "
"and/or in the same order as found in the conformer with index 0"
)
return _errmsg, _stereomsg
# Ensure that conformer is 3D
if mol.GetConformer().Is3D() is False:
_errmsg = (
f"the conformer with index {idx} is not 3D. Only SD files containing 3D "
"information are supported"
)
return _errmsg, _stereomsg
# Format stereochemistry warning message if applicable
if len(_stereowarn) > 0:
_stereomsg = (
f"File validation: the conformers with the following indices have different "
"stereochemical information than the conformer with index 0 (but identical atom "
f"connectivity): {_stereowarn}. Ensure that this is intended."
)
return _errmsg, _stereomsg
[docs]
def read_sd_file(
file_path: str,
) -> Tuple[Optional[List[Chem.rdchem.Mol]], Optional[str], Optional[str]]:
"""Read an SD file with one or more conformers.
Explicit hydrogen atoms are not removed. Sanitization is not performed at this stage. Instead,
it is done for every conformer separately in
``bonafide.utils.molecule_vault.MolVault.initialize_mol``. The file must comply with the SD
file format (see https://en.wikipedia.org/wiki/Chemical_table_file, last accessed on
23.09.2025).
Parameters
----------
file_path : str
The path to the SD file.
Returns
-------
Tuple[Optional[List[Chem.rdchem.Mol]], Optional[str], Optional[str]]
A tuple containing:
* A list of RDKit molecule objects if the file could be read and validated, otherwise
``None``.
* An error message if the file could not be read or is not valid, otherwise ``None``.
* A warning message if the conformers have different stereochemical information, otherwise
``None``.
"""
sdf_mols = None
_errmsg = None
_stereomsg = None
# Ensure correct file extension (just for double-checking)
if not file_path.endswith(".sdf"):
_errmsg = "the file does not have the expected .sdf file extension"
return sdf_mols, _errmsg, _stereomsg
# Try to open the file
try:
suppl = Chem.SDMolSupplier(fileName=file_path, sanitize=False, removeHs=False)
except Exception as e:
_errmsg = f"opening of the file failed: {e}"
return sdf_mols, _errmsg, _stereomsg
# Get mol objects
sdf_mols = [mol for mol in suppl]
# Validate the mol objects
_errmsg, _stereomsg = _validate_sdf(sdf_mols)
if _errmsg is not None:
_errmsg = f"validation of the file failed: {_errmsg}"
sdf_mols = None
return sdf_mols, _errmsg, _stereomsg
[docs]
def write_sd_file(mol: Chem.rdchem.Mol, file_path: str) -> None:
"""Write an SD file from an RDKit mol object.
Parameters
----------
mol : Chem.rdchem.Mol
An RDKit molecule object.
file_path : str
The path to the file the data is written to.
Returns
-------
None
"""
with Chem.SDWriter(file_path) as writer:
writer.write(mol)
[docs]
def write_xyz_file_from_coordinates_array(
elements: NDArray[np.str_],
coordinates: NDArray[np.float64],
file_path: str,
) -> None:
"""Write a list of elements and their coordinates to an XYZ file.
Parameters
----------
elements : NDArray[np.str\_]
The element symbols of the molecule.
coordinates : NDArray[np.float64]
The cartesian coordinates of the structure.
file_path : str
The path to the output XYZ file.
Returns
-------
None
"""
# Format file content
xyz = f"{len(elements)}\n\n"
for el, coord in zip(elements, coordinates):
xyz += f"{el} {coord[0]} {coord[1]} {coord[2]}\n"
# Write file
with open(file_path, "w") as f:
f.write(xyz)