"""Utility functions for input/output operations."""
from __future__ import annotations
import re
from typing import TYPE_CHECKING, List, Optional, Tuple
from rdkit import Chem
from bonafide.utils.constants import EH_TO_KJ_MOL, ELEMENT_SYMBOLS, ENERGY_UNITS, KCAL_MOL_TO_KJ_MOL
if TYPE_CHECKING:
import numpy as np
from numpy.typing import NDArray
[docs]
def read_mol_object(
mol: Chem.rdchem.Mol,
) -> Tuple[Chem.rdchem.Mol, List[Chem.rdchem.Mol], Optional[str]]:
"""Process an RDKit molecule object for incorporation into a molecule vault.
The conformer molecule-level properties are moved to properties of the processed molecule
objects.
Parameters
----------
mol : Chem.rdchem.Mol
The RDKit molecule object to be processed. It can contain one or more conformers.
Returns
-------
Tuple[Chem.rdchem.Mol, List[Chem.rdchem.Mol]]
A tuple containing:
* The initial input RDKit molecule object.
* A list of RDKit molecule objects, each containing one conformer of the input molecule.
* An error message if the input molecule object is not valid, otherwise ``None``.
"""
_errmsg = None
# Ensure validity of RDKit mol object
# Sanitization is not performed as this might modify the user mol object in an undesired way.
# Instead, the user is responsible to ensure that the input mol object is valid.
if type(mol) != Chem.rdchem.Mol:
_errmsg = "the input value is not a valid RDKit mol object"
return mol, [], _errmsg
if mol.GetNumConformers() < 1:
return mol, [Chem.Mol(mol)], _errmsg
# Generate a separate mol object for each conformer
mols = []
props = []
for idx, conf in enumerate(mol.GetConformers()):
if conf.Is3D() is False:
_errmsg = f"the conformer with index {idx} is 2D; only 3D conformers are supported"
return mol, [], _errmsg
mol_ = Chem.Mol(mol)
mol_.RemoveAllConformers()
mol_.AddConformer(conf=conf, assignId=True)
mols.append(mol_)
props.append(conf.GetPropsAsDict())
# Move conformer properties to mol object level
for prop, m in zip(props, mols):
for prop_name, value in prop.items():
dtype = type(value).__name__
if "int" in dtype:
m.SetIntProp(prop_name, int(value))
elif "float" in dtype:
m.SetDoubleProp(prop_name, float(value))
elif dtype == "str":
m.SetProp(prop_name, value)
elif dtype == "bool":
m.SetBoolProp(prop_name, value)
return mol, mols, _errmsg
[docs]
def read_smiles(smiles: str) -> Tuple[Optional[Chem.rdchem.Mol], Optional[str]]:
"""Read a SMILES string and return an RDKit molecule object and an error message
(``None`` if no error).
Initially, ``sanitize=False`` is set in ``Chem.MolFromSmiles()`` to preserve the hydrogen atoms
if they are given in the SMILES string. If the molecule object is successfully created, it is
tried to be sanitized.
Parameters
----------
smiles : str
The SMILES string of a molecule.
Returns
-------
Tuple[Optional[Chem.rdchem.Mol], Optional[str]]
A tuple containing:
* An RDKit molecule object if the SMILES string could be parsed, otherwise ``None``.
* An error message if the SMILES string could not be parsed or sanitized, otherwise
``None``.
"""
_errmsg = None
mol = None
# Try to generate RDKit mol object without sanitization first
try:
mol = Chem.MolFromSmiles(SMILES=smiles, sanitize=False)
except Exception as e:
_errmsg = f"Generation of RDKit mol object failed for SMILES string '{smiles}': {e}."
return mol, _errmsg
# Try to sanitize the mol object
if mol is not None:
try:
Chem.SanitizeMol(mol)
except Exception as e:
_errmsg = (
f"Sanitization of the RDKit mol object generated from SMILES string "
f"'{smiles}' failed: {e}."
)
else:
_errmsg = (
f"Generation of RDKit mol object failed for SMILES string '{smiles}' as it "
"resulted in None."
)
return mol, _errmsg
[docs]
def read_smarts(smarts: str) -> Tuple[Optional[Chem.rdchem.Mol], Optional[str]]:
"""Read a SMARTS pattern and return an RDKit molecule object and an error message
(``None`` if no error).
Parameters
----------
smarts : str
The SMARTS pattern.
Returns
-------
Tuple[Optional[Chem.rdchem.Mol], Optional[str]]
A tuple containing:
* An RDKit molecule object if the SMARTS pattern could be parsed, otherwise ``None``.
* An error message if the SMARTS pattern could not be parsed, otherwise ``None``.
"""
_errmsg = None
mol = None
# Check if SMARTS string is empty
smarts = smarts.strip()
if smarts == "":
_errmsg = "the SMARTS string must not be empty"
return mol, _errmsg
# Try to generate RDKit mol object
try:
mol = Chem.MolFromSmarts(SMARTS=smarts)
except Exception as e:
_errmsg = f"generation of RDKit mol object failed for SMARTS string '{smarts}': {e}"
return mol, _errmsg
# Check if mol is None
if mol is None:
_errmsg = (
f"generation of RDKit mol object failed for SMARTS string '{smarts}' as it "
"resulted in None"
)
return mol, _errmsg
[docs]
def _validate_xyz(
file_lines: List[str], number_of_atoms: int
) -> Tuple[List[str], List[str], Optional[str]]:
"""Validate the individual lines of an XYZ file with one or more conformers.
The following points are ensured:
* The first line of each structure block contains only a valid integer specifying the number of
atoms in the block.
* The number of atoms specified in the first line of each block matches the number of atoms
specified in the first line of the first block.
* Each atom line contains exactly one valid element symbol and three valid cartesian
coordinates (x, y, z) that can be converted to floats.
* The number of atom lines in each block matches the number of atoms specified in the first
line of the file.
* The elements in each block are identical and in the same order as found in the first
structure block.
Please note: These checks are not exhaustive and beyond them the user is responsible to ensure
that the individual structure blocks represent conformers of the same molecule.
Parameters
----------
file_lines : List[str]
The individual lines of the XYZ file.
number_of_atoms : int
The number of atoms in the molecule as defined by the first line of the XYZ file.
Returns
-------
Tuple[List[str], List[str], Optional[str]]
A tuple containing:
* A list of the comment lines of each conformer block.
* A list of strings, each string representing one conformer's atom lines.
* An error message if the file lines are not valid, otherwise ``None``.
"""
_errmsg = None
# Get individual atom blocks and check if they are valid
comment_line_in_blocks: List[str] = []
elements = None
atom_lines_in_blocks: List[str] = []
for block_idx, start_line_idx in enumerate(range(0, len(file_lines), number_of_atoms + 2)):
# Check if first line in block is a valid number of atoms line
try:
n_atoms_in_block = int(file_lines[start_line_idx])
except Exception as e:
_errmsg = (
f"the first line of XYZ block with index {block_idx} is not a single valid "
f"integer specifying the number of atoms in the molecule: {e}."
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Check if number of atoms in block matches number of atoms in first block
if n_atoms_in_block != number_of_atoms:
_errmsg = (
f"the number of atoms specified in the first line of XYZ block with index "
f"{block_idx} ({n_atoms_in_block}) does not match the number of atoms "
f"specified in the XYZ block with index 0 ({number_of_atoms})"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Add comment line of block to list of all comment lines
comment_line_in_blocks.append(file_lines[start_line_idx + 1])
# Loop over individual atom lines in block (skip first two lines)
atom_lines_in_block = []
elements_in_block = []
for line in file_lines[start_line_idx + 2 : start_line_idx + 2 + number_of_atoms]:
splitted = line.split()
# Check file format (element symbol + 3 coordinates per line)
if len(splitted) != 4:
_errmsg = (
f"a line in xzy block with index {block_idx} does not contain "
"exactly one element symbol and three cartesian coordinates (x, y, z)"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Check if element symbol is valid
if splitted[0] not in ELEMENT_SYMBOLS:
_errmsg = (
f"element symbol '{splitted[0]}' in xzy block with index "
f"{block_idx} is not a valid element symbol"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Check if coordinates are valid floats
try:
for coord in splitted[1:]:
float(coord)
except Exception as e:
_errmsg = (
f"one of the coordinates in xzy block with index {block_idx} is not a "
f"valid float: {e}."
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Add atom line and element
atom_lines_in_block.append(line)
elements_in_block.append(splitted[0])
# Check if number of atom lines matches number of atoms
if len(atom_lines_in_block) != number_of_atoms:
_errmsg = (
f"the number of atom lines in XYZ block with index {block_idx} "
f"({len(atom_lines_in_block)}) does not match the number of atoms of the molecule "
f"specified in the first line of the file ({number_of_atoms})"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
# Check if elements are consistent for all conformers
if elements is not None:
if elements != elements_in_block:
_errmsg = (
f"the elements in the XYZ block of conformer with index {block_idx} are not "
"identical and/or in the same order as found in the conformer with index 0"
)
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
else:
elements = elements_in_block
# Add atom lines of block to list of all blocks
atom_lines_in_blocks.append("".join(atom_lines_in_block))
return comment_line_in_blocks, atom_lines_in_blocks, _errmsg
[docs]
def read_xyz_file(file_path: str) -> Tuple[Optional[List[str]], Optional[str]]:
"""Read an XYZ file with one or more conformers and validate its content.
The first line of each conformer block contains the number of atoms, the second line is a
comment line, and the subsequent lines contain the atom symbols and their cartesian coordinates
(in Angstrom). The individual conformers cannot be separated by empty lines. The file content
is validated (see ``_validate_xyz()`` for details).
Parameters
----------
file_path : str
The path to the XYZ file.
Returns
-------
Tuple[Optional[List[str]], Optional[str]]
A tuple containing:
* A list of strings, each representing one conformer's XYZ block.
* An error message if the file could not be read or is not valid, otherwise ``None``.
"""
_errmsg = None
xyz_blocks = None
# Ensure correct file extension (just for double-checking)
if not file_path.endswith(".xyz"):
_errmsg = "the file does not have the expected .xyz file extension"
return xyz_blocks, _errmsg
# Try to open the file
try:
with open(file_path, "r") as f:
lines = f.readlines()
except Exception as e:
_errmsg = f"opening of the file failed: {e}"
return xyz_blocks, _errmsg
# Strip away empty lines at the end of the file
while len(lines) != 0 and lines[-1].strip() == "":
lines.pop(-1)
# Check if file is empty
if len(lines) == 0:
_errmsg = "the file is empty or only contains empty lines"
return xyz_blocks, _errmsg
# Try to get the number of atoms of the first conformer
try:
n_atoms = int(lines[0])
except Exception:
_errmsg = (
"the first line of the file is not a single valid integer defining the number "
"of atoms in the molecule"
)
return xyz_blocks, _errmsg
# Check if at least as many lines as atoms + 2 are present
if len(lines) < n_atoms + 2:
_errmsg = (
f"the file contains fewer non-empty lines ({len(lines)}) than required for a single "
f"structure block with {n_atoms} atoms"
)
return xyz_blocks, _errmsg
# Validate the input file
comment_line_in_blocks, atom_lines_in_blocks, _errmsg = _validate_xyz(
file_lines=lines, number_of_atoms=n_atoms
)
if _errmsg is not None:
return xyz_blocks, _errmsg
# Format XYZ blocks
try:
xyz_blocks = []
for comment, atom_lines in zip(comment_line_in_blocks, atom_lines_in_blocks):
xyz_block = f"{n_atoms}\n{comment}{atom_lines}"
xyz_blocks.append(xyz_block)
except Exception as e:
_errmsg = (
"the file was successfully read and validated, but the "
f"final formatting of the individual XYZ blocks failed: {e}"
)
return xyz_blocks, _errmsg
[docs]
def _validate_sdf(sdf_mols: List[Optional[Chem.rdchem.Mol]]) -> Optional[str]:
"""Validate the individual RDKit molecule objects generated from an SD file with one or more
conformers.
The following points are ensured:
* All conformers could be successfully converted to RDKit molecule objects that are not
``None``.
* All elements in the conformers represent valid element symbols.
* All conformers represent the same molecule (checked by comparing their SMILES and InChIKey
string as well as their chemical element symbols).
* All conformers possess 3D coordinates.
Parameters
----------
sdf_mols : List[Optional[Chem.rdchem.Mol]]
A list of RDKit molecule objects generated from the SD file (see the ``read_sd_file()``
function). ``None`` can be present in the list if individual conformers could not be
parsed.
Returns
-------
Optional[str]
An error message if the molecule objects are not valid, otherwise ``None``.
"""
_errmsg = None
_smiles = None
_inchikey = None
_elements = None
for idx, mol in enumerate(sdf_mols):
# Check if mol is None
if mol is None:
_errmsg = (
f"generation of RDKit mol object from SDF block failed for conformer "
f"with index {idx} as it resulted in None"
)
return _errmsg
# Validate elements
els = [atom.GetSymbol() for atom in mol.GetAtoms()]
for el in els:
if el not in ELEMENT_SYMBOLS:
_errmsg = (
f"element symbol '{el}' in SDF block with index {idx} is not a valid "
"element symbol"
)
return _errmsg
# Compare conformers by SMILES, InChIKey, and elements
smi = Chem.MolToSmiles(mol)
ikey = Chem.MolToInchiKey(mol)
if _smiles is None:
_smiles = smi
_inchikey = ikey
_elements = els
else:
# SMILES
if smi != _smiles:
_errmsg = (
f"the generated SMILES string of the conformer with index {idx} "
f"('{smi}') does not match the SMILES string of the conformer with index 0 "
f"('{_smiles}')"
)
return _errmsg
# InChIKey
if ikey != _inchikey:
_errmsg = (
f"the generated InChIKey of the conformer with index {idx} "
f"('{ikey}') does not match the InChIKey of the conformer with index 0 "
f"('{_inchikey}')"
)
return _errmsg
# Elements
if els != _elements:
_errmsg = (
f"the elements of the conformer with index {idx} are not identical "
"and/or in the same order as found in the conformer with index 0"
)
return _errmsg
# Ensure that conformer is 3D
if mol.GetConformer().Is3D() is False:
_errmsg = (
f"the conformer with index {idx} is not 3D. Only SD files containing 3D "
"information are supported"
)
return _errmsg
return _errmsg
[docs]
def read_sd_file(
file_path: str,
) -> Tuple[Optional[List[Optional[Chem.rdchem.Mol]]], Optional[str]]:
"""Read an SD file with one or more conformers.
The file must comply with the SD file format (see
https://en.wikipedia.org/wiki/Chemical_table_file, last accessed on 23.09.2025).
Parameters
----------
file_path : str
Path to the SD file.
Returns
-------
Tuple[Optional[List[Optional[Chem.rdchem.Mol]]], Optional[str]]
A tuple containing:
* A list of RDKit molecule objects if the file could be read , otherwise ``None``. The
mol objects can also be ``None`` if individual conformers could not be parsed.
* An error message if the file could not be read or is not valid, otherwise ``None``.
"""
sdf_mols = None
_errmsg = None
# Ensure correct file extension (just for double-checking)
if not file_path.endswith(".sdf"):
_errmsg = "the file does not have the expected .sdf file extension"
return sdf_mols, _errmsg
# Try to open the file
try:
suppl = Chem.SDMolSupplier(file_path, sanitize=False)
except Exception as e:
_errmsg = f"opening of the file failed: {e}"
return sdf_mols, _errmsg
# Get mol objects
sdf_mols = [mol for mol in suppl]
# Validate the mol objects
_errmsg = _validate_sdf(sdf_mols)
if _errmsg is not None:
_errmsg = f"validation of the file failed: {_errmsg}"
return sdf_mols, _errmsg
[docs]
def write_sd_file(mol: Chem.rdchem.Mol, file_path: str) -> None:
"""Write an SD file from an RDKit mol object.
Parameters
----------
mol : Chem.rdchem.Mol
An RDKit molecule object.
file_path : str
The path to the file the data is written to.
Returns
-------
None
"""
with Chem.SDWriter(file_path) as writer:
writer.write(mol)
[docs]
def write_xyz_file_from_coordinates_array(
elements: NDArray[np.str_],
coordinates: NDArray[np.float64],
file_path: str,
) -> None:
"""Write a list of elements and their coordinates to an XYZ file.
Parameters
----------
elements : NDArray[np.str\_]
The element symbols of the molecule.
coordinates : NDArray[np.float64]
The cartesian coordinates of the structure.
file_path : str
The path to the output XYZ file.
Returns
-------
None
"""
# Format file content
xyz = f"{len(elements)}\n\n"
for el, coord in zip(elements, coordinates):
xyz += f"{el} {coord[0]} {coord[1]} {coord[2]}\n"
# Write file
with open(file_path, "w") as f:
f.write(xyz)