Source code for maize.utilities.chem.chem

"""Utility functions and types for chemistry."""

from collections import defaultdict
from contextlib import redirect_stderr
import io
import logging
from pathlib import Path
import signal
from types import FrameType
from typing import Any, Callable, List, Literal, NoReturn, Sequence, TypeVar, ParamSpec, cast
from typing_extensions import Self

import numpy as np
from numpy.typing import NDArray
from rdkit import Chem, rdBase
from rdkit.Chem import AllChem, rdFMCS, rdchem, rdGeometry
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions

from maize.utilities.execution import CommandRunner
from maize.utilities.validation import FailValidator

rdBase.WrapLogs()

log = logging.getLogger("run")

# This is vital to have all properties transferred properly between processes
# See also: https://github.com/rdkit/rdkit/issues/1320
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)

# Allows partial sanitization of charged molecules with non-standard valences
_SANITIZE_FLAGS = (
    Chem.SanitizeFlags.SANITIZE_FINDRADICALS
    | Chem.SanitizeFlags.SANITIZE_KEKULIZE
    | Chem.SanitizeFlags.SANITIZE_SETAROMATICITY
    | Chem.SanitizeFlags.SANITIZE_SETCONJUGATION
    | Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION
    | Chem.SanitizeFlags.SANITIZE_SYMMRINGS
)
SCHRODINGER_VARIANT_TAG = "s_lp_Variant"



[docs]
class ChemistryException(Exception):
    """Exception raised for issues with executing RDKit or Openbabel"""



_P = ParamSpec("_P")
_T = TypeVar("_T")



[docs]
def result_check(
    func: Callable[_P, _T], none_check: bool = True, false_check: bool = False
) -> Callable[_P, _T]:
    """
    Calls an RDKit or Openbabel function and raises an exception if
    something goes wrong, instead of just returning ``None``.

    Parameters
    ----------
    func
        The function to wrap
    none_check
        Whether to check for a ``None`` return value
    false_check
        Whether to check for a ``False`` return value

    Returns
    -------
    Callable[_P, _T]
        The wrapped function

    """

    # RDKit does not throw exceptions, but just returns `None`
    # if something went wrong. It also writes the cause to its
    # internal logger, so we need to redirect this output to
    # stderr (with the call to `WrapLogs()` above) and then
    # capture it to get useful information. The same goes for
    # Openbabel, except that it returns ``False`` on failure.
    def wrapped(*args: _P.args, **kwargs: _P.kwargs) -> _T:
        logger = io.StringIO()
        with redirect_stderr(logger):
            res = func(*args, **kwargs)

        out = logger.getvalue()
        if (
            (none_check and res is None)
            or (false_check and not res)
            or any(val in out for val in ("error", "Failed"))
        ):
            raise ChemistryException(
                f"{func.__name__} raised the following error: {logger.getvalue()}"
            )
        return res

    return wrapped



def _timeout(timeout: int) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
    """
    Allows a wrapped function to timeout by raising a `TimeoutError`.

    Parameters
    ----------
    timeout
        Timeout in seconds, has to be an integer value

    Returns
    -------
    Callable[[Callable[_P, _T]], Callable[_P, _T]]
        Wrapped function

    Raises
    ------
    TimeoutError
        If the wrapped function times out

    """

    def _handler(signum: int, frame: FrameType | None) -> NoReturn:
        raise TimeoutError("Function timed out")

    def wrapper(func: Callable[_P, _T]) -> Callable[_P, _T]:
        def inner(*args: _P.args, **kwargs: _P.kwargs) -> _T:
            signal.signal(signal.SIGALRM, handler=_handler)
            signal.alarm(timeout)
            try:
                return func(*args, **kwargs)
            finally:
                signal.alarm(0)

        return inner

    return wrapper



[docs]
def convert(files: list[Path] | Path, output: Path, extra: list[str] | None = None) -> None:
    """
    Convert one or more molecule files into another using openbabel.
    Will detect the input and output formats from the suffixes.

    Parameters
    ----------
    file
        Input file(s) to convert, if multiple are given will
        attempt to aggregate output into a single file
    output
        Output file path
    extra
        Additional arguments to pass to ``obabel``

    """
    if isinstance(files, Path):
        files = [files]

    inp_format = files[0].suffix.strip(".")
    out_format = output.suffix.strip(".")
    for file in files:
        if not file.exists():
            raise FileNotFoundError(f"File at '{file.as_posix()}' not found")

    all_files = " ".join(file.as_posix() for file in files)
    command = f"obabel {all_files} -i{inp_format} -o{out_format} -O {output.as_posix()}"
    extra = extra or []
    for arg in extra:
        command += " " + arg

    cmd = CommandRunner(validators=[FailValidator("error")])
    cmd.run_validate(command, verbose=True)




[docs]
def save_sdf_library(
    file: Path,
    mols: list["IsomerCollection"],
    conformers: bool = False,
    split_strategy: Literal["schrodinger", "inchi"] = "inchi",
) -> None:
    """
    Saves a list of IsomerCollection instances to an SDF file.
    Will only write one Isomer and one conformer for each molecule.

    Parameters
    ----------
    file
        Path to the output SDF file
    smiles
        List of `IsomerCollection` to write
    conformers
        Whether to write all conformers
    split_strategy
        How to determine which isomer belongs to which molecule, from special tags or
        the name. ``schrodinger`` expects a name like ``1:0`` (molecule:isomer), ``inchi``
        expects an InChI key as a name, which can be split into molecule and isomer.

    """
    with Chem.SDWriter(file.as_posix()) as writer:
        for i, mol in enumerate(mols):
            for j, iso in enumerate(mol.molecules):
                if split_strategy == "schrodinger":
                    iso.name = f"{i}:{j}"
                elif split_strategy == "inchi":
                    iso.name = iso.inchi

                if conformers:
                    for i in range(iso.n_conformers):
                        writer.write(iso._molecule, confId=i)
                else:
                    writer.write(iso._molecule)




[docs]
def load_sdf_library(
    file: Path, split_strategy: Literal["schrodinger", "inchi", "schrodinger-tag", "none"] = "inchi"
) -> list["IsomerCollection"]:
    """
    Loads an SDF library containing multiple molecules with potentially
    multiple isomers each, and creates an `IsomerCollection` for all of them.

    Parameters
    ----------
    file
        Path to the SDF file
    split_strategy
        How to determine which isomer belongs to which molecule, from special tags or
        the name. ``schrodinger`` expects a name like ``1:0`` (molecule:isomer),
        ``schrodinger-tag`` expects a tag named ``s_lp_Variant``, ``inchi`` expects an
        InChI key as a name, which can be split into molecule and isomer, and ``'none'``
        provides a separate `IsomerCollection` for each entry.

    Returns
    -------
    list[IsomerCollection]
        Created molecules

    """
    mols: dict[int | str, dict[int | str, list[Chem.rdchem.Mol]]] = defaultdict(
        lambda: defaultdict(list)
    )
    with Chem.SDMolSupplier(file.as_posix(), removeHs=False) as supp:
        for i, mol in enumerate(supp):
            if mol is None:
                continue
            iso = Isomer(mol)

            mol_idx: int | str
            # Split based on special name: molecule:isomer
            if split_strategy == "schrodinger":
                if iso.name is None or len(parts := iso.name.split(":")) < 2:
                    raise ChemistryException(
                        f"Parsed molecule {iso} does not have a suitable name,"
                        "expected format molecule-index:isomer-index"
                    )
                mol_idx, iso_idx = (int(part) for part in parts)
                mols[mol_idx][iso_idx].append(mol)

            # Split based on special tag created by Schrodinger
            elif split_strategy == "schrodinger-tag":
                if not iso.has_tag(SCHRODINGER_VARIANT_TAG):
                    raise ChemistryException(
                        f"Parsed molecule does not have the expected '{SCHRODINGER_VARIANT_TAG}'"
                    )

                # The tag will be something like 'file.smi:1-1' with the indices
                # representing molecule and isomer number, respectively
                tag = iso.get_tag(SCHRODINGER_VARIANT_TAG)
                assert isinstance(tag, str)
                if ":" in tag:
                    mol_idx, iso_idx = (int(part) for part in tag.split(":")[-1].split("-"))
                else:
                    mol_idx, iso_nr = tag.split("-")
                    iso_idx = int(iso_nr)
                mols[mol_idx][iso_idx].append(mol)

            # Split based on InChI key: moleculeinchi-isomerinchi (e.g. ABCDEF-GHIJKL-M)
            elif split_strategy == "inchi":
                if iso.name is None or len(inchi_parts := iso.name.split("-")) != 3:
                    raise ChemistryException(
                        f"Name of parsed molecule ('{iso.name}') is not a valid InChI key"
                    )
                mol_part, *iso_part = inchi_parts
                mols[mol_part]["-".join(iso_part)].append(mol)

            # No split, each entry is a separate molecule
            elif split_strategy == "none":
                mols[i][0].append(mol)

    return [
        IsomerCollection([Isomer.from_rdmols(isos) for isos in mol.values()])
        for mol in mols.values()
    ]



def _lib2dict(lib: list["IsomerCollection"]) -> dict[str, dict[str, "Isomer"]]:
    """Provides a nested dictionary from a library using InChI keys"""
    dic: dict[str, dict[str, "Isomer"]] = defaultdict(dict)
    for mol in lib:
        for iso in mol.molecules:
            mol_id, iso_id, state = iso.inchi.split("-")
            dic[mol_id][f"{iso_id}-{state}"] = iso
    return dic


def _dict2lib(dic: dict[str, dict[str, "Isomer"]]) -> list["IsomerCollection"]:
    """Provides a list of `IsomerCollection` from a nested dictionary"""
    return [IsomerCollection(list(isodict.values())) for isodict in dic.values()]



[docs]
def merge_libraries(
    base: list["IsomerCollection"], other: list["IsomerCollection"]
) -> list["IsomerCollection"]:
    base_dic, other_dic = _lib2dict(base), _lib2dict(other)
    for key, iso_dic in base_dic.items():
        if key in other_dic:
            iso_dic.update(other_dic[key])
    return _dict2lib(base_dic)




[docs]
def save_smiles(file: Path, smiles: list[str]) -> None:
    """
    Saves a list of SMILES to a ``.smi`` file.

    Parameters
    ----------
    file
        Path to the output file
    smiles
        List of SMILES codes to write

    """
    if file.suffix != ".smi":
        raise ValueError("File needs to have the .smi suffix")

    with open(file, "w", encoding="utf8") as out:
        out.writelines("\n".join(smiles))




[docs]
def mcs(*mols: "Isomer") -> "Isomer":
    """
    Finds the maximum common substructure (MCS) between multiple molecules.

    Parameters
    ----------
    mols
        Reference `Isomer` molecules

    Returns
    -------
    Isomer
        Isomer instance using the MCS

    """
    mcs = rdFMCS.FindMCS([mol._molecule for mol in mols])
    return Isomer(result_check(Chem.MolFromSmarts)(mcs.smartsString))




[docs]
def rmsd(mol: "Isomer", ref: "Isomer") -> NDArray[np.float32]:
    """
    Calculate the unaligned root-mean-square deviation between
    all conformers of an isomer and a reference molecule.

    Parameters
    ----------
    mol
        Molecule in the form of an `Isomer`
    ref
        Reference molecule

    Returns
    -------
    NDArray[np.float32]
        RMSDs for all conformers

    """
    inds_ref, inds_mol = np.array(mol.atommap(ref)).T
    diff = mol.coordinates[:, inds_mol] - ref.coordinates[:, inds_ref]
    return cast(NDArray[np.float32], np.sqrt((diff**2).sum(axis=-1).mean(axis=1)))



_TAG_METHODS = {
    bool: "SetBoolProp",
    int: "SetIntProp",
    float: "SetDoubleProp",
}


_ValidRDKitTagType = bool | int | float | str | list[float | int] | NDArray[np.float_ | np.int_]


def _prop_setter(
    obj: Chem.rdchem.Conformer | Chem.rdchem.Mol | Chem.rdchem.Atom,
    tag: str,
    value: _ValidRDKitTagType,
) -> None:
    """Sets RDKit properties using the appropriate setter."""

    # Arrays require a conversion to a string; numpy arrays and
    # lists serialize differently (with and without commas as
    # delimiters), so we convert to list by default
    if isinstance(value, (list, np.ndarray)):
        value = str(list(value))

    # Numpy floats and ints need special treatment
    elif isinstance(value, (float, np.float_)):
        value = float(value)

    elif isinstance(value, (int, np.int_)):
        value = int(value)

    # Should just be a string
    if not isinstance(value, (bool, int, float, str)):
        value = str(value)

    # `SetProp` is the default for all strings
    getattr(obj, _TAG_METHODS.get(type(value), "SetProp"))(tag, value)


def _prop_converter(raw: _ValidRDKitTagType) -> _ValidRDKitTagType:
    """Convert raw RDKit tags to an appropriate python type"""
    if isinstance(raw, (bool, int, float)):
        return raw

    # This should never happen
    if not isinstance(raw, str):
        raise TypeError(f"Got invalid type '{type(raw)}' for value '{raw}' from rdkit tag")

    # We have an array and attempt to deserialize to an ndarray of floats
    if raw.startswith("[") and raw.endswith("]"):
        return np.fromstring(raw[1:-1], sep=",")
    return raw



[docs]
class Conformer:
    """
    Thin shim layer for rdkit conformers. Each conformer will have an `Isomer` as its parent.

    Parameters
    ----------
    rd_conf
        RDKit conformer object
    parent
        `Isomer` parent instance
    _rd_parent
        The actual rdkit molecule parent of the conformer

    """


[docs]
    def __init__(
        self,
        rd_conf: Chem.rdchem.Conformer,
        parent: "Isomer",
        _rd_parent: Chem.rdchem.Mol | None = None,
    ) -> None:
        self._conf = rdchem.Conformer(rd_conf)
        self.parent = parent

        # We need this to ensure the parent rdkit molecule instance does not go out of scope,
        # as this means we would lose access to our conformer (conformers cannot exist alone
        # and always need a parent molecule instance to exist). See this related issue for a
        # (now fixed) example: https://github.com/rdkit/rdkit/issues/3492
        self._rd_parent = parent._molecule if _rd_parent is None else _rd_parent

        coords = []
        for i in range(self._conf.GetNumAtoms()):
            pos = self._conf.GetAtomPosition(i)
            coords.append([pos.x, pos.y, pos.z])
        self._coordinates = np.array(coords)


    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(n_atoms={self.n_atoms}, "
            f"orphan={not self._conf.HasOwningMol()})"
        )


[docs]
    @classmethod
    def from_rdmol(cls, rd_mol: Chem.rdchem.Mol, parent: "Isomer", renumber: bool = True) -> Self:
        """
        Create a conformer from an RDKit molecule.

        Parameters
        ----------
        rd_mol
            RDKit molecule instance
        parent
            Parent `Isomer` instance
        renumber
            Ensure atom numbers of the molecule match the parent

        """
        if (n_atoms_conf := rd_mol.GetNumAtoms()) != parent.n_atoms:
            raise ChemistryException(
                f"Atom number mismatch, parent molecule has {parent.n_atoms}, "
                f"conformer has {n_atoms_conf}"
            )

        # We can't fully sanitize as we might have charged molecules
        # (for example with N valence of 4), so we disable sanitization
        # at load time and do a partial sanitizing step after, see also:
        # https://www.rdkit.org/docs/Cookbook.html#explicit-valence-error-partial-sanitization
        Chem.SanitizeMol(rd_mol, _SANITIZE_FLAGS, catchErrors=True)

        # We ensure that the atom numbering is consistent between the new conformer and the parent,
        # as the passed in molecule might not have any relation to the parent. This can be important
        # when converting from files that treat protonations differently, for example PDBQT.
        # Based on: https://www.rdkit.org/docs/Cookbook.html#reorder-atoms
        if renumber:
            parent_canon_order = np.array(Chem.CanonicalRankAtoms(parent._molecule)).argsort()
            newmol_canon_order = np.array(Chem.CanonicalRankAtoms(rd_mol)).argsort()
            inds = np.arange(parent.n_atoms)
            inds[parent_canon_order] = newmol_canon_order

            # FIXME This fails occasionally, example:
            # C1[C@H]2[C@@H]([C@@H](S1)CCCCC(=O)O)NC(=S)N2
            rd_mol = Chem.RenumberAtoms(rd_mol, [int(i) for i in inds])

        rd_conf = result_check(rd_mol.GetConformer)(0)
        conf = cls(rd_conf=rd_conf, parent=parent, _rd_parent=rd_mol)

        # Properties are not automatically copied over to the conformer
        for key, value in rd_mol.GetPropsAsDict().items():
            conf.set_tag(key, value)

        return conf


    @property
    def n_atoms(self) -> int:
        return cast(int, result_check(self._conf.GetNumAtoms)())

    @property
    def tags(self) -> dict[str, Any]:
        """Return all tags"""
        return cast(dict[str, Any], self._conf.GetPropsAsDict())

    @property
    def coordinates(self) -> NDArray[np.float32]:
        """Coordinates of the conformer"""
        if self._conf.HasOwningMol():
            coords = []
            for i in range(self._conf.GetNumAtoms()):
                pos = self._conf.GetAtomPosition(i)
                coords.append([pos.x, pos.y, pos.z])
            return np.array(coords)
        return self._coordinates

    @coordinates.setter
    def coordinates(self, coords: NDArray[np.float32]) -> None:
        """Set the coordinates"""
        if self._conf.HasOwningMol():
            self._set_conf_coords(coords)
        self._coordinates = coords

    # RDKit conformers are intrinsically tied to their parent molecule instance.
    # Therefore, calling `RemoveConformer` on the parent will partially deallocate
    # the memory associated with the conformer, leaving an orphan instance with no
    # atoms or coordinates. We thus need to internally keep track of the coordinates
    # and regenerate the Conformer wrapper manually when assigning an existing
    # conformer to an RDKit molecule, regardless of if it's been orphaned or not.
    # See this classic RDKit bug: https://github.com/rdkit/rdkit/issues/3817

[docs]
    def regenerate(self) -> None:
        """Recreate the internal RDKit conformer object to avoid deallocation"""
        self._set_conf_coords(coords=self._coordinates)



[docs]
    def set_tag(self, tag: str, value: _ValidRDKitTagType) -> None:
        """
        Sets a tag / property for the whole isomer.

        Parameters
        ----------
        tag
            Tag to set
        value
            Corresponding value for all conformers

        """
        if not self._conf.HasOwningMol():
            log.warning("Conformer is orphaned and will not remember any set tags")
        _prop_setter(self._conf, tag, value)



[docs]
    def get_tag(self, tag: str, default: _ValidRDKitTagType | None = None) -> _ValidRDKitTagType:
        """
        Returns the value for the specified tag.

        Parameters
        ----------
        tag
            The tag to lookup
        default
            A default value to return if the key is not found

        Returns
        -------
        Any
            The value of the tag

        Raises
        ------
        KeyError
            If the specified tag couldn't be found

        """
        if tag not in self.tags:
            if default is None:
                raise KeyError(f"Tag '{tag}' not found in molecule")
            return default
        return _prop_converter(self.tags[tag])



[docs]
    def has_tag(self, tag: str) -> bool:
        """
        Returns whether the specified tag is set.

        Parameters
        ----------
        tag
            The tag to lookup

        Returns
        -------
        bool
            Whether the tag is defined

        """
        return cast(bool, result_check(self._conf.HasProp)(tag))


    def _set_conf_coords(self, coords: NDArray[np.float32]) -> None:
        """Set the coordinates of the wrapped RDKit conformer"""
        for i in range(self._conf.GetNumAtoms()):
            self._conf.SetAtomPosition(i, rdGeometry.Point3D(*coords[i]))




[docs]
class Isomer:
    """
    Thin shim layer for rdkit molecules. Here, an isomer refers
    to a unique chemical form of a molecule, i.e. a form separated
    by major energy barriers. Note that SMILES codes do not
    necessarily map to a single isomer! Some examples of unique isomers:

    * A ring conformer (e.g. for cyclohexane)
    * Cis-trans isomers
    * Chirality
    * Tautomers
    * Different protonation states

    Parameters
    ----------
    rd_mol
        RDKit molecule instance

    """


[docs]
    def __init__(self, rd_mol: Chem.rdchem.Mol) -> None:
        self._molecule = rd_mol
        self._init_conformers()
        self.score_tag = "score"


    def __repr__(self) -> str:
        scores = f", scores={self.scores}" if self.scored else ""
        return (
            f"{self.__class__.__name__}(n_atoms={self.n_atoms}, "
            f"n_conformers={self.n_conformers}, charge={self.charge}{scores})"
        )

    # RDKit is for some reason unable to pickle conformer objects directly,
    # but only as part of an rdmol object. Because `Isomer` keeps references
    # to our Conformer wrapper objects, which in turn have a reference to the
    # RDKit conformer, pickling will fail. We therefore remove our conformer
    # proxies before pickling and restore them later with `_init_conformers()`.
    def __setstate__(self, state: dict[str, Any]) -> None:
        self.__dict__ = state
        self._init_conformers()

    def __getstate__(self) -> dict[str, Any]:
        inst = self.__dict__.copy()
        inst["_conformers"] = []
        return inst


[docs]
    @classmethod
    def from_rdmols(cls, rd_mols: list[Chem.rdchem.Mol], sanitize: bool = True) -> Self:
        """
        Create a molecule from multiple RDKit molecules acting as conformers.

        Parameters
        ----------
        rd_mols
            List of RDKit molecule instances
        sanitize
            Whether to sanitize the molecule

        Returns
        -------
        Molecule
            Molecule instance

        """
        first, *rest = rd_mols
        iso = cls(first)
        for conf in rest:
            iso.add_conformer(Conformer.from_rdmol(conf, parent=iso))
        return iso



[docs]
    @classmethod
    def from_smiles(cls, smiles: str, sanitize: bool = True) -> Self:
        """
        Create a molecule from a SMILES string.

        Parameters
        ----------
        smiles
            The SMILES string to initialize the molecule with
        sanitize
            Whether to sanitize the molecule

        Returns
        -------
        Molecule
            Molecule instance

        Raises
        ------
        ChemistryException
            If there was an error parsing the SMILES code

        """
        rd_mol = result_check(Chem.MolFromSmiles)(smiles, sanitize=sanitize)
        return cls(rd_mol=rd_mol)



[docs]
    @classmethod
    def from_sdf(cls, file: Path, read_conformers: bool = False) -> Self:
        """
        Create a molecule from an SDF file.

        Parameters
        ----------
        file
            The SDF file to initialize the molecule with
        read_conformers
            Whether to read all conformers

        Returns
        -------
        Molecule
            Molecule instance

        Raises
        ------
        ChemistryException
            If there was an error parsing the SDF

        """
        rd_mols = []
        with Chem.SDMolSupplier(file.as_posix()) as suppl:
            for rd_mol in suppl:
                if rd_mol is None:
                    continue
                if not read_conformers:
                    return cls(rd_mol)
                rd_mols.append(rd_mol)
        if not rd_mols:
            raise ChemistryException(f"SDF file '{file.name}' produced an empty molecule")

        return cls.from_rdmols(rd_mols)



[docs]
    @classmethod
    def from_sdf_block(cls, sdf: str) -> Self:
        """
        Create a molecule from an SDF string.

        Parameters
        ----------
        sdf
            The SDF string to initialize the molecule with

        Returns
        -------
        Molecule
            Molecule instance

        Raises
        ------
        ChemistryException
            If there was an error parsing the SDF

        """
        rd_mol = result_check(Chem.MolFromMolBlock)(sdf)
        return cls(rd_mol=rd_mol)


    @property
    def name(self) -> str | None:
        """Returns the molecule name"""
        if self._molecule.HasProp("_Name"):
            return cast(str, self._molecule.GetProp("_Name"))
        return None

    @name.setter
    def name(self, value: str) -> None:
        self._molecule.SetProp("_Name", value)

    @property
    def inchi(self) -> str:
        """Returns the InChI key for the molecule"""
        # We need the fixed-H option here to ensure unique InChIs for different
        # protonation states, see: https://www.inchi-trust.org/technical-faq-2/#15.24
        return cast(str, result_check(Chem.MolToInchiKey)(self._molecule, options="-FixedH"))

    @property
    def n_atoms(self) -> int:
        """Number of atoms in the molecule"""
        return cast(int, self._molecule.GetNumAtoms())

    @property
    def n_conformers(self) -> int:
        """Number of conformers in the molecule"""
        return cast(int, self._molecule.GetNumConformers())

    @property
    def charge(self) -> int:
        """The charge of the molecule"""
        return cast(int, result_check(Chem.GetFormalCharge)(self._molecule))

    @property
    def scores(self) -> NDArray[np.float32]:
        """The scores of the molecule"""
        conf_scores = np.array(
            [conf.get_tag(self.score_tag, np.nan) for conf in self.conformers]
        ).flatten()
        iso_scores = np.array(self.get_tag(self.score_tag, [np.nan])).flatten()
        if (
            conf_scores is None
            or len(conf_scores) < len(iso_scores)
            or (~np.isfinite(conf_scores)).all()
        ):
            return iso_scores
        return conf_scores

    @scores.setter
    def scores(self, scores: NDArray[np.float32]) -> None:
        """Set the scores"""
        self.set_tag(self.score_tag, scores)
        for conf, score in zip(self.conformers, scores):
            conf.set_tag(self.score_tag, score)

    @property
    def scored(self) -> bool:
        """Whether the molecule was scored"""
        return len(self.scores) > 0 and bool(np.isfinite(self.scores).any())

    @property
    def conformers(self) -> list[Conformer]:
        """Get all stored isomer conformers in an iterable structure"""
        return self._conformers

    @property
    def coordinates(self) -> NDArray[np.float32]:
        """Get coordinates of all stored conformers as one NDArray"""
        return np.array([conf.coordinates for conf in self.conformers])

    @property
    def tags(self) -> dict[str, Any]:
        """Return all tags"""
        return cast(dict[str, Any], self._molecule.GetPropsAsDict())


[docs]
    def to_sdf(self, path: Path, write_conformers: bool = False) -> None:
        """
        Generate an SDF file for the isomer.

        Parameters
        ----------
        path
            Output file path
        write_conformers
            Whether to write all conformers

        """
        with Chem.SDWriter(path.as_posix()) as writer:
            if write_conformers:
                for conf in self._molecule.GetConformers():
                    writer.write(self._molecule, confId=conf.GetId())
            else:
                writer.write(self._molecule)



[docs]
    def to_smiles(self) -> str:
        """Generate a SMILES code for the isomer"""
        return cast(str, result_check(Chem.MolToSmiles)(self._molecule))



[docs]
    def to_mol_block(self) -> str:
        """Generate a MOL block for the isomer."""
        return cast(str, Chem.MolToMolBlock(self._molecule))



[docs]
    def addh(self) -> None:
        """Adds hydrogens to the molecule"""
        self._molecule = Chem.AddHs(self._molecule)



[docs]
    def set_tag(self, tag: str, value: _ValidRDKitTagType) -> None:
        """
        Sets a tag / property for the whole isomer.

        Parameters
        ----------
        tag
            Tag to set
        value
            Corresponding value for all conformers

        """
        _prop_setter(self._molecule, tag, value)



[docs]
    def get_tag(self, tag: str, default: _ValidRDKitTagType | None = None) -> _ValidRDKitTagType:
        """
        Returns the value for the specified tag.

        Parameters
        ----------
        tag
            The tag to lookup
        default
            A default value to return if the key is not found

        Returns
        -------
        Any
            The value of the tag

        Raises
        ------
        KeyError
            If the specified tag couldn't be found

        """
        if tag not in self.tags:
            if default is None:
                raise KeyError(f"Tag '{tag}' not found in molecule")
            return default
        return _prop_converter(self.tags[tag])



[docs]
    def has_tag(self, tag: str) -> bool:
        """
        Returns whether the specified tag is set.

        Parameters
        ----------
        tag
            The tag to lookup

        Returns
        -------
        bool
            Whether the tag is defined

        """
        return cast(bool, result_check(self._molecule.HasProp)(tag))



[docs]
    def set_atomic_tag(self, idx: int, tag: str, value: _ValidRDKitTagType) -> None:
        """
        Sets a tag / property for a single atom.

        Parameters
        ----------
        idx
            Index of the atom
        tag
            Tag to set
        value
            Corresponding value for the atom

        """
        atom = result_check(self._molecule.GetAtomWithIdx)(idx)
        _prop_setter(atom, tag, value)



[docs]
    def get_atomic_tag(self, idx: int, tag: str, default: Any = None) -> Any:
        """
        Returns the value for the specified tag of an atom.

        Parameters
        ----------
        idx
            Index of the atom
        tag
            The tag to lookup
        default
            A default value to return if the key is not found

        Returns
        -------
        Any
            The value of the tag

        Raises
        ------
        KeyError
            If the specified tag couldn't be found

        """
        atom = result_check(self._molecule.GetAtomWithIdx)(idx)
        if not atom.HasProp(tag):
            if default is None:
                raise KeyError(f"Tag '{tag}' not found in atom {idx}")
            return default
        return atom.GetProp(tag)



[docs]
    def has_atomic_tag(self, idx: int, tag: str) -> bool:
        """
        Returns whether the specified atomic tag is set.

        Parameters
        ----------
        idx
            Index of the atom
        tag
            The tag to lookup

        Returns
        -------
        bool
            Whether the tag is defined

        """
        atom = result_check(self._molecule.GetAtomWithIdx)(idx)
        return bool(result_check(atom.HasProp)(tag))



[docs]
    def add_conformer(self, conf: Conformer) -> None:
        """
        Adds a conformer to the isomer.

        Parameters
        ----------
        conf
            Conformer instance

        """
        conf.regenerate()
        result_check(self._molecule.AddConformer, none_check=False)(conf._conf)
        self._conformers.append(conf)



[docs]
    def remove_conformer(self, idx: int) -> None:
        """
        Remove a conformer.

        Parameters
        ----------
        idx
            Index of the conformer to be removed

        """
        result_check(self._molecule.RemoveConformer, none_check=False)(idx)
        self._conformers.pop(idx)



[docs]
    def clear_conformers(self) -> None:
        """Remove all conformers."""
        self._molecule.RemoveAllConformers()
        self._conformers: list[Conformer] = []



[docs]
    def update_conformers_from_mol_block(
        self, block: str, score_parser: Callable[[dict[str, str]], float] | None = None
    ) -> None:
        """
        Update molecule conformers from a mol block.

        Parameters
        ----------
        mol
            The mol block
        score_parser
            Function used to parse a score from the mol properties

        Raises
        ------
        ChemistryException
            If there was an error parsing the mol block

        """
        self.clear_conformers()
        mol = result_check(Chem.MolFromMolBlock)(block, removeHs=False)
        try:
            conf = Conformer.from_rdmol(mol, parent=self)
            self.add_conformer(conf)
        except ValueError as err:
            raise ChemistryException("Unable to parse conformer, error: %s", err)

        # Only set scores if we actually got some, otherwise they might be set externally
        if score_parser is not None:
            self.scores = np.array(score_parser(mol.GetPropsAsDict()))



[docs]
    def update_conformers_from_sdf(
        self, sdf: Path, score_parser: Callable[[dict[str, str]], float] | None = None
    ) -> None:
        """
        Update molecule conformers from an SDF file.

        Parameters
        ----------
        sdf
            The SDF file to initialize the molecule with
        score_parser
            Function used to parse a score from the SDF properties

        Raises
        ------
        ChemistryException
            If there was an error parsing the SDF

        """
        scores = []
        self.clear_conformers()
        with Chem.SDMolSupplier(sdf.as_posix(), removeHs=False, sanitize=False) as supp:
            for mol in supp:
                if mol is None:
                    continue
                try:
                    conf = Conformer.from_rdmol(mol, parent=self)
                    self.add_conformer(conf)
                except ValueError as err:
                    log.warning("Unable to parse conformer, error: %s", err)
                    continue

                if score_parser is not None:
                    score = score_parser(mol.GetPropsAsDict())
                    scores.append(score)

        # Only set scores if we actually got some, otherwise they might be set externally
        if score_parser is not None:
            self.scores = np.array(scores)



[docs]
    def atommap(self, mol: "Isomer") -> list[tuple[int, int]]:
        """
        Finds the atom index mappings based on the MCS.

        Parameters
        ----------
        mol
            Reference `Isomer`

        Returns
        -------
        list[tuple[int, int]]
            Pairs of atom indices

        """
        mcs_mol = mcs(self, mol)
        ref_substructure = result_check(mol._molecule.GetSubstructMatch)(mcs_mol._molecule)
        iso_substructure = result_check(self._molecule.GetSubstructMatch)(mcs_mol._molecule)
        return list(zip(ref_substructure, iso_substructure))



[docs]
    def generate_stereoisomers(self, n_max: int = 32) -> list[Self]:
        """
        Generates possible enantiomers.

        Parameters
        ----------
        n_max
            Maximum number of stereoisomers to generate

        Returns
        -------
        list[Molecule]
            List of new molecules representing distinct stereoisomers

        """
        if n_max <= 1:
            return [self]
        enum_options = StereoEnumerationOptions(unique=True, maxIsomers=n_max, tryEmbedding=True)
        return [
            self.__class__(isomer)
            for isomer in EnumerateStereoisomers(self._molecule, options=enum_options)
        ]



[docs]
    def to_pdb(self, path: Path) -> None:
        """
        Writes a molecule to a PDB file.

        Parameters
        ----------
        path
            Path to the PDB file to write

        """
        result_check(Chem.MolToPDBFile, none_check=False)(self._molecule, filename=path.as_posix())
        if not path.exists():
            raise ChemistryException(f"File at '{path.as_posix()}' was not written")



[docs]
    def embed(self, n_conformers: int = 1) -> None:
        """
        Generate a 3D embedding of the molecule.

        Parameters
        ----------
        n_conformers
            Number of conformers to generate

        Raises
        ------
        ChemistryException
            If there was an error generating the embeddings

        """
        self.clear_conformers()
        self.addh()
        result_check(AllChem.EmbedMultipleConfs)(
            self._molecule, numConfs=n_conformers, maxAttempts=100
        )
        self._init_conformers()


    def _init_conformers(self) -> None:
        """Initializes `Conformer` instances based on contained RDKit conformers."""
        self._conformers = [
            Conformer(rd_conf, parent=self)
            for rd_conf in result_check(self._molecule.GetConformers)()
        ]




[docs]
class IsomerCollection:
    """
    Represents a collection of isomers / enumerations of a molecule.

    Parameters
    ----------
    molecules
        Molecules part of the collection and sharing a common topology

    """

    smiles: str | None = None


[docs]
    def __init__(self, molecules: Sequence[Isomer]) -> None:
        self.molecules = list(molecules)


    def __repr__(self) -> str:
        scores = f", best_score={self.best_score:4.4}" if self.scored else ""
        smiles = "" if self.smiles is None else f"'{self.smiles}', "
        return f"{self.__class__.__name__}({smiles}n_isomers={self.n_isomers}{scores})"


[docs]
    @classmethod
    def from_smiles(
        cls, smiles: str, max_isomers: int = 8, sanitize: bool = True, timeout: int = 10
    ) -> Self:
        """
        Create an isomer collection from a SMILES string.

        Parameters
        ----------
        smiles
            The SMILES string to initialize the collection with
        max_isomers
            Maximum number of isomers to generate
        sanitize
            Whether to sanitize the molecule
        timeout
            Timeout in seconds before failing stereoisomer generation

        Returns
        -------
        IsomerCollection
            IsomerCollection instance

        Raises
        ------
        ChemistryException
            If there was an error parsing the SMILES code

        See Also
        --------
        steps.mai.molecule.Gypsum
            A more advanced approach of generating any
            kind of isomer or high-energy conformers

        """
        mol = Isomer.from_smiles(smiles, sanitize=sanitize)
        try:
            isomers = _timeout(timeout)(mol.generate_stereoisomers)(n_max=max_isomers)
        except TimeoutError:
            log.warning(f"Stereoisomer generation for {mol} (SMILES={smiles}) timed out")
            isomers = []
        collection = cls(isomers)
        collection.smiles = smiles
        return collection



[docs]
    @classmethod
    def from_sdf(cls, file: Path) -> Self:
        """
        Create an isomer collection from an SDF file
        containing different isomers or conformers.

        Parameters
        ----------
        file
            The SDF file input

        Returns
        -------
        IsomerCollection
            IsomerCollection instance

        Raises
        ------
        ChemistryException
            If there was an error parsing the SMILES code
        """
        mols = []
        smiles = None
        with Chem.SDMolSupplier(file.as_posix(), removeHs=False) as supp:
            for mol in supp:
                if mol is None:
                    continue
                mols.append(Isomer(mol))
                if mol.HasProp("SMILES"):
                    smiles = mol.GetProp("SMILES")
        coll = cls(mols)
        if smiles is not None:
            coll.smiles = smiles
        return coll


    @property
    def n_isomers(self) -> int:
        """Number of contained isomers"""
        return len(self.molecules)

    @property
    def scored(self) -> bool:
        """Whether the isomers have been scored"""
        return all(iso.scored for iso in self.molecules)

    @property
    def best_score(self) -> float:
        """The best score among all isomers / conformers"""
        if self.scored and self.n_isomers > 0:
            it = (cast(float, mol.scores.min()) for mol in self.molecules if mol.scores is not None)
            return min(it)
        return np.nan

    @best_score.setter
    def best_score(self, score: float) -> None:
        for mol in self.molecules:
            mol.scores = np.array([score])

    @property
    def inchi(self) -> str:
        """Returns the InChI key for the molecule"""
        if self.n_isomers > 0:
            return self.molecules[0].inchi.split("-")[0]
        return ""


[docs]
    def to_sdf(self, path: Path) -> None:
        """
        Write all isomers to an SDF file.

        Parameters
        ----------
        path
            Output file path
        """
        with Chem.SDWriter(path.as_posix()) as writer:
            for iso in self.molecules:
                writer.write(iso._molecule)


[docs]
    def to_smiles(self) -> List[str]:
        """
        return smiles for an isomer collection
        """
        return [iso.to_smiles() for iso in self.molecules]



[docs]
    def embed(self, n_conformers: int = 1) -> None:
        """
        Generate a 3D embedding of all isomers.

        Parameters
        ----------
        n_conformers
            Number of conformers to generate

        Raises
        ------
        ChemistryException
            If there was an error generating the embeddings

        """
        for mol in self.molecules:
            mol.embed(n_conformers)



[docs]
    def remove_isomer(self, isomer: Isomer) -> None:
        """Remove isomer from collection"""
        self.molecules.remove(isomer)