Source code for maize.steps.mai.molecule.gypsum

"""GypsumDL prepares 3D small molecule conformers and isomers"""

# pylint: disable=import-outside-toplevel, import-error

from pathlib import Path
from typing import Any

from maize.core.node import Node
from maize.core.interface import Input, Output, Parameter, Flag
from maize.utilities.testing import TestRig
from maize.utilities.validation import SuccessValidator
from maize.utilities.resources import cpu_count
from maize.utilities.execution import ProcessError

from maize.utilities.chem import IsomerCollection, save_smiles


DEFAULT_FILE_NAME = "untitled_line_{0}__input{0}.sdf"
FAILED_SMILES_FILE = "gypsum_dl_failed.smi"


[docs] class Gypsum(Node): """ Converts SMILES codes into a set of 3D molecules using Gypsum-DL [#ropp2019]_. 3D embedding can fail, and in those cases it falls back on RDKit. Notes ----- The implementation in this node does not use the MPI capabilities of Gypsum, and simply installing ``MPI4PY`` can cause problems executing this step on some HPC systems. So it might be better to simply not install it for this use case. References ---------- .. [#ropp2019] Ropp, P.J., Spiegel, J.O., Walker, J.L. et al. Gypsum-DL: an open-source program for preparing small-molecule libraries for structure-based virtual screening. J Cheminform 11, 34 (2019). `DOI <https://doi.org/10.1186/s13321-019-0358-3>`_ See Also -------- :class:`~maize.steps.mai.molecule.Smiles2Molecules` : A simple, fast, and less accurate alternative to Gypsum, using RDKit embedding functionality. """ required_callables = ["gypsum"] inp: Input[list[str]] = Input() """SMILES input""" out: Output[list[IsomerCollection]] = Output() """Molecule output""" n_variants: Parameter[int] = Parameter(default=1) """Maximum number of variants to generate""" thoroughness: Parameter[int] = Parameter(default=3) """ Multiplier for the number of sampled conformers to evaluate energies. Higher numbers will increase the computational cost by performing more UFF energy evaluations. """ ph_range: Parameter[tuple[float, float]] = Parameter(default=(6.4, 8.4)) """The pH range in which to generate variants (min, max)""" use_filters: Flag = Flag(default=True) """Whether to use additional substructure filters from the Durrant lab""" n_jobs: Parameter[int] = Parameter(default=cpu_count()) """Number of parallel processes to use""" timeout: Parameter[int] = Parameter(default=5) """Timeout per SMILES in seconds, will attempt an RDKit embedding after""" def run(self) -> None: smiles = [smi.strip() for smi in self.inp.receive()] smiles_path = Path("input.smi") save_smiles(smiles_path, smiles) command = ( f"{self.runnable['gypsum']} --source {smiles_path.as_posix()} " f"--max_variants_per_compound {self.n_variants.value} " f"--thoroughness {self.thoroughness.value} --separate_output_files " f"--min_ph {self.ph_range.value[0]} --max_ph {self.ph_range.value[1]} " f"--job_manager multiprocessing --num_processors {self.n_jobs.value} " ) if self.use_filters.value: command += "--use_durrant_lab_filters" # With our settings Gypsum produces one SDF file per SMILES, # each of which can have one or more isomers / conformers res = self.run_command( command, verbose=True, validators=[SuccessValidator("Finished Gypsum-DL")], timeout=10 + len(smiles) * self.timeout.value, raise_on_failure=False, ) failed = set() if res.returncode == 130: # Timeout self.logger.warning("Timed out during embedding") failed = set(smiles) elif res.returncode > 0: raise ProcessError("Gypsum failed for SMILES: %s", smiles) # Gypsum can fail to embed certain SMILES, but helpfully writes out those separately if Path(FAILED_SMILES_FILE).exists(): self.logger.info("Found failed SMILES file") with Path(FAILED_SMILES_FILE).open() as failed_file: failed = {smi.split()[0] for smi in failed_file.readlines()} self.logger.info("Failed SMILES:\n'%s'", "\n".join(failed)) mols = [] for i, smi in enumerate(smiles): gypsum_index = i + 1 file = Path(DEFAULT_FILE_NAME.format(gypsum_index)) self.logger.debug("Checking SMILES '%s'", smi) if smi in failed: self.logger.warning( "Skipping failed embedding for SMILES '%s', falling back to RDKit", smi ) mol = IsomerCollection.from_smiles(smi) mol.embed() if any(isomer.n_conformers == 0 for isomer in mol.molecules): self.logger.warning("Coordinate generation for isomer '%s' failed", smi) # We already check for failed embeddings so this shouldn't really happen elif not file.exists() or file.stat().st_size == 0: raise FileNotFoundError( f"Gypsum output for '{smi}' at '{file.as_posix()}' not found or empty" ) # All good! else: mol = IsomerCollection.from_sdf(file) for isomer in mol.molecules: isomer.name = isomer.inchi mols.append(mol) self.out.send(mols)
class TestSuiteGypsum: def test_Gypsum(self, temp_working_dir: Any, test_config: Any, example_smiles: Any) -> None: rig = TestRig(Gypsum, config=test_config) res = rig.setup_run( inputs={"inp": [example_smiles]}, parameters={"n_variants": 2}, ) mols = res["out"].get() assert mols is not None assert len(mols) == 5 assert mols[0].molecules[0].n_conformers == 1 assert mols[0].molecules[0].charge <= 2 assert 51 <= mols[0].molecules[0].n_atoms <= 53 for mol in mols: assert mol.n_isomers <= 2 assert not mol.scored