Source code for icolos.core.workflow_steps.calculation.esp_sim

from copy import deepcopy
import tempfile
from typing import List
from icolos.core.containers.compound import Conformer, Enumeration
from icolos.core.workflow_steps.step import StepBase
from pydantic import BaseModel

try:
    from espsim import GetEspSim, GetShapeSim
except ImportError:
    print(
        "WARNING - Could not import module espsim, check it is installed in your environment"
    )

from rdkit.Chem import AllChem, Mol
from rdkit import Chem
from rdkit.Chem import rdFMCS
from rdkit.Chem import AllChem
from icolos.core.workflow_steps.step import _LE
from icolos.utils.general.parallelization import SubtaskContainer, Parallelizer
import os

# Based on https://github.com/hesther/espsim


[docs]class StepEspSim(StepBase, BaseModel):
    def __init__(self, **data):
        super().__init__(**data)

    def _compute_esp_sim(self, ref: Mol, trg: Enumeration, tmp_dir: str):
        """
        :param ref : Reference molecule, the known binder against which to calculate similarity
        :param trg: Icolos enumeration of the target molecule, as a smiles string.  Embedded with RDKit
        """
        # Create mol object from trg smile string
        # housekeeping for data appending later

        trg_mol = Chem.AddHs(Chem.MolFromSmiles(trg.get_smile()))
        AllChem.EmbedMolecule(ref, AllChem.ETKDG())
        AllChem.EmbedMolecule(trg_mol, AllChem.ETKDG())
        mols = [Chem.RemoveHs(x) for x in [ref, trg_mol]]
        mcs = rdFMCS.FindMCS(
            mols,
            threshold=0.8,
            completeRingsOnly=True,
            ringMatchesRingOnly=True,
        )
        self._logger.log(
            f"Computed mcs: {mcs} for enumeration {trg.get_smile()}", _LE.DEBUG
        )

        patt = Chem.MolFromSmarts(mcs.smartsString)
        refMol = mols[0]
        refMatch = refMol.GetSubstructMatch(patt)

        mv = trg_mol.GetSubstructMatch(patt)
        AllChem.AlignMol(trg_mol, refMol, atomMap=list(zip(mv, refMatch)))

        # Generate an single conformer for the reference

        # All atom alignment of the two

        charge_method = self._get_additional_setting("charge_method", default="am1-bcc")
        esp_sim = GetEspSim(trg_mol, ref, partialCharges=charge_method)
        shape_sim = GetShapeSim(trg_mol, ref)
        self._logger.log(
            f"Computed EspSim: {esp_sim}, ShapeSim: {shape_sim} for mol {trg.get_smile()}",
            _LE.DEBUG,
        )

        # now attach the mols as conformers attach the scores to the mol objects
        trg_conf = Conformer(conformer=trg_mol)
        trg_conf.get_molecule().SetProp("shape_sim", str(shape_sim))
        trg_conf.get_molecule().SetProp("esp_sim", str(esp_sim))

        trg_conf.write(os.path.join(tmp_dir, "conformer.sdf"))

    def _get_arguments(self, std_args: List) -> List:

        for flag in self.settings.arguments.flags:
            std_args.append(flag)
        for key, value in self.settings.arguments.parameters.items():
            std_args.append(f"{key}=")
            std_args.append(value)
        return std_args

    def _prepare_batch(self, batch):
        target_enums = []
        tmp_dirs = []

        for sublist in batch:
            for task in sublist:
                target_enums.append(task.data)
                tmp_dirs.append(tempfile.mkdtemp())
        return target_enums, tmp_dirs

    def _parse_output(self, trgs: List[Enumeration], tmp_dirs: List[str]) -> None:
        for tmp_dir, trg in zip(tmp_dirs, trgs):
            # grab the written sdf object
            sdf_path = os.path.join(tmp_dir, "conformer.sdf")
            mol_supplier = Chem.SDMolSupplier(sdf_path, removeHs=False)
            for mol in mol_supplier:  # should only be one conformer!
                conf = Conformer(conformer=mol)
                comp = self.get_compound_by_name(trg.get_compound_name())
                comp.find_enumeration(trg.get_enumeration_id()).add_conformer(conf)

    def _execute_espsim_parallel(self):
        # embed the reference compound

        parallelizer = Parallelizer(func=self._compute_esp_sim)
        ref_compound = Chem.AddHs(
            Chem.MolFromSmiles(self.settings.additional["ref_smiles"])
        )
        while self._subtask_container.done() is False:
            next_batch = self._get_sublists(get_first_n_lists=self._get_number_cores())

            _ = [sub.increment_tries() for element in next_batch for sub in element]
            _ = [sub.set_status_failed() for element in next_batch for sub in element]

            trgs, tmp_dirs = self._prepare_batch(next_batch)

            refs = [ref_compound for _ in range(len(next_batch))]

            parallelizer.execute_parallel(ref=refs, trg=trgs, tmp_dir=tmp_dirs)
            # hand over the embedded reference (compute once) and target compound (smiles string to be embedded)
            self._parse_output(tmp_dirs=tmp_dirs, trgs=trgs)

            for task, tmp_dir in zip(next_batch, tmp_dirs):
                for subtask in task:
                    if os.path.isfile(os.path.join(tmp_dir, "conformer.sdf")):
                        subtask.set_status_success()
                    else:
                        subtask.set_status_failed()

            self._remove_temporary(tmp_dirs)

[docs]    def execute(self):
        """
        esp-sim does molecular alignment with RDkit, then computes coulombic overlap integral + tanimoto similarity for shape measurement

        Use case takes a reference compound (known binder) and compare to REINVENT compounds

        Usage:
        * Define reference compound using settings.additional, as a smile string, to be embedded by RDkit
        * The remaining compounds are embedded using a preceeding RDkit embedding
        * attach the resulting scores to the enumeration
        """

        all_enums = []
        for compound in self.get_compounds():
            for enumeration in compound:
                all_enums.append(deepcopy(enumeration))
        if self._get_additional_setting("charge_method", default="am1-bcc") == "resp":
            # resp doesn#t play well with Icolos parallelization
            ref_compound = Chem.AddHs(
                Chem.MolFromSmiles(self.settings.additional["ref_smiles"])
            )
            tmp_dirs = []
            for enum in all_enums:
                tmp_dir = tempfile.mkdtemp()
                tmp_dirs.append(tmp_dir)
                self._compute_esp_sim(ref_compound, enum, tmp_dir=tmp_dir)
            self._parse_output(all_enums, tmp_dirs=tmp_dirs)
            self._remove_temporary(tmp_dirs)
        else:
            self.execution.parallelization.max_length_sublists = 1
            # unroll the provided compounds,
            self._subtask_container = SubtaskContainer(max_tries=3)
            self._subtask_container.load_data(all_enums)
            self._execute_espsim_parallel()