Source code for icolos.core.workflow_steps.calculation.shaep

from icolos.utils.execute_external.execute import Executor
from icolos.core.workflow_steps.calculation.base import StepCalculationBase
from icolos.core.workflow_steps.step import _LE
from icolos.utils.enums.step_enums import StepShaepEnum
from icolos.utils.enums.program_parameters import PantherEnum, ShaepEnum
from icolos.core.containers.compound import Conformer
import tempfile
from pydantic import BaseModel
import os

_SSE = StepShaepEnum()
_SE = ShaepEnum()
_PE = PantherEnum()


[docs]class StepShaep(StepCalculationBase, BaseModel):
    def __init__(self, **data):
        super().__init__(**data)

        self._initialize_backend(executor=Executor)

    def _prepare_tmp_input_dir(self):
        tmp_dir = tempfile.mkdtemp()
        return tmp_dir

    def _execute_backend(self, conf_path: str, tmp_dir: str, ni_path: str):
        arguments = [
            os.path.join(tmp_dir, ni_path),
            conf_path,
            os.path.join(tmp_dir, _SE.OUTPUT_SIMILARITY),
        ]
        self._backend_executor.execute(
            command=_SE.SHAEP_EXECUTABLE, arguments=arguments, check=True
        )

    def _parse_output(self, tmp_dir: str, conformer: Conformer):
        with open(os.path.join(tmp_dir, _SE.OUTPUT_SIMILARITY), "r") as f:
            # TODO: add support for multiple input structures; ignore the names (all will be in one line), but from
            #       position 8 (index 7 in python) onwards, the shape and esp similarities are reported in the same
            #       order as the input, i.e. <7 other values> mol1_shape mol1_esp mol2_shape ...
            parts = f.readlines()[1].split("\t")
            conformer.get_molecule().SetProp(_SE.TAG_SHAPE_SIMILARITY, str(parts[7]))
            conformer.get_molecule().SetProp(_SE.TAG_ESP_SIMILARITY, str(parts[8]))

[docs]    def execute(self):
        number_rescored = 0
        for compound in self.get_compounds():
            for enumeration in compound.get_enumerations():
                if len(enumeration.get_conformers()) == 0:
                    self._logger.log(
                        f"Found no conformers for enumeration {enumeration} for compound {compound}.",
                        _LE.WARNING,
                    )
                    # we can still execute shaep at the enumeration level, if the compounds are correcty annotated they should be written out ok.  Will be slower though
                    # easiest for now is to add the enumeration mol object as a single conformer and run that through shaep
                    mol = enumeration.get_molecule()
                    conf = Conformer(conformer=mol)
                    enumeration.add_conformer(conf)

                # TODO: ShaEP allow batch execution for any number of compounds (parsing gets more difficult though)
                #       Implement that to avoid overhead from file system issues
                # TODO: Refactor and add comments
                for conformer in enumeration.get_conformers():
                    tmp_dir = self._prepare_tmp_input_dir()
                    conf_path = os.path.join(tmp_dir, _SE.CONFORMER_PATH)
                    ni_file = self.data.generic.get_files_by_extension("mol2")[0]
                    ni_file.write(tmp_dir)
                    conformer.write(conf_path)
                    self._execute_backend(conf_path, tmp_dir, ni_file.get_file_name())
                    self._parse_output(tmp_dir, conformer)
                    self._logger.log(
                        f"Finished shaep execution for conformer {enumeration.get_index_string()}.",
                        _LE.DEBUG,
                    )
                    number_rescored += 1
                    self._remove_temporary(tmp_dir)
        self._logger.log(f"Executed ShaEP for {number_rescored} conformers.", _LE.INFO)