Source code for icolos.core.workflow_steps.autodockvina.docking

import os
import shutil
import tempfile
from typing import List, Tuple

from pydantic import BaseModel, Field
from rdkit import Chem
from copy import deepcopy

from icolos.utils.enums.step_enums import StepAutoDockVinaEnum, StepBaseEnum
from icolos.utils.execute_external.autodockvina import AutoDockVinaExecutor
from icolos.utils.execute_external.openbabel import OpenBabelExecutor
from icolos.utils.general.icolos_exceptions import StepFailed
from icolos.utils.general.files_paths import gen_tmp_file
from icolos.core.containers.compound import Conformer
from icolos.utils.enums.program_parameters import AutoDockVinaEnum, OpenBabelEnum
from icolos.core.workflow_steps.step import _LE, StepBase
from icolos.utils.general.parallelization import Subtask, SubtaskContainer, Parallelizer

_SBE = StepBaseEnum
_ADE = AutoDockVinaEnum()
_OBE = OpenBabelEnum()
_SAE = StepAutoDockVinaEnum()


[docs]class ADVSearchSpace(BaseModel):
    center_x: float = Field(alias="--center_x", default=None)
    center_y: float = Field(alias="--center_y", default=None)
    center_z: float = Field(alias="--center_z", default=None)
    size_x: float = Field(alias="--size_x", default=15.0)
    size_y: float = Field(alias="--size_y", default=15.0)
    size_z: float = Field(alias="--size_z", default=15.0)


[docs]class ADVConfiguration(BaseModel):
    seed: int = 42
    number_poses: int = 1
    search_space: ADVSearchSpace = ADVSearchSpace()
    receptor_path: str = None


[docs]class ADVAdditional(BaseModel):
    configuration: ADVConfiguration = ADVConfiguration()
    grid_ids: List[str] = ["grid0"]


[docs]class StepAutoDockVina(StepBase, BaseModel):

    _openbabel_executor: OpenBabelExecutor = None
    adv_additional: ADVAdditional = None

    def __init__(self, **data):
        super().__init__(**data)

        # initialize the executor and test availability
        self._initialize_backend(executor=AutoDockVinaExecutor)
        self._check_backend_availability()

        # initialize the executor for all "OpenBabel"
        self._openbabel_executor = OpenBabelExecutor()
        if not self._openbabel_executor.is_available():
            raise StepFailed(
                "AutoDock Vina requires OpenBabel execution, initialization failed."
            )

        # set ADV specific settings and ensure that each molecule gets its own sublist
        self.adv_additional = ADVAdditional(**self.settings.additional)
        self.execution.parallelization.max_length_sublists = 1

    def _set_docking_score(self, conformer: Chem.Mol) -> bool:
        try:
            result_tag_lines = conformer.GetProp(_ADE.REMARK_TAG).split("\n")
            result_line = [
                line for line in result_tag_lines if _ADE.RESULT_LINE_IDENTIFIER in line
            ][0]
            parts = result_line.split()
            docking_score = parts[_ADE.RESULT_LINE_POS_SCORE]
        except KeyError:
            return False
        conformer.SetProp(_SBE.ANNOTATION_TAG_DOCKING_SCORE, str(docking_score))
        return True

    def _write_molecule_to_pdbqt(self, path: str, molecule: Chem.Mol) -> bool:
        # generate temporary copy as PDB
        _, tmp_pdb = gen_tmp_file(suffix=".pdb", dir=os.path.dirname(path))
        Chem.MolToPDBFile(molecule, filename=tmp_pdb)

        # translate the pdb into a pdbqt including partial charges
        # Note: In contrast to the target preparation,
        # we will use a tree-based flexibility treatment here -
        # thus, the option "-xr" is NOT used.
        arguments = [
            tmp_pdb,
            _OBE.OBABEL_OUTPUT_FORMAT_PDBQT,
            "".join([_OBE.OBABEL_O, path]),
            _OBE.OBABEL_PARTIALCHARGE,
            _OBE.OBABEL_PARTIALCHARGE_GASTEIGER,
        ]
        self._openbabel_executor.execute(
            command=_OBE.OBABEL, arguments=arguments, check=False
        )

        if os.path.exists(path):
            return True
        else:
            return False

    def _generate_temporary_input_output_files(
        self, batch: List[List[Subtask]]
    ) -> Tuple[List[str], List[str], List[str], List[str]]:
        tmp_output_dirs = []
        tmp_input_paths = []
        tmp_output_paths = []
        enumeration_ids = []

        for next_subtask_list in batch:
            # for "AutoDock Vina", only single molecules can be handled so every sublist is
            # guaranteed at this stage to have only one element
            if len(next_subtask_list) > 1:
                self._logger.log(
                    f"Subtask list length for ADV is > 1 ({len(next_subtask_list)}), only the first element will be processed.",
                    _LE.WARNING,
                )
            subtask = next_subtask_list[0]

            # generate temporary input files and output directory
            cur_tmp_output_dir = tempfile.mkdtemp()
            _, cur_tmp_input_pdbqt = gen_tmp_file(
                suffix=".pdbqt", dir=cur_tmp_output_dir
            )
            _, cur_tmp_output_sdf = gen_tmp_file(suffix=".sdf", dir=cur_tmp_output_dir)

            # try to write the enumeration molecules out as PDBQT files
            enumeration = subtask.data
            mol = deepcopy(enumeration.get_molecule())
            if mol is None:
                shutil.rmtree(cur_tmp_output_dir)
                self._logger.log(
                    f"Enumeration {enumeration.get_index_string()} did not hold a valid RDkit molecule - skipped.",
                    _LE.DEBUG,
                )
                continue
            if not self._write_molecule_to_pdbqt(cur_tmp_input_pdbqt, mol):
                self._logger.log(
                    f"Could not generate PDBQT intermediate file from enumeration {enumeration.get_index_string()} - skipped.",
                    _LE.DEBUG,
                )
                continue

            # also store all the paths in case it succeeded -> these will be used later, failures will be ignored
            tmp_output_dirs.append(cur_tmp_output_dir)
            tmp_input_paths.append(cur_tmp_input_pdbqt)
            tmp_output_paths.append(cur_tmp_output_sdf)
            enumeration_ids.append(enumeration.get_index_string())

        return tmp_output_dirs, tmp_input_paths, tmp_output_paths, enumeration_ids

    def _execute_autodockvina(self):
        # get number of sublists in batch and initialize Parallelizer
        adv_parallelizer = Parallelizer(func=self._run_subjob)

        # continue until everything is successfully done or number of retries have been exceeded
        while self._subtask_container.done() is False:
            next_batch = self._get_sublists(get_first_n_lists=self._get_number_cores())

            # generate paths and initialize molecules (so that if they fail, this can be covered)
            (
                tmp_output_dirs,
                tmp_input_paths,
                tmp_output_paths,
                enumeration_ids,
            ) = self._generate_temporary_input_output_files(next_batch)

            # execute the current batch in parallel; hand over lists of parameters (will be handled by Parallelizer)
            # also increment the tries and set the status to "failed" (don't do that inside subprocess, as data is
            # copied, not shared!)
            _ = [sub.increment_tries() for element in next_batch for sub in element]
            _ = [sub.set_status_failed() for element in next_batch for sub in element]
            adv_parallelizer.execute_parallel(
                input_path_pdbqt=tmp_input_paths, output_path_sdf=tmp_output_paths
            )

            # parse the output of that particular batch and remove temporary files
            self._parse_adv_output_batch(
                tmp_input_paths=tmp_input_paths,
                tmp_output_paths=tmp_output_paths,
                enumeration_ids=enumeration_ids,
                next_batch=next_batch,
            )

            # clean-up
            self._remove_temporary(tmp_output_dirs)

            # print the progress for this execution
            self._log_execution_progress()

    def _parse_adv_output_batch(
        self,
        tmp_input_paths: List[str],
        tmp_output_paths: List[str],
        enumeration_ids: List[str],
        next_batch: List[List[Subtask]],
    ):

        for i in range(len(next_batch)):
            subtask = next_batch[i][0]
            tmp_output_path = tmp_output_paths[i]
            enumeration_id = enumeration_ids[i]
            grid_id = self.adv_additional.grid_ids[0]
            grid_path = self.adv_additional.configuration.receptor_path

            # this is a protection against the case where empty (file size == 0 bytes) files are generated due to
            # a failure during docking
            if (
                not os.path.isfile(tmp_output_path)
                or os.path.getsize(tmp_output_path) == 0
            ):
                continue

            mol_supplier = Chem.SDMolSupplier(tmp_output_path, removeHs=False)
            for mol in mol_supplier:
                if mol is None:
                    continue

                # add the information on the actual grid used
                mol.SetProp(_SBE.ANNOTATION_GRID_ID, str(grid_id))
                mol.SetProp(_SBE.ANNOTATION_GRID_PATH, str(grid_path))
                mol.SetProp(_SBE.ANNOTATION_GRID_FILENAME, os.path.basename(grid_path))

                # if no docking score is attached (i.e. the molecule is a receptor or so, skip it)
                if self._set_docking_score(mol) is not True:
                    continue

                # add molecule to the appropriate ligand
                for compound in self.get_compounds():
                    for enumeration in compound:
                        if enumeration.get_index_string() == enumeration_id:
                            new_conformer = Conformer(
                                conformer=mol,
                                conformer_id=None,
                                enumeration_object=enumeration,
                            )
                            enumeration.add_conformer(new_conformer, auto_update=True)
                            subtask.set_status_success()
                            break

    def _delay_file_system(self, path) -> bool:
        return self._wait_until_file_generation(
            path=path, interval_sec=2, maximum_sec=10
        )

    def _run_subjob(self, input_path_pdbqt: str, output_path_sdf: str):

        config = self.adv_additional.configuration

        # set up arguments list and execute
        _, tmp_pdbqt_docked = gen_tmp_file(
            suffix=".pdbqt", dir=os.path.dirname(input_path_pdbqt)
        )
        arguments = [
            _ADE.VINA_RECEPTOR,
            config.receptor_path,
            _ADE.VINA_LIGAND,
            input_path_pdbqt,
            _ADE.VINA_CPU,
            str(1),
            _ADE.VINA_SEED,
            config.seed,
            _ADE.VINA_OUT,
            tmp_pdbqt_docked,
            _ADE.VINA_CENTER_X,
            str(config.search_space.center_x),
            _ADE.VINA_CENTER_Y,
            str(config.search_space.center_y),
            _ADE.VINA_CENTER_Z,
            str(config.search_space.center_z),
            _ADE.VINA_SIZE_X,
            str(config.search_space.size_x),
            _ADE.VINA_SIZE_Y,
            str(config.search_space.size_y),
            _ADE.VINA_SIZE_Z,
            str(config.search_space.size_z),
            _ADE.VINA_NUM_MODES,
            config.number_poses,
        ]

        self._backend_executor.execute(
            command=_ADE.VINA, arguments=arguments, check=True
        )
        self._delay_file_system(path=tmp_pdbqt_docked)

        # translate the parsed output PDBQT into an SDF, via regular pdb to retain hydrogens!
        # This appears to be a bug in obabel 3.1
        # TODO: move to a direct conversino once the obabel bug is fixed
        arguments = [
            tmp_pdbqt_docked,
            _OBE.OBABEL_INPUTFORMAT_PDBQT,
            _OBE.OBABEL_OUTPUT_FORMAT_PDB,
            # clip the extension to pdb
            "".join([_OBE.OBABEL_O, tmp_pdbqt_docked[:-2]]),
        ]
        self._openbabel_executor.execute(
            command=_OBE.OBABEL, arguments=arguments, check=True
        )
        # convert resulting pdb to sdf
        arguments = [
            _OBE.OBABEL_INPUTFORMAT_PDB,
            tmp_pdbqt_docked[:-2],
            _OBE.OBABEL_OUTPUT_FORMAT_SDF,
            # clip the extension to pdb
            "".join([_OBE.OBABEL_O, output_path_sdf]),
            "-h",
        ]
        self._openbabel_executor.execute(
            command=_OBE.OBABEL, arguments=arguments, check=True
        )
        self._delay_file_system(path=output_path_sdf)

[docs]    def execute(self):
        # Note: This step only supports one grid at a time, ensemble docking is taken care of at the workflow level!

        # in order to be able to efficiently execute ADV on the enumeration level, all of them have to be unrolled
        # Note: As they retain their respective Compound object, the attribution later on is simple
        all_enumerations = []
        for compound in self.get_compounds():
            all_enumerations = all_enumerations + compound.get_enumerations()
            for enumeration in compound:
                enumeration.clear_conformers()

        # split into sublists, according to the settings
        self._subtask_container = SubtaskContainer(
            max_tries=self.execution.failure_policy.n_tries
        )
        self._subtask_container.load_data(all_enumerations)

        # execute ADV
        self._execute_autodockvina()