Source code for icolos.core.workflow_steps.pmx.mutate

import os
import shutil
import tempfile
from icolos.core.containers.perturbation_map import Edge, Node, PerturbationMap
from icolos.core.workflow_steps.pmx.base import StepPMXBase
from pydantic import BaseModel
from icolos.utils.enums.logging_enums import LoggingConfigEnum
from icolos.utils.enums.program_parameters import (
    GromacsEnum,
    PMXAtomMappingEnum,
    PMXEnum,
)
from icolos.utils.enums.step_enums import StepGromacsEnum
from icolos.utils.execute_external.execute import Executor

from icolos.utils.execute_external.gromacs import GromacsExecutor

_SGE = StepGromacsEnum()
_GE = GromacsEnum()
_PE = PMXEnum()
_PAE = PMXAtomMappingEnum()
_LE = LoggingConfigEnum()


[docs]class StepPMXmutate(StepPMXBase, BaseModel):
    """This is the de facto entrypoint for a protein FEP workflow

    This step does the legwork for setting up the output dir structure, similar to pmx_setup for the small molecule case
    """

    def __init__(self, **data):
        super().__init__(**data)
        self._initialize_backend(Executor)

[docs]    def execute(self):
        # start with a pdb file, parametrize with pdb2gmx
        replicas = (
            self.settings.additional["replicas"]
            if "replicas" in self.settings.additional.keys()
            else 3
        )
        assert self.work_dir is not None

        # this is a bit awkward putting them in the ligand dir, we adopt the same structure as the small molecule approach
        os.makedirs(os.path.join(self.work_dir, "input"), exist_ok=True)
        for folder in ["ligands", "mdp", "protein"]:
            os.makedirs(os.path.join(self.work_dir, "input", folder), exist_ok=True)
        setup_dir = os.path.join(self.work_dir, _PAE.LIGAND_DIR)
        self.data.generic.write_out_all_files(setup_dir)

        mdp_dir = self.data.generic.get_argument_by_extension(
            ext="mdp", rtn_file_object=True
        )
        mdp_dir.write(os.path.join(self.work_dir, "input/mdp"))

        # run pdb2gmx on the protein, generate a good input file
        pdb2gmx_args = [
            "-f",
            self.data.generic.get_argument_by_extension("pdb"),
            "-water",
            self.settings.additional["water"],
            "-ff",
            self.settings.additional["forcefield"],
            "-o",
            # use pdb here to retain chain IDs for pmx mutate
            os.path.join(setup_dir, "wt.pdb"),
            "-ignh",
        ]
        # run the
        self._gromacs_executor.execute(
            command=_GE.PDB2GMX, arguments=pdb2gmx_args, check=True, location=setup_dir
        )
        # now we have the prepared structure to pass to pmx mutate

        # parse mutant file, note that pdb2gmx will remove chain IDs
        with open(
            os.path.join(setup_dir, self.data.generic.get_argument_by_extension("mut")),
            "r",
        ) as f:
            mut_lines = [l for l in f.readlines() if l.strip()]

        perturbation_map = PerturbationMap()
        wt_node = Node(node_id="wt", node_hash="wt")
        perturbation_map.add_node(wt_node)
        # all we need is the identities of the mutants

        # in the form "P 76 TYR\n X 45 GLY"
        for mut in mut_lines:
            chain, resid, new_res = mut.split()
            edge_dir = os.path.join(self.work_dir, f"wt_{chain}{resid}{new_res}")
            os.makedirs(
                edge_dir,
                exist_ok=True,
            )
            for branch in self.therm_cycle_branches:
                for state in self.states:
                    for r in range(1, replicas + 1):
                        for sim in self.sim_types:
                            os.makedirs(
                                os.path.join(
                                    self.work_dir,
                                    edge_dir,
                                    branch,
                                    state,
                                    f"run{r}",
                                    sim,
                                ),
                                exist_ok=True,
                            )
            mutate_args = [
                "-f",
                os.path.join(setup_dir, "wt.pdb"),
                "--ref",
                os.path.join(setup_dir, "wt.pdb"),
                "-o",
                f"{chain}{resid}{new_res}.pdb",
                "-ff",
                self._get_additional_setting(_SGE.FORCEFIELD),
            ]
            self._logger.log(
                f"Generating mutation {chain}{resid}->{new_res}", _LE.DEBUG
            )
            result = self._backend_executor.execute(
                command=_PE.MUTATE,
                arguments=mutate_args,
                location=os.path.join(edge_dir, "bound"),
                check=True,
                pipe_input=f'echo -e "{chain}\n{resid}\n{new_res}\nn"',
            )

            # run pdb2gmx on the resulting structure, resulting structures are for the bound states

            pdb2gmx_args = [
                "-f",
                f"{chain}{resid}{new_res}.pdb",
                "-water",
                self.settings.additional["water"],
                "-ff",
                self.settings.additional["forcefield"],
                "-o",
                "init.pdb",
            ]
            self._backend_executor.execute(
                command=_GE.PDB2GMX,
                arguments=pdb2gmx_args,
                location=os.path.join(edge_dir, "bound"),
                check=True,
            )
            # now make the directory in the main workdir to  for that mutation and copy the input files over

            # now we have a single mutated structure file per system.
            node = Node(
                node_id=f"{chain}{resid}{new_res}",
                node_hash=f"{chain}{resid}{new_res}",
            )
            perturbation_map.add_node(node)
            perturbation_map.add_edge(Edge(node_from=wt_node, node_to=node))

            # generate connectivity (just a star map), now we have an initialized perturbation map, relying on file paths only, not attached structures for now
            for node in perturbation_map.nodes:
                perturbation_map._attach_node_connectivity(node)

            self.get_workflow_object().set_perturbation_map(perturbation_map)

            # TODO: we need to generate mutatinos for the bound state and the unbound state, by separating the pdb file

            # we have the chain being mutated, extract this to its own file

            # create the index file
            make_ndx_args = ["-f", "init.pdb"]
            self._backend_executor.execute(
                command=_GE.MAKE_NDX,
                arguments=make_ndx_args,
                location=os.path.join(edge_dir, "bound"),
                pipe_input=f'echo -e "chain {chain}\nq"',
                check=True,
            )
            # index.ndx in the bound state directory

            editconf_args = [
                "-f",
                "init.pdb",
                "-n",
                "index.ndx",
                "-o",
                os.path.join(
                    edge_dir, "unbound", f"{chain}{resid}{new_res}_unbound.pdb"
                ),
            ]
            self._backend_executor.execute(
                command=_GE.EDITCONF,
                arguments=editconf_args,
                location=os.path.join(edge_dir, "bound"),
                check=True,
                pipe_input=f'echo -e "ch{chain}\n"',
            )

            # generate topology for the unbound state
            pdb2gmx_args = [
                "-f",
                f"{chain}{resid}{new_res}_unbound.pdb",
                "-water",
                self.settings.additional["water"],
                "-ff",
                self.settings.additional["forcefield"],
                "-o",
                "init.pdb",
            ]
            self._backend_executor.execute(
                command=_GE.PDB2GMX,
                arguments=pdb2gmx_args,
                location=os.path.join(edge_dir, "unbound"),
                check=True,
            )


help_string = """
pmx mutate -h
usage: pmx [-h] [-f infile] [-fB infileB] [-o outfile] [-ff ff]                                    
           [--script script] [--keep_resid | --ref ] [--resinfo]                                   
                                                                                                   
This script applies mutations of residues in a structure file for subsequent                                          
free energy calculations. It supports mutations to protein, DNA, and RNA                           
molecules.                                                                                         
                                                                          
The mutation information and dummy placements are taken from the hybrid residue
database "mutres.mtp". The best way to use this script is to take a pdb/gro file                                                                    
that has been written with pdb2gmx with all hydrogen atoms present.
                                                                          
By default, all residues are renumbered starting from 1, so to have unique
residue IDs. If you want to keep the original residue IDs, you can use the flag                                                                     
--keep_resid. In this case, you will also need to provide chain information                                                                         
in order to be able to mutate the desired residue. Alternatively, if you would                                                                      
like to use the original residue IDs but these have been changed, e.g. by gromacs,
you can provide a reference PDB file (with chain information too) using the --ref
flag. The input structure will be mutated according to the IDs chosen for the
reference structure after having mapped the two residue indices.

The program can either be executed interactively or via script. The script file
simply has to consist of "residue_id target_residue_name" pairs (just with some
space between the id and the name), or "chain_id residue_id target_residue_name"
if you are keeping the original residue IDs or providing a reference structure.

The script uses an extended one-letter code for amino acids to account for
different protonation states. Use the --resinfo flag to print the dictionary.

optional arguments:
  -h, --help       show this help message and exit
  -f infile        Input structure file in PDB or GRO format. Default is "protein.pdb"
  -fB infileB      Input structure file of the B state in PDB or GRO format (optional).
  -o outfile       Output structure file in PDB or GRO format. Default is "mutant.pdb"
  -ff ff           Force field to use. If none is provided, 
                   a list of available ff will be shown.
  --script script  Text file with list of mutations (optional).
  --keep_resid     Whether to renumber all residues or to keep the
                   original residue IDs. By default, all residues are
                   renumbered so to have unique IDs. With this flags set,
                   the original IDs are kept. Because the IDs might not
                   be unique anymore, you will also be asked to choose
                   the chain ID where the residue you want to mutate is.
  --ref            Provide a reference PDB structure from which to map
                   the chain and residue IDs onto the file to be mutated (-f).
                   This can be useful when wanting to mutate a file that
                   has had its residues renumbered or the chain information
                   removed (e.g. after gmx grompp). As in the --keep_resid
                   option, if --ref is chosen, you will need to provide chain
                   information either interactively or via the --script flag.
  --resinfo        Show the list of 3-letter -> 1-letter residues
"""