import os
import shutil
import tempfile
from icolos.core.containers.perturbation_map import Edge, Node, PerturbationMap
from icolos.core.workflow_steps.pmx.base import StepPMXBase
from pydantic import BaseModel
from icolos.utils.enums.logging_enums import LoggingConfigEnum
from icolos.utils.enums.program_parameters import (
GromacsEnum,
PMXAtomMappingEnum,
PMXEnum,
)
from icolos.utils.enums.step_enums import StepGromacsEnum
from icolos.utils.execute_external.execute import Executor
from icolos.utils.execute_external.gromacs import GromacsExecutor
_SGE = StepGromacsEnum()
_GE = GromacsEnum()
_PE = PMXEnum()
_PAE = PMXAtomMappingEnum()
_LE = LoggingConfigEnum()
[docs]class StepPMXmutate(StepPMXBase, BaseModel):
"""This is the de facto entrypoint for a protein FEP workflow
This step does the legwork for setting up the output dir structure, similar to pmx_setup for the small molecule case
"""
def __init__(self, **data):
super().__init__(**data)
self._initialize_backend(Executor)
[docs] def execute(self):
# start with a pdb file, parametrize with pdb2gmx
replicas = (
self.settings.additional["replicas"]
if "replicas" in self.settings.additional.keys()
else 3
)
assert self.work_dir is not None
# this is a bit awkward putting them in the ligand dir, we adopt the same structure as the small molecule approach
os.makedirs(os.path.join(self.work_dir, "input"), exist_ok=True)
for folder in ["ligands", "mdp", "protein"]:
os.makedirs(os.path.join(self.work_dir, "input", folder), exist_ok=True)
setup_dir = os.path.join(self.work_dir, _PAE.LIGAND_DIR)
self.data.generic.write_out_all_files(setup_dir)
mdp_dir = self.data.generic.get_argument_by_extension(
ext="mdp", rtn_file_object=True
)
mdp_dir.write(os.path.join(self.work_dir, "input/mdp"))
# run pdb2gmx on the protein, generate a good input file
pdb2gmx_args = [
"-f",
self.data.generic.get_argument_by_extension("pdb"),
"-water",
self.settings.additional["water"],
"-ff",
self.settings.additional["forcefield"],
"-o",
# use pdb here to retain chain IDs for pmx mutate
os.path.join(setup_dir, "wt.pdb"),
"-ignh",
]
# run the
self._gromacs_executor.execute(
command=_GE.PDB2GMX, arguments=pdb2gmx_args, check=True, location=setup_dir
)
# now we have the prepared structure to pass to pmx mutate
# parse mutant file, note that pdb2gmx will remove chain IDs
with open(
os.path.join(setup_dir, self.data.generic.get_argument_by_extension("mut")),
"r",
) as f:
mut_lines = [l for l in f.readlines() if l.strip()]
perturbation_map = PerturbationMap()
wt_node = Node(node_id="wt", node_hash="wt")
perturbation_map.add_node(wt_node)
# all we need is the identities of the mutants
# in the form "P 76 TYR\n X 45 GLY"
for mut in mut_lines:
chain, resid, new_res = mut.split()
edge_dir = os.path.join(self.work_dir, f"wt_{chain}{resid}{new_res}")
os.makedirs(
edge_dir,
exist_ok=True,
)
for branch in self.therm_cycle_branches:
for state in self.states:
for r in range(1, replicas + 1):
for sim in self.sim_types:
os.makedirs(
os.path.join(
self.work_dir,
edge_dir,
branch,
state,
f"run{r}",
sim,
),
exist_ok=True,
)
mutate_args = [
"-f",
os.path.join(setup_dir, "wt.pdb"),
"--ref",
os.path.join(setup_dir, "wt.pdb"),
"-o",
f"{chain}{resid}{new_res}.pdb",
"-ff",
self._get_additional_setting(_SGE.FORCEFIELD),
]
self._logger.log(
f"Generating mutation {chain}{resid}->{new_res}", _LE.DEBUG
)
result = self._backend_executor.execute(
command=_PE.MUTATE,
arguments=mutate_args,
location=os.path.join(edge_dir, "bound"),
check=True,
pipe_input=f'echo -e "{chain}\n{resid}\n{new_res}\nn"',
)
# run pdb2gmx on the resulting structure, resulting structures are for the bound states
pdb2gmx_args = [
"-f",
f"{chain}{resid}{new_res}.pdb",
"-water",
self.settings.additional["water"],
"-ff",
self.settings.additional["forcefield"],
"-o",
"init.pdb",
]
self._backend_executor.execute(
command=_GE.PDB2GMX,
arguments=pdb2gmx_args,
location=os.path.join(edge_dir, "bound"),
check=True,
)
# now make the directory in the main workdir to for that mutation and copy the input files over
# now we have a single mutated structure file per system.
node = Node(
node_id=f"{chain}{resid}{new_res}",
node_hash=f"{chain}{resid}{new_res}",
)
perturbation_map.add_node(node)
perturbation_map.add_edge(Edge(node_from=wt_node, node_to=node))
# generate connectivity (just a star map), now we have an initialized perturbation map, relying on file paths only, not attached structures for now
for node in perturbation_map.nodes:
perturbation_map._attach_node_connectivity(node)
self.get_workflow_object().set_perturbation_map(perturbation_map)
# TODO: we need to generate mutatinos for the bound state and the unbound state, by separating the pdb file
# we have the chain being mutated, extract this to its own file
# create the index file
make_ndx_args = ["-f", "init.pdb"]
self._backend_executor.execute(
command=_GE.MAKE_NDX,
arguments=make_ndx_args,
location=os.path.join(edge_dir, "bound"),
pipe_input=f'echo -e "chain {chain}\nq"',
check=True,
)
# index.ndx in the bound state directory
editconf_args = [
"-f",
"init.pdb",
"-n",
"index.ndx",
"-o",
os.path.join(
edge_dir, "unbound", f"{chain}{resid}{new_res}_unbound.pdb"
),
]
self._backend_executor.execute(
command=_GE.EDITCONF,
arguments=editconf_args,
location=os.path.join(edge_dir, "bound"),
check=True,
pipe_input=f'echo -e "ch{chain}\n"',
)
# generate topology for the unbound state
pdb2gmx_args = [
"-f",
f"{chain}{resid}{new_res}_unbound.pdb",
"-water",
self.settings.additional["water"],
"-ff",
self.settings.additional["forcefield"],
"-o",
"init.pdb",
]
self._backend_executor.execute(
command=_GE.PDB2GMX,
arguments=pdb2gmx_args,
location=os.path.join(edge_dir, "unbound"),
check=True,
)
help_string = """
pmx mutate -h
usage: pmx [-h] [-f infile] [-fB infileB] [-o outfile] [-ff ff]
[--script script] [--keep_resid | --ref ] [--resinfo]
This script applies mutations of residues in a structure file for subsequent
free energy calculations. It supports mutations to protein, DNA, and RNA
molecules.
The mutation information and dummy placements are taken from the hybrid residue
database "mutres.mtp". The best way to use this script is to take a pdb/gro file
that has been written with pdb2gmx with all hydrogen atoms present.
By default, all residues are renumbered starting from 1, so to have unique
residue IDs. If you want to keep the original residue IDs, you can use the flag
--keep_resid. In this case, you will also need to provide chain information
in order to be able to mutate the desired residue. Alternatively, if you would
like to use the original residue IDs but these have been changed, e.g. by gromacs,
you can provide a reference PDB file (with chain information too) using the --ref
flag. The input structure will be mutated according to the IDs chosen for the
reference structure after having mapped the two residue indices.
The program can either be executed interactively or via script. The script file
simply has to consist of "residue_id target_residue_name" pairs (just with some
space between the id and the name), or "chain_id residue_id target_residue_name"
if you are keeping the original residue IDs or providing a reference structure.
The script uses an extended one-letter code for amino acids to account for
different protonation states. Use the --resinfo flag to print the dictionary.
optional arguments:
-h, --help show this help message and exit
-f infile Input structure file in PDB or GRO format. Default is "protein.pdb"
-fB infileB Input structure file of the B state in PDB or GRO format (optional).
-o outfile Output structure file in PDB or GRO format. Default is "mutant.pdb"
-ff ff Force field to use. If none is provided,
a list of available ff will be shown.
--script script Text file with list of mutations (optional).
--keep_resid Whether to renumber all residues or to keep the
original residue IDs. By default, all residues are
renumbered so to have unique IDs. With this flags set,
the original IDs are kept. Because the IDs might not
be unique anymore, you will also be asked to choose
the chain ID where the residue you want to mutate is.
--ref Provide a reference PDB structure from which to map
the chain and residue IDs onto the file to be mutated (-f).
This can be useful when wanting to mutate a file that
has had its residues renumbered or the chain information
removed (e.g. after gmx grompp). As in the --keep_resid
option, if --ref is chosen, you will need to provide chain
information either interactively or via the --script flag.
--resinfo Show the list of 3-letter -> 1-letter residues
"""