"""Qptuna support"""
# pylint: disable=import-outside-toplevel, import-error
import json
import logging
from pathlib import Path
from typing import Annotated, Any, Callable, cast
import numpy as np
import pandas as pd
import pytest
from maize.core.node import Node
from maize.core.interface import Input, Output, FileParameter, Suffix, Parameter, Flag
from maize.utilities.testing import TestRig
from maize.utilities.chem import IsomerCollection, Isomer
log = logging.getLogger("run")
SMILES_COLUMN = "smiles"
TARGET_COLUMN = "score"
PREDICTED_COLUMN = "Prediction"
DEFAULT_DATAPATH = Path("data.csv")
DEFAULT_PATCHED_CONFIG = Path("config-patched.json")
def _mols_to_csv(path: Path, mols: list[IsomerCollection], max_size: int = 1000) -> None:
"""Save SMILES - score information to a CSV file, or update one if it exists"""
new = pd.DataFrame(
{
SMILES_COLUMN: [mol.smiles for mol in mols],
TARGET_COLUMN: [mol.best_score for mol in mols],
}
)
if path.exists():
old = pd.read_csv(path)
data = pd.concat([old, new]).drop_duplicates(subset=SMILES_COLUMN)
log.info("Updating pool with %s molecules, %s total", len(new), len(data))
else:
data = new
log.info("Sending %s molecules to Qptuna", len(data[-max_size:]))
data[-max_size:].to_csv(path)
def _parse_score(isomer: Isomer, value: Any) -> Isomer:
isomer.scores = np.array([value])
isomer.set_tag("score_type", "surrogate")
return isomer
def _parse_unc(isomer: Isomer, value: Any) -> Isomer:
isomer.set_tag("uncertainty", value)
return isomer
def _csv_to_mols(
path: Path, mols: list[IsomerCollection], parser: Callable[[Isomer, Any], Isomer]
) -> list[IsomerCollection]:
"""Read qptuna CSV output into existing molecules"""
data = pd.read_csv(path)
if any(col not in data.columns for col in (SMILES_COLUMN, TARGET_COLUMN)):
raise KeyError(
(f"Could not find columns '{SMILES_COLUMN}' or " f"'{TARGET_COLUMN}' in qptuna output")
)
for mol, score in zip(mols, data[PREDICTED_COLUMN]):
for isomer in mol.molecules:
parser(isomer, score)
return mols
def _patch_config(path: Path, training_data: Path) -> Path:
"""Patch the qptuna config 'data' section"""
with path.open() as file:
config = json.load(file)
config["data"] = {
"training_dataset_file": training_data.as_posix(),
"input_column": SMILES_COLUMN,
"response_column": TARGET_COLUMN,
}
with DEFAULT_PATCHED_CONFIG.open("w") as file:
json.dump(config, file)
return DEFAULT_PATCHED_CONFIG
def _log_results(path: Path, logger: logging.Logger) -> None:
"""Formats the results of hyperparameter optimization"""
with path.open() as file:
config = json.load(file)
logger.info("Best value: %s", config["metadata"]["best_value"])
logger.info("Best model: %s", config["algorithm"]["name"])
for key, val in config["algorithm"]["parameters"].items():
logger.info(" %s = %s", key, val)
[docs]
class QptunaTrain(Node):
"""
Interface to Qptuna training.
Notes
-----
See the `Qptuna repo <https://github.com/MolecularAI/Qptuna>`_
for installation instructions. For maize to access it, specify the python interpreter
and script location (most likely your python environment ``bin`` folder).
"""
required_callables = ["qptuna-build"]
"""Requires the 'qptuna-build' callable"""
inp: Input[list[IsomerCollection]] = Input()
"""List of molecules to train the model with"""
inp_config: Input[Annotated[Path, Suffix("json")]] = Input(cached=True)
"""Qptuna model configuration (``buildconfig``)"""
out: Output[bool] = Output()
"""Signal emitted for completed training"""
model: FileParameter[Annotated[Path, Suffix(".pkl")]] = FileParameter(exist_required=False)
"""Merged model output"""
pool: FileParameter[Annotated[Path, Suffix(".csv")]] = FileParameter(exist_required=False)
"""File to pool training molecules in"""
n_train: Parameter[int] = Parameter(default=1000)
"""Number of molecules to train with"""
def run(self) -> None:
# Make sure we have a consistent 'data' section
build_config = self.inp_config.receive()
self.logger.debug("Received config")
config = _patch_config(build_config, self.pool.filepath)
# Outputs
best_model = Path("best.pkl").absolute()
mols = self.inp.receive()
self.logger.info("Updating pool with %s molecules", len(mols))
_mols_to_csv(self.pool.filepath, mols, max_size=self.n_train.value)
command = (
f"{self.runnable['qptuna-build']} "
f"--config {config.as_posix()} "
f"--best-model-outpath {best_model.as_posix()} "
f"--merged-model-outpath {self.model.filepath.absolute().as_posix()}"
)
self.run_command(command, verbose=True)
self.logger.info("Finished training")
self.out.send(True)
[docs]
class QptunaHyper(Node):
"""
Interface to Qptuna hyperparameter optimisation.
Notes
-----
See the `Qptuna repo <https://github.com/MolecularAI/Qptuna>`_
for installation instructions. For maize to access it, specify the python interpreter
and script location (most likely your python environment ``bin`` folder).
"""
required_callables = ["qptuna-optimize"]
"""Requires the 'qptuna-optimize' callable"""
inp: Input[list[IsomerCollection]] = Input()
"""List of molecules to train the model with"""
out: Output[Annotated[Path, Suffix("json")]] = Output()
"""Optimised hyperparameter config"""
configuration: FileParameter[Annotated[Path, Suffix(".json")]] = FileParameter()
"""Qptuna configuration template"""
model: FileParameter[Annotated[Path, Suffix(".pkl")]] = FileParameter(exist_required=False)
"""Merged model output"""
pool: FileParameter[Annotated[Path, Suffix(".csv")]] = FileParameter(exist_required=False)
"""File to pool training molecules in"""
n_train: Parameter[int] = Parameter(default=1000)
"""Number of molecules to optimize with"""
def run(self) -> None:
# Make sure we have a consistent 'data' section
config = _patch_config(self.configuration.filepath, self.pool.filepath)
# Outputs
build_out = Path("build.json").absolute()
best_model = Path("best.pkl").absolute()
mols = self.inp.receive()
self.logger.info("Updating pool with %s molecules", len(mols))
_mols_to_csv(self.pool.filepath, mols, max_size=self.n_train.value)
command = (
f"{self.runnable['qptuna-optimize']} "
f"--config {config.as_posix()} "
f"--best-buildconfig-outpath {build_out.as_posix()} "
f"--best-model-outpath {best_model.as_posix()} "
f"--merged-model-outpath {self.model.filepath.absolute().as_posix()}"
)
self.run_command(command, verbose=True)
self.logger.info("Finished hyperparameter optimization")
_log_results(build_out, self.logger)
self.out.send(build_out)
# TODO Add `SmilesAndSideInfoFromFile` and / or `PrecomputedDescriptorFromFile`
[docs]
class QptunaPredict(Node):
"""
Interface to Qptuna prediction.
Notes
-----
See the `Qptuna repo <https://github.com/MolecularAI/Qptuna>`_
for installation instructions. For maize to access it, specify the python interpreter
and script location (most likely your python environment ``bin`` folder).
"""
required_callables = ["qptuna-predict"]
"""Requires the 'qptuna-predict' callable"""
inp: Input[list[IsomerCollection]] = Input()
"""List of molecules to predict using the built model"""
out: Output[list[IsomerCollection]] = Output()
"""List of tagged molecules with predicted values"""
model: FileParameter[Annotated[Path, Suffix(".pkl")]] = FileParameter(exist_required=False)
"""Merged model output / reading location"""
uncertainty: Flag = Flag(default=False)
"""Whether to additionally predict uncertainties (not available for all models)"""
def run(self) -> None:
data = DEFAULT_DATAPATH.absolute()
# Outputs
pred = Path("pred.csv").absolute()
if self.inp.ready() and not self.model.filepath.exists():
self.logger.warning("Received data for prediction, but model is not trained yet")
return
mols = self.inp.receive()
_mols_to_csv(data, mols, max_size=len(mols))
self.logger.info("Predicting %s molecules", len(mols))
command = (
f"{self.runnable['qptuna-predict']} "
f"--model-file {self.model.filepath.absolute().as_posix()} "
f"--input-smiles-csv-file {data.as_posix()} "
f"--input-smiles-csv-column {SMILES_COLUMN} "
f"--output-prediction-csv-file {pred.as_posix()} "
)
self.run_command(command, verbose=True)
mols = _csv_to_mols(pred, mols, parser=_parse_score)
if self.uncertainty.value:
self.logger.info("Predicting uncertainty for %s molecules", len(mols))
command += "--predict-uncertainty"
self.run_command(command, verbose=True)
mols = _csv_to_mols(pred, mols, parser=_parse_unc)
self.out.send(mols)
@pytest.fixture
def qptuna_example_config(shared_datadir: Path) -> Path:
return shared_datadir / "qptuna.json"
@pytest.fixture
def qptuna_example_config_mapie(shared_datadir: Path) -> Path:
return shared_datadir / "build-mapie-best.json"
@pytest.fixture
def qptuna_pool(tmp_path: Path) -> Path:
return tmp_path / "pool.csv"
# 1UYD ligands (IcolosData)
@pytest.fixture
def train_smiles() -> list[str]:
return [
"Nc1ncnc(c12)n(CCCC#C)c(n2)Cc3cc(OC)c(OC)c(c3Cl)OC",
"Nc1ncnc(c12)n(CCCC)c(n2)Cc3cc(OC)c(OC)c(c3)OC",
"Nc1ncnc(c12)n(CCCC)c(n2)Cc3cc(OC)ccc3OC",
"Nc1ncnc(c12)n(CCCC)c(n2)Cc3cccc(c3)OC",
"Nc1nc(F)nc(c12)n(CCCC#C)c(n2)Cc3cc(OC)c(OC)c(c3Cl)OC",
"Nc1ncnc(c12)n(CCCC)c(n2)Cc3ccc(OC)cc3",
"Nc1ncnc(c12)n(CCCC)c(n2)Cc(cc3)cc(c34)OCO4",
"Nc1nc(F)nc(c12)n(CCCC)c(n2)Cc3cc(OC)ccc3OC",
"Nc1nc(F)nc(c12)n(CCCC)c(n2)Cc(cc3)cc(c34)OCO4",
"Nc1nc(F)nc(c12)n(CCCC#C)c(n2)Cc3cc(OC)ccc3OC",
]
@pytest.fixture
def test_smiles() -> list[str]:
return [
"CC(C)NCCCn(c(c12)nc(F)nc2N)c(n1)Cc(c3)c(I)cc(c34)OCO4",
"CC(C)NCCCn(c(c12)ncnc2N)c(n1)Sc(c3)c(Br)cc(c34)OCO4",
"CC(C)NCCCn(c(c12)ncnc2N)c(n1)Sc(c3)c(I)cc(c34)OCO4",
"COc1ccc(OC)c(c1)Cc(n2)[nH]c(c23)c(N)nc(n3)F",
"O1COc(c12)cc(Br)c(c2)Cc(nc(n34)c(N)ncc3)c4NCc5ccccc5",
]
class TestSuiteQptuna:
def test_qptuna(
self,
qptuna_example_config: Any,
train_smiles: Any,
test_smiles: Any,
test_config: Any,
temp_working_dir: Any,
qptuna_pool: Any,
) -> None:
"""Test our step in isolation"""
model = Path("model.pkl").absolute()
mols_train = [IsomerCollection.from_smiles(smi) for smi in train_smiles]
for mol in mols_train:
for isomer in mol.molecules:
isomer.scores = -10 * np.random.random(10)
mols_test = [IsomerCollection.from_smiles(smi) for smi in test_smiles]
# Hyperparam
rig = TestRig(QptunaHyper, config=test_config)
res = rig.setup_run(
inputs={"inp": [mols_train]},
parameters={
"configuration": qptuna_example_config,
"model": model,
"pool": qptuna_pool,
},
)
build_conf = res["out"].get()
assert build_conf is not None and build_conf.exists()
# Training
rig = TestRig(QptunaTrain, config=test_config)
res = rig.setup_run(
inputs={"inp": [mols_train], "inp_config": [build_conf]},
parameters={"model": model, "pool": qptuna_pool},
)
assert res["out"].get()
rig = TestRig(QptunaPredict, config=test_config)
res = rig.setup_run(
inputs={"inp": [mols_test]},
parameters={"model": model, "uncertainty": False},
)
mols = res["out"].get()
assert mols is not None
assert hasattr(mols, "__len__")
assert len(mols) == 5
for mol in mols:
assert mol.scored
for iso in mol.molecules:
assert iso.scores is not None
assert -12 < iso.scores[0] < 2
def test_qptuna_unc(
self,
qptuna_example_config_mapie: Any,
train_smiles: Any,
test_smiles: Any,
test_config: Any,
temp_working_dir: Any,
qptuna_pool: Any,
) -> None:
"""Test our step in isolation"""
model = Path("model.pkl").absolute()
mols_train = [IsomerCollection.from_smiles(smi) for smi in train_smiles]
for mol in mols_train:
for isomer in mol.molecules:
isomer.scores = -10 * np.random.random(10)
mols_test = [IsomerCollection.from_smiles(smi) for smi in test_smiles]
# Training
rig = TestRig(QptunaTrain, config=test_config)
res = rig.setup_run(
inputs={"inp": [mols_train], "inp_config": [qptuna_example_config_mapie]},
parameters={"model": model, "pool": qptuna_pool},
)
assert res["out"].get()
rig = TestRig(QptunaPredict, config=test_config)
res = rig.setup_run(
inputs={"inp": [mols_test]},
parameters={"model": model, "uncertainty": True},
)
mols = res["out"].get()
assert mols is not None
assert hasattr(mols, "__len__")
assert len(mols) == 5
for mol in mols:
assert mol.scored
for iso in mol.molecules:
assert iso.scores is not None
assert -12 < iso.scores[0] < 2
assert iso.has_tag("uncertainty")
assert 0 < float(cast(float, iso.get_tag("uncertainty"))) < 10