import abc
from dataclasses import dataclass, field
from typing import Optional, Union, Literal
import pickle
import numpy as np
import sklearn
import sklearn.cross_decomposition
import sklearn.ensemble
import sklearn.linear_model
import sklearn.neighbors
import sklearn.svm
import xgboost
from apischema import schema
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
import optunaz
from optunaz import algorithms
from optunaz.algorithms import chem_prop
from optunaz.algorithms import chem_prop_hyperopt
from optunaz.algorithms import probabilistic_random_forest
from optunaz.algorithms import calibrated_cv
from optunaz.config import (
ModelMode,
OptimizationDirection,
Algorithm as GenericAlg,
Visualization,
)
from optunaz.config.optconfig import (
RegressionScore,
ClassificationScore,
)
from optunaz.datareader import Dataset
from optunaz.descriptors import AnyDescriptor
[docs]class Algorithm(GenericAlg):
[docs] @abc.abstractmethod
def estimator(self) -> BaseEstimator:
pass
[docs]@dataclass
class AdaBoostClassifier(Algorithm):
[docs] @dataclass
class AdaBoostClassifierParameters:
n_estimators: int = field(default=1, metadata=schema(min=1))
learning_rate: float = field(default=0.1, metadata=schema(min=0.0001))
name: Literal["AdaBoostClassifier"]
parameters: AdaBoostClassifierParameters
[docs] def estimator(self):
return sklearn.ensemble.AdaBoostClassifier(
estimator=None,
random_state=42,
n_estimators=self.parameters.n_estimators,
learning_rate=self.parameters.learning_rate,
algorithm="SAMME",
)
[docs]@dataclass
class Lasso(Algorithm):
[docs] @dataclass
class LassoParameters:
alpha: float = field(default=1.0, metadata=schema(min=0))
name: Literal["Lasso"]
parameters: LassoParameters
[docs] def estimator(self):
return sklearn.linear_model.Lasso(alpha=self.parameters.alpha, random_state=42)
[docs]@dataclass
class KNeighborsClassifier(Algorithm):
[docs] @dataclass
class KNeighborsClassifierParameters:
metric: str
weights: str
n_neighbors: int = field(default=5, metadata=schema(min=1))
name: Literal["KNeighborsClassifier"]
parameters: KNeighborsClassifierParameters
[docs] def estimator(self):
return sklearn.neighbors.KNeighborsClassifier(
metric=self.parameters.metric,
n_jobs=-1,
n_neighbors=self.parameters.n_neighbors,
weights=self.parameters.weights,
)
[docs]@dataclass
class KNeighborsRegressor(Algorithm):
[docs] @dataclass
class KNeighborsRegressorParameters:
metric: str
weights: str
n_neighbors: int = field(default=5, metadata=schema(min=1))
name: Literal["KNeighborsRegressor"]
parameters: KNeighborsRegressorParameters
[docs] def estimator(self):
return sklearn.neighbors.KNeighborsRegressor(
metric=self.parameters.metric,
n_jobs=-1,
n_neighbors=self.parameters.n_neighbors,
weights=self.parameters.weights,
)
[docs]@dataclass
class LogisticRegression(Algorithm):
[docs] @dataclass
class LogisticRegressionParameters:
solver: str
C: float = field(default=1.0, metadata=schema(min=0.001, max=1000))
name: Literal["LogisticRegression"]
parameters: LogisticRegressionParameters
[docs] def estimator(self):
return sklearn.linear_model.LogisticRegression(
penalty="l2",
random_state=42,
C=self.parameters.C,
solver=self.parameters.solver,
max_iter=100,
n_jobs=-1,
class_weight="balanced",
)
[docs]@dataclass
class PLSRegression(Algorithm):
[docs] @dataclass
class PLSParameters:
n_components: int = field(default=2, metadata=schema(min=1))
name: Literal["PLSRegression"]
parameters: PLSParameters
[docs] def estimator(self):
return sklearn.cross_decomposition.PLSRegression(
n_components=self.parameters.n_components
)
[docs]@dataclass
class RandomForestClassifier(Algorithm):
[docs] @dataclass
class RandomForestParameters:
max_features: str
max_depth: int = field(default=None, metadata=schema(min=1))
n_estimators: int = field(default=100, metadata=schema(min=1))
name: Literal["RandomForestClassifier"]
parameters: RandomForestParameters
[docs] def estimator(self):
if self.parameters.max_features == "auto":
max_features = 1.0
else:
max_features = self.parameters.max_features
return sklearn.ensemble.RandomForestClassifier(
max_depth=self.parameters.max_depth,
max_features=max_features,
n_estimators=self.parameters.n_estimators,
class_weight="balanced",
random_state=42,
n_jobs=-1,
)
[docs]@dataclass
class RandomForestRegressor(Algorithm):
[docs] @dataclass
class RandomForestParameters:
max_depth: int = field(metadata=schema(min=1))
n_estimators: int = field(metadata=schema(min=1))
max_features: str
name: Literal["RandomForestRegressor"]
parameters: RandomForestParameters
[docs] def estimator(self):
if self.parameters.max_features == "auto":
max_features = 1.0
else:
max_features = self.parameters.max_features
return sklearn.ensemble.RandomForestRegressor(
max_depth=self.parameters.max_depth,
max_features=max_features,
n_estimators=self.parameters.n_estimators,
random_state=42,
n_jobs=-1,
)
[docs]@dataclass
class Ridge(Algorithm):
[docs] @dataclass
class RidgeParameters:
alpha: float = field(metadata=schema(min=0))
name: Literal["Ridge"]
parameters: RidgeParameters
[docs] def estimator(self):
return sklearn.linear_model.Ridge(alpha=self.parameters.alpha)
[docs]@dataclass
class SVC(Algorithm):
[docs] @dataclass
class SVCParameters:
C: float = field(default=1.0, metadata=schema(min=1e-30, max=1e10))
gamma: float = field(default=1e-4, metadata=schema(min=1e-9, max=1e3))
name: Literal["SVC"]
parameters: SVCParameters
[docs] def estimator(self):
return sklearn.svm.SVC(
C=self.parameters.C,
gamma=self.parameters.gamma,
class_weight="balanced",
probability=True,
random_state=42,
)
[docs]@dataclass
class SVR(Algorithm):
[docs] @dataclass
class SVRParameters:
C: float = field(metadata=schema(min=1e-30, max=1e10))
gamma: float = field(metadata=schema(min=1e-9, max=1e3))
name: Literal["SVR"]
parameters: SVRParameters
[docs] def estimator(self):
return sklearn.svm.SVR(C=self.parameters.C, gamma=self.parameters.gamma)
[docs]@dataclass
class XGBRegressor(Algorithm):
[docs] @dataclass
class XGBRegressorParameters:
max_depth: int = field(metadata=schema(min=1))
n_estimators: int = field(metadata=schema(min=1))
learning_rate: float = field(metadata=schema(min=0.0001))
name: Literal["XGBRegressor"]
parameters: XGBRegressorParameters
[docs] def estimator(self):
return xgboost.XGBRegressor(
max_depth=self.parameters.max_depth,
n_estimators=self.parameters.n_estimators,
learning_rate=self.parameters.learning_rate,
random_state=42,
reg_lambda=1,
objective="reg:squarederror",
subsample=1,
booster="gbtree",
verbosity=0,
n_jobs=-1,
gamma=1,
)
[docs]@dataclass
class PRFClassifier(Algorithm):
[docs] @dataclass
class PRFClassifierParameters:
max_depth: int = field(metadata=schema(min=1))
n_estimators: int = field(metadata=schema(min=1))
max_features: str
use_py_gini: int = field(metadata=schema(default=1, min=0, max=1))
use_py_leafs: int = field(metadata=schema(default=1, min=0, max=1))
bootstrap: int = field(default=1, metadata=schema(min=0, max=1))
new_syn_data_frac: float = field(default=0.0, metadata=schema(min=0))
min_py_sum_leaf: int = field(default=1, metadata=schema(min=0))
name: Literal["PRFClassifier"]
parameters: PRFClassifierParameters
[docs] def estimator(self):
return optunaz.algorithms.probabilistic_random_forest.PRFClassifier(
max_depth=self.parameters.max_depth,
max_features=self.parameters.max_features,
n_estimators=self.parameters.n_estimators,
use_py_gini=self.parameters.use_py_gini,
use_py_leafs=self.parameters.use_py_leafs,
bootstrap=self.parameters.bootstrap,
new_syn_data_frac=self.parameters.new_syn_data_frac,
min_py_sum_leaf=self.parameters.min_py_sum_leaf,
)
[docs]@dataclass
class ChemPropRegressor(Algorithm):
[docs] @dataclass
class ChemPropRegressorParameters:
activation: str
aggregation: str
aggregation_norm: float = field(
metadata=schema(min=1)
) # float: suggest_discrete_uniform fix
batch_size: float = field(
metadata=schema(min=5)
) # float: suggest_discrete_uniform fix
depth: float = field(
metadata=schema(min=2)
) # float: suggest_discrete_uniform fix
dropout: float = field(metadata=schema(min=0))
ensemble_size: int = field(metadata=schema(default=1, min=1, max=5))
epochs: int = field(metadata=schema(default=30, min=4, max=400))
features_generator: str
ffn_hidden_size: float = field(
metadata=schema(min=300)
) # float: suggest_discrete_uniform fix
ffn_num_layers: float = field(
metadata=schema(min=1)
) # float: suggest_discrete_uniform fix
final_lr_ratio_exp: int = field(metadata=schema(min=-4))
hidden_size: float = field(
metadata=schema(min=300)
) # float: suggest_discrete_uniform fix
init_lr_ratio_exp: int = field(metadata=schema(min=-4))
max_lr_exp: int = field(metadata=schema(min=-6))
warmup_epochs_ratio: float = field(default=0.1, metadata=schema(min=0, max=0.2))
aux_weight_pc: int = 100
name: Literal["ChemPropRegressor"]
parameters: ChemPropRegressorParameters
[docs] def estimator(self):
return optunaz.algorithms.chem_prop.ChemPropRegressor(
activation=self.parameters.activation,
aggregation=self.parameters.aggregation,
aggregation_norm=int(self.parameters.aggregation_norm),
batch_size=int(self.parameters.batch_size),
depth=int(self.parameters.depth),
dropout=self.parameters.dropout,
ensemble_size=self.parameters.ensemble_size,
epochs=self.parameters.epochs,
features_generator=self.parameters.features_generator,
ffn_hidden_size=int(self.parameters.ffn_hidden_size),
ffn_num_layers=int(self.parameters.ffn_num_layers),
final_lr_ratio_exp=self.parameters.final_lr_ratio_exp,
hidden_size=int(self.parameters.hidden_size),
init_lr_ratio_exp=self.parameters.init_lr_ratio_exp,
max_lr_exp=self.parameters.max_lr_exp,
warmup_epochs_ratio=self.parameters.warmup_epochs_ratio,
aux_weight_pc=self.parameters.aux_weight_pc,
)
[docs]@dataclass
class ChemPropClassifier(Algorithm):
[docs] @dataclass
class ChemPropClassifierParameters:
activation: str
aggregation: str
aggregation_norm: float = field(
metadata=schema(min=1)
) # float: suggest_discrete_uniform fix
batch_size: float = field(
metadata=schema(min=5)
) # float: suggest_discrete_uniform fix
depth: float = field(
metadata=schema(min=2)
) # float: suggest_discrete_uniform fix
dropout: float = field(metadata=schema(min=0))
ensemble_size: int = field(metadata=schema(default=1, min=1, max=5))
epochs: int = field(metadata=schema(default=30, min=4, max=400))
features_generator: str
ffn_hidden_size: float = field(
metadata=schema(min=300)
) # float: suggest_discrete_uniform fix
ffn_num_layers: float = field(
metadata=schema(min=1)
) # float: suggest_discrete_uniform fix
final_lr_ratio_exp: int = field(metadata=schema(min=-4))
hidden_size: float = field(
metadata=schema(min=300)
) # float: suggest_discrete_uniform fix
init_lr_ratio_exp: int = field(metadata=schema(min=-4))
max_lr_exp: int = field(metadata=schema(min=-6))
warmup_epochs_ratio: float = field(default=0.1, metadata=schema(min=0, max=0.2))
aux_weight_pc: int = 100
name: Literal["ChemPropClassifier"]
parameters: ChemPropClassifierParameters
[docs] def estimator(self):
return optunaz.algorithms.chem_prop.ChemPropClassifier(
activation=self.parameters.activation,
aggregation=self.parameters.aggregation,
aggregation_norm=int(self.parameters.aggregation_norm),
batch_size=int(self.parameters.batch_size),
depth=int(self.parameters.depth),
dropout=self.parameters.dropout,
ensemble_size=self.parameters.ensemble_size,
epochs=self.parameters.epochs,
features_generator=self.parameters.features_generator,
ffn_hidden_size=int(self.parameters.ffn_hidden_size),
ffn_num_layers=int(self.parameters.ffn_num_layers),
final_lr_ratio_exp=self.parameters.final_lr_ratio_exp,
hidden_size=int(self.parameters.hidden_size),
init_lr_ratio_exp=self.parameters.init_lr_ratio_exp,
max_lr_exp=self.parameters.max_lr_exp,
warmup_epochs_ratio=self.parameters.warmup_epochs_ratio,
aux_weight_pc=self.parameters.aux_weight_pc,
)
[docs]@dataclass
class ChemPropRegressorPretrained(Algorithm):
[docs] @dataclass
class ChemPropRegressorPretrainedParameters:
epochs: int = field(metadata=schema(default=30, min=0, max=400))
frzn: str
pretrained_model: str
name: Literal["ChemPropRegressorPretrained"]
parameters: ChemPropRegressorPretrainedParameters
[docs] def estimator(self):
return optunaz.algorithms.chem_prop.ChemPropRegressorPretrained(
epochs=self.parameters.epochs,
frzn=self.parameters.frzn,
pretrained_model=self.parameters.pretrained_model,
)
[docs]@dataclass
class ChemPropHyperoptClassifier(Algorithm):
[docs] @dataclass
class ChemPropHyperoptClassifierParameters:
ensemble_size: int = field(metadata=schema(default=1, min=1, max=5))
epochs: int = field(metadata=schema(default=30, min=4, max=400))
features_generator: str
num_iters: int = field(metadata=schema(default=30, min=1, max=50))
search_parameter_level: str
aux_weight_pc: int = 100
name: Literal["ChemPropHyperoptClassifier"]
parameters: ChemPropHyperoptClassifierParameters
[docs] def estimator(self):
return optunaz.algorithms.chem_prop_hyperopt.ChemPropHyperoptClassifier(
ensemble_size=self.parameters.ensemble_size,
epochs=self.parameters.epochs,
features_generator=self.parameters.features_generator,
num_iters=self.parameters.num_iters,
search_parameter_level=self.parameters.search_parameter_level,
aux_weight_pc=self.parameters.aux_weight_pc,
)
[docs]@dataclass
class ChemPropHyperoptRegressor(Algorithm):
[docs] @dataclass
class ChemPropHyperoptRegressorParameters:
ensemble_size: int = field(metadata=schema(default=1, min=1, max=5))
epochs: int = field(metadata=schema(default=30, min=4, max=400))
features_generator: str
num_iters: int = field(metadata=schema(default=30, min=1, max=50))
search_parameter_level: str
aux_weight_pc: int = 100
name: Literal["ChemPropHyperoptRegressor"]
parameters: ChemPropHyperoptRegressorParameters
[docs] def estimator(self):
return optunaz.algorithms.chem_prop_hyperopt.ChemPropHyperoptRegressor(
ensemble_size=self.parameters.ensemble_size,
epochs=self.parameters.epochs,
features_generator=self.parameters.features_generator,
num_iters=self.parameters.num_iters,
search_parameter_level=self.parameters.search_parameter_level,
aux_weight_pc=self.parameters.aux_weight_pc,
)
[docs]@dataclass
class CustomClassificationModel(Algorithm):
[docs] @dataclass
class CustomClassificationModelParameters:
preexisting_model: str
refit_model: int = field(metadata=schema(default=0, min=0, max=1))
[docs] class CustomClassificationEstimator(ClassifierMixin, BaseEstimator):
def __init__(self, preexisting_model, refit_model):
self.preexisting_model = preexisting_model
self.refit_model = refit_model
self.classes_ = np.unique([0, 1])
[docs] def fit(self, X, y):
if self.refit_model:
self.preexisting_model.fit(X, y)
else:
pass
[docs] def predict(self, y):
return self.preexisting_model.predict(y)
[docs] def predict_proba(self, y):
return self.preexisting_model.predict_proba(y)
name: Literal["CustomClassificationModel"]
parameters: CustomClassificationModelParameters
[docs] def estimator(self):
with open(self.parameters.preexisting_model, "rb") as fid:
preexisting_model = pickle.load(fid)
from optunaz.model_writer import QSARtunaModel
if isinstance(preexisting_model, QSARtunaModel):
preexisting_model = preexisting_model.predictor
if isinstance(preexisting_model, self.CustomClassificationEstimator):
preexisting_model = preexisting_model.preexisting_model
for p in ["predict_proba", "predict"]:
assert hasattr(
preexisting_model, p
), f"an estimator with '{p}' method must be supplied to CustomClassificationModel"
estimator = self.CustomClassificationEstimator(
preexisting_model=preexisting_model, refit_model=self.parameters.refit_model
)
return estimator
[docs]@dataclass
class CustomRegressionModel(Algorithm):
[docs] @dataclass
class CustomRegressionModelParameters:
preexisting_model: str
refit_model: int = field(metadata=schema(default=0, min=0, max=1))
[docs] class CustomRegressionEstimator(RegressorMixin, BaseEstimator):
def __init__(self, preexisting_model, refit_model):
self.preexisting_model = preexisting_model
self.refit_model = refit_model
[docs] def fit(self, X, y):
if self.refit_model:
self.preexisting_model.fit(X, y)
else:
pass
[docs] def predict(self, y):
return self.preexisting_model.predict(y)
name: Literal["CustomRegressionModel"]
parameters: CustomRegressionModelParameters
[docs] def estimator(self):
with open(self.parameters.preexisting_model, "rb") as fid:
preexisting_model = pickle.load(fid)
from optunaz.model_writer import QSARtunaModel
if isinstance(preexisting_model, QSARtunaModel):
preexisting_model = preexisting_model.predictor
if isinstance(preexisting_model, self.CustomRegressionEstimator):
preexisting_model = preexisting_model.preexisting_model
assert hasattr(
preexisting_model, "predict"
), f"an estimator with 'predict' method must be supplied to CustomRegressionModel"
estimator = self.CustomRegressionEstimator(
preexisting_model=preexisting_model, refit_model=self.parameters.refit_model
)
return estimator
AnyUncalibratedClassifier = Union[
AdaBoostClassifier,
KNeighborsClassifier,
LogisticRegression,
RandomForestClassifier,
SVC,
ChemPropClassifier,
ChemPropRegressor,
ChemPropRegressorPretrained,
ChemPropHyperoptClassifier,
ChemPropHyperoptRegressor,
CustomClassificationModel,
]
[docs]@dataclass
class CalibratedClassifierCVWithVA(Algorithm):
[docs] @dataclass
class CalibratedClassifierCVParameters:
n_folds: int = field(metadata=schema(default=2, min=2))
ensemble: str
method: str
estimator: AnyUncalibratedClassifier
name: Literal["CalibratedClassifierCVWithVA"]
parameters: CalibratedClassifierCVParameters
[docs] def estimator(self):
estimator = self.parameters.estimator.estimator()
if hasattr(estimator, "num_workers"):
n_jobs = 1
else:
n_jobs = -1
return optunaz.algorithms.calibrated_cv.CalibratedClassifierCVWithVA(
estimator,
n_folds=self.parameters.n_folds,
ensemble=self.parameters.ensemble == "True",
method=self.parameters.method,
n_jobs=n_jobs,
)
AnyRegression = Union[
Lasso,
PLSRegression,
RandomForestRegressor,
Ridge,
KNeighborsRegressor,
SVR,
XGBRegressor,
PRFClassifier,
ChemPropRegressor,
ChemPropHyperoptRegressor,
ChemPropRegressorPretrained,
CustomRegressionModel,
]
MapieCompatible = Union[
Lasso,
PLSRegression,
RandomForestRegressor,
KNeighborsRegressor,
Ridge,
SVR,
XGBRegressor,
PRFClassifier,
CustomRegressionModel,
]
[docs]@dataclass
class Mapie(Algorithm):
[docs] @dataclass
class MapieParameters:
mapie_alpha: float = field(metadata=schema(default=0.05, min=0.01, max=0.99))
estimator: MapieCompatible
name: Literal["Mapie"]
parameters: MapieParameters
[docs] def estimator(self):
from optunaz.algorithms.mapie_uncertainty import MapieWithUncertainty
return MapieWithUncertainty(
mapie_alpha=self.parameters.mapie_alpha,
estimator=self.parameters.estimator.estimator(),
n_jobs=-1,
)
AnyAlgorithm = Union[
AnyUncalibratedClassifier,
AnyRegression,
CalibratedClassifierCVWithVA,
Mapie,
]
AnyChemPropAlgorithm = [
ChemPropClassifier,
ChemPropRegressor,
ChemPropHyperoptClassifier,
ChemPropHyperoptRegressor,
ChemPropRegressorPretrained,
]
[docs]@dataclass
class BuildConfig:
"""Build configuration.
This is the configuration to train a model,
i.e. optimize parameters of a model,
given fixed hyperparameters.
It roughly corresponds to Optuna Trial.
"""
[docs] @dataclass
class Settings:
mode: Optional[ModelMode] = None
scoring: Union[RegressionScore, ClassificationScore, str, None] = None
direction: Optional[OptimizationDirection] = None
n_trials: Optional[int] = field(default=None, metadata=schema(min=0))
tracking_rest_endpoint: Optional[str] = field(
default=None,
metadata=schema(title="URL to track build results using internal format"),
)
data: Dataset
metadata: Optional[Metadata]
descriptor: AnyDescriptor
settings: Optional[Settings]
algorithm: AnyAlgorithm
task: Literal["building"] = "building"