Source code for optunaz.config.buildconfig

import abc
from dataclasses import dataclass, field
from typing import Optional, Union, Literal
import pickle

import numpy as np
import sklearn
import sklearn.cross_decomposition
import sklearn.ensemble
import sklearn.linear_model
import sklearn.neighbors
import sklearn.svm
import xgboost
from apischema import schema
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin

import optunaz
from optunaz import algorithms
from optunaz.algorithms import chem_prop
from optunaz.algorithms import chem_prop_hyperopt
from optunaz.algorithms import probabilistic_random_forest
from optunaz.algorithms import calibrated_cv
from optunaz.config import (
    ModelMode,
    OptimizationDirection,
    Algorithm as GenericAlg,
    Visualization,
)
from optunaz.config.optconfig import (
    RegressionScore,
    ClassificationScore,
)
from optunaz.datareader import Dataset
from optunaz.descriptors import AnyDescriptor


[docs]class Algorithm(GenericAlg):
[docs] @abc.abstractmethod def estimator(self) -> BaseEstimator: pass
[docs]@dataclass class AdaBoostClassifier(Algorithm):
[docs] @dataclass class AdaBoostClassifierParameters: n_estimators: int = field(default=1, metadata=schema(min=1)) learning_rate: float = field(default=0.1, metadata=schema(min=0.0001))
name: Literal["AdaBoostClassifier"] parameters: AdaBoostClassifierParameters
[docs] def estimator(self): return sklearn.ensemble.AdaBoostClassifier( estimator=None, random_state=42, n_estimators=self.parameters.n_estimators, learning_rate=self.parameters.learning_rate, algorithm="SAMME", )
[docs]@dataclass class Lasso(Algorithm):
[docs] @dataclass class LassoParameters: alpha: float = field(default=1.0, metadata=schema(min=0))
name: Literal["Lasso"] parameters: LassoParameters
[docs] def estimator(self): return sklearn.linear_model.Lasso(alpha=self.parameters.alpha, random_state=42)
[docs]@dataclass class KNeighborsClassifier(Algorithm):
[docs] @dataclass class KNeighborsClassifierParameters: metric: str weights: str n_neighbors: int = field(default=5, metadata=schema(min=1))
name: Literal["KNeighborsClassifier"] parameters: KNeighborsClassifierParameters
[docs] def estimator(self): return sklearn.neighbors.KNeighborsClassifier( metric=self.parameters.metric, n_jobs=-1, n_neighbors=self.parameters.n_neighbors, weights=self.parameters.weights, )
[docs]@dataclass class KNeighborsRegressor(Algorithm):
[docs] @dataclass class KNeighborsRegressorParameters: metric: str weights: str n_neighbors: int = field(default=5, metadata=schema(min=1))
name: Literal["KNeighborsRegressor"] parameters: KNeighborsRegressorParameters
[docs] def estimator(self): return sklearn.neighbors.KNeighborsRegressor( metric=self.parameters.metric, n_jobs=-1, n_neighbors=self.parameters.n_neighbors, weights=self.parameters.weights, )
[docs]@dataclass class LogisticRegression(Algorithm):
[docs] @dataclass class LogisticRegressionParameters: solver: str C: float = field(default=1.0, metadata=schema(min=0.001, max=1000))
name: Literal["LogisticRegression"] parameters: LogisticRegressionParameters
[docs] def estimator(self): return sklearn.linear_model.LogisticRegression( penalty="l2", random_state=42, C=self.parameters.C, solver=self.parameters.solver, max_iter=100, n_jobs=-1, class_weight="balanced", )
[docs]@dataclass class PLSRegression(Algorithm):
[docs] @dataclass class PLSParameters: n_components: int = field(default=2, metadata=schema(min=1))
name: Literal["PLSRegression"] parameters: PLSParameters
[docs] def estimator(self): return sklearn.cross_decomposition.PLSRegression( n_components=self.parameters.n_components )
[docs]@dataclass class RandomForestClassifier(Algorithm):
[docs] @dataclass class RandomForestParameters: max_features: str max_depth: int = field(default=None, metadata=schema(min=1)) n_estimators: int = field(default=100, metadata=schema(min=1))
name: Literal["RandomForestClassifier"] parameters: RandomForestParameters
[docs] def estimator(self): if self.parameters.max_features == "auto": max_features = 1.0 else: max_features = self.parameters.max_features return sklearn.ensemble.RandomForestClassifier( max_depth=self.parameters.max_depth, max_features=max_features, n_estimators=self.parameters.n_estimators, class_weight="balanced", random_state=42, n_jobs=-1, )
[docs]@dataclass class RandomForestRegressor(Algorithm):
[docs] @dataclass class RandomForestParameters: max_depth: int = field(metadata=schema(min=1)) n_estimators: int = field(metadata=schema(min=1)) max_features: str
name: Literal["RandomForestRegressor"] parameters: RandomForestParameters
[docs] def estimator(self): if self.parameters.max_features == "auto": max_features = 1.0 else: max_features = self.parameters.max_features return sklearn.ensemble.RandomForestRegressor( max_depth=self.parameters.max_depth, max_features=max_features, n_estimators=self.parameters.n_estimators, random_state=42, n_jobs=-1, )
[docs]@dataclass class Ridge(Algorithm):
[docs] @dataclass class RidgeParameters: alpha: float = field(metadata=schema(min=0))
name: Literal["Ridge"] parameters: RidgeParameters
[docs] def estimator(self): return sklearn.linear_model.Ridge(alpha=self.parameters.alpha)
[docs]@dataclass class SVC(Algorithm):
[docs] @dataclass class SVCParameters: C: float = field(default=1.0, metadata=schema(min=1e-30, max=1e10)) gamma: float = field(default=1e-4, metadata=schema(min=1e-9, max=1e3))
name: Literal["SVC"] parameters: SVCParameters
[docs] def estimator(self): return sklearn.svm.SVC( C=self.parameters.C, gamma=self.parameters.gamma, class_weight="balanced", probability=True, random_state=42, )
[docs]@dataclass class SVR(Algorithm):
[docs] @dataclass class SVRParameters: C: float = field(metadata=schema(min=1e-30, max=1e10)) gamma: float = field(metadata=schema(min=1e-9, max=1e3))
name: Literal["SVR"] parameters: SVRParameters
[docs] def estimator(self): return sklearn.svm.SVR(C=self.parameters.C, gamma=self.parameters.gamma)
[docs]@dataclass class XGBRegressor(Algorithm):
[docs] @dataclass class XGBRegressorParameters: max_depth: int = field(metadata=schema(min=1)) n_estimators: int = field(metadata=schema(min=1)) learning_rate: float = field(metadata=schema(min=0.0001))
name: Literal["XGBRegressor"] parameters: XGBRegressorParameters
[docs] def estimator(self): return xgboost.XGBRegressor( max_depth=self.parameters.max_depth, n_estimators=self.parameters.n_estimators, learning_rate=self.parameters.learning_rate, random_state=42, reg_lambda=1, objective="reg:squarederror", subsample=1, booster="gbtree", verbosity=0, n_jobs=-1, gamma=1, )
[docs]@dataclass class PRFClassifier(Algorithm):
[docs] @dataclass class PRFClassifierParameters: max_depth: int = field(metadata=schema(min=1)) n_estimators: int = field(metadata=schema(min=1)) max_features: str use_py_gini: int = field(metadata=schema(default=1, min=0, max=1)) use_py_leafs: int = field(metadata=schema(default=1, min=0, max=1)) bootstrap: int = field(default=1, metadata=schema(min=0, max=1)) new_syn_data_frac: float = field(default=0.0, metadata=schema(min=0)) min_py_sum_leaf: int = field(default=1, metadata=schema(min=0))
name: Literal["PRFClassifier"] parameters: PRFClassifierParameters
[docs] def estimator(self): return optunaz.algorithms.probabilistic_random_forest.PRFClassifier( max_depth=self.parameters.max_depth, max_features=self.parameters.max_features, n_estimators=self.parameters.n_estimators, use_py_gini=self.parameters.use_py_gini, use_py_leafs=self.parameters.use_py_leafs, bootstrap=self.parameters.bootstrap, new_syn_data_frac=self.parameters.new_syn_data_frac, min_py_sum_leaf=self.parameters.min_py_sum_leaf, )
[docs]@dataclass class ChemPropRegressor(Algorithm):
[docs] @dataclass class ChemPropRegressorParameters: activation: str aggregation: str aggregation_norm: float = field( metadata=schema(min=1) ) # float: suggest_discrete_uniform fix batch_size: float = field( metadata=schema(min=5) ) # float: suggest_discrete_uniform fix depth: float = field( metadata=schema(min=2) ) # float: suggest_discrete_uniform fix dropout: float = field(metadata=schema(min=0)) ensemble_size: int = field(metadata=schema(default=1, min=1, max=5)) epochs: int = field(metadata=schema(default=30, min=4, max=400)) features_generator: str ffn_hidden_size: float = field( metadata=schema(min=300) ) # float: suggest_discrete_uniform fix ffn_num_layers: float = field( metadata=schema(min=1) ) # float: suggest_discrete_uniform fix final_lr_ratio_exp: int = field(metadata=schema(min=-4)) hidden_size: float = field( metadata=schema(min=300) ) # float: suggest_discrete_uniform fix init_lr_ratio_exp: int = field(metadata=schema(min=-4)) max_lr_exp: int = field(metadata=schema(min=-6)) warmup_epochs_ratio: float = field(default=0.1, metadata=schema(min=0, max=0.2)) aux_weight_pc: int = 100
name: Literal["ChemPropRegressor"] parameters: ChemPropRegressorParameters
[docs] def estimator(self): return optunaz.algorithms.chem_prop.ChemPropRegressor( activation=self.parameters.activation, aggregation=self.parameters.aggregation, aggregation_norm=int(self.parameters.aggregation_norm), batch_size=int(self.parameters.batch_size), depth=int(self.parameters.depth), dropout=self.parameters.dropout, ensemble_size=self.parameters.ensemble_size, epochs=self.parameters.epochs, features_generator=self.parameters.features_generator, ffn_hidden_size=int(self.parameters.ffn_hidden_size), ffn_num_layers=int(self.parameters.ffn_num_layers), final_lr_ratio_exp=self.parameters.final_lr_ratio_exp, hidden_size=int(self.parameters.hidden_size), init_lr_ratio_exp=self.parameters.init_lr_ratio_exp, max_lr_exp=self.parameters.max_lr_exp, warmup_epochs_ratio=self.parameters.warmup_epochs_ratio, aux_weight_pc=self.parameters.aux_weight_pc, )
[docs]@dataclass class ChemPropClassifier(Algorithm):
[docs] @dataclass class ChemPropClassifierParameters: activation: str aggregation: str aggregation_norm: float = field( metadata=schema(min=1) ) # float: suggest_discrete_uniform fix batch_size: float = field( metadata=schema(min=5) ) # float: suggest_discrete_uniform fix depth: float = field( metadata=schema(min=2) ) # float: suggest_discrete_uniform fix dropout: float = field(metadata=schema(min=0)) ensemble_size: int = field(metadata=schema(default=1, min=1, max=5)) epochs: int = field(metadata=schema(default=30, min=4, max=400)) features_generator: str ffn_hidden_size: float = field( metadata=schema(min=300) ) # float: suggest_discrete_uniform fix ffn_num_layers: float = field( metadata=schema(min=1) ) # float: suggest_discrete_uniform fix final_lr_ratio_exp: int = field(metadata=schema(min=-4)) hidden_size: float = field( metadata=schema(min=300) ) # float: suggest_discrete_uniform fix init_lr_ratio_exp: int = field(metadata=schema(min=-4)) max_lr_exp: int = field(metadata=schema(min=-6)) warmup_epochs_ratio: float = field(default=0.1, metadata=schema(min=0, max=0.2)) aux_weight_pc: int = 100
name: Literal["ChemPropClassifier"] parameters: ChemPropClassifierParameters
[docs] def estimator(self): return optunaz.algorithms.chem_prop.ChemPropClassifier( activation=self.parameters.activation, aggregation=self.parameters.aggregation, aggregation_norm=int(self.parameters.aggregation_norm), batch_size=int(self.parameters.batch_size), depth=int(self.parameters.depth), dropout=self.parameters.dropout, ensemble_size=self.parameters.ensemble_size, epochs=self.parameters.epochs, features_generator=self.parameters.features_generator, ffn_hidden_size=int(self.parameters.ffn_hidden_size), ffn_num_layers=int(self.parameters.ffn_num_layers), final_lr_ratio_exp=self.parameters.final_lr_ratio_exp, hidden_size=int(self.parameters.hidden_size), init_lr_ratio_exp=self.parameters.init_lr_ratio_exp, max_lr_exp=self.parameters.max_lr_exp, warmup_epochs_ratio=self.parameters.warmup_epochs_ratio, aux_weight_pc=self.parameters.aux_weight_pc, )
[docs]@dataclass class ChemPropRegressorPretrained(Algorithm):
[docs] @dataclass class ChemPropRegressorPretrainedParameters: epochs: int = field(metadata=schema(default=30, min=0, max=400)) frzn: str pretrained_model: str
name: Literal["ChemPropRegressorPretrained"] parameters: ChemPropRegressorPretrainedParameters
[docs] def estimator(self): return optunaz.algorithms.chem_prop.ChemPropRegressorPretrained( epochs=self.parameters.epochs, frzn=self.parameters.frzn, pretrained_model=self.parameters.pretrained_model, )
[docs]@dataclass class ChemPropHyperoptClassifier(Algorithm):
[docs] @dataclass class ChemPropHyperoptClassifierParameters: ensemble_size: int = field(metadata=schema(default=1, min=1, max=5)) epochs: int = field(metadata=schema(default=30, min=4, max=400)) features_generator: str num_iters: int = field(metadata=schema(default=30, min=1, max=50)) search_parameter_level: str aux_weight_pc: int = 100
name: Literal["ChemPropHyperoptClassifier"] parameters: ChemPropHyperoptClassifierParameters
[docs] def estimator(self): return optunaz.algorithms.chem_prop_hyperopt.ChemPropHyperoptClassifier( ensemble_size=self.parameters.ensemble_size, epochs=self.parameters.epochs, features_generator=self.parameters.features_generator, num_iters=self.parameters.num_iters, search_parameter_level=self.parameters.search_parameter_level, aux_weight_pc=self.parameters.aux_weight_pc, )
[docs]@dataclass class ChemPropHyperoptRegressor(Algorithm):
[docs] @dataclass class ChemPropHyperoptRegressorParameters: ensemble_size: int = field(metadata=schema(default=1, min=1, max=5)) epochs: int = field(metadata=schema(default=30, min=4, max=400)) features_generator: str num_iters: int = field(metadata=schema(default=30, min=1, max=50)) search_parameter_level: str aux_weight_pc: int = 100
name: Literal["ChemPropHyperoptRegressor"] parameters: ChemPropHyperoptRegressorParameters
[docs] def estimator(self): return optunaz.algorithms.chem_prop_hyperopt.ChemPropHyperoptRegressor( ensemble_size=self.parameters.ensemble_size, epochs=self.parameters.epochs, features_generator=self.parameters.features_generator, num_iters=self.parameters.num_iters, search_parameter_level=self.parameters.search_parameter_level, aux_weight_pc=self.parameters.aux_weight_pc, )
[docs]@dataclass class CustomClassificationModel(Algorithm):
[docs] @dataclass class CustomClassificationModelParameters: preexisting_model: str refit_model: int = field(metadata=schema(default=0, min=0, max=1))
[docs] class CustomClassificationEstimator(ClassifierMixin, BaseEstimator): def __init__(self, preexisting_model, refit_model): self.preexisting_model = preexisting_model self.refit_model = refit_model self.classes_ = np.unique([0, 1])
[docs] def fit(self, X, y): if self.refit_model: self.preexisting_model.fit(X, y) else: pass
[docs] def predict(self, y): return self.preexisting_model.predict(y)
[docs] def predict_proba(self, y): return self.preexisting_model.predict_proba(y)
name: Literal["CustomClassificationModel"] parameters: CustomClassificationModelParameters
[docs] def estimator(self): with open(self.parameters.preexisting_model, "rb") as fid: preexisting_model = pickle.load(fid) from optunaz.model_writer import QSARtunaModel if isinstance(preexisting_model, QSARtunaModel): preexisting_model = preexisting_model.predictor if isinstance(preexisting_model, self.CustomClassificationEstimator): preexisting_model = preexisting_model.preexisting_model for p in ["predict_proba", "predict"]: assert hasattr( preexisting_model, p ), f"an estimator with '{p}' method must be supplied to CustomClassificationModel" estimator = self.CustomClassificationEstimator( preexisting_model=preexisting_model, refit_model=self.parameters.refit_model ) return estimator
[docs]@dataclass class CustomRegressionModel(Algorithm):
[docs] @dataclass class CustomRegressionModelParameters: preexisting_model: str refit_model: int = field(metadata=schema(default=0, min=0, max=1))
[docs] class CustomRegressionEstimator(RegressorMixin, BaseEstimator): def __init__(self, preexisting_model, refit_model): self.preexisting_model = preexisting_model self.refit_model = refit_model
[docs] def fit(self, X, y): if self.refit_model: self.preexisting_model.fit(X, y) else: pass
[docs] def predict(self, y): return self.preexisting_model.predict(y)
name: Literal["CustomRegressionModel"] parameters: CustomRegressionModelParameters
[docs] def estimator(self): with open(self.parameters.preexisting_model, "rb") as fid: preexisting_model = pickle.load(fid) from optunaz.model_writer import QSARtunaModel if isinstance(preexisting_model, QSARtunaModel): preexisting_model = preexisting_model.predictor if isinstance(preexisting_model, self.CustomRegressionEstimator): preexisting_model = preexisting_model.preexisting_model assert hasattr( preexisting_model, "predict" ), f"an estimator with 'predict' method must be supplied to CustomRegressionModel" estimator = self.CustomRegressionEstimator( preexisting_model=preexisting_model, refit_model=self.parameters.refit_model ) return estimator
AnyUncalibratedClassifier = Union[ AdaBoostClassifier, KNeighborsClassifier, LogisticRegression, RandomForestClassifier, SVC, ChemPropClassifier, ChemPropRegressor, ChemPropRegressorPretrained, ChemPropHyperoptClassifier, ChemPropHyperoptRegressor, CustomClassificationModel, ]
[docs]@dataclass class CalibratedClassifierCVWithVA(Algorithm):
[docs] @dataclass class CalibratedClassifierCVParameters: n_folds: int = field(metadata=schema(default=2, min=2)) ensemble: str method: str estimator: AnyUncalibratedClassifier
name: Literal["CalibratedClassifierCVWithVA"] parameters: CalibratedClassifierCVParameters
[docs] def estimator(self): estimator = self.parameters.estimator.estimator() if hasattr(estimator, "num_workers"): n_jobs = 1 else: n_jobs = -1 return optunaz.algorithms.calibrated_cv.CalibratedClassifierCVWithVA( estimator, n_folds=self.parameters.n_folds, ensemble=self.parameters.ensemble == "True", method=self.parameters.method, n_jobs=n_jobs, )
AnyRegression = Union[ Lasso, PLSRegression, RandomForestRegressor, Ridge, KNeighborsRegressor, SVR, XGBRegressor, PRFClassifier, ChemPropRegressor, ChemPropHyperoptRegressor, ChemPropRegressorPretrained, CustomRegressionModel, ] MapieCompatible = Union[ Lasso, PLSRegression, RandomForestRegressor, KNeighborsRegressor, Ridge, SVR, XGBRegressor, PRFClassifier, CustomRegressionModel, ]
[docs]@dataclass class Mapie(Algorithm):
[docs] @dataclass class MapieParameters: mapie_alpha: float = field(metadata=schema(default=0.05, min=0.01, max=0.99)) estimator: MapieCompatible
name: Literal["Mapie"] parameters: MapieParameters
[docs] def estimator(self): from optunaz.algorithms.mapie_uncertainty import MapieWithUncertainty return MapieWithUncertainty( mapie_alpha=self.parameters.mapie_alpha, estimator=self.parameters.estimator.estimator(), n_jobs=-1, )
AnyAlgorithm = Union[ AnyUncalibratedClassifier, AnyRegression, CalibratedClassifierCVWithVA, Mapie, ] AnyChemPropAlgorithm = [ ChemPropClassifier, ChemPropRegressor, ChemPropHyperoptClassifier, ChemPropHyperoptRegressor, ChemPropRegressorPretrained, ]
[docs]@dataclass class BuildConfig: """Build configuration. This is the configuration to train a model, i.e. optimize parameters of a model, given fixed hyperparameters. It roughly corresponds to Optuna Trial. """
[docs] @dataclass class Metadata: name: Optional[str] = None cross_validation: Optional[int] = field(default=None, metadata=schema(min=1)) shuffle: Optional[bool] = None best_trial: Optional[int] = field(default=None, metadata=schema(min=0)) best_value: Optional[float] = None n_trials: Optional[int] = field(default=None, metadata=schema(min=0)) visualization: Optional[Visualization] = None
[docs] @dataclass class Settings: mode: Optional[ModelMode] = None scoring: Union[RegressionScore, ClassificationScore, str, None] = None direction: Optional[OptimizationDirection] = None n_trials: Optional[int] = field(default=None, metadata=schema(min=0)) tracking_rest_endpoint: Optional[str] = field( default=None, metadata=schema(title="URL to track build results using internal format"), )
data: Dataset metadata: Optional[Metadata] descriptor: AnyDescriptor settings: Optional[Settings] algorithm: AnyAlgorithm task: Literal["building"] = "building"