import json
from joblib import effective_n_jobs
from typing import Union
from functools import partial
from apischema import deserialize, serialize
from optuna import Study
from optuna.trial import FrozenTrial
from optunaz.config.buildconfig import BuildConfig
from optunaz.config.optconfig import OptimizationConfig
from optunaz.descriptors import MolDescriptor
from joblib import Memory
from optunaz.utils import mkdict
from optunaz.utils.enums import StudyUserAttrs, TrialParams
import optunaz.config.optconfig as opt
import optunaz.config.buildconfig as build
import optunaz.descriptors as descriptors
from optunaz.utils.enums.configuration_enum import ConfigurationEnum
_CE = ConfigurationEnum()
[docs]def set_build_cache(study: Study, optconfig: OptimizationConfig) -> Memory | None:
"""Set the cache to preexisting one from Optimisation, when the number of cores supports this"""
if effective_n_jobs(optconfig.settings.n_jobs) > 1 and "cache" in study.user_attrs:
return Memory(study.user_attrs["cache"], verbose=0)
else:
return None
[docs]def remove_algo_hash(trial: FrozenTrial) -> FrozenTrial:
"""Remove the hash from an Optuna algo param set"""
trial.params = {
param_name.split("__")[0]: param_value
for param_name, param_value in trial.params.items()
}
return trial
[docs]def buildconfig_from_trial(study: Study, trial: FrozenTrial) -> BuildConfig:
optconfig_json = study.user_attrs.get(StudyUserAttrs.OPTCONFIG, None)
if optconfig_json is None:
raise ValueError(
"Study does not have a user attribute with Optimization Configuration."
)
optconfig = deserialize(OptimizationConfig, optconfig_json)
trial = remove_algo_hash(trial)
descriptor_json = trial.params[TrialParams.DESCRIPTOR]
descriptor_dict = json.loads(descriptor_json)
descriptor = deserialize(MolDescriptor, descriptor_dict)
# Aux weight for side information prepared
aux_weight_pc = trial.params.get(_CE.DESCRIPTORS_SMILES_AND_SI_AUX_WEIGHT_PC, 100)
# Base estimator for calibrated methods are prepared here
base_estimator = trial.user_attrs.get(
_CE.ALGORITHMS_CALIBRATEDCLASSIFIERCV_ESTIMATOR
)
# Pretrained model for pretrained ChemProp methods are prepared here
pretrained_model = trial.user_attrs.get(
_CE.ALGORITHMS_CHEMPROP_PRETRAINED_MODEL, {}
)
# Parameter dictionary for calibrated CV methods are prepared here
calibrated_params = trial.user_attrs.get(
_CE.ALGORITHMS_CALIBRATEDCLASSIFIERCV_PARAMS, {}
)
if base_estimator:
base_estimator[_CE.GENERAL_PARAMETERS][
_CE.DESCRIPTORS_SMILES_AND_SI_AUX_WEIGHT_PC
] = aux_weight_pc
algorithm_dict = {
_CE.GENERAL_NAME: trial.params.get(_CE.GENERAL_ALGORITHM_NAME),
_CE.GENERAL_PARAMETERS: mkdict(
{
**trial.params,
**calibrated_params,
**{
_CE.ALGORITHMS_ESTIMATOR: base_estimator,
_CE.DESCRIPTORS_SMILES_AND_SI_AUX_WEIGHT_PC: aux_weight_pc,
_CE.ALGORITHMS_CHEMPROP_PRETRAINED_MODEL: pretrained_model,
},
}
),
}
algorithm = deserialize(
build.AnyAlgorithm, algorithm_dict, additional_properties=True
)
if optconfig.settings.minimise_std_dev:
best_trial = study.best_trials[0].number
best_value = study.best_trials[0].values[0]
else:
best_trial = study.best_trial.number
best_value = study.best_value
return BuildConfig(
data=optconfig.data,
descriptor=descriptor,
algorithm=algorithm,
metadata=BuildConfig.Metadata(
name=optconfig.name,
cross_validation=optconfig.settings.cross_validation,
shuffle=optconfig.settings.shuffle,
best_trial=best_trial,
best_value=best_value,
n_trials=optconfig.settings.n_trials,
),
settings=BuildConfig.Settings(
mode=optconfig.settings.mode,
scoring=optconfig.settings.scoring,
direction=optconfig.settings.direction,
n_trials=optconfig.settings.n_trials,
),
)
[docs]def encode_name(CEname, hash=hash):
"""Encode the parameter names with a hash to enable multi-parameter optimisation"""
return f"{CEname}__{hash}"
[docs]def suggest_alg_params(trial: FrozenTrial, alg: opt.AnyAlgorithm) -> build.AnyAlgorithm:
para = alg.parameters
_encode_name = partial(encode_name, hash=alg.hash)
if isinstance(alg, opt.AdaBoostClassifier):
n_estimators = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_ADABOOSTCLASSIFIER_N_ESTIMATORS),
low=para.n_estimators.low,
high=para.n_estimators.high,
)
learning_rate = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_ADABOOSTCLASSIFIER_LEARNING_RATE),
low=para.learning_rate.low,
high=para.learning_rate.high,
)
return build.AdaBoostClassifier.new(
n_estimators=n_estimators,
learning_rate=learning_rate,
)
elif isinstance(alg, opt.Lasso):
alpha = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_LASSO_ALPHA),
low=para.alpha.low,
high=para.alpha.high,
)
return build.Lasso.new(alpha=alpha)
elif isinstance(alg, opt.KNeighborsClassifier):
metric = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_KNEIGHBORS_METRIC),
choices=para.metric,
)
n_neighbors = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_KNEIGHBORS_N_NEIGHBORS),
low=para.n_neighbors.low,
high=para.n_neighbors.high,
)
weights = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_KNEIGHBORS_WEIGHTS),
choices=para.weights,
)
return build.KNeighborsClassifier.new(
metric=metric, n_neighbors=n_neighbors, weights=weights
)
elif isinstance(alg, opt.KNeighborsRegressor):
metric = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_KNEIGHBORS_METRIC),
choices=para.metric,
)
n_neighbors = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_KNEIGHBORS_N_NEIGHBORS),
low=para.n_neighbors.low,
high=para.n_neighbors.high,
)
weights = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_KNEIGHBORS_WEIGHTS),
choices=para.weights,
)
return build.KNeighborsRegressor.new(
metric=metric, n_neighbors=n_neighbors, weights=weights
)
elif isinstance(alg, opt.LogisticRegression):
solver = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_LOGISTICREGRESSION_SOLVER),
choices=para.solver,
)
lg_c = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_LOGISTICREGRESSION_C),
low=para.C.low,
high=para.C.high,
log=True,
)
return build.LogisticRegression.new(solver=solver, C=lg_c)
elif isinstance(alg, opt.PLSRegression):
n_components = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_PLSREGRESSION_N_COMPONENTS),
low=para.n_components.low,
high=para.n_components.high,
)
return build.PLSRegression.new(n_components=n_components)
elif isinstance(alg, opt.RandomForestClassifier):
max_depth = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_RF_MAX_DEPTH),
low=para.max_depth.low,
high=para.max_depth.high,
)
n_estimators = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_RF_N_ESTIMATORS),
low=para.n_estimators.low,
high=para.n_estimators.high,
)
max_features = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_RF_MAX_FEATURES),
choices=para.max_features,
)
return build.RandomForestClassifier.new(
max_depth=max_depth, n_estimators=n_estimators, max_features=max_features
)
elif isinstance(alg, opt.RandomForestRegressor):
max_depth = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_RF_MAX_DEPTH),
low=para.max_depth.low,
high=para.max_depth.high,
)
n_estimators = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_RF_N_ESTIMATORS),
low=para.n_estimators.low,
high=para.n_estimators.high,
)
max_features = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_RF_MAX_FEATURES),
choices=para.max_features,
)
return build.RandomForestRegressor.new(
max_depth=max_depth, n_estimators=n_estimators, max_features=max_features
)
elif isinstance(alg, opt.Ridge):
alpha = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_RIDGE_ALPHA),
low=para.alpha.low,
high=para.alpha.high,
)
return build.Ridge.new(alpha=alpha)
elif isinstance(alg, opt.SVC):
gamma = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_SVC_GAMMA),
low=para.gamma.low,
high=para.gamma.high,
log=True,
)
svc_c = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_SVC_C),
low=para.C.low,
high=para.C.high,
log=True,
)
return build.SVC.new(gamma=gamma, C=svc_c)
elif isinstance(alg, opt.SVR):
gamma = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_SVR_GAMMA),
low=para.gamma.low,
high=para.gamma.high,
log=True,
)
svr_c = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_SVR_C),
low=para.C.low,
high=para.C.high,
log=True,
)
return build.SVR.new(C=svr_c, gamma=gamma)
elif isinstance(alg, opt.XGBRegressor):
max_depth = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_XGBREGRESSOR_MAX_DEPTH),
low=para.max_depth.low,
high=para.max_depth.high,
)
n_estimators = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_XGBREGRESSOR_N_ESTIMATORS),
low=para.n_estimators.low,
high=para.n_estimators.high,
)
learning_rate = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_XGBREGRESSOR_LEARNING_RATE),
low=para.learning_rate.low,
high=para.learning_rate.high,
)
return build.XGBRegressor.new(
max_depth=max_depth,
n_estimators=n_estimators,
learning_rate=learning_rate,
)
elif isinstance(alg, opt.PRFClassifier):
max_depth = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_PRF_MAX_DEPTH),
low=para.max_depth.low,
high=para.max_depth.high,
)
n_estimators = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_PRF_N_ESTIMATORS),
low=para.n_estimators.low,
high=para.n_estimators.high,
)
max_features = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_PRF_MAX_FEATURES),
choices=para.max_features,
)
min_py_sum_leaf = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_PRF_MINPYSUMLEAF),
low=para.min_py_sum_leaf.low,
high=para.min_py_sum_leaf.high,
)
use_py_gini = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_PRF_USE_PY_GINI),
low=para.use_py_gini,
high=para.use_py_gini,
)
use_py_leafs = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_PRF_USE_PY_LEAFS),
low=para.use_py_leafs,
high=para.use_py_leafs,
)
return build.PRFClassifier.new(
max_depth=max_depth,
n_estimators=n_estimators,
max_features=max_features,
min_py_sum_leaf=min_py_sum_leaf,
use_py_gini=use_py_gini,
use_py_leafs=use_py_leafs,
)
elif isinstance(alg, opt.ChemPropRegressor):
activation = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_ACTIVATION),
choices=para.activation,
)
aggregation = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_AGGREGATION),
choices=para.aggregation,
)
aggregation_norm = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_AGGREGATION_NORM),
low=para.aggregation_norm.low,
high=para.aggregation_norm.high,
step=para.aggregation_norm.q,
)
batch_size = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_BATCH_SIZE),
low=para.batch_size.low,
high=para.batch_size.high,
step=para.batch_size.q,
)
depth = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_DEPTH),
low=para.depth.low,
high=para.depth.high,
step=para.depth.q,
)
dropout = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_DROPOUT),
low=para.dropout.low,
high=para.dropout.high,
step=para.dropout.q,
)
ensemble_size = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_ENSEMBLE_SIZE),
low=para.ensemble_size,
high=para.ensemble_size,
)
epochs = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_EPOCHS),
low=para.epochs,
high=para.epochs,
)
features_generator = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FEATURES_GENERATOR),
choices=para.features_generator,
)
ffn_hidden_size = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FFN_HIDDEN_SIZE),
low=para.ffn_hidden_size.low,
high=para.ffn_hidden_size.high,
step=para.ffn_hidden_size.q,
)
ffn_num_layers = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FFN_NUM_LAYERS),
low=para.ffn_num_layers.low,
high=para.ffn_num_layers.high,
step=para.ffn_num_layers.q,
)
final_lr_ratio_exp = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FINAL_LR_RATIO_EXP),
low=para.final_lr_ratio_exp.low,
high=para.final_lr_ratio_exp.high,
)
hidden_size = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_HIDDEN_SIZE),
low=para.hidden_size.low,
high=para.hidden_size.high,
step=para.hidden_size.q,
)
init_lr_ratio_exp = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_INIT_LR_RATIO_EXP),
low=para.init_lr_ratio_exp.low,
high=para.init_lr_ratio_exp.high,
)
max_lr_exp = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_MAX_LR_EXP),
low=para.max_lr_exp.low,
high=para.max_lr_exp.high,
)
warmup_epochs_ratio = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_WARMUP_EPOCHS_RATIO),
low=para.warmup_epochs_ratio.low,
high=para.warmup_epochs_ratio.high,
step=para.warmup_epochs_ratio.q,
)
return build.ChemPropRegressor.new(
activation=activation,
aggregation=aggregation,
aggregation_norm=aggregation_norm,
batch_size=batch_size,
depth=depth,
dropout=dropout,
ensemble_size=ensemble_size,
epochs=epochs,
features_generator=features_generator,
ffn_hidden_size=ffn_hidden_size,
ffn_num_layers=ffn_num_layers,
final_lr_ratio_exp=final_lr_ratio_exp,
hidden_size=hidden_size,
init_lr_ratio_exp=init_lr_ratio_exp,
max_lr_exp=max_lr_exp,
warmup_epochs_ratio=warmup_epochs_ratio,
)
elif isinstance(alg, opt.ChemPropClassifier):
activation = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_ACTIVATION),
choices=para.activation,
)
aggregation = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_AGGREGATION),
choices=para.aggregation,
)
aggregation_norm = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_AGGREGATION_NORM),
low=para.aggregation_norm.low,
high=para.aggregation_norm.high,
step=para.aggregation_norm.q,
)
batch_size = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_BATCH_SIZE),
low=para.batch_size.low,
high=para.batch_size.high,
step=para.batch_size.q,
)
depth = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_DEPTH),
low=para.depth.low,
high=para.depth.high,
step=para.depth.q,
)
dropout = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_DROPOUT),
low=para.dropout.low,
high=para.dropout.high,
step=para.dropout.q,
)
ensemble_size = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_ENSEMBLE_SIZE),
low=para.ensemble_size,
high=para.ensemble_size,
)
epochs = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_EPOCHS),
low=para.epochs,
high=para.epochs,
)
features_generator = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FEATURES_GENERATOR),
choices=para.features_generator,
)
ffn_hidden_size = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FFN_HIDDEN_SIZE),
low=para.ffn_hidden_size.low,
high=para.ffn_hidden_size.high,
step=para.ffn_hidden_size.q,
)
ffn_num_layers = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FFN_NUM_LAYERS),
low=para.ffn_num_layers.low,
high=para.ffn_num_layers.high,
step=para.ffn_num_layers.q,
)
final_lr_ratio_exp = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FINAL_LR_RATIO_EXP),
low=para.final_lr_ratio_exp.low,
high=para.final_lr_ratio_exp.high,
)
hidden_size = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_HIDDEN_SIZE),
low=para.hidden_size.low,
high=para.hidden_size.high,
step=para.hidden_size.q,
)
init_lr_ratio_exp = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_INIT_LR_RATIO_EXP),
low=para.init_lr_ratio_exp.low,
high=para.init_lr_ratio_exp.high,
)
max_lr_exp = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_MAX_LR_EXP),
low=para.max_lr_exp.low,
high=para.max_lr_exp.high,
)
warmup_epochs_ratio = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_WARMUP_EPOCHS_RATIO),
low=para.warmup_epochs_ratio.low,
high=para.warmup_epochs_ratio.high,
step=para.warmup_epochs_ratio.q,
)
return build.ChemPropClassifier.new(
activation=activation,
aggregation=aggregation,
aggregation_norm=aggregation_norm,
batch_size=batch_size,
depth=depth,
dropout=dropout,
ensemble_size=ensemble_size,
epochs=epochs,
features_generator=features_generator,
ffn_hidden_size=ffn_hidden_size,
ffn_num_layers=ffn_num_layers,
final_lr_ratio_exp=final_lr_ratio_exp,
hidden_size=hidden_size,
init_lr_ratio_exp=init_lr_ratio_exp,
max_lr_exp=max_lr_exp,
warmup_epochs_ratio=warmup_epochs_ratio,
)
elif isinstance(alg, opt.ChemPropHyperoptRegressor):
ensemble_size = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_ENSEMBLE_SIZE),
low=para.ensemble_size,
high=para.ensemble_size,
)
epochs = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_EPOCHS),
low=para.epochs,
high=para.epochs,
)
features_generator = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FEATURES_GENERATOR),
choices=para.features_generator,
)
num_iters = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_NUM_ITERS),
low=para.num_iters,
high=para.num_iters,
)
search_parameter_level = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_SEARCH_PARAMETER_LEVEL),
choices=para.search_parameter_level,
)
return build.ChemPropHyperoptRegressor.new(
ensemble_size=ensemble_size,
epochs=epochs,
features_generator=features_generator,
num_iters=num_iters,
search_parameter_level=search_parameter_level,
)
elif isinstance(alg, opt.ChemPropHyperoptClassifier):
ensemble_size = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_ENSEMBLE_SIZE),
low=para.ensemble_size,
high=para.ensemble_size,
)
epochs = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_EPOCHS),
low=para.epochs,
high=para.epochs,
)
features_generator = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FEATURES_GENERATOR),
choices=para.features_generator,
)
num_iters = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_NUM_ITERS),
low=para.num_iters,
high=para.num_iters,
)
search_parameter_level = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_SEARCH_PARAMETER_LEVEL),
choices=para.search_parameter_level,
)
return build.ChemPropHyperoptClassifier.new(
ensemble_size=ensemble_size,
epochs=epochs,
features_generator=features_generator,
num_iters=num_iters,
search_parameter_level=search_parameter_level,
)
elif isinstance(alg, opt.ChemPropRegressorPretrained):
frzn = trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_FRZN), choices=para.frzn
)
epochs = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CHEMPROP_EPOCHS),
low=para.epochs.low,
high=para.epochs.high,
)
trial.set_user_attr(
key=_CE.ALGORITHMS_CHEMPROP_PRETRAINED_MODEL, value=para.pretrained_model
)
return build.ChemPropRegressorPretrained.new(
epochs=epochs,
frzn=frzn,
pretrained_model=para.pretrained_model,
)
elif isinstance(alg, opt.CalibratedClassifierCVWithVA):
n_folds = trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CALIBRATEDCLASSIFIERCV_N_FOLDS),
low=para.n_folds,
high=para.n_folds,
)
estimator = suggest_alg_params(trial, para.estimator)
trial.set_user_attr(
key=_CE.ALGORITHMS_CALIBRATEDCLASSIFIERCV_ESTIMATOR,
value=serialize(estimator),
)
calibrated_params = {
_CE.ALGORITHMS_CALIBRATEDCLASSIFIERCV_ENSEMBLE: para.ensemble,
_CE.ALGORITHMS_CALIBRATEDCLASSIFIERCV_METHOD: para.method,
}
trial.set_user_attr(
key=_CE.ALGORITHMS_CALIBRATEDCLASSIFIERCV_PARAMS, value=calibrated_params
)
return build.CalibratedClassifierCVWithVA.new(
ensemble=para.ensemble,
estimator=estimator,
method=para.method,
n_folds=n_folds,
)
elif isinstance(alg, opt.Mapie):
mapie_alpha = trial.suggest_float(
name=_encode_name(_CE.ALGORITHMS_MAPIE_ALPHA),
low=para.mapie_alpha,
high=para.mapie_alpha,
)
estimator = suggest_alg_params(trial, para.estimator)
trial.set_user_attr(
key=_CE.ALGORITHMS_MAPIE_ESTIMATOR, value=serialize(estimator)
)
return build.Mapie.new(
estimator=estimator,
mapie_alpha=mapie_alpha,
)
elif isinstance(alg, opt.CustomRegressionModel):
trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CUSTOM_FILE),
choices=[para.preexisting_model],
)
trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CUSTOM_REFIT_MODEL),
low=para.refit_model,
high=para.refit_model,
)
return build.CustomRegressionModel.new(
preexisting_model=para.preexisting_model, refit_model=para.refit_model
)
elif isinstance(alg, opt.CustomClassificationModel):
trial.suggest_categorical(
name=_encode_name(_CE.ALGORITHMS_CUSTOM_FILE),
choices=[para.preexisting_model],
)
trial.suggest_int(
name=_encode_name(_CE.ALGORITHMS_CUSTOM_REFIT_MODEL),
low=para.refit_model,
high=para.refit_model,
)
return build.CustomClassificationModel.new(
preexisting_model=para.preexisting_model, refit_model=para.refit_model
)
else:
raise ValueError(f"Unrecognized algorithm: {alg.__class__}")
[docs]def suggest_aux_params(trial: FrozenTrial, desc: descriptors.AnyDescriptor):
para = desc.parameters
_encode_name = partial(encode_name, hash=trial.user_attrs["alg_hash"])
# SmilesAndSideInfoFromFile is the only descriptor currently supporting aux params
if isinstance(desc, descriptors.SmilesAndSideInfoFromFile):
return trial.suggest_int(
name=_encode_name(_CE.DESCRIPTORS_SMILES_AND_SI_AUX_WEIGHT_PC),
low=para.aux_weight_pc.low,
high=para.aux_weight_pc.high,
step=para.aux_weight_pc.q,
)
# All other descriptors currently pass through
[docs]def check_invalid_descriptor_param(alg: build.AnyAlgorithm) -> list:
# if calibration is performed then base_estimator should be compat
if isinstance(alg, Union[build.Mapie, build.CalibratedClassifierCVWithVA]):
alg = alg.parameters.estimator
# chemprop should have only chemprop descriptors
if opt.isanyof(alg, build.AnyChemPropAlgorithm):
return descriptors.SmilesBasedDescriptor.__args__
# all others should have non-chemprop descriptors
else:
return descriptors.AnyChemPropIncompatible.__args__