Source code for optunaz.utils.preprocessing.transform
import abc
import inspect
import logging
from dataclasses import field, dataclass
import apischema
from scipy.stats import norm
from apischema import schema, deserializer, serializer, identity
from apischema.conversions import Conversion
from typing import Union, Any, Literal, Annotated, Optional
from enum import Enum
import pandas as pd
import numpy as np
from optunaz.config import NameParameterDataclass
[docs]class DataTransformError(Exception):
"""Raised when insufficient molecules for UnfittedSklearnSclaer to fit"""
pass
[docs]class DataTransform(NameParameterDataclass, abc.ABC):
"""Base class for auxiliary transformers.
Each data transformer should provide method `transform`,
which takes raw input data, and returns numpy arrays with
transformed output data.
"""
_union: Any = None
# You can use __init_subclass__ to register new subclass automatically
def __init_subclass__(cls, **kwargs):
if inspect.isabstract(cls):
return # Do not register abstract classes, like RDKitDescriptor.
# Deserializers stack directly as a Union
deserializer(Conversion(identity, source=cls, target=DataTransform))
# Only Base serializer must be registered (and updated for each subclass) as
# a Union, and not be inherited
DataTransform._union = (
cls if DataTransform._union is None else Union[DataTransform._union, cls]
)
serializer(
Conversion(
identity,
source=DataTransform,
target=DataTransform._union,
inherited=False,
)
)
[docs]@dataclass
class PTRTransform(DataTransform):
"""Transform model input/output with PTR"""
[docs] @apischema.type_name("PTRTransformParams")
@dataclass
class Parameters:
threshold: Annotated[
float,
schema(
title="PTR Threshold",
description="The decision boundary for discretising active or inactive classes used by PTR.",
),
] = field(
default=None,
)
std: Annotated[
float,
schema(
title="PTR standard deviation",
description="The standard deviation used by PTR, e.g. experimental reproducibility/uncertainty",
),
] = field(
default=None,
)
name: Literal["PTRTransform"] = "PTRTransform"
parameters: Parameters = Parameters()
[docs] def transform(self, y_) -> np.ndarray:
assert self.parameters.threshold is not None, "Must define a PTR threshold"
assert self.parameters.std is not None, "Must define a PTR Std. Dev."
y_ = np.array(y_)
mask = np.isfinite(y_)
ret = np.zeros(y_.shape)
ret[mask] = norm.cdf(
np.array(y_[mask]).clip(
0.0000000000000001,
),
self.parameters.threshold,
self.parameters.std,
)
ret[~mask] = np.nan
return ret
[docs] def reverse_transform(self, y_) -> np.ndarray:
assert self.parameters.threshold is not None, "Must define a PTR threshold"
assert float(self.parameters.threshold) != float(
"nan"
), "PTR threshold must not be nan"
assert self.parameters.std is not None, "Must define a PTR Std. Dev."
y_ = np.array(y_)
mask = np.isfinite(y_)
ret = np.zeros(y_.shape)
ret[mask] = norm.ppf(
y_[mask].clip(0.0000000000000001, 0.9999999999999999),
self.parameters.threshold,
self.parameters.std,
)
ret[~mask] = np.nan
return ret
[docs]class LogBase(str, Enum):
"""Base for Numpy transform in ModelDataTransform"""
LOG2 = "log2"
LOG10 = "log10"
LOG = "log"
[docs]@dataclass
class ModelDataTransform(DataTransform):
"""Data transformer that applies and reverses logarithmic functions to user data"""
[docs] @apischema.type_name("ModelDataTransformParams")
@dataclass
class Parameters:
base: Annotated[
LogBase,
schema(
title="Base",
description="The log, log2 or log10 base to use in log transformation",
),
] = field(
default=None,
)
negation: Annotated[
LogNegative,
schema(
title="Negation",
description="Whether or not to make the log transform performed negated (-)",
),
] = field(
default=None,
)
conversion: Annotated[
Optional[int],
schema(
title="Conversion power",
description="The conversion power applied in the log transformation",
),
] = field(
default=None,
)
name: Literal["ModelDataTransform"] = "ModelDataTransform"
parameters: Parameters = Parameters()
base_dict = {
LogBase.LOG2: np.log2,
LogBase.LOG10: np.log10,
LogBase.LOG: np.log,
}
base_negation = {
LogNegative.TRUE: True,
LogNegative.FALSE: False,
}
reverse_dict = {
LogBase.LOG2: lambda x: 2**x,
LogBase.LOG10: lambda x: 10**x,
LogBase.LOG: np.exp,
}
[docs] def transform_df(self, df: pd.Series) -> pd.Series:
return self.base_dict[self.parameters.base](df)
[docs] def transform_one(self, value: float) -> np.float64:
return self.base_dict[self.parameters.base](value)
[docs] def reverse_transform_df(self, df: pd.Series) -> pd.Series:
return self.reverse_dict[self.parameters.base](df)
[docs] def reverse_transform_one(self, value: float) -> np.float64:
return self.reverse_dict[self.parameters.base](value)
[docs] def transform(self, y_):
if self.parameters.conversion is not None:
y_ = y_ / np.power(10, self.parameters.conversion)
if isinstance(y_, pd.Series):
transformed = self.transform_df(y_)
else:
transformed = self.transform_one(y_)
if len(transformed.shape) >= 1:
transformed[~np.isfinite(transformed)] = float("nan")
if self.base_negation[self.parameters.negation]:
return -transformed
else:
return transformed
[docs] def reverse_transform(self, y_):
if self.base_negation[self.parameters.negation]:
y_ = -y_.astype(float)
if isinstance(y_, pd.Series):
transformed = self.reverse_transform_df(y_)
else:
transformed = self.reverse_transform_one(y_)
if self.parameters.conversion is not None:
transformed = transformed * np.power(10, self.parameters.conversion)
return transformed
[docs]class AuxTransformer(DataTransform):
"""Base class for Auxiliary transformation classes
Each auxiliary data transforation provides the method `transform`,
which takes raw auxiliary data, and returns numpy arrays with
transformed auxiliary data."""
[docs]@dataclass
class VectorFromColumn(AuxTransformer):
"""Vector from column
Splits delimited values from in inputs into usable vectors"""
[docs] @apischema.type_name("VectorFromColumnParams")
@dataclass
class Parameters:
delimiter: Annotated[
str,
schema(
title="Delimiter",
description="String used to split the auxiliary column into a vector",
),
] = field(
default=",",
)
name: Literal["VectorFromColumn"] = "VectorFromColumn"
parameters: Parameters = Parameters()
[docs] def transform(self, auxiliary_data: np.array) -> np.array:
return np.array(
[
np.fromstring(val, sep=self.parameters.delimiter)
for val in auxiliary_data
]
)
[docs]@dataclass
class ZScales(AuxTransformer):
"""Z-scales from column
Calculates Z-scores for sequences or a predefined list of peptide/protein targets"""
name: Literal["ZScales"] = "ZScales"
parameters: Parameters = Parameters()
[docs] def transform(self, auxiliary_data: np.ndarray) -> np.ndarray:
try:
from peptides import Peptide
except ImportError:
logging.critical(
"peptides package must be installed to use Z-Scales transform"
)
return np.array([list(Peptide(val).z_scales()) for val in auxiliary_data])
[docs]@dataclass
class AmorProt(AuxTransformer):
"""AmorProt from column
Calculates AmorProt for sequences or a predefined list of peptide/protein targets"""
name: Literal["AmorProt"] = "AmorProt"
parameters: Parameters = Parameters()
def __post_init__(self):
try:
from amorprot import AmorProt
except ImportError:
logging.critical("AmorProt must be installed to use AmorProt transform")
self.ap = AmorProt()
[docs] def transform(self, auxiliary_data: np.ndarray) -> np.ndarray:
aux_array = []
for val_idx, val in enumerate(auxiliary_data):
try:
aux_array.append(self.ap.fingerprint(val))
except KeyError:
raise DataTransformError(
f"AmorProt transform failed on line {val_idx}, for seq: {val}"
)
return np.array(aux_array)
AnyAuxTransformer = Union[VectorFromColumn, ZScales, AmorProt]