Source code for bonafide.utils.custom_featurizer_input_validation
"""Type and format validation of the dictionary provided by the user for custom featurizers."""
import logging
from typing import Any, Dict, Tuple
from bonafide.utils.constants import (
DATA_TYPES,
DIMENSIONALITIES,
FEATURE_TYPES,
)
from bonafide.utils.helper_functions import standardize_string
[docs]
def custom_featurizer_data_validator(
custom_metadata: Dict[str, Any],
feature_info: Dict[int, Dict[str, Any]],
feature_config: Dict[str, Any],
namespace: str,
loc: str,
) -> Tuple[str, Dict[str, Any]]:
"""Validate the user input for introducing a custom featurizer to BONAFIDE.
Parameters
----------
custom_metadata : Dict[str, Any]
The dictionary with the required metadata for the custom featurizer.
feature_info : Dict[int, Dict[str, Any]]
The metadata of all implemented atom and bond features, e.g., the name of the feature, its
dimensionality requirements (either 2D or 3D), or the program it is calculated with
(origin).
feature_config : Dict[str, Any]
The configuration settings for the individual programs used for feature calculation.
namespace : str
The namespace for the molecule as defined by the user when reading in the molecule.
loc : str
The location string representing the current class and method for logging purposes.
Returns
-------
Tuple[str, Dict[str, Any]]
A tuple containing the origin string of the custom featurizer and the validated metadata
dictionary.
"""
# Check if required keys were provided
required_keys = list(feature_info[list(feature_info.keys())[0]].keys())
required_keys_ = [x for x in required_keys]
required_keys.sort()
provided_keys = list(custom_metadata.keys())
provided_keys.sort()
if required_keys != provided_keys:
_errmsg = (
f"Invalid input to 'custom_metadata': provided data format is incorrect. "
f"The (only) required keys of the input dictionary are {required_keys_}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise KeyError(f"{loc}(): {_errmsg}")
# Check name
_inpt = type(custom_metadata["name"])
if _inpt != str:
_errmsg = (
"Invalid input to 'name' in 'custom_metadata': must be of type str but obtained "
f"{_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
name_ = standardize_string(inp_data=custom_metadata["name"])
if len(name_) == 0:
_errmsg = "Invalid input to 'name' in 'custom_metadata': must not be an empty string."
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise ValueError(f"{loc}(): {_errmsg}")
custom_metadata["name"] = custom_metadata["name"].strip()
# Check origin
_inpt = type(custom_metadata["origin"])
if _inpt != str:
_errmsg = (
"Invalid input to 'origin' in 'custom_metadata': must be of type str but obtained "
f"{_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
origin_ = standardize_string(inp_data=custom_metadata["origin"])
if origin_ in list(feature_config.keys()):
_errmsg = f"Invalid input to 'origin' in 'custom_metadata': '{origin_}' is already in use."
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise ValueError(f"{loc}(): {_errmsg}")
custom_metadata["origin"] = origin_
# Check feature_type
_inpt = type(custom_metadata["feature_type"])
if _inpt != str:
_errmsg = (
"Invalid input to 'feature_type' in 'custom_metadata': must be of type str but "
f"obtained {_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
feature_type_ = standardize_string(inp_data=custom_metadata["feature_type"])
if feature_type_ not in FEATURE_TYPES:
_errmsg = (
f"Invalid input to 'feature_type' in 'custom_metadata': '{feature_type_}' is "
f"not supported, available: {FEATURE_TYPES}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise ValueError(f"{loc}(): {_errmsg}")
custom_metadata["feature_type"] = feature_type_
# Check dimensionality
_inpt = type(custom_metadata["dimensionality"])
if _inpt != str:
_errmsg = (
"Invalid input to 'dimensionality' in 'custom_metadata': must be of type str but "
f"obtained {_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
dimensionality_ = standardize_string(inp_data=custom_metadata["dimensionality"], case="upper")
if dimensionality_ not in DIMENSIONALITIES:
_errmsg = (
f"Invalid input to 'dimensionality' in 'custom_metadata': '{dimensionality_}' is "
f"not supported, available: {DIMENSIONALITIES}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise ValueError(f"{loc}(): {_errmsg}")
custom_metadata["dimensionality"] = dimensionality_
# Check data_type
_inpt = type(custom_metadata["data_type"])
if _inpt != str:
_errmsg = (
"Invalid input to 'data_type' in 'custom_metadata': must be of type str but obtained "
f"{_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
data_type_ = standardize_string(inp_data=custom_metadata["data_type"])
if data_type_ not in DATA_TYPES:
_errmsg = (
f"Invalid input to 'data_type' in 'custom_metadata': '{data_type_}' is not supported, "
f"available: {DATA_TYPES}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise ValueError(f"{loc}(): {_errmsg}")
custom_metadata["data_type"] = data_type_
# Check requires_electronic_structure_data
_inpt = type(custom_metadata["requires_electronic_structure_data"])
if _inpt != bool:
_errmsg = (
"Invalid input to 'requires_electronic_structure_data' in 'custom_metadata': must be "
f"of type bool but obtained {_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
# Check requires_bond_data
_inpt = type(custom_metadata["requires_bond_data"])
if _inpt != bool:
_errmsg = (
"Invalid input to 'requires_bond_data' in 'custom_metadata': must be "
f"of type bool but obtained {_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
# Check requires_charge
_inpt = type(custom_metadata["requires_charge"])
if _inpt != bool:
_errmsg = (
"Invalid input to 'requires_charge' in 'custom_metadata': must be "
f"of type bool but obtained {_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
# Check requires_multiplicity
_inpt = type(custom_metadata["requires_multiplicity"])
if _inpt != bool:
_errmsg = (
"Invalid input to 'requires_multiplicity' in 'custom_metadata': must be "
f"of type bool but obtained {_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
# Check config_path
_inpt = type(custom_metadata["config_path"])
if _inpt != dict:
_errmsg = (
"Invalid input to 'config_path' in 'custom_metadata': must be of type dict but "
f"obtained {_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
# Check factory
_inpt = type(custom_metadata["factory"])
if _inpt != type:
_errmsg = (
"Invalid input to 'factory' in 'custom_metadata': must be pointing to a class but "
f"obtained {_inpt.__name__}."
)
logging.error(f"'{namespace}' | {loc}()\n{_errmsg}")
raise TypeError(f"{loc}(): {_errmsg}")
return origin_, custom_metadata