Source code for bonafide.log_file_analysis
"""Utility methods for analyzing log files from BONAFIDE after or during feature generation."""
from datetime import datetime
from typing import List, Optional
import pandas as pd
[docs]
class LogFileAnalyzer:
"""Analyze a log file from the Bond and Atom Featurizer and Descriptor Extractor (BONAFIDE).
Parameters
----------
log_file_path : str
The path to the log file to analyze.
Attributes
----------
log_file_lines : List[str]
A list of the lines of the log file.
"""
def __init__(self, log_file_path: str) -> None:
self.log_file_path: str = log_file_path
self.log_file_lines: List[str] = []
self._read_file()
[docs]
def _read_file(self) -> None:
"""Read the log file.
Returns
-------
None
"""
try:
with open(self.log_file_path, "r") as f:
self.log_file_lines = f.readlines()
except Exception as e:
raise IOError(f"Error reading log file: {e}")
if self.log_file_lines:
while self.log_file_lines and self.log_file_lines[-1].strip() == "":
self.log_file_lines.pop()
[docs]
def _get_time_stamp(self, time_string: str) -> datetime:
"""Convert a time string to a datetime object.
Parameters
----------
time_string : str
The time string to convert, expected format: "YYYY-MM-DD HH:MM:SS".
Returns
-------
datetime
The corresponding datetime object if the conversion was successful.
"""
try:
return datetime.strptime(time_string, "%Y-%m-%d %H:%M:%S")
except Exception as e:
raise ValueError(f"Error parsing time string: {e}")
[docs]
def get_level_log_messages(self, log_level: str = "ERROR") -> str:
"""Get all log messages of a specific logging level.
Parameters
----------
log_level : str, optional
The desired logging level, by default "ERROR".
Returns
-------
str
A string containing all log messages of the specified logging level, including
any indented lines that follow each log message.
"""
log_level = log_level.upper()
all_to_be_returned = ""
for line_idx, line in enumerate(self.log_file_lines):
if f"| {log_level} |" in line:
to_be_returned = [line]
for line2 in self.log_file_lines[line_idx + 1 :]:
if line2.startswith(" "):
to_be_returned.append(line2)
else:
break
all_to_be_returned += "".join(to_be_returned)
return all_to_be_returned
[docs]
def check_string_in_last_line(self, target_string: str) -> bool:
"""Check if a specific string is present in the last line of the log file.
Parameters
----------
target_string : str
The string to check for in the last line of the log file.
Returns
-------
bool
``True`` if the target string is found in the last line, ``False`` otherwise.
"""
return target_string in self.log_file_lines[-1]
[docs]
def get_time_for_individual_features(self) -> pd.DataFrame:
"""Get the elapsed time for each individual feature.
Returns
-------
pd.DataFrame
DataFrame with feature names as index and columns for elapsed time, start time,
end time, and feature type.
"""
time_dict = {}
for line_idx, line in enumerate(self.log_file_lines):
if "Validated configuration settings: {" in line:
start_time = self._get_time_stamp(
self.log_file_lines[line_idx - 1].split("|")[0].strip()
)
feature_name = None
for line_idx2, line2 in enumerate(self.log_file_lines[line_idx:]):
if (
"-atom-" in line2 or "-bond-" in line2
) and "configuration settings:" not in line2:
feature_name = line2.split("'")[1]
break
if feature_name is None:
continue
lines_inverted = self.log_file_lines[::-1]
for line_idx3, line3 in enumerate(lines_inverted):
if f"'{feature_name}'" in line3 and "configuration settings:" not in line3:
if (
"AtomBondFeaturizer.featurize_atoms()" in lines_inverted[line_idx3 + 1]
or "AtomBondFeaturizer.featurize_bonds()"
in lines_inverted[line_idx3 + 1]
):
end_time = self._get_time_stamp(
lines_inverted[line_idx3 + 1].split("|")[0].strip()
)
elif (
"AtomBondFeaturizer.featurize_atoms()" in lines_inverted[line_idx3 + 2]
or "AtomBondFeaturizer.featurize_bonds()"
in lines_inverted[line_idx3 + 2]
):
end_time = self._get_time_stamp(
lines_inverted[line_idx3 + 2].split("|")[0].strip()
)
else:
raise ValueError("Could not find end time for feature.")
break
elapsed_time = end_time - start_time
time_dict[feature_name] = {
"elapsed_time [s]": elapsed_time.total_seconds(),
"start_time": start_time.strftime("%H:%M:%S"),
"end_time": end_time.strftime("%H:%M:%S"),
"feature_type": "atom" if "-atom-" in feature_name else "bond",
}
df = pd.DataFrame(time_dict).T.sort_values("elapsed_time [s]", ascending=False)
df["elapsed_time [s]"] = df["elapsed_time [s]"].astype(int)
return df
[docs]
def get_total_time_for_atom_featurization(self) -> float:
"""Get the total time taken for atom featurization.
Returns
-------
float
The total time taken for atom featurization in seconds.
"""
start_time: Optional[datetime] = None
end_time: Optional[datetime] = None
for line in self.log_file_lines:
if "| AtomBondFeaturizer.featurize_atoms() | START" in line:
try:
start_time = self._get_time_stamp(line.split("|")[0].strip())
except Exception as e:
raise ValueError(f"Error parsing start time for atom featurization: {e}")
if "| AtomBondFeaturizer.featurize_atoms() | DONE" in line:
try:
end_time = self._get_time_stamp(line.split("|")[0].strip())
except Exception as e:
raise ValueError(f"Error parsing end time for atom featurization: {e}")
if start_time is None or end_time is None:
raise ValueError(
"Could not find start and/or end time for atom featurization in log file."
)
return (end_time - start_time).total_seconds()
[docs]
def get_total_time_for_bond_featurization(self) -> float:
"""Get the total time taken for bond featurization.
Returns
-------
float
The total time taken for bond featurization in seconds.
"""
start_time: Optional[datetime] = None
end_time: Optional[datetime] = None
for line in self.log_file_lines:
if "| AtomBondFeaturizer.featurize_bonds() | START" in line:
try:
start_time = self._get_time_stamp(line.split("|")[0].strip())
except Exception as e:
raise ValueError(f"Error parsing start time for bond featurization: {e}")
if "| AtomBondFeaturizer.featurize_bonds() | DONE" in line:
try:
end_time = self._get_time_stamp(line.split("|")[0].strip())
except Exception as e:
raise ValueError(f"Error parsing end time for bond featurization: {e}")
if start_time is None or end_time is None:
raise ValueError(
"Could not find start and/or end time for bond featurization in log file."
)
return (end_time - start_time).total_seconds()
[docs]
def get_total_runtime(self) -> float:
"""Get the total runtime.
Returns
-------
float
The total runtime in seconds.
"""
if not self.log_file_lines:
raise ValueError("Log file is empty.")
first_line = self.log_file_lines[0]
last_line = None
for line in self.log_file_lines[::-1]:
if not line.startswith(" "):
last_line = line
break
if last_line is None:
raise ValueError("Could not find a valid last line in log file.")
try:
start_time = datetime.strptime(first_line.split("|")[0].strip(), "%Y-%m-%d %H:%M:%S")
except Exception as e:
raise ValueError(f"Error parsing start time from log file: {e}")
try:
end_time = self._get_time_stamp(last_line.split("|")[0].strip())
except Exception as e:
raise ValueError(f"Error parsing end time from log file: {e}")
return (end_time - start_time).total_seconds()