Source code for bonafide.log_file_analysis

"""Utility methods for analyzing log files from BONAFIDE after or during feature generation."""

from datetime import datetime
from typing import List, Optional

import pandas as pd


[docs] class LogFileAnalyzer: """Analyze a log file from the Bond and Atom Featurizer and Descriptor Extractor (BONAFIDE). Parameters ---------- log_file_path : str The path to the log file to analyze. Attributes ---------- log_file_lines : List[str] A list of the lines of the log file. """ def __init__(self, log_file_path: str) -> None: self.log_file_path: str = log_file_path self.log_file_lines: List[str] = [] self._read_file()
[docs] def _read_file(self) -> None: """Read the log file. Returns ------- None """ try: with open(self.log_file_path, "r") as f: self.log_file_lines = f.readlines() except Exception as e: raise IOError(f"Error reading log file: {e}") if self.log_file_lines: while self.log_file_lines and self.log_file_lines[-1].strip() == "": self.log_file_lines.pop()
[docs] def _get_time_stamp(self, time_string: str) -> datetime: """Convert a time string to a datetime object. Parameters ---------- time_string : str The time string to convert, expected format: "YYYY-MM-DD HH:MM:SS". Returns ------- datetime The corresponding datetime object if the conversion was successful. """ try: return datetime.strptime(time_string, "%Y-%m-%d %H:%M:%S") except Exception as e: raise ValueError(f"Error parsing time string: {e}")
[docs] def get_level_log_messages(self, log_level: str = "ERROR") -> str: """Get all log messages of a specific logging level. Parameters ---------- log_level : str, optional The desired logging level, by default "ERROR". Returns ------- str A string containing all log messages of the specified logging level, including any indented lines that follow each log message. """ log_level = log_level.upper() all_to_be_returned = "" for line_idx, line in enumerate(self.log_file_lines): if f"| {log_level} |" in line: to_be_returned = [line] for line2 in self.log_file_lines[line_idx + 1 :]: if line2.startswith(" "): to_be_returned.append(line2) else: break all_to_be_returned += "".join(to_be_returned) return all_to_be_returned
[docs] def check_string_in_last_line(self, target_string: str) -> bool: """Check if a specific string is present in the last line of the log file. Parameters ---------- target_string : str The string to check for in the last line of the log file. Returns ------- bool ``True`` if the target string is found in the last line, ``False`` otherwise. """ return target_string in self.log_file_lines[-1]
[docs] def get_time_for_individual_features(self) -> pd.DataFrame: """Get the elapsed time for each individual feature. Returns ------- pd.DataFrame DataFrame with feature names as index and columns for elapsed time, start time, end time, and feature type. """ time_dict = {} for line_idx, line in enumerate(self.log_file_lines): if "Validated configuration settings: {" in line: start_time = self._get_time_stamp( self.log_file_lines[line_idx - 1].split("|")[0].strip() ) feature_name = None for line_idx2, line2 in enumerate(self.log_file_lines[line_idx:]): if ( "-atom-" in line2 or "-bond-" in line2 ) and "configuration settings:" not in line2: feature_name = line2.split("'")[1] break if feature_name is None: continue lines_inverted = self.log_file_lines[::-1] for line_idx3, line3 in enumerate(lines_inverted): if f"'{feature_name}'" in line3 and "configuration settings:" not in line3: if ( "AtomBondFeaturizer.featurize_atoms()" in lines_inverted[line_idx3 + 1] or "AtomBondFeaturizer.featurize_bonds()" in lines_inverted[line_idx3 + 1] ): end_time = self._get_time_stamp( lines_inverted[line_idx3 + 1].split("|")[0].strip() ) elif ( "AtomBondFeaturizer.featurize_atoms()" in lines_inverted[line_idx3 + 2] or "AtomBondFeaturizer.featurize_bonds()" in lines_inverted[line_idx3 + 2] ): end_time = self._get_time_stamp( lines_inverted[line_idx3 + 2].split("|")[0].strip() ) else: raise ValueError("Could not find end time for feature.") break elapsed_time = end_time - start_time time_dict[feature_name] = { "elapsed_time [s]": elapsed_time.total_seconds(), "start_time": start_time.strftime("%H:%M:%S"), "end_time": end_time.strftime("%H:%M:%S"), "feature_type": "atom" if "-atom-" in feature_name else "bond", } df = pd.DataFrame(time_dict).T.sort_values("elapsed_time [s]", ascending=False) df["elapsed_time [s]"] = df["elapsed_time [s]"].astype(int) return df
[docs] def get_total_time_for_atom_featurization(self) -> float: """Get the total time taken for atom featurization. Returns ------- float The total time taken for atom featurization in seconds. """ start_time: Optional[datetime] = None end_time: Optional[datetime] = None for line in self.log_file_lines: if "| AtomBondFeaturizer.featurize_atoms() | START" in line: try: start_time = self._get_time_stamp(line.split("|")[0].strip()) except Exception as e: raise ValueError(f"Error parsing start time for atom featurization: {e}") if "| AtomBondFeaturizer.featurize_atoms() | DONE" in line: try: end_time = self._get_time_stamp(line.split("|")[0].strip()) except Exception as e: raise ValueError(f"Error parsing end time for atom featurization: {e}") if start_time is None or end_time is None: raise ValueError( "Could not find start and/or end time for atom featurization in log file." ) return (end_time - start_time).total_seconds()
[docs] def get_total_time_for_bond_featurization(self) -> float: """Get the total time taken for bond featurization. Returns ------- float The total time taken for bond featurization in seconds. """ start_time: Optional[datetime] = None end_time: Optional[datetime] = None for line in self.log_file_lines: if "| AtomBondFeaturizer.featurize_bonds() | START" in line: try: start_time = self._get_time_stamp(line.split("|")[0].strip()) except Exception as e: raise ValueError(f"Error parsing start time for bond featurization: {e}") if "| AtomBondFeaturizer.featurize_bonds() | DONE" in line: try: end_time = self._get_time_stamp(line.split("|")[0].strip()) except Exception as e: raise ValueError(f"Error parsing end time for bond featurization: {e}") if start_time is None or end_time is None: raise ValueError( "Could not find start and/or end time for bond featurization in log file." ) return (end_time - start_time).total_seconds()
[docs] def get_total_runtime(self) -> float: """Get the total runtime. Returns ------- float The total runtime in seconds. """ if not self.log_file_lines: raise ValueError("Log file is empty.") first_line = self.log_file_lines[0] last_line = None for line in self.log_file_lines[::-1]: if not line.startswith(" "): last_line = line break if last_line is None: raise ValueError("Could not find a valid last line in log file.") try: start_time = datetime.strptime(first_line.split("|")[0].strip(), "%Y-%m-%d %H:%M:%S") except Exception as e: raise ValueError(f"Error parsing start time from log file: {e}") try: end_time = self._get_time_stamp(last_line.split("|")[0].strip()) except Exception as e: raise ValueError(f"Error parsing end time from log file: {e}") return (end_time - start_time).total_seconds()