from typing import Any, Dict, List, Tuple, Union import numpy as np import pandas as pd from numpy.typing import NDArray from sklearn.metrics import log_loss, roc_auc_score from llm_studio.python_configs.base import DefaultConfigProblemBase def accuracy_score( cfg: DefaultConfigProblemBase, results: Dict, val_df: pd.DataFrame, raw_results: bool = False, ) -> Union[NDArray, Tuple[NDArray, List[str]]]: """Calculate accuracy score. Only considers the predicted value (results["predictions"]) and target value (results["target_text"]). It supports both binary and multiclass classification. Args: cfg: DefaultConfigProblemBase, ignored results: Dict, model results including 'predictions' and 'target_text' val_df: pd.DataFrame, validation dataframe raw_results: bool, ignored Returns: Numpy array of 0.0 or 1.0 for each sample Raises: ValueError: If input data is invalid or inconsistent """ predictions = np.array(results["predictions"]) target = np.array( [[int(t) for t in text.split(",")] for text in results["target_text"]] ) # Input validation if len(target) != len(predictions): raise ValueError( f"Length of target ({len(target)}) and predicted ({len(predictions)}) " "should be the same." ) if len(target) == 0: raise ValueError("No data to calculate accuracy score") return (predictions == target).mean(axis=1).reshape(-1).astype("float") def auc_score( cfg: DefaultConfigProblemBase, results: Dict, val_df: pd.DataFrame, raw_results: bool = False, ) -> Union[NDArray, Tuple[NDArray, List[str]]]: """Calculate Area Under the ROC Curve (AUC) score. This function computes the AUC score using the predicted logits and target values. It supports both binary and multiclass classification. Args: cfg: DefaultConfigProblemBase, configuration results: Dict, model results including 'logits' and 'target_text' val_df: pd.DataFrame, ignored raw_results: bool, ignored Returns: float: AUC score for binary classification NDArray: AUC scores for multiclass classification (one-vs-rest) Raises: ValueError: If input data is invalid or inconsistent """ logits = np.array(results["logits"]) target = np.array( [[int(t) for t in text.split(",")] for text in results["target_text"]] ) # Input validation if len(target) != len(logits): raise ValueError( f"Length of target ({len(target)}) and logits ({len(logits)}) " "should be the same." ) if len(target) == 0: raise ValueError("No data to calculate AUC score.") if target.shape[1] == 1 and cfg.dataset.num_classes > 1: target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)] return roc_auc_score(target, logits, multi_class="ovr") def logloss_score( cfg: DefaultConfigProblemBase, results: Dict, val_df: pd.DataFrame, raw_results: bool = False, ) -> Union[NDArray, Tuple[NDArray, List[str]]]: """Calculate the Log Loss (Cross-Entropy Loss) score. This function computes the log loss using the predicted probabilities and target. It supports binary, multiclass, and multilabel classification. Args: cfg: DefaultConfigProblemBase, configuration results: Dict, model results including 'probabilities' and 'target_text' val_df: pd.DataFrame, ignored raw_results: bool, ignored Returns: float: Log Loss score Raises: ValueError: If input data is invalid or inconsistent """ predictions = np.array(results["probabilities"]) target = np.array( [[int(t) for t in text.split(",")] for text in results["target_text"]] ) # Input validation if len(target) != len(predictions): raise ValueError( f"Length of target ({len(target)}) and predictions ({len(predictions)}) " "should be the same." ) if len(target) == 0: raise ValueError("No data to calculate log loss.") # Handle multilabel case if len(cfg.dataset.answer_column) > 1: log_losses = [] for col in range(len(cfg.dataset.answer_column)): log_losses.append(log_loss(target[:, col], predictions[:, col])) return np.mean(log_losses) # Handle binary and multiclass cases if cfg.dataset.num_classes > 1: target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)] return log_loss(target, predictions) class Metrics: """ Metrics factory. Returns: - metric value - should it be maximized or minimized - Reduce function Maximized or minimized is needed for early stopping (saving best checkpoint) Reduce function to generate a single metric value, usually "mean" or "none" """ _metrics = { "AUC": (auc_score, "max", "mean"), "Accuracy": (accuracy_score, "max", "mean"), "LogLoss": (logloss_score, "min", "mean"), } @classmethod def names(cls) -> List[str]: return sorted(cls._metrics.keys()) @classmethod def get(cls, name: str) -> Any: """Access to Metrics. Args: name: metrics name Returns: A class to build the Metrics """ return cls._metrics.get(name, cls._metrics["LogLoss"])