Spaces:
Sleeping
Sleeping
from typing import Any, Dict, List, Tuple, Union | |
import numpy as np | |
import pandas as pd | |
from numpy.typing import NDArray | |
from sklearn.metrics import log_loss, roc_auc_score | |
from llm_studio.python_configs.base import DefaultConfigProblemBase | |
def accuracy_score( | |
cfg: DefaultConfigProblemBase, | |
results: Dict, | |
val_df: pd.DataFrame, | |
raw_results: bool = False, | |
) -> Union[NDArray, Tuple[NDArray, List[str]]]: | |
"""Calculate accuracy score. | |
Only considers the predicted value (results["predictions"]) and target value | |
(results["target_text"]). | |
It supports both binary and multiclass classification. | |
Args: | |
cfg: DefaultConfigProblemBase, ignored | |
results: Dict, model results including 'predictions' and 'target_text' | |
val_df: pd.DataFrame, validation dataframe | |
raw_results: bool, ignored | |
Returns: | |
Numpy array of 0.0 or 1.0 for each sample | |
Raises: | |
ValueError: If input data is invalid or inconsistent | |
""" | |
predictions = np.array(results["predictions"]) | |
target = np.array( | |
[[int(t) for t in text.split(",")] for text in results["target_text"]] | |
) | |
# Input validation | |
if len(target) != len(predictions): | |
raise ValueError( | |
f"Length of target ({len(target)}) and predicted ({len(predictions)}) " | |
"should be the same." | |
) | |
if len(target) == 0: | |
raise ValueError("No data to calculate accuracy score") | |
return (predictions == target).mean(axis=1).reshape(-1).astype("float") | |
def auc_score( | |
cfg: DefaultConfigProblemBase, | |
results: Dict, | |
val_df: pd.DataFrame, | |
raw_results: bool = False, | |
) -> Union[NDArray, Tuple[NDArray, List[str]]]: | |
"""Calculate Area Under the ROC Curve (AUC) score. | |
This function computes the AUC score using the predicted logits and target values. | |
It supports both binary and multiclass classification. | |
Args: | |
cfg: DefaultConfigProblemBase, configuration | |
results: Dict, model results including 'logits' and 'target_text' | |
val_df: pd.DataFrame, ignored | |
raw_results: bool, ignored | |
Returns: | |
float: AUC score for binary classification | |
NDArray: AUC scores for multiclass classification (one-vs-rest) | |
Raises: | |
ValueError: If input data is invalid or inconsistent | |
""" | |
logits = np.array(results["logits"]) | |
target = np.array( | |
[[int(t) for t in text.split(",")] for text in results["target_text"]] | |
) | |
# Input validation | |
if len(target) != len(logits): | |
raise ValueError( | |
f"Length of target ({len(target)}) and logits ({len(logits)}) " | |
"should be the same." | |
) | |
if len(target) == 0: | |
raise ValueError("No data to calculate AUC score.") | |
if target.shape[1] == 1 and cfg.dataset.num_classes > 1: | |
target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)] | |
return roc_auc_score(target, logits, multi_class="ovr") | |
def logloss_score( | |
cfg: DefaultConfigProblemBase, | |
results: Dict, | |
val_df: pd.DataFrame, | |
raw_results: bool = False, | |
) -> Union[NDArray, Tuple[NDArray, List[str]]]: | |
"""Calculate the Log Loss (Cross-Entropy Loss) score. | |
This function computes the log loss using the predicted probabilities and target. | |
It supports binary, multiclass, and multilabel classification. | |
Args: | |
cfg: DefaultConfigProblemBase, configuration | |
results: Dict, model results including 'probabilities' and 'target_text' | |
val_df: pd.DataFrame, ignored | |
raw_results: bool, ignored | |
Returns: | |
float: Log Loss score | |
Raises: | |
ValueError: If input data is invalid or inconsistent | |
""" | |
predictions = np.array(results["probabilities"]) | |
target = np.array( | |
[[int(t) for t in text.split(",")] for text in results["target_text"]] | |
) | |
# Input validation | |
if len(target) != len(predictions): | |
raise ValueError( | |
f"Length of target ({len(target)}) and predictions ({len(predictions)}) " | |
"should be the same." | |
) | |
if len(target) == 0: | |
raise ValueError("No data to calculate log loss.") | |
# Handle multilabel case | |
if len(cfg.dataset.answer_column) > 1: | |
log_losses = [] | |
for col in range(len(cfg.dataset.answer_column)): | |
log_losses.append(log_loss(target[:, col], predictions[:, col])) | |
return np.mean(log_losses) | |
# Handle binary and multiclass cases | |
if cfg.dataset.num_classes > 1: | |
target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)] | |
return log_loss(target, predictions) | |
class Metrics: | |
""" | |
Metrics factory. Returns: | |
- metric value | |
- should it be maximized or minimized | |
- Reduce function | |
Maximized or minimized is needed for early stopping (saving best checkpoint) | |
Reduce function to generate a single metric value, usually "mean" or "none" | |
""" | |
_metrics = { | |
"AUC": (auc_score, "max", "mean"), | |
"Accuracy": (accuracy_score, "max", "mean"), | |
"LogLoss": (logloss_score, "min", "mean"), | |
} | |
def names(cls) -> List[str]: | |
return sorted(cls._metrics.keys()) | |
def get(cls, name: str) -> Any: | |
"""Access to Metrics. | |
Args: | |
name: metrics name | |
Returns: | |
A class to build the Metrics | |
""" | |
return cls._metrics.get(name, cls._metrics["LogLoss"]) | |