Spaces:
Sleeping
Sleeping
File size: 5,482 Bytes
5caedb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
from typing import Any, Dict, List, Tuple, Union
import numpy as np
import pandas as pd
from numpy.typing import NDArray
from sklearn.metrics import log_loss, roc_auc_score
from llm_studio.python_configs.base import DefaultConfigProblemBase
def accuracy_score(
cfg: DefaultConfigProblemBase,
results: Dict,
val_df: pd.DataFrame,
raw_results: bool = False,
) -> Union[NDArray, Tuple[NDArray, List[str]]]:
"""Calculate accuracy score.
Only considers the predicted value (results["predictions"]) and target value
(results["target_text"]).
It supports both binary and multiclass classification.
Args:
cfg: DefaultConfigProblemBase, ignored
results: Dict, model results including 'predictions' and 'target_text'
val_df: pd.DataFrame, validation dataframe
raw_results: bool, ignored
Returns:
Numpy array of 0.0 or 1.0 for each sample
Raises:
ValueError: If input data is invalid or inconsistent
"""
predictions = np.array(results["predictions"])
target = np.array(
[[int(t) for t in text.split(",")] for text in results["target_text"]]
)
# Input validation
if len(target) != len(predictions):
raise ValueError(
f"Length of target ({len(target)}) and predicted ({len(predictions)}) "
"should be the same."
)
if len(target) == 0:
raise ValueError("No data to calculate accuracy score")
return (predictions == target).mean(axis=1).reshape(-1).astype("float")
def auc_score(
cfg: DefaultConfigProblemBase,
results: Dict,
val_df: pd.DataFrame,
raw_results: bool = False,
) -> Union[NDArray, Tuple[NDArray, List[str]]]:
"""Calculate Area Under the ROC Curve (AUC) score.
This function computes the AUC score using the predicted logits and target values.
It supports both binary and multiclass classification.
Args:
cfg: DefaultConfigProblemBase, configuration
results: Dict, model results including 'logits' and 'target_text'
val_df: pd.DataFrame, ignored
raw_results: bool, ignored
Returns:
float: AUC score for binary classification
NDArray: AUC scores for multiclass classification (one-vs-rest)
Raises:
ValueError: If input data is invalid or inconsistent
"""
logits = np.array(results["logits"])
target = np.array(
[[int(t) for t in text.split(",")] for text in results["target_text"]]
)
# Input validation
if len(target) != len(logits):
raise ValueError(
f"Length of target ({len(target)}) and logits ({len(logits)}) "
"should be the same."
)
if len(target) == 0:
raise ValueError("No data to calculate AUC score.")
if target.shape[1] == 1 and cfg.dataset.num_classes > 1:
target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)]
return roc_auc_score(target, logits, multi_class="ovr")
def logloss_score(
cfg: DefaultConfigProblemBase,
results: Dict,
val_df: pd.DataFrame,
raw_results: bool = False,
) -> Union[NDArray, Tuple[NDArray, List[str]]]:
"""Calculate the Log Loss (Cross-Entropy Loss) score.
This function computes the log loss using the predicted probabilities and target.
It supports binary, multiclass, and multilabel classification.
Args:
cfg: DefaultConfigProblemBase, configuration
results: Dict, model results including 'probabilities' and 'target_text'
val_df: pd.DataFrame, ignored
raw_results: bool, ignored
Returns:
float: Log Loss score
Raises:
ValueError: If input data is invalid or inconsistent
"""
predictions = np.array(results["probabilities"])
target = np.array(
[[int(t) for t in text.split(",")] for text in results["target_text"]]
)
# Input validation
if len(target) != len(predictions):
raise ValueError(
f"Length of target ({len(target)}) and predictions ({len(predictions)}) "
"should be the same."
)
if len(target) == 0:
raise ValueError("No data to calculate log loss.")
# Handle multilabel case
if len(cfg.dataset.answer_column) > 1:
log_losses = []
for col in range(len(cfg.dataset.answer_column)):
log_losses.append(log_loss(target[:, col], predictions[:, col]))
return np.mean(log_losses)
# Handle binary and multiclass cases
if cfg.dataset.num_classes > 1:
target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)]
return log_loss(target, predictions)
class Metrics:
"""
Metrics factory. Returns:
- metric value
- should it be maximized or minimized
- Reduce function
Maximized or minimized is needed for early stopping (saving best checkpoint)
Reduce function to generate a single metric value, usually "mean" or "none"
"""
_metrics = {
"AUC": (auc_score, "max", "mean"),
"Accuracy": (accuracy_score, "max", "mean"),
"LogLoss": (logloss_score, "min", "mean"),
}
@classmethod
def names(cls) -> List[str]:
return sorted(cls._metrics.keys())
@classmethod
def get(cls, name: str) -> Any:
"""Access to Metrics.
Args:
name: metrics name
Returns:
A class to build the Metrics
"""
return cls._metrics.get(name, cls._metrics["LogLoss"])
|