Spaces:

qinfeng722
/

llm-studio

Running

File size: 5,482 Bytes

5caedb4

from typing import Any, Dict, List, Tuple, Union

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from sklearn.metrics import log_loss, roc_auc_score

from llm_studio.python_configs.base import DefaultConfigProblemBase


def accuracy_score(
    cfg: DefaultConfigProblemBase,
    results: Dict,
    val_df: pd.DataFrame,
    raw_results: bool = False,
) -> Union[NDArray, Tuple[NDArray, List[str]]]:
    """Calculate accuracy score.

    Only considers the predicted value (results["predictions"]) and target value
    (results["target_text"]).
    It supports both binary and multiclass classification.

    Args:
        cfg: DefaultConfigProblemBase, ignored
        results: Dict, model results including 'predictions' and 'target_text'
        val_df: pd.DataFrame, validation dataframe
        raw_results: bool, ignored

    Returns:
        Numpy array of 0.0 or 1.0 for each sample

    Raises:
        ValueError: If input data is invalid or inconsistent
    """
    predictions = np.array(results["predictions"])
    target = np.array(
        [[int(t) for t in text.split(",")] for text in results["target_text"]]
    )

    # Input validation
    if len(target) != len(predictions):
        raise ValueError(
            f"Length of target ({len(target)}) and predicted ({len(predictions)}) "
            "should be the same."
        )
    if len(target) == 0:
        raise ValueError("No data to calculate accuracy score")

    return (predictions == target).mean(axis=1).reshape(-1).astype("float")


def auc_score(
    cfg: DefaultConfigProblemBase,
    results: Dict,
    val_df: pd.DataFrame,
    raw_results: bool = False,
) -> Union[NDArray, Tuple[NDArray, List[str]]]:
    """Calculate Area Under the ROC Curve (AUC) score.

    This function computes the AUC score using the predicted logits and target values.
    It supports both binary and multiclass classification.

    Args:
        cfg: DefaultConfigProblemBase, configuration
        results: Dict, model results including 'logits' and 'target_text'
        val_df: pd.DataFrame, ignored
        raw_results: bool, ignored

    Returns:
        float: AUC score for binary classification
        NDArray: AUC scores for multiclass classification (one-vs-rest)

    Raises:
        ValueError: If input data is invalid or inconsistent
    """
    logits = np.array(results["logits"])
    target = np.array(
        [[int(t) for t in text.split(",")] for text in results["target_text"]]
    )

    # Input validation
    if len(target) != len(logits):
        raise ValueError(
            f"Length of target ({len(target)}) and logits ({len(logits)}) "
            "should be the same."
        )
    if len(target) == 0:
        raise ValueError("No data to calculate AUC score.")

    if target.shape[1] == 1 and cfg.dataset.num_classes > 1:
        target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)]
    return roc_auc_score(target, logits, multi_class="ovr")


def logloss_score(
    cfg: DefaultConfigProblemBase,
    results: Dict,
    val_df: pd.DataFrame,
    raw_results: bool = False,
) -> Union[NDArray, Tuple[NDArray, List[str]]]:
    """Calculate the Log Loss (Cross-Entropy Loss) score.

    This function computes the log loss using the predicted probabilities and target.
    It supports binary, multiclass, and multilabel classification.

    Args:
        cfg: DefaultConfigProblemBase, configuration
        results: Dict, model results including 'probabilities' and 'target_text'
        val_df: pd.DataFrame, ignored
        raw_results: bool, ignored

    Returns:
        float: Log Loss score

    Raises:
        ValueError: If input data is invalid or inconsistent
    """
    predictions = np.array(results["probabilities"])
    target = np.array(
        [[int(t) for t in text.split(",")] for text in results["target_text"]]
    )

    # Input validation
    if len(target) != len(predictions):
        raise ValueError(
            f"Length of target ({len(target)}) and predictions ({len(predictions)}) "
            "should be the same."
        )
    if len(target) == 0:
        raise ValueError("No data to calculate log loss.")

    # Handle multilabel case
    if len(cfg.dataset.answer_column) > 1:
        log_losses = []
        for col in range(len(cfg.dataset.answer_column)):
            log_losses.append(log_loss(target[:, col], predictions[:, col]))
        return np.mean(log_losses)

    # Handle binary and multiclass cases
    if cfg.dataset.num_classes > 1:
        target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)]
    return log_loss(target, predictions)


class Metrics:
    """
    Metrics factory. Returns:
        - metric value
        - should it be maximized or minimized
        - Reduce function

    Maximized or minimized is needed for early stopping (saving best checkpoint)
    Reduce function to generate a single metric value, usually "mean" or "none"
    """

    _metrics = {
        "AUC": (auc_score, "max", "mean"),
        "Accuracy": (accuracy_score, "max", "mean"),
        "LogLoss": (logloss_score, "min", "mean"),
    }

    @classmethod
    def names(cls) -> List[str]:
        return sorted(cls._metrics.keys())

    @classmethod
    def get(cls, name: str) -> Any:
        """Access to Metrics.

        Args:
            name: metrics name
        Returns:
            A class to build the Metrics
        """
        return cls._metrics.get(name, cls._metrics["LogLoss"])