Spaces:

qinfeng722
/

llm-studio

Running

App Files Files Community

llm-studio / llm_studio /src /metrics /text_causal_classification_modeling_metrics.py

qinfeng722

Upload 322 files

5caedb4 verified 4 months ago

raw

history blame contribute delete

5.48 kB

	from typing import Any, Dict, List, Tuple, Union

	import numpy as np
	import pandas as pd
	from numpy.typing import NDArray
	from sklearn.metrics import log_loss, roc_auc_score

	from llm_studio.python_configs.base import DefaultConfigProblemBase


	def accuracy_score(
	cfg: DefaultConfigProblemBase,
	results: Dict,
	val_df: pd.DataFrame,
	raw_results: bool = False,
	) -> Union[NDArray, Tuple[NDArray, List[str]]]:
	"""Calculate accuracy score.

	Only considers the predicted value (results["predictions"]) and target value
	(results["target_text"]).
	It supports both binary and multiclass classification.

	Args:
	cfg: DefaultConfigProblemBase, ignored
	results: Dict, model results including 'predictions' and 'target_text'
	val_df: pd.DataFrame, validation dataframe
	raw_results: bool, ignored

	Returns:
	Numpy array of 0.0 or 1.0 for each sample

	Raises:
	ValueError: If input data is invalid or inconsistent
	"""
	predictions = np.array(results["predictions"])
	target = np.array(
	[[int(t) for t in text.split(",")] for text in results["target_text"]]
	)

	# Input validation
	if len(target) != len(predictions):
	raise ValueError(
	f"Length of target ({len(target)}) and predicted ({len(predictions)}) "
	"should be the same."
	)
	if len(target) == 0:
	raise ValueError("No data to calculate accuracy score")

	return (predictions == target).mean(axis=1).reshape(-1).astype("float")


	def auc_score(
	cfg: DefaultConfigProblemBase,
	results: Dict,
	val_df: pd.DataFrame,
	raw_results: bool = False,
	) -> Union[NDArray, Tuple[NDArray, List[str]]]:
	"""Calculate Area Under the ROC Curve (AUC) score.

	This function computes the AUC score using the predicted logits and target values.
	It supports both binary and multiclass classification.

	Args:
	cfg: DefaultConfigProblemBase, configuration
	results: Dict, model results including 'logits' and 'target_text'
	val_df: pd.DataFrame, ignored
	raw_results: bool, ignored

	Returns:
	float: AUC score for binary classification
	NDArray: AUC scores for multiclass classification (one-vs-rest)

	Raises:
	ValueError: If input data is invalid or inconsistent
	"""
	logits = np.array(results["logits"])
	target = np.array(
	[[int(t) for t in text.split(",")] for text in results["target_text"]]
	)

	# Input validation
	if len(target) != len(logits):
	raise ValueError(
	f"Length of target ({len(target)}) and logits ({len(logits)}) "
	"should be the same."
	)
	if len(target) == 0:
	raise ValueError("No data to calculate AUC score.")

	if target.shape[1] == 1 and cfg.dataset.num_classes > 1:
	target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)]
	return roc_auc_score(target, logits, multi_class="ovr")


	def logloss_score(
	cfg: DefaultConfigProblemBase,
	results: Dict,
	val_df: pd.DataFrame,
	raw_results: bool = False,
	) -> Union[NDArray, Tuple[NDArray, List[str]]]:
	"""Calculate the Log Loss (Cross-Entropy Loss) score.

	This function computes the log loss using the predicted probabilities and target.
	It supports binary, multiclass, and multilabel classification.

	Args:
	cfg: DefaultConfigProblemBase, configuration
	results: Dict, model results including 'probabilities' and 'target_text'
	val_df: pd.DataFrame, ignored
	raw_results: bool, ignored

	Returns:
	float: Log Loss score

	Raises:
	ValueError: If input data is invalid or inconsistent
	"""
	predictions = np.array(results["probabilities"])
	target = np.array(
	[[int(t) for t in text.split(",")] for text in results["target_text"]]
	)

	# Input validation
	if len(target) != len(predictions):
	raise ValueError(
	f"Length of target ({len(target)}) and predictions ({len(predictions)}) "
	"should be the same."
	)
	if len(target) == 0:
	raise ValueError("No data to calculate log loss.")

	# Handle multilabel case
	if len(cfg.dataset.answer_column) > 1:
	log_losses = []
	for col in range(len(cfg.dataset.answer_column)):
	log_losses.append(log_loss(target[:, col], predictions[:, col]))
	return np.mean(log_losses)

	# Handle binary and multiclass cases
	if cfg.dataset.num_classes > 1:
	target = np.eye(cfg.dataset.num_classes)[target.reshape(-1)]
	return log_loss(target, predictions)


	class Metrics:
	"""
	Metrics factory. Returns:
	- metric value
	- should it be maximized or minimized
	- Reduce function

	Maximized or minimized is needed for early stopping (saving best checkpoint)
	Reduce function to generate a single metric value, usually "mean" or "none"
	"""

	_metrics = {
	"AUC": (auc_score, "max", "mean"),
	"Accuracy": (accuracy_score, "max", "mean"),
	"LogLoss": (logloss_score, "min", "mean"),
	}

	@classmethod
	def names(cls) -> List[str]:
	return sorted(cls._metrics.keys())

	@classmethod
	def get(cls, name: str) -> Any:
	"""Access to Metrics.

	Args:
	name: metrics name
	Returns:
	A class to build the Metrics
	"""
	return cls._metrics.get(name, cls._metrics["LogLoss"])