Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import random | |
import numpy as np | |
import pandas as pd | |
from datasets import load_dataset | |
from sentence_transformers import CrossEncoder | |
from sklearn.metrics import average_precision_score | |
import matplotlib.pyplot as plt | |
import torch | |
import spaces | |
import os | |
from huggingface_hub import HfApi | |
# Load Hugging Face token from the environment variable | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
if HF_TOKEN is None: | |
raise ValueError("HF_TOKEN environment variable is not set. Please set it before running the script.") | |
hf_api = HfApi( | |
token= HF_TOKEN, # Token is not persisted on the machine. | |
) | |
# Check for GPU support and configure appropriately | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
zero = torch.Tensor([0]).to(device) | |
print(f"Device being used: {zero.device}") | |
# Define evaluation metrics | |
def mean_reciprocal_rank(relevance_labels, scores): | |
sorted_indices = np.argsort(scores)[::-1] | |
for rank, idx in enumerate(sorted_indices, start=1): | |
if relevance_labels[idx] == 1: | |
return 1 / rank | |
return 0 | |
def mean_average_precision(relevance_labels, scores): | |
return average_precision_score(relevance_labels, scores) | |
def ndcg_at_k(relevance_labels, scores, k=10): | |
sorted_indices = np.argsort(scores)[::-1] | |
relevance_sorted = np.take(relevance_labels, sorted_indices[:k]) | |
dcg = sum(rel / np.log2(rank + 2) for rank, rel in enumerate(relevance_sorted)) | |
idcg = sum(1 / np.log2(rank + 2) for rank in range(min(k, sum(relevance_labels)))) | |
return dcg / idcg if idcg > 0 else 0 | |
# Load datasets | |
datasets = { | |
"Relevance_Labels_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-query-candidate" , token =HF_TOKEN )["train"].select(range(300)), | |
"Positive_Negatives_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-test-triplet" , token =HF_TOKEN )["train"].select(range(300)) | |
} | |
def evaluate_model_with_insights(model_name): | |
model = CrossEncoder(model_name, device=device) | |
results = [] | |
sample_outputs = [] | |
for dataset_name, dataset in datasets.items(): | |
all_mrr, all_map, all_ndcg = [], [], [] | |
dataset_samples = [] | |
if 'candidate_document' in dataset.column_names: | |
grouped_data = dataset.to_pandas().groupby("query") | |
for query, group in grouped_data: | |
# Skip invalid queries | |
if query is None or not isinstance(query, str) or query.strip() == "": | |
continue | |
candidate_texts = group['candidate_document'].dropna().tolist() | |
relevance_labels = group['relevance_label'].tolist() | |
# Skip if no valid candidate documents | |
if not candidate_texts or len(candidate_texts) != len(relevance_labels): | |
continue | |
pairs = [(query, doc) for doc in candidate_texts if doc is not None and isinstance(doc, str) and doc.strip() != ""] | |
scores = model.predict(pairs) | |
# Collecting top-5 results for display | |
sorted_indices = np.argsort(scores)[::-1] | |
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] | |
dataset_samples.append({ | |
"Query": query, | |
"Top 5 Candidates": top_docs | |
}) | |
# Metrics | |
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) | |
all_map.append(mean_average_precision(relevance_labels, scores)) | |
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) | |
else: | |
for entry in dataset: | |
query = entry['query'] | |
# Validate query and documents | |
if query is None or not isinstance(query, str) or query.strip() == "": | |
continue | |
candidate_texts = [ | |
doc for doc in [entry.get('positive'), entry.get('negative1'), entry.get('negative2'), entry.get('negative3'), entry.get('negative4')] | |
if doc is not None and isinstance(doc, str) and doc.strip() != "" | |
] | |
relevance_labels = [1] + [0] * (len(candidate_texts) - 1) | |
# Skip if no valid candidate documents | |
if not candidate_texts or len(candidate_texts) != len(relevance_labels): | |
continue | |
pairs = [(query, doc) for doc in candidate_texts] | |
scores = model.predict(pairs) | |
# Collecting top-5 results for display | |
sorted_indices = np.argsort(scores)[::-1] | |
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] | |
dataset_samples.append({ | |
"Query": query, | |
"Top 5 Candidates": top_docs | |
}) | |
# Metrics | |
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) | |
all_map.append(mean_average_precision(relevance_labels, scores)) | |
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) | |
else: | |
for entry in dataset: | |
query = entry['query'] | |
candidate_texts = [entry['positive'], entry['negative1'], entry['negative2'], entry['negative3'], entry['negative4']] | |
relevance_labels = [1, 0, 0, 0, 0] | |
pairs = [(query, doc) for doc in candidate_texts] | |
scores = model.predict(pairs) | |
# Collecting top-5 results for display | |
sorted_indices = np.argsort(scores)[::-1] | |
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] | |
dataset_samples.append({ | |
"Query": query, | |
"Top 5 Candidates": top_docs | |
}) | |
# Metrics | |
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) | |
all_map.append(mean_average_precision(relevance_labels, scores)) | |
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) | |
# Metrics for this dataset | |
results.append({ | |
"Dataset": dataset_name, | |
"MRR": np.mean(all_mrr), | |
"MAP": np.mean(all_map), | |
"nDCG@10": np.mean(all_ndcg) | |
}) | |
# Collect sample outputs for inspection | |
sample_outputs.extend(dataset_samples) | |
results_df = pd.DataFrame(results) | |
# Plot results as a bar chart | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
results_df.plot(kind='bar', x='Dataset', y=['MRR', 'MAP', 'nDCG@10'], ax=ax) | |
ax.set_title(f"Evaluation Results for {model_name}") | |
ax.set_ylabel("Score") | |
plt.xticks(rotation=0) | |
return results_df, fig, sample_outputs | |
# Gradio app interface | |
def gradio_app_with_insights(model_name): | |
results_df, chart, samples = evaluate_model_with_insights(model_name) | |
sample_display = [] | |
for sample in samples: | |
sample_display.append(f"Query: {sample['Query']}") | |
for doc, score, label in sample["Top 5 Candidates"]: | |
sample_display.append(f" Doc: {doc[:50]}... | Score: {score:.2f} | Relevance: {label}") | |
sample_display.append("\n") | |
return results_df, chart, "\n".join(sample_display) | |
interface = gr.Interface( | |
fn=gradio_app_with_insights, | |
inputs=gr.Textbox(label="Enter Model Name", placeholder="e.g., NAMAA-Space/GATE-Reranker-V1"), | |
outputs=[ | |
gr.Dataframe(label="Evaluation Results"), | |
gr.Plot(label="Evaluation Metrics Chart"), | |
gr.Textbox(label="Sample Reranking Insights", lines=15) | |
], | |
title="Arabic Reranking Model Evaluation and Insights", | |
description=( | |
"This app evaluates Arabic reranking models on two datasets:\n" | |
"1. **Relevance Labels Dataset**\n" | |
"2. **Positive-Negatives Dataset**\n\n" | |
"### Metrics Used:\n" | |
"- **MRR (Mean Reciprocal Rank)**: Measures how quickly the first relevant document appears.\n" | |
"- **MAP (Mean Average Precision)**: Reflects ranking quality across all relevant documents.\n" | |
"- **nDCG@10 (Normalized Discounted Cumulative Gain)**: Focuses on the ranking of relevant documents in the top-10.\n\n" | |
"Input a model name to evaluate its performance, view metrics, and examine sample reranking results." | |
) | |
) | |
interface.launch(debug=True) |