|
import gradio as gr |
|
import random |
|
import numpy as np |
|
import pandas as pd |
|
from datasets import load_dataset |
|
from sentence_transformers import CrossEncoder |
|
from sklearn.metrics import average_precision_score |
|
import matplotlib.pyplot as plt |
|
import torch |
|
import spaces |
|
import os |
|
from huggingface_hub import HfApi |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
if HF_TOKEN is None: |
|
raise ValueError("HF_TOKEN environment variable is not set. Please set it before running the script.") |
|
|
|
|
|
hf_api = HfApi( |
|
token= HF_TOKEN, |
|
) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
zero = torch.Tensor([0]).to(device) |
|
print(f"Device being used: {zero.device}") |
|
|
|
|
|
def mean_reciprocal_rank(relevance_labels, scores): |
|
sorted_indices = np.argsort(scores)[::-1] |
|
for rank, idx in enumerate(sorted_indices, start=1): |
|
if relevance_labels[idx] == 1: |
|
return 1 / rank |
|
return 0 |
|
|
|
def mean_average_precision(relevance_labels, scores): |
|
return average_precision_score(relevance_labels, scores) |
|
|
|
def ndcg_at_k(relevance_labels, scores, k=10): |
|
sorted_indices = np.argsort(scores)[::-1] |
|
relevance_sorted = np.take(relevance_labels, sorted_indices[:k]) |
|
dcg = sum(rel / np.log2(rank + 2) for rank, rel in enumerate(relevance_sorted)) |
|
idcg = sum(1 / np.log2(rank + 2) for rank in range(min(k, sum(relevance_labels)))) |
|
return dcg / idcg if idcg > 0 else 0 |
|
|
|
|
|
datasets = { |
|
"Relevance_Labels_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-test-triplet" , token =HF_TOKEN )["train"].select(range(100)), |
|
"Positive_Negatives_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-query-candidate" , token =HF_TOKEN )["train"].select(range(100)) |
|
} |
|
|
|
@spaces.GPU(duration=120) |
|
def evaluate_model_with_insights(model_name): |
|
model = CrossEncoder(model_name, device=device) |
|
results = [] |
|
sample_outputs = [] |
|
|
|
for dataset_name, dataset in datasets.items(): |
|
all_mrr, all_map, all_ndcg = [], [], [] |
|
dataset_samples = [] |
|
|
|
if 'candidate_document' in dataset.column_names: |
|
grouped_data = dataset.to_pandas().groupby("query") |
|
for query, group in grouped_data: |
|
candidate_texts = group['candidate_document'].tolist() |
|
relevance_labels = group['relevance_label'].tolist() |
|
pairs = [(query, doc) for doc in candidate_texts] |
|
scores = model.predict(pairs) |
|
|
|
|
|
sorted_indices = np.argsort(scores)[::-1] |
|
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] |
|
dataset_samples.append({ |
|
"Query": query, |
|
"Top 5 Candidates": top_docs |
|
}) |
|
|
|
|
|
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) |
|
all_map.append(mean_average_precision(relevance_labels, scores)) |
|
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) |
|
else: |
|
for entry in dataset: |
|
query = entry['query'] |
|
candidate_texts = [entry['positive'], entry['negative1'], entry['negative2'], entry['negative3'], entry['negative4']] |
|
relevance_labels = [1, 0, 0, 0, 0] |
|
pairs = [(query, doc) for doc in candidate_texts] |
|
scores = model.predict(pairs) |
|
|
|
|
|
sorted_indices = np.argsort(scores)[::-1] |
|
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] |
|
dataset_samples.append({ |
|
"Query": query, |
|
"Top 5 Candidates": top_docs |
|
}) |
|
|
|
|
|
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) |
|
all_map.append(mean_average_precision(relevance_labels, scores)) |
|
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) |
|
|
|
|
|
results.append({ |
|
"Dataset": dataset_name, |
|
"MRR": np.mean(all_mrr), |
|
"MAP": np.mean(all_map), |
|
"nDCG@10": np.mean(all_ndcg) |
|
}) |
|
|
|
|
|
sample_outputs.extend(dataset_samples) |
|
|
|
results_df = pd.DataFrame(results) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
results_df.plot(kind='bar', x='Dataset', y=['MRR', 'MAP', 'nDCG@10'], ax=ax) |
|
ax.set_title(f"Evaluation Results for {model_name}") |
|
ax.set_ylabel("Score") |
|
plt.xticks(rotation=0) |
|
|
|
return results_df, fig, sample_outputs |
|
|
|
|
|
def gradio_app_with_insights(model_name): |
|
results_df, chart, samples = evaluate_model_with_insights(model_name) |
|
sample_display = [] |
|
for sample in samples: |
|
sample_display.append(f"Query: {sample['Query']}") |
|
for doc, score, label in sample["Top 5 Candidates"]: |
|
sample_display.append(f" Doc: {doc[:50]}... | Score: {score:.2f} | Relevance: {label}") |
|
sample_display.append("\n") |
|
return results_df, chart, "\n".join(sample_display) |
|
|
|
interface = gr.Interface( |
|
fn=gradio_app_with_insights, |
|
inputs=gr.Textbox(label="Enter Model Name", placeholder="e.g., NAMAA-Space/GATE-Reranker-V1"), |
|
outputs=[ |
|
gr.Dataframe(label="Evaluation Results"), |
|
gr.Plot(label="Evaluation Metrics Chart"), |
|
gr.Textbox(label="Sample Reranking Insights", lines=15) |
|
], |
|
title="Arabic Reranking Model Evaluation and Insights", |
|
description=( |
|
"This app evaluates Arabic reranking models on two datasets:\n" |
|
"1. **Relevance Labels Dataset**\n" |
|
"2. **Positive-Negatives Dataset**\n\n" |
|
"### Metrics Used:\n" |
|
"- **MRR (Mean Reciprocal Rank)**: Measures how quickly the first relevant document appears.\n" |
|
"- **MAP (Mean Average Precision)**: Reflects ranking quality across all relevant documents.\n" |
|
"- **nDCG@10 (Normalized Discounted Cumulative Gain)**: Focuses on the ranking of relevant documents in the top-10.\n\n" |
|
"Input a model name to evaluate its performance, view metrics, and examine sample reranking results." |
|
) |
|
) |
|
|
|
interface.launch(debug=True) |
|
|