Omartificial-Intelligence-Space's picture
update app
86b486d verified
raw
history blame
8.55 kB
import gradio as gr
import random
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import CrossEncoder
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
import torch
import spaces
import os
from huggingface_hub import HfApi
# Load Hugging Face token from the environment variable
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
raise ValueError("HF_TOKEN environment variable is not set. Please set it before running the script.")
hf_api = HfApi(
token= HF_TOKEN, # Token is not persisted on the machine.
)
# Check for GPU support and configure appropriately
device = "cuda" if torch.cuda.is_available() else "cpu"
zero = torch.Tensor([0]).to(device)
print(f"Device being used: {zero.device}")
# Define evaluation metrics
def mean_reciprocal_rank(relevance_labels, scores):
sorted_indices = np.argsort(scores)[::-1]
for rank, idx in enumerate(sorted_indices, start=1):
if relevance_labels[idx] == 1:
return 1 / rank
return 0
def mean_average_precision(relevance_labels, scores):
return average_precision_score(relevance_labels, scores)
def ndcg_at_k(relevance_labels, scores, k=10):
sorted_indices = np.argsort(scores)[::-1]
relevance_sorted = np.take(relevance_labels, sorted_indices[:k])
dcg = sum(rel / np.log2(rank + 2) for rank, rel in enumerate(relevance_sorted))
idcg = sum(1 / np.log2(rank + 2) for rank in range(min(k, sum(relevance_labels))))
return dcg / idcg if idcg > 0 else 0
# Load datasets
datasets = {
"Relevance_Labels_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-query-candidate" , token =HF_TOKEN )["train"].select(range(300)),
"Positive_Negatives_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-test-triplet" , token =HF_TOKEN )["train"].select(range(300))
}
@spaces.GPU(duration=120)
def evaluate_model_with_insights(model_name):
model = CrossEncoder(model_name, device=device)
results = []
sample_outputs = []
for dataset_name, dataset in datasets.items():
all_mrr, all_map, all_ndcg = [], [], []
dataset_samples = []
if 'candidate_document' in dataset.column_names:
grouped_data = dataset.to_pandas().groupby("query")
for query, group in grouped_data:
# Skip invalid queries
if query is None or not isinstance(query, str) or query.strip() == "":
continue
candidate_texts = group['candidate_document'].dropna().tolist()
relevance_labels = group['relevance_label'].tolist()
# Skip if no valid candidate documents
if not candidate_texts or len(candidate_texts) != len(relevance_labels):
continue
pairs = [(query, doc) for doc in candidate_texts if doc is not None and isinstance(doc, str) and doc.strip() != ""]
scores = model.predict(pairs)
# Collecting top-5 results for display
sorted_indices = np.argsort(scores)[::-1]
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]]
dataset_samples.append({
"Query": query,
"Top 5 Candidates": top_docs
})
# Metrics
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores))
all_map.append(mean_average_precision(relevance_labels, scores))
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10))
else:
for entry in dataset:
query = entry['query']
# Validate query and documents
if query is None or not isinstance(query, str) or query.strip() == "":
continue
candidate_texts = [
doc for doc in [entry.get('positive'), entry.get('negative1'), entry.get('negative2'), entry.get('negative3'), entry.get('negative4')]
if doc is not None and isinstance(doc, str) and doc.strip() != ""
]
relevance_labels = [1] + [0] * (len(candidate_texts) - 1)
# Skip if no valid candidate documents
if not candidate_texts or len(candidate_texts) != len(relevance_labels):
continue
pairs = [(query, doc) for doc in candidate_texts]
scores = model.predict(pairs)
# Collecting top-5 results for display
sorted_indices = np.argsort(scores)[::-1]
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]]
dataset_samples.append({
"Query": query,
"Top 5 Candidates": top_docs
})
# Metrics
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores))
all_map.append(mean_average_precision(relevance_labels, scores))
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10))
else:
for entry in dataset:
query = entry['query']
candidate_texts = [entry['positive'], entry['negative1'], entry['negative2'], entry['negative3'], entry['negative4']]
relevance_labels = [1, 0, 0, 0, 0]
pairs = [(query, doc) for doc in candidate_texts]
scores = model.predict(pairs)
# Collecting top-5 results for display
sorted_indices = np.argsort(scores)[::-1]
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]]
dataset_samples.append({
"Query": query,
"Top 5 Candidates": top_docs
})
# Metrics
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores))
all_map.append(mean_average_precision(relevance_labels, scores))
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10))
# Metrics for this dataset
results.append({
"Dataset": dataset_name,
"MRR": np.mean(all_mrr),
"MAP": np.mean(all_map),
"nDCG@10": np.mean(all_ndcg)
})
# Collect sample outputs for inspection
sample_outputs.extend(dataset_samples)
results_df = pd.DataFrame(results)
# Plot results as a bar chart
fig, ax = plt.subplots(figsize=(8, 6))
results_df.plot(kind='bar', x='Dataset', y=['MRR', 'MAP', 'nDCG@10'], ax=ax)
ax.set_title(f"Evaluation Results for {model_name}")
ax.set_ylabel("Score")
plt.xticks(rotation=0)
return results_df, fig, sample_outputs
# Gradio app interface
def gradio_app_with_insights(model_name):
results_df, chart, samples = evaluate_model_with_insights(model_name)
sample_display = []
for sample in samples:
sample_display.append(f"Query: {sample['Query']}")
for doc, score, label in sample["Top 5 Candidates"]:
sample_display.append(f" Doc: {doc[:50]}... | Score: {score:.2f} | Relevance: {label}")
sample_display.append("\n")
return results_df, chart, "\n".join(sample_display)
interface = gr.Interface(
fn=gradio_app_with_insights,
inputs=gr.Textbox(label="Enter Model Name", placeholder="e.g., NAMAA-Space/GATE-Reranker-V1"),
outputs=[
gr.Dataframe(label="Evaluation Results"),
gr.Plot(label="Evaluation Metrics Chart"),
gr.Textbox(label="Sample Reranking Insights", lines=15)
],
title="Arabic Reranking Model Evaluation and Insights",
description=(
"This app evaluates Arabic reranking models on two datasets:\n"
"1. **Relevance Labels Dataset**\n"
"2. **Positive-Negatives Dataset**\n\n"
"### Metrics Used:\n"
"- **MRR (Mean Reciprocal Rank)**: Measures how quickly the first relevant document appears.\n"
"- **MAP (Mean Average Precision)**: Reflects ranking quality across all relevant documents.\n"
"- **nDCG@10 (Normalized Discounted Cumulative Gain)**: Focuses on the ranking of relevant documents in the top-10.\n\n"
"Input a model name to evaluate its performance, view metrics, and examine sample reranking results."
)
)
interface.launch(debug=True)