Omartificial-Intelligence-Space's picture
Create app.py
daf1d9d verified
raw
history blame
6.07 kB
import gradio as gr
import random
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import CrossEncoder
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
import torch
import spaces
# Check for GPU support and configure appropriately
device = "cuda" if torch.cuda.is_available() else "cpu"
zero = torch.Tensor([0]).to(device)
print(f"Device being used: {zero.device}")
# Define evaluation metrics
def mean_reciprocal_rank(relevance_labels, scores):
sorted_indices = np.argsort(scores)[::-1]
for rank, idx in enumerate(sorted_indices, start=1):
if relevance_labels[idx] == 1:
return 1 / rank
return 0
def mean_average_precision(relevance_labels, scores):
return average_precision_score(relevance_labels, scores)
def ndcg_at_k(relevance_labels, scores, k=10):
sorted_indices = np.argsort(scores)[::-1]
relevance_sorted = np.take(relevance_labels, sorted_indices[:k])
dcg = sum(rel / np.log2(rank + 2) for rank, rel in enumerate(relevance_sorted))
idcg = sum(1 / np.log2(rank + 2) for rank in range(min(k, sum(relevance_labels))))
return dcg / idcg if idcg > 0 else 0
# Load datasets
datasets = {
"Relevance_Labels_Dataset": load_dataset("NAMAA-Space/Ar-Reranking-Eval")["train"],
"Positive_Negatives_Dataset": load_dataset("NAMAA-Space/Arabic-Reranking-Triplet-5-Eval")["train"]
}
@spaces.GPU
def evaluate_model_with_insights(model_name):
model = CrossEncoder(model_name, device=device)
results = []
sample_outputs = []
for dataset_name, dataset in datasets.items():
all_mrr, all_map, all_ndcg = [], [], []
dataset_samples = []
if 'candidate_document' in dataset.column_names:
grouped_data = dataset.to_pandas().groupby("query")
for query, group in grouped_data:
candidate_texts = group['candidate_document'].tolist()
relevance_labels = group['relevance_label'].tolist()
pairs = [(query, doc) for doc in candidate_texts]
scores = model.predict(pairs)
# Collecting top-5 results for display
sorted_indices = np.argsort(scores)[::-1]
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]]
dataset_samples.append({
"Query": query,
"Top 5 Candidates": top_docs
})
# Metrics
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores))
all_map.append(mean_average_precision(relevance_labels, scores))
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10))
else:
for entry in dataset:
query = entry['query']
candidate_texts = [entry['positive'], entry['negative1'], entry['negative2'], entry['negative3'], entry['negative4']]
relevance_labels = [1, 0, 0, 0, 0]
pairs = [(query, doc) for doc in candidate_texts]
scores = model.predict(pairs)
# Collecting top-5 results for display
sorted_indices = np.argsort(scores)[::-1]
top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]]
dataset_samples.append({
"Query": query,
"Top 5 Candidates": top_docs
})
# Metrics
all_mrr.append(mean_reciprocal_rank(relevance_labels, scores))
all_map.append(mean_average_precision(relevance_labels, scores))
all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10))
# Metrics for this dataset
results.append({
"Dataset": dataset_name,
"MRR": np.mean(all_mrr),
"MAP": np.mean(all_map),
"nDCG@10": np.mean(all_ndcg)
})
# Collect sample outputs for inspection
sample_outputs.extend(dataset_samples)
results_df = pd.DataFrame(results)
# Plot results as a bar chart
fig, ax = plt.subplots(figsize=(8, 6))
results_df.plot(kind='bar', x='Dataset', y=['MRR', 'MAP', 'nDCG@10'], ax=ax)
ax.set_title(f"Evaluation Results for {model_name}")
ax.set_ylabel("Score")
plt.xticks(rotation=0)
return results_df, fig, sample_outputs
# Gradio app interface
def gradio_app_with_insights(model_name):
results_df, chart, samples = evaluate_model_with_insights(model_name)
sample_display = []
for sample in samples:
sample_display.append(f"Query: {sample['Query']}")
for doc, score, label in sample["Top 5 Candidates"]:
sample_display.append(f" Doc: {doc[:50]}... | Score: {score:.2f} | Relevance: {label}")
sample_display.append("\n")
return results_df, chart, "\n".join(sample_display)
interface = gr.Interface(
fn=gradio_app_with_insights,
inputs=gr.Textbox(label="Enter Model Name", placeholder="e.g., NAMAA-Space/GATE-Reranker-V1"),
outputs=[
gr.Dataframe(label="Evaluation Results"),
gr.Plot(label="Evaluation Metrics Chart"),
gr.Textbox(label="Sample Reranking Insights", lines=15)
],
title="Arabic Reranking Model Evaluation and Insights",
description=(
"This app evaluates Arabic reranking models on two datasets:\n"
"1. **Relevance Labels Dataset**\n"
"2. **Positive-Negatives Dataset**\n\n"
"### Metrics Used:\n"
"- **MRR (Mean Reciprocal Rank)**: Measures how quickly the first relevant document appears.\n"
"- **MAP (Mean Average Precision)**: Reflects ranking quality across all relevant documents.\n"
"- **nDCG@10 (Normalized Discounted Cumulative Gain)**: Focuses on the ranking of relevant documents in the top-10.\n\n"
"Input a model name to evaluate its performance, view metrics, and examine sample reranking results."
)
)
interface.launch(debug=True)