import gradio as gr import random import numpy as np import pandas as pd from datasets import load_dataset from sentence_transformers import CrossEncoder from sklearn.metrics import average_precision_score import matplotlib.pyplot as plt import torch import spaces import os from huggingface_hub import HfApi # Load Hugging Face token from the environment variable HF_TOKEN = os.getenv("HF_TOKEN") if HF_TOKEN is None: raise ValueError("HF_TOKEN environment variable is not set. Please set it before running the script.") hf_api = HfApi( token= HF_TOKEN, # Token is not persisted on the machine. ) # Check for GPU support and configure appropriately device = "cuda" if torch.cuda.is_available() else "cpu" zero = torch.Tensor([0]).to(device) print(f"Device being used: {zero.device}") # Define evaluation metrics def mean_reciprocal_rank(relevance_labels, scores): sorted_indices = np.argsort(scores)[::-1] for rank, idx in enumerate(sorted_indices, start=1): if relevance_labels[idx] == 1: return 1 / rank return 0 def mean_average_precision(relevance_labels, scores): return average_precision_score(relevance_labels, scores) def ndcg_at_k(relevance_labels, scores, k=10): sorted_indices = np.argsort(scores)[::-1] relevance_sorted = np.take(relevance_labels, sorted_indices[:k]) dcg = sum(rel / np.log2(rank + 2) for rank, rel in enumerate(relevance_sorted)) idcg = sum(1 / np.log2(rank + 2) for rank in range(min(k, sum(relevance_labels)))) return dcg / idcg if idcg > 0 else 0 # Load datasets datasets = { "Relevance_Labels_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-test-triplet" , token =HF_TOKEN )["train"].select(range(100)), "Positive_Negatives_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-query-candidate" , token =HF_TOKEN )["train"].select(range(100)) } @spaces.GPU(duration=120) def evaluate_model_with_insights(model_name): model = CrossEncoder(model_name, device=device) results = [] sample_outputs = [] for dataset_name, dataset in datasets.items(): all_mrr, all_map, all_ndcg = [], [], [] dataset_samples = [] if 'candidate_document' in dataset.column_names: grouped_data = dataset.to_pandas().groupby("query") for query, group in grouped_data: # Skip invalid queries if query is None or not isinstance(query, str) or query.strip() == "": continue candidate_texts = group['candidate_document'].dropna().tolist() relevance_labels = group['relevance_label'].tolist() # Skip if no valid candidate documents if not candidate_texts or len(candidate_texts) != len(relevance_labels): continue pairs = [(query, doc) for doc in candidate_texts if doc is not None and isinstance(doc, str) and doc.strip() != ""] scores = model.predict(pairs) # Collecting top-5 results for display sorted_indices = np.argsort(scores)[::-1] top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] dataset_samples.append({ "Query": query, "Top 5 Candidates": top_docs }) # Metrics all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) all_map.append(mean_average_precision(relevance_labels, scores)) all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) else: for entry in dataset: query = entry['query'] # Validate query and documents if query is None or not isinstance(query, str) or query.strip() == "": continue candidate_texts = [ doc for doc in [entry.get('positive'), entry.get('negative1'), entry.get('negative2'), entry.get('negative3'), entry.get('negative4')] if doc is not None and isinstance(doc, str) and doc.strip() != "" ] relevance_labels = [1] + [0] * (len(candidate_texts) - 1) # Skip if no valid candidate documents if not candidate_texts or len(candidate_texts) != len(relevance_labels): continue pairs = [(query, doc) for doc in candidate_texts] scores = model.predict(pairs) # Collecting top-5 results for display sorted_indices = np.argsort(scores)[::-1] top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] dataset_samples.append({ "Query": query, "Top 5 Candidates": top_docs }) # Metrics all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) all_map.append(mean_average_precision(relevance_labels, scores)) all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) else: for entry in dataset: query = entry['query'] candidate_texts = [entry['positive'], entry['negative1'], entry['negative2'], entry['negative3'], entry['negative4']] relevance_labels = [1, 0, 0, 0, 0] pairs = [(query, doc) for doc in candidate_texts] scores = model.predict(pairs) # Collecting top-5 results for display sorted_indices = np.argsort(scores)[::-1] top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] dataset_samples.append({ "Query": query, "Top 5 Candidates": top_docs }) # Metrics all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) all_map.append(mean_average_precision(relevance_labels, scores)) all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) # Metrics for this dataset results.append({ "Dataset": dataset_name, "MRR": np.mean(all_mrr), "MAP": np.mean(all_map), "nDCG@10": np.mean(all_ndcg) }) # Collect sample outputs for inspection sample_outputs.extend(dataset_samples) results_df = pd.DataFrame(results) # Plot results as a bar chart fig, ax = plt.subplots(figsize=(8, 6)) results_df.plot(kind='bar', x='Dataset', y=['MRR', 'MAP', 'nDCG@10'], ax=ax) ax.set_title(f"Evaluation Results for {model_name}") ax.set_ylabel("Score") plt.xticks(rotation=0) return results_df, fig, sample_outputs # Gradio app interface def gradio_app_with_insights(model_name): results_df, chart, samples = evaluate_model_with_insights(model_name) sample_display = [] for sample in samples: sample_display.append(f"Query: {sample['Query']}") for doc, score, label in sample["Top 5 Candidates"]: sample_display.append(f" Doc: {doc[:50]}... | Score: {score:.2f} | Relevance: {label}") sample_display.append("\n") return results_df, chart, "\n".join(sample_display) interface = gr.Interface( fn=gradio_app_with_insights, inputs=gr.Textbox(label="Enter Model Name", placeholder="e.g., NAMAA-Space/GATE-Reranker-V1"), outputs=[ gr.Dataframe(label="Evaluation Results"), gr.Plot(label="Evaluation Metrics Chart"), gr.Textbox(label="Sample Reranking Insights", lines=15) ], title="Arabic Reranking Model Evaluation and Insights", description=( "This app evaluates Arabic reranking models on two datasets:\n" "1. **Relevance Labels Dataset**\n" "2. **Positive-Negatives Dataset**\n\n" "### Metrics Used:\n" "- **MRR (Mean Reciprocal Rank)**: Measures how quickly the first relevant document appears.\n" "- **MAP (Mean Average Precision)**: Reflects ranking quality across all relevant documents.\n" "- **nDCG@10 (Normalized Discounted Cumulative Gain)**: Focuses on the ranking of relevant documents in the top-10.\n\n" "Input a model name to evaluate its performance, view metrics, and examine sample reranking results." ) ) interface.launch(debug=True)