Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
from pathlib import Path | |
from utils import load_json_results | |
from leaderboard_tab import search_leaderboard, update_columns_to_show, create_leaderboard_tab | |
# Constants | |
RETRIEVAL_ABOUT_SECTION = """ | |
## About Retrieval Evaluation | |
The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on: | |
### Web Search Dataset Metrics | |
- **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result | |
- **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results | |
- **Recall@5**: Measures the proportion of relevant documents found in the top 5 results | |
- **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5 | |
### Model Requirements | |
- Must support Arabic text embeddings | |
- Should handle queries of at least 512 tokens | |
- Must work with `sentence-transformers` library | |
### Evaluation Process | |
1. Models process Arabic web search queries | |
2. Retrieved documents are evaluated using: | |
- MRR for first relevant result positioning | |
- nDCG for overall ranking quality | |
- Recall@5 for top results accuracy | |
3. Metrics are averaged to calculate the overall score | |
4. Models are ranked based on their overall performance | |
### How to Prepare Your Model | |
- Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet) | |
- Model should output fixed-dimension embeddings for text | |
- Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`) | |
""" | |
# Global variables | |
retrieval_df = None | |
def load_retrieval_results(prepare_for_display=False, sort_col=None, drop_cols=None): | |
dataframe_path = Path(__file__).parent / "results" / "retrieval_results.json" | |
return load_json_results( | |
dataframe_path, | |
prepare_for_display=prepare_for_display, | |
sort_col=sort_col, | |
drop_cols=drop_cols | |
) | |
def load_retrieval_leaderboard(): | |
"""Load and prepare the retrieval leaderboard data""" | |
global retrieval_df | |
# Prepare retrieval dataframe | |
retrieval_df = load_retrieval_results(True, "Average Score", drop_cols=["Revision", "Precision", "Task"]) | |
retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df))) | |
return retrieval_df | |
def retrieval_search_leaderboard(model_name, columns_to_show): | |
"""Search function for retrieval leaderboard""" | |
return search_leaderboard(retrieval_df, model_name, columns_to_show) | |
def update_retrieval_columns_to_show(columns_to_show): | |
"""Update displayed columns for retrieval leaderboard""" | |
return update_columns_to_show(retrieval_df, columns_to_show) | |
def create_retrieval_tab(): | |
"""Create the complete retrieval leaderboard tab""" | |
global retrieval_df | |
# Load data if not already loaded | |
if retrieval_df is None: | |
retrieval_df = load_retrieval_leaderboard() | |
# Define default columns to show | |
default_columns = ["Rank", "Model", "Average Score", "Model Size (MB)", "Context Length", | |
"Embedding Dimension", "Web Search Dataset", "Islamic Knowledge Dataset"] | |
# Create and return the tab | |
return create_leaderboard_tab( | |
df=retrieval_df, | |
initial_columns_to_show=default_columns, | |
search_function=retrieval_search_leaderboard, | |
update_function=update_retrieval_columns_to_show, | |
about_section=RETRIEVAL_ABOUT_SECTION, | |
task_type="Retriever" | |
) | |