Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 12,260 Bytes
d392fbe 7b90636 3a93505 d392fbe a40f7ce d392fbe a40f7ce d392fbe fe19b0b d392fbe fe19b0b d392fbe fe19b0b d392fbe fe19b0b d392fbe fe19b0b d392fbe fe19b0b d392fbe fe19b0b d392fbe fe19b0b d392fbe fe19b0b eb50697 d392fbe fbd0e7d 7b90636 d392fbe 7b90636 da222cf 7b90636 da222cf 3a93505 da222cf 7b90636 995d9de da222cf 3a93505 7b90636 e9e4a75 a40f7ce e9e4a75 a40f7ce e9e4a75 3a93505 d392fbe 7b90636 ef6f53c 8ba337d a40f7ce da222cf 7b90636 7d5aa22 a40f7ce 9440e3a da222cf 85ca42c d392fbe 1aa355c d392fbe fbd0e7d d392fbe fbd0e7d de145b2 d392fbe 85ca42c 7b90636 85ca42c da222cf 7b90636 85ca42c d392fbe eb50697 a40f7ce eabeecb e9e4a75 d3d9816 e9e4a75 a40f7ce d392fbe 3a93505 85ca42c 7b90636 85ca42c 7b90636 e9e4a75 7b90636 3a93505 d392fbe fbd0e7d 492c93e d392fbe fbd0e7d fe19b0b fbd0e7d d392fbe fbd0e7d 7b90636 d392fbe 7b90636 da222cf 7b90636 d392fbe 3a93505 7b90636 d3d9816 e9e4a75 d3d9816 da222cf e9e4a75 d3d9816 eb50697 7b90636 e9e4a75 7b90636 3a93505 492c93e fbd0e7d 492c93e fbd0e7d fe19b0b eb50697 d392fbe eb50697 d392fbe de145b2 d392fbe de145b2 d392fbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
import gradio as gr
from utils import submit_gradio_module, load_retrieval_results, load_reranking_results
from fuzzywuzzy import fuzz
HEADER = """<div style="text-align: center; margin-bottom: 20px;">
<h1>The Arabic RAG Leaderboard</h1>
<p style="font-size: 16px; color: #888;">The only leaderboard you will require for your RAG needs π</p>
</div>
This leaderboard presents the first comprehensive benchmark for Arabic RAG systems, evaluating both retrieval and re-ranking components. Our framework combines real-world queries with synthetic contexts in a dynamic evaluation cycle, ensuring fair and robust assessment of Arabic information retrieval systems.
<br>
<br>
For technical details, check our blog post <a href="https://huggingface.co/blog/Navid-AI/arabic-rag-leaderboard">here</a>.
"""
RETRIEVAL_ABOUT_SECTION = """
## About Retrieval Evaluation
The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on:
### Web Search Dataset Metrics
- **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result
- **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results
- **Recall@5**: Measures the proportion of relevant documents found in the top 5 results
- **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5
### Model Requirements
- Must support Arabic text embeddings
- Should handle queries of at least 512 tokens
- Must work with `sentence-transformers` library
### Evaluation Process
1. Models process Arabic web search queries
2. Retrieved documents are evaluated using:
- MRR for first relevant result positioning
- nDCG for overall ranking quality
- Recall@5 for top results accuracy
3. Metrics are averaged to calculate the overall score
4. Models are ranked based on their overall performance
### How to Prepare Your Model
- Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet)
- Model should output fixed-dimension embeddings for text
- Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`)
"""
RERANKER_ABOUT_SECTION = """
## About Reranking Evaluation
The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance.
### Evaluation Metrics
- **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10
- **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10
- **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents
All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance.
### Model Requirements
- Must accept query-document pairs as input
- Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching)
- Support for Arabic text processing
### Evaluation Process
1. Models are tested on multiple unseen Arabic datasets
2. For each dataset:
- Initial candidate documents are provided
- Model reranks the candidates
- MRR@10, NDCG@10, and MAP are calculated
3. Final scores are averaged across all datasets
4. Models are ranked based on overall performance
### How to Prepare Your Model
- Model should be public on HuggingFace Hub (private models are not supported yet)
- Make sure it works coherently with `sentence-transformers` library
"""
CITATION_BUTTON_LABEL = """
Copy the following snippet to cite these results
"""
CITATION_BUTTON_TEXT = """
@misc{TARL,
author = {Mohaned A. Rashad, Hamza Shahid},
title = {The Arabic RAG Leaderboard},
year = {2025},
publisher = {Navid-AI},
howpublished = "url{https://huggingface.co/spaces/Navid-AI/The-Arabic-Rag-Leaderboard}"
}
"""
retrieval_df = None
reranking_df = None
def search_leaderboard(df, model_name, columns_to_show, threshold=95):
if not model_name.strip():
return df.loc[:, columns_to_show]
search_name = model_name.lower() # compute once for efficiency
def calculate_similarity(row):
return fuzz.partial_ratio(search_name, row["Model"].lower())
filtered_df = df.copy()
filtered_df["similarity"] = filtered_df.apply(calculate_similarity, axis=1)
filtered_df = filtered_df[filtered_df["similarity"] >= threshold].sort_values('similarity', ascending=False)
filtered_df = filtered_df.drop('similarity', axis=1).loc[:, columns_to_show]
return filtered_df
def retrieval_search_leaderboard(model_name, columns_to_show):
return search_leaderboard(retrieval_df, model_name, columns_to_show)
def reranking_search_leaderboard(model_name, columns_to_show):
return search_leaderboard(reranking_df, model_name, columns_to_show)
def update_retrieval_columns_to_show(columns_to_show):
global retrieval_df
dummy_df = retrieval_df.loc[:, [col for col in retrieval_df.columns if col in columns_to_show]]
columns_widths = []
for col in dummy_df.columns:
if col == "Rank":
columns_widths.append(80)
elif col == "Model":
columns_widths.append(400)
else:
columns_widths.append(150)
return gr.update(value=dummy_df, column_widths=columns_widths)
def update_reranker_columns_to_show(columns_to_show):
global reranking_df
dummy_df = reranking_df.loc[:, [col for col in reranking_df.columns if col in columns_to_show]]
columns_widths = []
for col in dummy_df.columns:
if col == "Rank":
columns_widths.append(80)
elif col == "Model":
columns_widths.append(400)
else:
columns_widths.append(150)
return gr.update(value=dummy_df, column_widths=columns_widths)
def main():
global retrieval_df, reranking_df
# Prepare retrieval dataframe
retrieval_df = load_retrieval_results(True, "Web Search Dataset (Overall Score)", ["Revision", "Precision", "Task"])
retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df)))
retrieval_df = retrieval_df[['Rank', 'Model', 'Web Search Dataset (Overall Score)', 'Model Size (MB)', 'Embedding Dimension', 'Max Tokens', 'Num Likes', 'Downloads Last Month', 'Web Search Dataset (MRR)', 'Web Search Dataset (nDCG@k=None)', 'Web Search Dataset (Recall@5)', 'License']]
retrieval_columns_to_show = ["Rank", "Model", "Web Search Dataset (Overall Score)", "Model Size (MB)", "Embedding Dimension", "Max Tokens", "Num Likes"]
retrieval_columns_widths = [80, 400, 150, 150, 150, 150, 150]
retrieval_cols = retrieval_df.columns.tolist() # cache columns
# Prepare reranking dataframe
reranking_df = load_reranking_results(True, sort_col="Overall Score", drop_cols=["Revision", "Precision", "Task"])
reranking_df.insert(0, "Rank", range(1, 1 + len(reranking_df)))
reranking_df.rename(columns={"nDCG": "nDCG@10", "MRR": "MRR@10"}, inplace=True)
reranking_columns_to_show = ["Rank", "Model", "Overall Score", "Model Parameters (in Millions)", "Embedding Dimensions", "Downloads Last Month", "MRR@10", "nDCG@10", "MAP"]
reranking_columns_widths = [80, 400, 150, 150, 150, 150, 150, 150, 150]
reranking_cols = reranking_df.columns.tolist() # cache columns
with gr.Blocks() as demo:
gr.HTML(HEADER)
with gr.Tabs():
with gr.Tab("π΅οΈββοΈ Retrieval"):
with gr.Tabs():
with gr.Tab("π Leaderboard"):
with gr.Row():
search_box_retrieval = gr.Textbox(
placeholder="Search for models...",
label="Search",
scale=5
)
retrieval_columns_to_show_input = gr.CheckboxGroup(
label="Columns to Show",
choices=retrieval_cols, # use cached list
value=retrieval_columns_to_show,
scale=4
)
retrieval_leaderboard = gr.Dataframe(
value=retrieval_df.loc[:, retrieval_columns_to_show],
datatype="markdown",
wrap=False,
show_fullscreen_button=True,
interactive=False,
column_widths=retrieval_columns_widths
)
# Submit the search box and the leaderboard
search_box_retrieval.input(
retrieval_search_leaderboard,
inputs=[search_box_retrieval, retrieval_columns_to_show_input],
outputs=retrieval_leaderboard
)
retrieval_columns_to_show_input.select(
update_retrieval_columns_to_show,
inputs=retrieval_columns_to_show_input,
outputs=retrieval_leaderboard
)
with gr.Tab("π΅οΈ Submit Retriever"):
submit_gradio_module("Retriever")
with gr.Tab("βΉοΈ About"):
gr.Markdown(RETRIEVAL_ABOUT_SECTION)
with gr.Tab("π Reranking"):
with gr.Tabs():
with gr.Tab("π Leaderboard"):
with gr.Row():
search_box_reranker = gr.Textbox(
placeholder="Search for models...",
label="Search",
scale=5
)
reranking_columns_to_show_input = gr.CheckboxGroup(
label="Columns to Show",
choices=reranking_cols, # use cached list
value=reranking_columns_to_show,
scale=4
)
reranker_leaderboard = gr.Dataframe(
value=reranking_df[reranking_columns_to_show],
datatype="markdown",
wrap=False,
show_fullscreen_button=True,
interactive=False,
column_widths=reranking_columns_widths
)
# Submit the search box and the leaderboard
search_box_reranker.input(
reranking_search_leaderboard,
inputs=[search_box_reranker, reranking_columns_to_show_input],
outputs=reranker_leaderboard
)
reranking_columns_to_show_input.select(
update_reranker_columns_to_show,
inputs=reranking_columns_to_show_input,
outputs=reranker_leaderboard
)
with gr.Tab("π΅οΈ Submit Reranker"):
submit_gradio_module("Reranker")
with gr.Tab("βΉοΈ About"):
gr.Markdown(RERANKER_ABOUT_SECTION)
with gr.Row():
with gr.Accordion("π Citation", open=False):
gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
demo.launch()
if __name__ == "__main__":
main()
|