alielfilali01 commited on
Commit
dfa1e52
·
verified ·
1 Parent(s): ad45142

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +571 -406
app.py CHANGED
@@ -9,105 +9,107 @@ from huggingface_hub import HfApi, hf_hub_download
9
  OWNER = "inceptionai"
10
  DATASET_REPO_ID = f"{OWNER}/requests-dataset"
11
 
 
12
  HEADER = """
13
  <center>
14
- <h1>AraGen Leaderboard: Generative Tasks Evaluation of Arabic LLMs</h1>
15
- </center>
16
-
17
  <br></br>
18
-
19
- <p>This leaderboard introduces generative tasks evaluation for Arabic Large Language Models (LLMs). Powered by the new <strong>3C3H</strong> evaluation measure, this framework delivers a transparent, robust, and holistic evaluation system that balances factual accuracy and usability assessment for a production ready setting.</p>
20
-
21
- <p>For more details, please consider going through the technical blogpost <a href="https://huggingface.co/blog/leaderboard-3c3h-aragen">here</a>.</p>
 
22
  """
23
 
24
  ABOUT_SECTION = """
25
  ## About
26
 
27
- The AraGen Leaderboard is designed to evaluate and compare the performance of Chat Arabic Large Language Models (LLMs) on a set of generative tasks. By leveraging the new **3C3H** evaluation measure which evaluate the model's output across six dimensions —Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness— the leaderboard provides a comprehensive and holistic evaluation of a model's performance in generating human-like and ethically responsible content.
28
 
29
- ### Why Focus on Chat Models?
 
 
30
 
31
- AraGen Leaderboard —And 3C3H in general— is specifically designed to assess **chat models**, which interact in conversational settings, intended for end user interaction and require a blend of factual accuracy and user-centric dialogue capabilities. While it is technically possible to submit foundational models, we kindly ask users to refrain from doing so. For evaluations of foundational models using likelihood accuracy based benchmarks, please refer to the [Open Arabic LLM Leaderboard (OALL)](https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard).
32
 
33
- ### How to Submit Your Model?
34
 
35
- Navigate to the submission section below to submit your open chat model from the HuggingFace Hub for evaluation. Ensure that your model is public and the submmited metadata (precision, revision, #params) is accurate.
 
 
 
 
36
 
37
  ### Contact
38
 
39
- For any inquiries or assistance, feel free to reach out through the community tab at [Inception AraGen Community](https://huggingface.co/spaces/inceptionai/AraGen-Leaderboard/discussions) or via [email](mailto:[email protected]).
40
  """
41
 
42
- CITATION_BUTTON_LABEL = """
43
- Copy the following snippet to cite these results
44
- """
45
 
46
  CITATION_BUTTON_TEXT = """
47
- @misc{AraGen,
48
- author = {El Filali, Ali and Sengupta, Neha and Abouelseoud, Arwa and Nakov, Preslav and Fourrier, Clémentine},
49
- title = {Rethinking LLM Evaluation with 3C3H: AraGen Benchmark and Leaderboard},
50
- year = {2024},
51
  publisher = {Inception},
52
- howpublished = "url{https://huggingface.co/spaces/inceptionai/AraGen-Leaderboard}"
53
  }
54
  """
55
 
 
 
 
 
56
 
57
  def load_results():
58
- # Get the current directory of the script and construct the path to results.json
 
 
 
 
59
  current_dir = os.path.dirname(os.path.abspath(__file__))
60
- results_file = os.path.join(current_dir, "assets", "results", "results.json")
61
 
62
- # Load the JSON data from the specified file
63
  with open(results_file, 'r') as f:
64
  data = json.load(f)
65
 
66
  # Filter out any entries that only contain '_last_sync_timestamp'
67
  filtered_data = []
68
  for entry in data:
69
- # If '_last_sync_timestamp' is the only key, skip it
70
  if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
71
  continue
72
  filtered_data.append(entry)
73
 
74
  data = filtered_data
75
 
76
- # Lists to collect data
77
  data_3c3h = []
78
  data_tasks = []
79
 
80
  for model_data in data:
81
- # Extract model meta data
82
  meta = model_data.get('Meta', {})
83
  model_name = meta.get('Model Name', 'UNK')
84
  revision = meta.get('Revision', 'UNK')
85
  precision = meta.get('Precision', 'UNK')
86
  params = meta.get('Params', 'UNK')
87
- license = meta.get('License', 'UNK')
88
 
89
- # Convert "Model Size" to numeric, treating "UNK" as infinity
90
  try:
91
  model_size_numeric = float(params)
92
  except (ValueError, TypeError):
93
  model_size_numeric = np.inf
94
 
95
- # 3C3H Scores
96
  scores_data = model_data.get('claude-3.5-sonnet Scores', {})
97
  scores_3c3h = scores_data.get('3C3H Scores', {})
98
  scores_tasks = scores_data.get('Tasks Scores', {})
99
 
100
- # Multiply scores by 100 to get percentages (keep them as numeric values)
101
  formatted_scores_3c3h = {k: v*100 for k, v in scores_3c3h.items()}
102
  formatted_scores_tasks = {k: v*100 for k, v in scores_tasks.items()}
103
 
104
- # For 3C3H Scores DataFrame
105
  data_entry_3c3h = {
106
  'Model Name': model_name,
107
  'Revision': revision,
108
- 'License': license,
109
  'Precision': precision,
110
- 'Model Size': model_size_numeric, # Numeric value for sorting
111
  '3C3H Score': formatted_scores_3c3h.get("3C3H Score", np.nan),
112
  'Correctness': formatted_scores_3c3h.get("Correctness", np.nan),
113
  'Completeness': formatted_scores_3c3h.get("Completeness", np.nan),
@@ -118,13 +120,12 @@ def load_results():
118
  }
119
  data_3c3h.append(data_entry_3c3h)
120
 
121
- # For Tasks Scores DataFrame
122
  data_entry_tasks = {
123
  'Model Name': model_name,
124
  'Revision': revision,
125
- 'License': license,
126
  'Precision': precision,
127
- 'Model Size': model_size_numeric, # Numeric value for sorting
128
  **formatted_scores_tasks
129
  }
130
  data_tasks.append(data_entry_tasks)
@@ -132,147 +133,148 @@ def load_results():
132
  df_3c3h = pd.DataFrame(data_3c3h)
133
  df_tasks = pd.DataFrame(data_tasks)
134
 
135
- # Round the numeric score columns to 4 decimal places
136
  score_columns_3c3h = ['3C3H Score', 'Correctness', 'Completeness', 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness']
137
  df_3c3h[score_columns_3c3h] = df_3c3h[score_columns_3c3h].round(4)
138
 
139
- # Replace np.inf with a large number in 'Model Size Filter' for filtering
140
- max_model_size_value = 1000 # Define a maximum value
141
  df_3c3h['Model Size Filter'] = df_3c3h['Model Size'].replace(np.inf, max_model_size_value)
142
 
143
- # Sort df_3c3h by '3C3H Score' descending if column exists
144
  if '3C3H Score' in df_3c3h.columns:
145
  df_3c3h = df_3c3h.sort_values(by='3C3H Score', ascending=False)
146
- df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1)) # Add Rank column starting from 1
147
  else:
148
  df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
149
 
150
- # Extract task columns
151
  task_columns = [col for col in df_tasks.columns if col not in ['Model Name', 'Revision', 'License', 'Precision', 'Model Size', 'Model Size Filter']]
152
-
153
- # Round the task score columns to 4 decimal places
154
  if task_columns:
155
  df_tasks[task_columns] = df_tasks[task_columns].round(4)
156
 
157
- # Replace np.inf with a large number in 'Model Size Filter' for filtering
158
  df_tasks['Model Size Filter'] = df_tasks['Model Size'].replace(np.inf, max_model_size_value)
159
 
160
- # Sort df_tasks by the first task column if it exists
161
  if task_columns:
162
  first_task = task_columns[0]
163
  df_tasks = df_tasks.sort_values(by=first_task, ascending=False)
164
- df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1)) # Add Rank column starting from 1
165
  else:
166
  df_tasks = df_tasks.sort_values(by='Model Name', ascending=True)
167
  df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
168
 
169
  return df_3c3h, df_tasks, task_columns
170
 
171
- def load_requests(status_folder):
172
- api = HfApi()
173
- requests_data = []
174
- folder_path_in_repo = status_folder # 'pending', 'finished', or 'failed'
175
-
176
- hf_api_token = os.environ.get('HF_API_TOKEN', None)
177
 
178
- try:
179
- # List files in the dataset repository
180
- files_info = api.list_repo_files(
181
- repo_id=DATASET_REPO_ID,
182
- repo_type="dataset",
183
- token=hf_api_token
184
- )
185
- except Exception as e:
186
- print(f"Error accessing dataset repository: {e}")
187
- return pd.DataFrame() # Return empty DataFrame if repository not found or inaccessible
188
-
189
- # Filter files in the desired folder
190
- files_in_folder = [f for f in files_info if f.startswith(f"{folder_path_in_repo}/") and f.endswith('.json')]
191
-
192
- for file_path in files_in_folder:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  try:
194
- # Download the JSON file
195
- local_file_path = hf_hub_download(
196
- repo_id=DATASET_REPO_ID,
197
- filename=file_path,
198
- repo_type="dataset",
199
- token=hf_api_token
200
- )
201
- # Load JSON data
202
- with open(local_file_path, 'r') as f:
203
- request = json.load(f)
204
- requests_data.append(request)
205
- except Exception as e:
206
- print(f"Error loading file {file_path}: {e}")
207
- continue # Skip files that can't be loaded
208
-
209
- df = pd.DataFrame(requests_data)
210
  return df
211
 
212
- def submit_model(model_name, revision, precision, params, license):
213
- # Load existing evaluations
214
  df_3c3h, df_tasks, _ = load_results()
215
  existing_models_results = df_3c3h[['Model Name', 'Revision', 'Precision']]
216
 
217
- # Handle 'Missing' precision
218
  if precision == 'Missing':
219
  precision = None
220
  else:
221
  precision = precision.strip().lower()
222
 
223
- # Load pending and finished requests from the dataset repository
224
  df_pending = load_requests('pending')
225
  df_finished = load_requests('finished')
226
 
227
- # Check if model is already evaluated
228
- model_exists_in_results = ((existing_models_results['Model Name'] == model_name) &
229
- (existing_models_results['Revision'] == revision) &
230
- (existing_models_results['Precision'] == precision)).any()
 
231
  if model_exists_in_results:
232
  return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
233
 
234
- # Check if model is in pending requests
235
  if not df_pending.empty:
236
  existing_models_pending = df_pending[['model_name', 'revision', 'precision']]
237
- model_exists_in_pending = ((existing_models_pending['model_name'] == model_name) &
238
- (existing_models_pending['revision'] == revision) &
239
- (existing_models_pending['precision'] == precision)).any()
 
 
240
  if model_exists_in_pending:
241
  return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' is already in the pending evaluations.**"
242
 
243
- # Check if model is in finished requests
244
  if not df_finished.empty:
245
  existing_models_finished = df_finished[['model_name', 'revision', 'precision']]
246
- model_exists_in_finished = ((existing_models_finished['model_name'] == model_name) &
247
- (existing_models_finished['revision'] == revision) &
248
- (existing_models_finished['precision'] == precision)).any()
 
 
249
  if model_exists_in_finished:
250
  return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
251
 
252
- # Check if model exists on HuggingFace Hub
253
  api = HfApi()
254
  try:
255
- model_info = api.model_info(model_name)
256
- except Exception as e:
257
  return f"**Error: Could not find model '{model_name}' on HuggingFace Hub. Please ensure the model name is correct and the model is public.**"
258
 
259
- # Proceed with submission
260
  status = "PENDING"
261
-
262
- # Prepare the submission data
263
  submission = {
264
  "model_name": model_name,
265
  "license": license,
266
  "revision": revision,
267
  "precision": precision,
 
268
  "status": status,
269
- "params": params
270
  }
271
-
272
- # Serialize the submission to JSON
273
  submission_json = json.dumps(submission, indent=2)
274
 
275
- # Define the file path in the repository
276
  org_model = model_name.split('/')
277
  if len(org_model) != 2:
278
  return "**Please enter the full model name including the organization or username, e.g., 'inceptionai/jais-family-30b-8k'**"
@@ -280,7 +282,6 @@ def submit_model(model_name, revision, precision, params, license):
280
  precision_str = precision if precision else 'Missing'
281
  file_path_in_repo = f"pending/{org}/{model_id}_eval_request_{revision}_{precision_str}.json"
282
 
283
- # Upload the submission to the dataset repository
284
  try:
285
  hf_api_token = os.environ.get('HF_API_TOKEN', None)
286
  api.upload_file(
@@ -295,10 +296,207 @@ def submit_model(model_name, revision, precision, params, license):
295
 
296
  return f"**Model '{model_name}' has been submitted for evaluation.**"
297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  def main():
299
  df_3c3h, df_tasks, task_columns = load_results()
 
300
 
301
- # Extract unique Precision and License values for filters
302
  precision_options_3c3h = sorted(df_3c3h['Precision'].dropna().unique().tolist())
303
  precision_options_3c3h = [p for p in precision_options_3c3h if p != 'UNK']
304
  precision_options_3c3h.append('Missing')
@@ -307,6 +505,7 @@ def main():
307
  license_options_3c3h = [l for l in license_options_3c3h if l != 'UNK']
308
  license_options_3c3h.append('Missing')
309
 
 
310
  precision_options_tasks = sorted(df_tasks['Precision'].dropna().unique().tolist())
311
  precision_options_tasks = [p for p in precision_options_tasks if p != 'UNK']
312
  precision_options_tasks.append('Missing')
@@ -315,361 +514,327 @@ def main():
315
  license_options_tasks = [l for l in license_options_tasks if l != 'UNK']
316
  license_options_tasks.append('Missing')
317
 
318
- # Get min and max model sizes for sliders, handling 'inf' values
319
  min_model_size_3c3h = int(df_3c3h['Model Size Filter'].min())
320
  max_model_size_3c3h = int(df_3c3h['Model Size Filter'].max())
321
 
 
322
  min_model_size_tasks = int(df_tasks['Model Size Filter'].min())
323
  max_model_size_tasks = int(df_tasks['Model Size Filter'].max())
324
 
325
- # Exclude 'Model Size Filter' from column selectors
326
- column_choices_3c3h = [col for col in df_3c3h.columns if col != 'Model Size Filter']
327
- column_choices_tasks = [col for col in df_tasks.columns if col != 'Model Size Filter']
328
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  with gr.Blocks() as demo:
330
  gr.HTML(HEADER)
331
-
332
  with gr.Tabs():
333
- with gr.Tab("Leaderboard"):
334
- with gr.Tabs():
335
- with gr.Tab("3C3H Scores"):
336
- with gr.Row():
337
- search_box_3c3h = gr.Textbox(
338
- placeholder="Search for models...",
339
- label="Search",
340
- interactive=True
341
- )
342
- with gr.Row():
343
- column_selector_3c3h = gr.CheckboxGroup(
344
- choices=column_choices_3c3h,
345
- value=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
347
  'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
348
- ],
349
- label="Select columns to display",
350
- )
351
- with gr.Row():
352
- license_filter_3c3h = gr.CheckboxGroup(
353
- choices=license_options_3c3h,
354
- value=license_options_3c3h.copy(), # Default all selected
355
- label="Filter by License",
356
- )
357
- precision_filter_3c3h = gr.CheckboxGroup(
358
- choices=precision_options_3c3h,
359
- value=precision_options_3c3h.copy(), # Default all selected
360
- label="Filter by Precision",
361
- )
362
- with gr.Row():
363
- model_size_min_filter_3c3h = gr.Slider(
364
- minimum=min_model_size_3c3h,
365
- maximum=max_model_size_3c3h,
366
- value=min_model_size_3c3h,
367
- step=1,
368
- label="Minimum Model Size",
369
- interactive=True
370
  )
371
- model_size_max_filter_3c3h = gr.Slider(
372
- minimum=min_model_size_3c3h,
373
- maximum=max_model_size_3c3h,
374
- value=max_model_size_3c3h,
375
- step=1,
376
- label="Maximum Model Size",
377
- interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  )
379
-
380
- leaderboard_3c3h = gr.Dataframe(
381
- df_3c3h[['Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
382
- 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness']],
383
- interactive=False
384
- )
385
-
386
- def filter_df_3c3h(search_query, selected_cols, precision_filters, license_filters, min_size, max_size):
387
- filtered_df = df_3c3h.copy()
388
-
389
- # Ensure min_size <= max_size
390
- if min_size > max_size:
391
- min_size, max_size = max_size, min_size
392
-
393
- # Apply search filter
394
- if search_query:
395
- filtered_df = filtered_df[filtered_df['Model Name'].str.contains(search_query, case=False, na=False)]
396
-
397
- # Apply Precision filter
398
- if precision_filters:
399
- include_missing = 'Missing' in precision_filters
400
- selected_precisions = [p for p in precision_filters if p != 'Missing']
401
- if include_missing:
402
- filtered_df = filtered_df[
403
- (filtered_df['Precision'].isin(selected_precisions)) |
404
- (filtered_df['Precision'] == 'UNK') |
405
- (filtered_df['Precision'].isna())
406
- ]
407
- else:
408
- filtered_df = filtered_df[filtered_df['Precision'].isin(selected_precisions)]
409
-
410
- # Apply License filter
411
- if license_filters:
412
- include_missing = 'Missing' in license_filters
413
- selected_licenses = [l for l in license_filters if l != 'Missing']
414
- if include_missing:
415
- filtered_df = filtered_df[
416
- (filtered_df['License'].isin(selected_licenses)) |
417
- (filtered_df['License'] == 'UNK') |
418
- (filtered_df['License'].isna())
419
- ]
420
- else:
421
- filtered_df = filtered_df[filtered_df['License'].isin(selected_licenses)]
422
-
423
- # Apply Model Size filter
424
- filtered_df = filtered_df[
425
- (filtered_df['Model Size Filter'] >= min_size) &
426
- (filtered_df['Model Size Filter'] <= max_size)
427
  ]
428
-
429
- # Remove existing 'Rank' column if present
430
- if 'Rank' in filtered_df.columns:
431
- filtered_df = filtered_df.drop(columns=['Rank'])
432
-
433
- # Recalculate Rank after filtering
434
- filtered_df = filtered_df.reset_index(drop=True)
435
- filtered_df.insert(0, 'Rank', range(1, len(filtered_df) + 1))
436
-
437
- # Ensure selected columns are present
438
- selected_cols = [col for col in selected_cols if col in filtered_df.columns]
439
-
440
- return filtered_df[selected_cols]
441
-
442
- # Bind the filter function to the appropriate events
443
- filter_inputs_3c3h = [
444
- search_box_3c3h,
445
- column_selector_3c3h,
446
- precision_filter_3c3h,
447
- license_filter_3c3h,
448
- model_size_min_filter_3c3h,
449
- model_size_max_filter_3c3h
450
- ]
451
- search_box_3c3h.submit(
452
- filter_df_3c3h,
453
- inputs=filter_inputs_3c3h,
454
- outputs=leaderboard_3c3h
455
- )
456
-
457
- # Bind change events for CheckboxGroups and sliders
458
- for component in filter_inputs_3c3h:
459
- component.change(
460
- filter_df_3c3h,
461
- inputs=filter_inputs_3c3h,
462
- outputs=leaderboard_3c3h
463
  )
464
-
465
- with gr.Tab("Tasks Scores"):
466
- gr.Markdown("""
467
- Note: This Table is sorted based on the First Task (Question Answering)
468
- """)
469
-
 
 
 
 
 
 
470
  with gr.Row():
471
- search_box_tasks = gr.Textbox(
472
  placeholder="Search for models...",
473
  label="Search",
474
  interactive=True
475
  )
476
  with gr.Row():
477
- column_selector_tasks = gr.CheckboxGroup(
478
- choices=column_choices_tasks,
479
- value=['Rank', 'Model Name'] + task_columns,
480
- label="Select columns to display",
481
  )
482
  with gr.Row():
483
- license_filter_tasks = gr.CheckboxGroup(
484
- choices=license_options_tasks,
485
- value=license_options_tasks.copy(), # Default all selected
486
- label="Filter by License",
487
- )
488
- precision_filter_tasks = gr.CheckboxGroup(
489
- choices=precision_options_tasks,
490
- value=precision_options_tasks.copy(), # Default all selected
491
- label="Filter by Precision",
492
  )
493
  with gr.Row():
494
- model_size_min_filter_tasks = gr.Slider(
495
- minimum=min_model_size_tasks,
496
- maximum=max_model_size_tasks,
497
- value=min_model_size_tasks,
498
  step=1,
499
  label="Minimum Model Size",
500
  interactive=True
501
  )
502
- model_size_max_filter_tasks = gr.Slider(
503
- minimum=min_model_size_tasks,
504
- maximum=max_model_size_tasks,
505
- value=max_model_size_tasks,
506
  step=1,
507
  label="Maximum Model Size",
508
  interactive=True
509
  )
510
-
511
- leaderboard_tasks = gr.Dataframe(
512
- df_tasks[['Rank', 'Model Name'] + task_columns],
513
- interactive=False
514
- )
515
-
516
- def filter_df_tasks(search_query, selected_cols, precision_filters, license_filters, min_size, max_size):
517
- filtered_df = df_tasks.copy()
518
-
519
- # Ensure min_size <= max_size
520
- if min_size > max_size:
521
- min_size, max_size = max_size, min_size
522
-
523
- # Apply search filter
524
- if search_query:
525
- filtered_df = filtered_df[filtered_df['Model Name'].str.contains(search_query, case=False, na=False)]
526
-
527
- # Apply Precision filter
528
- if precision_filters:
529
- include_missing = 'Missing' in precision_filters
530
- selected_precisions = [p for p in precision_filters if p != 'Missing']
531
- if include_missing:
532
- filtered_df = filtered_df[
533
- (filtered_df['Precision'].isin(selected_precisions)) |
534
- (filtered_df['Precision'] == 'UNK') |
535
- (filtered_df['Precision'].isna())
536
- ]
537
- else:
538
- filtered_df = filtered_df[filtered_df['Precision'].isin(selected_precisions)]
539
-
540
- # Apply License filter
541
- if license_filters:
542
- include_missing = 'Missing' in license_filters
543
- selected_licenses = [l for l in license_filters if l != 'Missing']
544
- if include_missing:
545
- filtered_df = filtered_df[
546
- (filtered_df['License'].isin(selected_licenses)) |
547
- (filtered_df['License'] == 'UNK') |
548
- (filtered_df['License'].isna())
549
- ]
550
- else:
551
- filtered_df = filtered_df[filtered_df['License'].isin(selected_licenses)]
552
-
553
- # Apply Model Size filter
554
- filtered_df = filtered_df[
555
- (filtered_df['Model Size Filter'] >= min_size) &
556
- (filtered_df['Model Size Filter'] <= max_size)
557
- ]
558
-
559
- # Remove existing 'Rank' column if present
560
- if 'Rank' in filtered_df.columns:
561
- filtered_df = filtered_df.drop(columns=['Rank'])
562
-
563
- # Sort by the first task column if it exists
564
- if task_columns:
565
- first_task = task_columns[0]
566
- filtered_df = filtered_df.sort_values(by=first_task, ascending=False)
567
- else:
568
- filtered_df = filtered_df.sort_values(by='Model Name', ascending=True)
569
-
570
- # Recalculate Rank after filtering
571
- filtered_df = filtered_df.reset_index(drop=True)
572
- filtered_df.insert(0, 'Rank', range(1, len(filtered_df) + 1))
573
-
574
- # Ensure selected columns are present
575
- selected_cols = [col for col in selected_cols if col in filtered_df.columns]
576
-
577
- return filtered_df[selected_cols]
578
-
579
- # Bind the filter function to the appropriate events
580
- filter_inputs_tasks = [
581
- search_box_tasks,
582
- column_selector_tasks,
583
- precision_filter_tasks,
584
- license_filter_tasks,
585
- model_size_min_filter_tasks,
586
- model_size_max_filter_tasks
587
- ]
588
- search_box_tasks.submit(
589
- filter_df_tasks,
590
- inputs=filter_inputs_tasks,
591
- outputs=leaderboard_tasks
592
- )
593
-
594
- # Bind change events for CheckboxGroups and sliders
595
- for component in filter_inputs_tasks:
596
- component.change(
597
- filter_df_tasks,
598
- inputs=filter_inputs_tasks,
599
- outputs=leaderboard_tasks
600
- )
601
-
602
- with gr.Tab("Submit Here"):
603
  gr.Markdown(ABOUT_SECTION)
604
- gr.Markdown("---")
605
- gr.Markdown("# Submit Your Model for Evaluation")
606
  with gr.Column():
607
  model_name_input = gr.Textbox(
608
- label="Model Name",
609
  placeholder="Enter the full model name from HuggingFace Hub (e.g., inceptionai/jais-family-30b-8k)"
610
  )
611
- revision_input = gr.Textbox(
612
- label="Revision",
613
- placeholder="main",
614
- value="main"
615
- )
616
  precision_input = gr.Dropdown(
617
- choices=["float16", "float32", "bfloat16", "8bit", "4bit"],
618
  label="Precision",
619
  value="float16"
620
  )
621
  params_input = gr.Textbox(
622
- label="Params",
623
  placeholder="Enter the approximate number of parameters as Integer (e.g., 7, 13, 30, 70 ...)"
624
  )
625
- # Changed from Dropdown to Textbox with default value "Open"
626
  license_input = gr.Textbox(
627
- label="License",
628
- placeholder="Enter the license type (Generic one is 'Open' in case no License is provided)",
629
  value="Open"
630
  )
 
 
 
 
 
631
  submit_button = gr.Button("Submit Model")
632
  submission_result = gr.Markdown()
633
-
634
  submit_button.click(
635
  submit_model,
636
- inputs=[model_name_input, revision_input, precision_input, params_input, license_input],
 
 
 
637
  outputs=submission_result
638
  )
639
-
640
- # Load pending, finished, and failed requests
641
- df_pending = load_requests('pending')
642
- df_finished = load_requests('finished')
643
- df_failed = load_requests('failed')
644
-
645
- # Display the tables
646
- gr.Markdown("## Evaluation Status of Open Models from the 🤗 Hub")
647
- with gr.Accordion(f"Pending Evaluations ({len(df_pending)})", open=False):
648
- if not df_pending.empty:
649
- gr.Dataframe(df_pending)
650
- else:
651
- gr.Markdown("No pending evaluations.")
652
- with gr.Accordion(f"Finished Evaluations ({len(df_finished)})", open=False):
653
- if not df_finished.empty:
654
- gr.Dataframe(df_finished)
655
- else:
656
- gr.Markdown("No finished evaluations.")
657
- with gr.Accordion(f"Failed Evaluations ({len(df_failed)})", open=False):
658
- if not df_failed.empty:
659
- gr.Dataframe(df_failed)
660
- else:
661
- gr.Markdown("No failed evaluations.")
662
  with gr.Row():
663
  with gr.Accordion("📙 Citation", open=False):
664
  citation_button = gr.Textbox(
665
  value=CITATION_BUTTON_TEXT,
666
  label=CITATION_BUTTON_LABEL,
667
- lines=20,
668
  elem_id="citation-button",
669
- show_copy_button=True,
670
  )
 
 
 
 
671
 
672
- demo.launch()
673
 
674
  if __name__ == "__main__":
675
  main()
 
9
  OWNER = "inceptionai"
10
  DATASET_REPO_ID = f"{OWNER}/requests-dataset"
11
 
12
+
13
  HEADER = """
14
  <center>
 
 
 
15
  <br></br>
16
+ <h1>Arabic Leaderboards</h1>
17
+ <h2>Comprehensive Evaluation of Arabic Large Language Models</h2>
18
+ <br></br>
19
+ <br></br>
20
+ </center>
21
  """
22
 
23
  ABOUT_SECTION = """
24
  ## About
25
 
26
+ In our `12-24` release, we introduced the `AraGen Benchmark`, along with the `3C3H` evaluation measure (aka the 3C3H Score). You can find more details about AraGen and 3C3H, [here](https://huggingface.co/blog/leaderboard-3c3h-aragen). And you can find the first version of the benchmark, `AraGen-12-24` [here](https://huggingface.co/datasets/inceptionai/AraGen). Building on that foundation, and as part of this new release, we have expanded this space to incorporate additional tasks and evaluation metrics.
27
 
28
+ In this release, we present two leaderboards:
29
+
30
+ **AraGen-03-25 (v2):**
31
 
32
+ - The AraGen Benchmark is designed to evaluate and compare the performance of Chat/Instruct Arabic Large Language Models on a suite of generative tasks that are culturally relevant to the Arab region, history, politics, cuisine ... etc. By leveraging **3C3H** as an evaluation metric—which assesses a model's output across six dimensions: Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness—the leaderboard offers a comprehensive and holistic evaluation of a model’s chat capabilities and its ability to generate human-like and ethically responsible content.
33
 
34
+ **Instruction Following:**
35
 
36
+ - We have established a robust leaderboard that benchmarks models on Arabic and English instruction following, offering an open and comparative performance landscape for the research community. Concurrently, we released the first publicly available Arabic [dataset](https://huggingface.co/datasets/inceptionai/Arabic_IFEval) aimed at evaluating LLMs' ability to follow instructions. The Arabic IFEval samples are meticulously curated to capture the language’s unique nuances—such as diacritization and distinctive phonetic features—often overlooked in generic datasets. Our dedicated linguistic team generated original samples and adapted selections from the IFEval English dataset, ensuring that the material resonates with Arabic cultural contexts and meets the highest standards of authenticity and quality.
37
+
38
+ ### Why Focus on Chat Models?
39
+
40
+ Our evaluations are conducted in a generative mode, meaning that we expect models to produce complete, context-rich responses rather than simply predicting the next token as base models do. This approach not only yields results that are more explainable and nuanced compared to logit-based measurements, but it also captures elements like creativity, coherence, and ethical considerations—providing deeper insights into overall model performance.
41
 
42
  ### Contact
43
 
44
+ For inquiries or assistance, please join the conversation on our [Discussions Tab](https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/discussions) or reach out via [email](mailto:[email protected]).
45
  """
46
 
47
+ BOTTOM_LOGO = """<img src="https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/resolve/main/assets/pictures/03-25/arabic-leaderboards-colab-march-preview-free-3.png" style="width:50%;display:block;margin-left:auto;margin-right:auto;border-radius:15px;">"""
 
 
48
 
49
  CITATION_BUTTON_TEXT = """
50
+ @misc{Arabic-Leaderboards,
51
+ author = {El Filali, Ali and Albarri, Sarah and Abouelseoud, Arwa and Kamboj, Samta and Sengupta, Neha and Nakov, Preslav},
52
+ title = {Arabic-Leaderboards: Comprehensive Evaluation of Arabic Large Language Models},
53
+ year = {2025},
54
  publisher = {Inception},
55
+ howpublished = "url{https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards}"
56
  }
57
  """
58
 
59
+ CITATION_BUTTON_LABEL = """
60
+ Copy the following snippet to cite the results from all Arabic Leaderboards in this Space.
61
+ """
62
+
63
 
64
  def load_results():
65
+ """
66
+ Loads the AraGen v2 results from aragen_v2_results.json and returns two dataframes:
67
+ 1) df_3c3h with columns for 3C3H scores
68
+ 2) df_tasks with columns for tasks scores
69
+ """
70
  current_dir = os.path.dirname(os.path.abspath(__file__))
71
+ results_file = os.path.join(current_dir, "assets", "results", "aragen_v2_results.json")
72
 
 
73
  with open(results_file, 'r') as f:
74
  data = json.load(f)
75
 
76
  # Filter out any entries that only contain '_last_sync_timestamp'
77
  filtered_data = []
78
  for entry in data:
 
79
  if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
80
  continue
81
  filtered_data.append(entry)
82
 
83
  data = filtered_data
84
 
 
85
  data_3c3h = []
86
  data_tasks = []
87
 
88
  for model_data in data:
 
89
  meta = model_data.get('Meta', {})
90
  model_name = meta.get('Model Name', 'UNK')
91
  revision = meta.get('Revision', 'UNK')
92
  precision = meta.get('Precision', 'UNK')
93
  params = meta.get('Params', 'UNK')
 
94
 
 
95
  try:
96
  model_size_numeric = float(params)
97
  except (ValueError, TypeError):
98
  model_size_numeric = np.inf
99
 
 
100
  scores_data = model_data.get('claude-3.5-sonnet Scores', {})
101
  scores_3c3h = scores_data.get('3C3H Scores', {})
102
  scores_tasks = scores_data.get('Tasks Scores', {})
103
 
 
104
  formatted_scores_3c3h = {k: v*100 for k, v in scores_3c3h.items()}
105
  formatted_scores_tasks = {k: v*100 for k, v in scores_tasks.items()}
106
 
 
107
  data_entry_3c3h = {
108
  'Model Name': model_name,
109
  'Revision': revision,
110
+ 'License': meta.get('License', 'UNK'),
111
  'Precision': precision,
112
+ 'Model Size': model_size_numeric,
113
  '3C3H Score': formatted_scores_3c3h.get("3C3H Score", np.nan),
114
  'Correctness': formatted_scores_3c3h.get("Correctness", np.nan),
115
  'Completeness': formatted_scores_3c3h.get("Completeness", np.nan),
 
120
  }
121
  data_3c3h.append(data_entry_3c3h)
122
 
 
123
  data_entry_tasks = {
124
  'Model Name': model_name,
125
  'Revision': revision,
126
+ 'License': meta.get('License', 'UNK'),
127
  'Precision': precision,
128
+ 'Model Size': model_size_numeric,
129
  **formatted_scores_tasks
130
  }
131
  data_tasks.append(data_entry_tasks)
 
133
  df_3c3h = pd.DataFrame(data_3c3h)
134
  df_tasks = pd.DataFrame(data_tasks)
135
 
 
136
  score_columns_3c3h = ['3C3H Score', 'Correctness', 'Completeness', 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness']
137
  df_3c3h[score_columns_3c3h] = df_3c3h[score_columns_3c3h].round(4)
138
 
139
+ max_model_size_value = 1000
 
140
  df_3c3h['Model Size Filter'] = df_3c3h['Model Size'].replace(np.inf, max_model_size_value)
141
 
 
142
  if '3C3H Score' in df_3c3h.columns:
143
  df_3c3h = df_3c3h.sort_values(by='3C3H Score', ascending=False)
144
+ df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
145
  else:
146
  df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
147
 
 
148
  task_columns = [col for col in df_tasks.columns if col not in ['Model Name', 'Revision', 'License', 'Precision', 'Model Size', 'Model Size Filter']]
 
 
149
  if task_columns:
150
  df_tasks[task_columns] = df_tasks[task_columns].round(4)
151
 
 
152
  df_tasks['Model Size Filter'] = df_tasks['Model Size'].replace(np.inf, max_model_size_value)
153
 
 
154
  if task_columns:
155
  first_task = task_columns[0]
156
  df_tasks = df_tasks.sort_values(by=first_task, ascending=False)
157
+ df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
158
  else:
159
  df_tasks = df_tasks.sort_values(by='Model Name', ascending=True)
160
  df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
161
 
162
  return df_3c3h, df_tasks, task_columns
163
 
 
 
 
 
 
 
164
 
165
+ def load_if_data():
166
+ """
167
+ Loads the instruction-following data from ifeval_results.jsonl
168
+ and returns a dataframe with relevant columns,
169
+ converting decimal values to percentage format.
170
+ """
171
+ current_dir = os.path.dirname(os.path.abspath(__file__))
172
+ results_file = os.path.join(current_dir, "assets", "results", "ifeval_results.jsonl")
173
+
174
+ data = []
175
+ with open(results_file, "r", encoding="utf-8") as f:
176
+ for line in f:
177
+ line = line.strip()
178
+ if not line:
179
+ continue
180
+ data.append(json.loads(line))
181
+
182
+ df = pd.DataFrame(data)
183
+
184
+ # Convert numeric columns
185
+ numeric_cols = ["En Prompt-lvl", "En Instruction-lvl", "Ar Prompt-lvl", "Ar Instruction-lvl"]
186
+ for col in numeric_cols:
187
+ df[col] = pd.to_numeric(df[col], errors="coerce")
188
+
189
+ # Compute average accuracy for En and Ar
190
+ df["Average Accuracy (En)"] = (df["En Prompt-lvl"] + df["En Instruction-lvl"]) / 2
191
+ df["Average Accuracy (Ar)"] = (df["Ar Prompt-lvl"] + df["Ar Instruction-lvl"]) / 2
192
+
193
+ # Convert them to percentage format (e.g., 0.871 -> 87.1)
194
+ for col in numeric_cols:
195
+ df[col] = (df[col] * 100).round(1)
196
+ df["Average Accuracy (En)"] = (df["Average Accuracy (En)"] * 100).round(1)
197
+ df["Average Accuracy (Ar)"] = (df["Average Accuracy (Ar)"] * 100).round(1)
198
+
199
+ # Handle size as numeric
200
+ def parse_size(x):
201
  try:
202
+ return float(x)
203
+ except:
204
+ return np.inf
205
+
206
+ df["Model Size"] = df["Size (B)"].apply(parse_size)
207
+
208
+ # Add a filter column for size
209
+ max_model_size_value = 1000
210
+ df["Model Size Filter"] = df["Model Size"].replace(np.inf, max_model_size_value)
211
+
212
+ # Sort by "Average Accuracy (Ar)" as an example
213
+ df = df.sort_values(by="Average Accuracy (Ar)", ascending=False)
214
+ df = df.reset_index(drop=True)
215
+ df.insert(0, "Rank", range(1, len(df) + 1))
216
+
 
217
  return df
218
 
219
+
220
+ def submit_model(model_name, revision, precision, params, license, modality):
221
  df_3c3h, df_tasks, _ = load_results()
222
  existing_models_results = df_3c3h[['Model Name', 'Revision', 'Precision']]
223
 
 
224
  if precision == 'Missing':
225
  precision = None
226
  else:
227
  precision = precision.strip().lower()
228
 
 
229
  df_pending = load_requests('pending')
230
  df_finished = load_requests('finished')
231
 
232
+ model_exists_in_results = (
233
+ (existing_models_results['Model Name'] == model_name) &
234
+ (existing_models_results['Revision'] == revision) &
235
+ (existing_models_results['Precision'] == precision)
236
+ ).any()
237
  if model_exists_in_results:
238
  return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
239
 
 
240
  if not df_pending.empty:
241
  existing_models_pending = df_pending[['model_name', 'revision', 'precision']]
242
+ model_exists_in_pending = (
243
+ (existing_models_pending['model_name'] == model_name) &
244
+ (existing_models_pending['revision'] == revision) &
245
+ (existing_models_pending['precision'] == precision)
246
+ ).any()
247
  if model_exists_in_pending:
248
  return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' is already in the pending evaluations.**"
249
 
 
250
  if not df_finished.empty:
251
  existing_models_finished = df_finished[['model_name', 'revision', 'precision']]
252
+ model_exists_in_finished = (
253
+ (existing_models_finished['model_name'] == model_name) &
254
+ (existing_models_finished['revision'] == revision) &
255
+ (existing_models_finished['precision'] == precision)
256
+ ).any()
257
  if model_exists_in_finished:
258
  return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
259
 
 
260
  api = HfApi()
261
  try:
262
+ _ = api.model_info(model_name)
263
+ except Exception:
264
  return f"**Error: Could not find model '{model_name}' on HuggingFace Hub. Please ensure the model name is correct and the model is public.**"
265
 
 
266
  status = "PENDING"
 
 
267
  submission = {
268
  "model_name": model_name,
269
  "license": license,
270
  "revision": revision,
271
  "precision": precision,
272
+ "params": params,
273
  "status": status,
274
+ "modality": modality
275
  }
 
 
276
  submission_json = json.dumps(submission, indent=2)
277
 
 
278
  org_model = model_name.split('/')
279
  if len(org_model) != 2:
280
  return "**Please enter the full model name including the organization or username, e.g., 'inceptionai/jais-family-30b-8k'**"
 
282
  precision_str = precision if precision else 'Missing'
283
  file_path_in_repo = f"pending/{org}/{model_id}_eval_request_{revision}_{precision_str}.json"
284
 
 
285
  try:
286
  hf_api_token = os.environ.get('HF_API_TOKEN', None)
287
  api.upload_file(
 
296
 
297
  return f"**Model '{model_name}' has been submitted for evaluation.**"
298
 
299
+
300
+ def load_requests(status_folder):
301
+ api = HfApi()
302
+ requests_data = []
303
+ folder_path_in_repo = status_folder
304
+
305
+ hf_api_token = os.environ.get('HF_API_TOKEN', None)
306
+
307
+ try:
308
+ files_info = api.list_repo_files(
309
+ repo_id=DATASET_REPO_ID,
310
+ repo_type="dataset",
311
+ token=hf_api_token
312
+ )
313
+ except Exception as e:
314
+ print(f"Error accessing dataset repository: {e}")
315
+ return pd.DataFrame()
316
+
317
+ files_in_folder = [f for f in files_info if f.startswith(f"{folder_path_in_repo}/") and f.endswith('.json')]
318
+
319
+ for file_path in files_in_folder:
320
+ try:
321
+ local_file_path = hf_hub_download(
322
+ repo_id=DATASET_REPO_ID,
323
+ filename=file_path,
324
+ repo_type="dataset",
325
+ token=hf_api_token
326
+ )
327
+ with open(local_file_path, 'r') as f:
328
+ request = json.load(f)
329
+ requests_data.append(request)
330
+ except Exception as e:
331
+ print(f"Error loading file {file_path}: {e}")
332
+ continue
333
+
334
+ df = pd.DataFrame(requests_data)
335
+ return df
336
+
337
+
338
+ def filter_df_3c3h(search_query, selected_cols, precision_filters, license_filters, min_size, max_size):
339
+ df_ = load_results()[0].copy()
340
+ if min_size > max_size:
341
+ min_size, max_size = max_size, min_size
342
+ if search_query:
343
+ df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
344
+ if precision_filters:
345
+ include_missing = 'Missing' in precision_filters
346
+ selected_precisions = [p for p in precision_filters if p != 'Missing']
347
+ if include_missing:
348
+ df_ = df_[
349
+ (df_['Precision'].isin(selected_precisions)) |
350
+ (df_['Precision'] == 'UNK') |
351
+ (df_['Precision'].isna())
352
+ ]
353
+ else:
354
+ df_ = df_[df_['Precision'].isin(selected_precisions)]
355
+ if license_filters:
356
+ include_missing = 'Missing' in license_filters
357
+ selected_licenses = [l for l in license_filters if l != 'Missing']
358
+ if include_missing:
359
+ df_ = df_[
360
+ (df_['License'].isin(selected_licenses)) |
361
+ (df_['License'] == 'UNK') |
362
+ (df_['License'].isna())
363
+ ]
364
+ else:
365
+ df_ = df_[df_['License'].isin(selected_licenses)]
366
+ df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
367
+ if 'Rank' in df_.columns:
368
+ df_ = df_.drop(columns=['Rank'])
369
+ df_ = df_.reset_index(drop=True)
370
+ df_.insert(0, 'Rank', range(1, len(df_)+1))
371
+ fixed_column_order = [
372
+ "Rank",
373
+ "Model Name",
374
+ "3C3H Score",
375
+ "Correctness",
376
+ "Completeness",
377
+ "Conciseness",
378
+ "Helpfulness",
379
+ "Honesty",
380
+ "Harmlessness",
381
+ "Revision",
382
+ "License",
383
+ "Precision",
384
+ "Model Size"
385
+ ]
386
+
387
+ selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
388
+
389
+ return df_[selected_cols]
390
+
391
+
392
+ def filter_df_tasks(search_query, selected_cols, precision_filters, license_filters, min_size, max_size, task_columns):
393
+ df_ = load_results()[1].copy()
394
+ if min_size > max_size:
395
+ min_size, max_size = max_size, min_size
396
+ if search_query:
397
+ df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
398
+ if precision_filters:
399
+ include_missing = 'Missing' in precision_filters
400
+ selected_precisions = [p for p in precision_filters if p != 'Missing']
401
+ if include_missing:
402
+ df_ = df_[
403
+ (df_['Precision'].isin(selected_precisions)) |
404
+ (df_['Precision'] == 'UNK') |
405
+ (df_['Precision'].isna())
406
+ ]
407
+ else:
408
+ df_ = df_[df_['Precision'].isin(selected_precisions)]
409
+ if license_filters:
410
+ include_missing = 'Missing' in license_filters
411
+ selected_licenses = [l for l in license_filters if l != 'Missing']
412
+ if include_missing:
413
+ df_ = df_[
414
+ (df_['License'].isin(selected_licenses)) |
415
+ (df_['License'] == 'UNK') |
416
+ (df_['License'].isna())
417
+ ]
418
+ else:
419
+ df_ = df_[df_['License'].isin(selected_licenses)]
420
+ df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
421
+ if 'Rank' in df_.columns:
422
+ df_ = df_.drop(columns=['Rank'])
423
+ if task_columns:
424
+ first_task = task_columns[0]
425
+ df_ = df_.sort_values(by=first_task, ascending=False)
426
+ else:
427
+ df_ = df_.sort_values(by='Model Name', ascending=True)
428
+ df_ = df_.reset_index(drop=True)
429
+ df_.insert(0, 'Rank', range(1, len(df_)+1))
430
+ fixed_column_order = [
431
+ "Rank",
432
+ "Model Name",
433
+ "Question Answering (QA)",
434
+ "Orthographic and Grammatical Analysis",
435
+ "Safety",
436
+ "Reasoning",
437
+ "Revision",
438
+ "License",
439
+ "Precision",
440
+ "Model Size"
441
+ ]
442
+
443
+ selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
444
+ return df_[selected_cols]
445
+
446
+
447
+ def filter_if_df(search_query, selected_cols, family_filters, min_size, max_size):
448
+ """
449
+ Filters the instruction-following dataframe based on various criteria.
450
+ We have removed 'Filter by Type' and 'Filter by Creator'.
451
+ """
452
+ df_ = load_if_data().copy()
453
+ if min_size > max_size:
454
+ min_size, max_size = max_size, min_size
455
+
456
+ # Search by model name
457
+ if search_query:
458
+ df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
459
+
460
+ # Filter by Family only (Creator and Type filters removed)
461
+ if family_filters:
462
+ df_ = df_[df_['Family'].isin(family_filters)]
463
+
464
+ # Filter by Model Size
465
+ df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
466
+
467
+ # Re-rank
468
+ if 'Rank' in df_.columns:
469
+ df_ = df_.drop(columns=['Rank'])
470
+ df_ = df_.reset_index(drop=True)
471
+ df_.insert(0, 'Rank', range(1, len(df_)+1))
472
+
473
+ fixed_column_order = [
474
+ "Rank",
475
+ "Model Name",
476
+ "Creator",
477
+ "Family",
478
+ "Type",
479
+ "Average Accuracy (Ar)",
480
+ "Ar Prompt-lvl",
481
+ "Ar Instruction-lvl",
482
+ "Average Accuracy (En)",
483
+ "En Prompt-lvl",
484
+ "En Instruction-lvl",
485
+ "Size (B)",
486
+ "Base Model",
487
+ "Context Window",
488
+ "Lang."
489
+ ]
490
+
491
+ selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
492
+ return df_[selected_cols]
493
+
494
+
495
  def main():
496
  df_3c3h, df_tasks, task_columns = load_results()
497
+ df_if = load_if_data() # Instruction Following DF
498
 
499
+ # Setup precision/license options for the 3C3H scoreboard
500
  precision_options_3c3h = sorted(df_3c3h['Precision'].dropna().unique().tolist())
501
  precision_options_3c3h = [p for p in precision_options_3c3h if p != 'UNK']
502
  precision_options_3c3h.append('Missing')
 
505
  license_options_3c3h = [l for l in license_options_3c3h if l != 'UNK']
506
  license_options_3c3h.append('Missing')
507
 
508
+ # Setup precision/license options for tasks scoreboard
509
  precision_options_tasks = sorted(df_tasks['Precision'].dropna().unique().tolist())
510
  precision_options_tasks = [p for p in precision_options_tasks if p != 'UNK']
511
  precision_options_tasks.append('Missing')
 
514
  license_options_tasks = [l for l in license_options_tasks if l != 'UNK']
515
  license_options_tasks.append('Missing')
516
 
517
+ # Model size range for 3C3H scoreboard
518
  min_model_size_3c3h = int(df_3c3h['Model Size Filter'].min())
519
  max_model_size_3c3h = int(df_3c3h['Model Size Filter'].max())
520
 
521
+ # Model size range for tasks scoreboard
522
  min_model_size_tasks = int(df_tasks['Model Size Filter'].min())
523
  max_model_size_tasks = int(df_tasks['Model Size Filter'].max())
524
 
525
+ # Column choices for 3C3H
526
+ column_choices_3c3h = [col for col in df_3c3h.columns.tolist() if col != 'Model Size Filter']
527
+
528
+ # Column choices for tasks
529
+ column_choices_tasks = [col for col in df_tasks.columns.tolist() if col != 'Model Size Filter']
530
+
531
+ # Now for instruction-following
532
+ family_options_if = sorted(df_if['Family'].dropna().unique().tolist())
533
+ min_model_size_if = int(df_if['Model Size Filter'].min())
534
+ max_model_size_if = int(df_if['Model Size Filter'].max())
535
+
536
+ #
537
+ # IMPORTANT: Reorder the columns for the Instruction-Following leaderboard
538
+ # Define the full order and the default visible columns separately.
539
+ #
540
+ all_if_columns = [
541
+ "Rank",
542
+ "Model Name",
543
+ "Average Accuracy (Ar)",
544
+ "Ar Prompt-lvl",
545
+ "Ar Instruction-lvl",
546
+ "Average Accuracy (En)",
547
+ "En Prompt-lvl",
548
+ "En Instruction-lvl",
549
+ "Type",
550
+ "Creator",
551
+ "Family",
552
+ "Size (B)",
553
+ "Base Model",
554
+ "Context Window",
555
+ "Lang."
556
+ ]
557
+ default_if_columns = [
558
+ "Rank",
559
+ "Model Name",
560
+ "Average Accuracy (Ar)",
561
+ #"Ar Prompt-lvl",
562
+ #"Ar Instruction-lvl",
563
+ "Average Accuracy (En)"
564
+ ]
565
+
566
  with gr.Blocks() as demo:
567
  gr.HTML(HEADER)
568
+
569
  with gr.Tabs():
570
+ #
571
+ # AL Leaderboards Tab
572
+ #
573
+ with gr.Tab("AL Leaderboards 🏅"):
574
+ # -------------------------
575
+ # Sub-Tab: AraGen Leaderboards
576
+ # -------------------------
577
+ with gr.Tab("🐪 AraGen Leaderboards"):
578
+ with gr.Tabs():
579
+ # 3C3H Scores
580
+ with gr.Tab("3C3H Scores"):
581
+ with gr.Accordion("⚙️ Filters", open=False):
582
+ with gr.Row():
583
+ search_box_3c3h = gr.Textbox(
584
+ placeholder="Search for models...",
585
+ label="Search",
586
+ interactive=True
587
+ )
588
+ with gr.Row():
589
+ column_selector_3c3h = gr.CheckboxGroup(
590
+ choices=column_choices_3c3h,
591
+ value=[
592
+ 'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
593
+ 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
594
+ ],
595
+ label="Select columns to display"
596
+ )
597
+ with gr.Row():
598
+ license_filter_3c3h = gr.CheckboxGroup(
599
+ choices=license_options_3c3h,
600
+ value=license_options_3c3h.copy(),
601
+ label="Filter by License"
602
+ )
603
+ precision_filter_3c3h = gr.CheckboxGroup(
604
+ choices=precision_options_3c3h,
605
+ value=precision_options_3c3h.copy(),
606
+ label="Filter by Precision"
607
+ )
608
+ with gr.Row():
609
+ model_size_min_filter_3c3h = gr.Slider(
610
+ minimum=min_model_size_3c3h,
611
+ maximum=max_model_size_3c3h,
612
+ value=min_model_size_3c3h,
613
+ step=1,
614
+ label="Minimum Model Size",
615
+ interactive=True
616
+ )
617
+ model_size_max_filter_3c3h = gr.Slider(
618
+ minimum=min_model_size_3c3h,
619
+ maximum=max_model_size_3c3h,
620
+ value=max_model_size_3c3h,
621
+ step=1,
622
+ label="Maximum Model Size",
623
+ interactive=True
624
+ )
625
+ leaderboard_3c3h = gr.Dataframe(
626
+ df_3c3h[[
627
  'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
628
  'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
629
+ ]],
630
+ interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  )
632
+ filter_inputs_3c3h = [
633
+ search_box_3c3h, column_selector_3c3h,
634
+ precision_filter_3c3h, license_filter_3c3h,
635
+ model_size_min_filter_3c3h, model_size_max_filter_3c3h
636
+ ]
637
+ search_box_3c3h.submit(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)
638
+ for component in filter_inputs_3c3h:
639
+ component.change(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)
640
+
641
+ # Tasks Scores
642
+ with gr.Tab("Tasks Scores"):
643
+ gr.Markdown("This Table is sorted based on the First Task (Question Answering)")
644
+ with gr.Accordion("⚙️ Filters", open=False):
645
+ with gr.Row():
646
+ search_box_tasks = gr.Textbox(
647
+ placeholder="Search for models...",
648
+ label="Search",
649
+ interactive=True
650
+ )
651
+ with gr.Row():
652
+ column_selector_tasks = gr.CheckboxGroup(
653
+ choices=column_choices_tasks,
654
+ value=['Rank', 'Model Name'] + task_columns,
655
+ label="Select columns to display"
656
+ )
657
+ with gr.Row():
658
+ license_filter_tasks = gr.CheckboxGroup(
659
+ choices=license_options_tasks,
660
+ value=license_options_tasks.copy(),
661
+ label="Filter by License"
662
+ )
663
+ precision_filter_tasks = gr.CheckboxGroup(
664
+ choices=precision_options_tasks,
665
+ value=precision_options_tasks.copy(),
666
+ label="Filter by Precision"
667
+ )
668
+ with gr.Row():
669
+ model_size_min_filter_tasks = gr.Slider(
670
+ minimum=min_model_size_tasks,
671
+ maximum=max_model_size_tasks,
672
+ value=min_model_size_tasks,
673
+ step=1,
674
+ label="Minimum Model Size",
675
+ interactive=True
676
+ )
677
+ model_size_max_filter_tasks = gr.Slider(
678
+ minimum=min_model_size_tasks,
679
+ maximum=max_model_size_tasks,
680
+ value=max_model_size_tasks,
681
+ step=1,
682
+ label="Maximum Model Size",
683
+ interactive=True
684
+ )
685
+ leaderboard_tasks = gr.Dataframe(
686
+ df_tasks[['Rank', 'Model Name'] + task_columns],
687
+ interactive=False
688
  )
689
+ filter_inputs_tasks = [
690
+ search_box_tasks, column_selector_tasks,
691
+ precision_filter_tasks, license_filter_tasks,
692
+ model_size_min_filter_tasks, model_size_max_filter_tasks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
  ]
694
+ search_box_tasks.submit(
695
+ lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
696
+ inputs=filter_inputs_tasks,
697
+ outputs=leaderboard_tasks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  )
699
+ for component in filter_inputs_tasks:
700
+ component.change(
701
+ lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
702
+ inputs=filter_inputs_tasks,
703
+ outputs=leaderboard_tasks
704
+ )
705
+
706
+ # -------------------------
707
+ # Sub-Tab: Instruction Following Leaderboard
708
+ # -------------------------
709
+ with gr.Tab("🗡️ Instruction Following Leaderboard"):
710
+ with gr.Accordion("⚙️ Filters", open=False):
711
  with gr.Row():
712
+ search_box_if = gr.Textbox(
713
  placeholder="Search for models...",
714
  label="Search",
715
  interactive=True
716
  )
717
  with gr.Row():
718
+ column_selector_if = gr.CheckboxGroup(
719
+ choices=all_if_columns,
720
+ value=default_if_columns,
721
+ label="Select columns to display"
722
  )
723
  with gr.Row():
724
+ family_filter_if = gr.CheckboxGroup(
725
+ choices=family_options_if,
726
+ value=family_options_if.copy(),
727
+ label="Filter by Family"
 
 
 
 
 
728
  )
729
  with gr.Row():
730
+ model_size_min_filter_if = gr.Slider(
731
+ minimum=min_model_size_if,
732
+ maximum=max_model_size_if,
733
+ value=min_model_size_if,
734
  step=1,
735
  label="Minimum Model Size",
736
  interactive=True
737
  )
738
+ model_size_max_filter_if = gr.Slider(
739
+ minimum=min_model_size_if,
740
+ maximum=max_model_size_if,
741
+ value=max_model_size_if,
742
  step=1,
743
  label="Maximum Model Size",
744
  interactive=True
745
  )
746
+ leaderboard_if = gr.Dataframe(
747
+ df_if[default_if_columns],
748
+ interactive=False
749
+ )
750
+ filter_inputs_if = [
751
+ search_box_if, column_selector_if,
752
+ family_filter_if,
753
+ model_size_min_filter_if, model_size_max_filter_if
754
+ ]
755
+ search_box_if.submit(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)
756
+ for component in filter_inputs_if:
757
+ component.change(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)
758
+
759
+ #
760
+ # Submit Tab
761
+ #
762
+ with gr.Tab("Submit Here 📝"):
763
+ df_pending = load_requests('pending')
764
+ df_finished = load_requests('finished')
765
+ df_failed = load_requests('failed')
766
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
767
  gr.Markdown(ABOUT_SECTION)
768
+
769
+ gr.Markdown("## Submit Your Model for Evaluation")
770
  with gr.Column():
771
  model_name_input = gr.Textbox(
772
+ label="Model Name",
773
  placeholder="Enter the full model name from HuggingFace Hub (e.g., inceptionai/jais-family-30b-8k)"
774
  )
775
+ revision_input = gr.Textbox(label="Revision", placeholder="main", value="main")
 
 
 
 
776
  precision_input = gr.Dropdown(
777
+ choices=["float16", "float32", "bfloat16", "8bit", "4bit"],
778
  label="Precision",
779
  value="float16"
780
  )
781
  params_input = gr.Textbox(
782
+ label="Params",
783
  placeholder="Enter the approximate number of parameters as Integer (e.g., 7, 13, 30, 70 ...)"
784
  )
 
785
  license_input = gr.Textbox(
786
+ label="License",
787
+ placeholder="Enter the license type (Generic one is 'Open' in case no License is provided)",
788
  value="Open"
789
  )
790
+ modality_input = gr.Radio(
791
+ choices=["Text"],
792
+ label="Modality",
793
+ value="Text"
794
+ )
795
  submit_button = gr.Button("Submit Model")
796
  submission_result = gr.Markdown()
 
797
  submit_button.click(
798
  submit_model,
799
+ inputs=[
800
+ model_name_input, revision_input, precision_input,
801
+ params_input, license_input, modality_input
802
+ ],
803
  outputs=submission_result
804
  )
805
+
806
+ gr.Markdown("## Evaluation Status")
807
+ with gr.Accordion(f"Pending Evaluations ({len(df_pending)})", open=False):
808
+ if not df_pending.empty:
809
+ gr.Dataframe(df_pending)
810
+ else:
811
+ gr.Markdown("No pending evaluations.")
812
+ with gr.Accordion(f"Finished Evaluations ({len(df_finished)})", open=False):
813
+ if not df_finished.empty:
814
+ gr.Dataframe(df_finished)
815
+ else:
816
+ gr.Markdown("No finished evaluations.")
817
+ with gr.Accordion(f"Failed Evaluations ({len(df_failed)})", open=False):
818
+ if not df_failed.empty:
819
+ gr.Dataframe(df_failed)
820
+ else:
821
+ gr.Markdown("No failed evaluations.")
822
+
823
+ # Citation Section
 
 
 
 
824
  with gr.Row():
825
  with gr.Accordion("📙 Citation", open=False):
826
  citation_button = gr.Textbox(
827
  value=CITATION_BUTTON_TEXT,
828
  label=CITATION_BUTTON_LABEL,
829
+ lines=8,
830
  elem_id="citation-button",
831
+ show_copy_button=True
832
  )
833
+
834
+ gr.HTML(BOTTOM_LOGO)
835
+
836
+ demo.launch()
837
 
 
838
 
839
  if __name__ == "__main__":
840
  main()