File size: 37,232 Bytes
24a5f16
 
 
 
 
 
 
 
 
 
 
dfa1e52
24a5f16
 
 
dfa1e52
 
 
 
24a5f16
 
 
ed120b4
24a5f16
dfa1e52
24a5f16
dfa1e52
 
 
24a5f16
dfa1e52
24a5f16
dfa1e52
24a5f16
dfa1e52
 
 
 
 
24a5f16
 
 
dfa1e52
24a5f16
 
dfa1e52
24a5f16
ed120b4
dfa1e52
 
 
 
ed120b4
dfa1e52
ed120b4
7206088
dbbe798
dfa1e52
 
 
 
ed120b4
24a5f16
dfa1e52
 
 
 
 
24a5f16
dfa1e52
24a5f16
 
 
 
7206088
 
 
 
 
 
 
 
 
24a5f16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfa1e52
24a5f16
dfa1e52
24a5f16
 
 
 
 
 
 
 
 
 
 
 
 
dfa1e52
24a5f16
dfa1e52
24a5f16
 
 
 
 
 
 
 
 
 
dfa1e52
24a5f16
 
 
 
dfa1e52
24a5f16
 
 
 
 
 
 
 
 
 
 
 
dfa1e52
24a5f16
 
 
 
 
 
 
dfa1e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a5f16
dfa1e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a5f16
 
dfa1e52
 
24a5f16
 
 
 
 
 
 
 
 
 
 
dfa1e52
 
 
 
 
24a5f16
 
 
 
 
dfa1e52
 
 
 
 
24a5f16
 
 
 
 
dfa1e52
 
 
 
 
24a5f16
 
 
 
 
dfa1e52
 
24a5f16
 
 
 
 
 
 
 
dfa1e52
24a5f16
dfa1e52
24a5f16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfa1e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a5f16
 
dfa1e52
24a5f16
dfa1e52
24a5f16
 
 
 
 
 
 
 
dfa1e52
24a5f16
 
 
 
 
 
 
 
dfa1e52
24a5f16
 
 
dfa1e52
24a5f16
 
 
dfa1e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269e9b8
 
dfa1e52
 
 
24a5f16
 
dfa1e52
24a5f16
dfa1e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a5f16
 
dfa1e52
 
24a5f16
dfa1e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a5f16
dfa1e52
 
 
 
24a5f16
dfa1e52
 
 
 
24a5f16
dfa1e52
 
 
 
 
 
 
 
 
 
 
 
24a5f16
dfa1e52
24a5f16
 
 
 
 
dfa1e52
 
 
 
24a5f16
 
dfa1e52
 
 
 
24a5f16
 
dfa1e52
 
 
 
24a5f16
 
 
 
dfa1e52
 
 
 
24a5f16
 
 
 
dfa1e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a5f16
dfa1e52
 
24a5f16
 
dfa1e52
24a5f16
 
dfa1e52
24a5f16
dfa1e52
24a5f16
 
 
 
dfa1e52
24a5f16
 
 
dfa1e52
 
24a5f16
 
dfa1e52
 
 
 
 
24a5f16
 
 
 
dfa1e52
 
 
 
24a5f16
 
dfa1e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24a5f16
 
 
 
 
dfa1e52
24a5f16
dfa1e52
24a5f16
dfa1e52
 
 
 
24a5f16
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
import os
import json
import numpy as np
import pandas as pd
import gradio as gr
from huggingface_hub import HfApi, hf_hub_download


OWNER = "inceptionai"
DATASET_REPO_ID = f"{OWNER}/requests-dataset"


HEADER = """
<center>
<br></br>
<h1>Arabic Leaderboards</h1>
<h2>Comprehensive Evaluation of Arabic Large Language Models</h2>
<br></br>
</center>
"""

ABOUT_SECTION = """
## About

In our `12-24` release, we introduced the `AraGen Benchmark`, along with the `3C3H` evaluation measure (aka the 3C3H Score). You can find more details about AraGen and 3C3H, [here](https://huggingface.co/blog/leaderboard-3c3h-aragen). And you can find the first version of the benchmark, `AraGen-12-24` [here](https://huggingface.co/datasets/inceptionai/AraGen). Building on that foundation, and as part of this new release, we have expanded this space to incorporate additional tasks and evaluation metrics.

In this release, we present two leaderboards:

**AraGen-03-25 (v2):**

- The AraGen Benchmark is designed to evaluate and compare the performance of Chat/Instruct Arabic Large Language Models on a suite of generative tasks that are culturally relevant to the Arab region, history, politics, cuisine ... etc. By leveraging **3C3H** as an evaluation metricβ€”which assesses a model's output across six dimensions: Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessnessβ€”the leaderboard offers a comprehensive and holistic evaluation of a model’s chat capabilities and its ability to generate human-like and ethically responsible content.

**Instruction Following:**

- We have established a robust leaderboard that benchmarks models on Arabic and English instruction following, offering an open and comparative performance landscape for the research community. Concurrently, we released the first publicly available Arabic [dataset](https://huggingface.co/datasets/inceptionai/Arabic_IFEval) aimed at evaluating LLMs' ability to follow instructions. The Arabic IFEval samples are meticulously curated to capture the language’s unique nuancesβ€”such as diacritization and distinctive phonetic featuresβ€”often overlooked in generic datasets. Our dedicated linguistic team generated original samples and adapted selections from the IFEval English dataset, ensuring that the material resonates with Arabic cultural contexts and meets the highest standards of authenticity and quality.

### Why Focus on Chat Models?

Our evaluations are conducted in a generative mode, meaning that we expect models to produce complete, context-rich responses rather than simply predicting the next token as base models do. This approach not only yields results that are more explainable and nuanced compared to logit-based measurements, but it also captures elements like creativity, coherence, and ethical considerationsβ€”providing deeper insights into overall model performance.

### Contact

For inquiries or assistance, please join the conversation on our [Discussions Tab](https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/discussions) or reach out via [email](mailto:[email protected]).
"""

BOTTOM_LOGO = """<img src="https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/resolve/main/assets/pictures/03-25/arabic-leaderboards-colab-march-preview-free-3.png" style="width:50%;display:block;margin-left:auto;margin-right:auto;border-radius:15px;">"""

CITATION_BUTTON_TEXT = """
@misc{Arabic-Leaderboards,
  author = {El Filali, Ali and Albarri, Sarah and Abouelseoud, Arwa and Kamboj, Samta and Sengupta, Neha and Nakov, Preslav},
  title = {Arabic-Leaderboards: Comprehensive Evaluation of Arabic Large Language Models},
  year = {2025},
  publisher = {Inception},
  howpublished = "url{https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards}"
}
"""

CITATION_BUTTON_LABEL = """
Copy the following snippet to cite the results from all Arabic Leaderboards in this Space.
"""


def load_results():
    """
    Loads the AraGen v2 results from aragen_v2_results.json and returns two dataframes:
    1) df_3c3h with columns for 3C3H scores
    2) df_tasks with columns for tasks scores
    """
    current_dir = os.path.dirname(os.path.abspath(__file__))
    results_file = os.path.join(current_dir, "assets", "results", "aragen_v2_results.json")
    
    with open(results_file, 'r') as f:
        data = json.load(f)
    
    # Filter out any entries that only contain '_last_sync_timestamp'
    filtered_data = []
    for entry in data:
        if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
            continue
        filtered_data.append(entry)
    
    data = filtered_data
    
    data_3c3h = []
    data_tasks = []
    
    for model_data in data:
        meta = model_data.get('Meta', {})
        model_name = meta.get('Model Name', 'UNK')
        revision = meta.get('Revision', 'UNK')
        precision = meta.get('Precision', 'UNK')
        params = meta.get('Params', 'UNK')
        
        try:
            model_size_numeric = float(params)
        except (ValueError, TypeError):
            model_size_numeric = np.inf
        
        scores_data = model_data.get('claude-3.5-sonnet Scores', {})
        scores_3c3h = scores_data.get('3C3H Scores', {})
        scores_tasks = scores_data.get('Tasks Scores', {})
        
        formatted_scores_3c3h = {k: v*100 for k, v in scores_3c3h.items()}
        formatted_scores_tasks = {k: v*100 for k, v in scores_tasks.items()}
        
        data_entry_3c3h = {
            'Model Name': model_name,
            'Revision': revision,
            'License': meta.get('License', 'UNK'),
            'Precision': precision,
            'Model Size': model_size_numeric,
            '3C3H Score': formatted_scores_3c3h.get("3C3H Score", np.nan),
            'Correctness': formatted_scores_3c3h.get("Correctness", np.nan),
            'Completeness': formatted_scores_3c3h.get("Completeness", np.nan),
            'Conciseness': formatted_scores_3c3h.get("Conciseness", np.nan),
            'Helpfulness': formatted_scores_3c3h.get("Helpfulness", np.nan),
            'Honesty': formatted_scores_3c3h.get("Honesty", np.nan),
            'Harmlessness': formatted_scores_3c3h.get("Harmlessness", np.nan),
        }
        data_3c3h.append(data_entry_3c3h)
        
        data_entry_tasks = {
            'Model Name': model_name,
            'Revision': revision,
            'License': meta.get('License', 'UNK'),
            'Precision': precision,
            'Model Size': model_size_numeric,
            **formatted_scores_tasks
        }
        data_tasks.append(data_entry_tasks)
    
    df_3c3h = pd.DataFrame(data_3c3h)
    df_tasks = pd.DataFrame(data_tasks)
    
    score_columns_3c3h = ['3C3H Score', 'Correctness', 'Completeness', 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness']
    df_3c3h[score_columns_3c3h] = df_3c3h[score_columns_3c3h].round(4)
    
    max_model_size_value = 1000
    df_3c3h['Model Size Filter'] = df_3c3h['Model Size'].replace(np.inf, max_model_size_value)
    
    if '3C3H Score' in df_3c3h.columns:
        df_3c3h = df_3c3h.sort_values(by='3C3H Score', ascending=False)
        df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
    else:
        df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
    
    task_columns = [col for col in df_tasks.columns if col not in ['Model Name', 'Revision', 'License', 'Precision', 'Model Size', 'Model Size Filter']]
    if task_columns:
        df_tasks[task_columns] = df_tasks[task_columns].round(4)
    
    df_tasks['Model Size Filter'] = df_tasks['Model Size'].replace(np.inf, max_model_size_value)
    
    if task_columns:
        first_task = task_columns[0]
        df_tasks = df_tasks.sort_values(by=first_task, ascending=False)
        df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
    else:
        df_tasks = df_tasks.sort_values(by='Model Name', ascending=True)
        df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
    
    return df_3c3h, df_tasks, task_columns


def load_if_data():
    """
    Loads the instruction-following data from ifeval_results.jsonl 
    and returns a dataframe with relevant columns, 
    converting decimal values to percentage format.
    """
    current_dir = os.path.dirname(os.path.abspath(__file__))
    results_file = os.path.join(current_dir, "assets", "results", "ifeval_results.jsonl")
    
    data = []
    with open(results_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data.append(json.loads(line))
    
    df = pd.DataFrame(data)
    
    # Convert numeric columns
    numeric_cols = ["En Prompt-lvl", "En Instruction-lvl", "Ar Prompt-lvl", "Ar Instruction-lvl"]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Compute average accuracy for En and Ar
    df["Average Accuracy (En)"] = (df["En Prompt-lvl"] + df["En Instruction-lvl"]) / 2
    df["Average Accuracy (Ar)"] = (df["Ar Prompt-lvl"] + df["Ar Instruction-lvl"]) / 2
    
    # Convert them to percentage format (e.g., 0.871 -> 87.1)
    for col in numeric_cols:
        df[col] = (df[col] * 100).round(1)
    df["Average Accuracy (En)"] = (df["Average Accuracy (En)"] * 100).round(1)
    df["Average Accuracy (Ar)"] = (df["Average Accuracy (Ar)"] * 100).round(1)
    
    # Handle size as numeric
    def parse_size(x):
        try:
            return float(x)
        except:
            return np.inf
    
    df["Model Size"] = df["Size (B)"].apply(parse_size)
    
    # Add a filter column for size
    max_model_size_value = 1000
    df["Model Size Filter"] = df["Model Size"].replace(np.inf, max_model_size_value)
    
    # Sort by "Average Accuracy (Ar)" as an example
    df = df.sort_values(by="Average Accuracy (Ar)", ascending=False)
    df = df.reset_index(drop=True)
    df.insert(0, "Rank", range(1, len(df) + 1))
    
    return df


def submit_model(model_name, revision, precision, params, license, modality):
    df_3c3h, df_tasks, _ = load_results()
    existing_models_results = df_3c3h[['Model Name', 'Revision', 'Precision']]

    if precision == 'Missing':
        precision = None
    else:
        precision = precision.strip().lower()

    df_pending = load_requests('pending')
    df_finished = load_requests('finished')

    model_exists_in_results = (
        (existing_models_results['Model Name'] == model_name) &
        (existing_models_results['Revision'] == revision) &
        (existing_models_results['Precision'] == precision)
    ).any()
    if model_exists_in_results:
        return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"

    if not df_pending.empty:
        existing_models_pending = df_pending[['model_name', 'revision', 'precision']]
        model_exists_in_pending = (
            (existing_models_pending['model_name'] == model_name) &
            (existing_models_pending['revision'] == revision) &
            (existing_models_pending['precision'] == precision)
        ).any()
        if model_exists_in_pending:
            return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' is already in the pending evaluations.**"

    if not df_finished.empty:
        existing_models_finished = df_finished[['model_name', 'revision', 'precision']]
        model_exists_in_finished = (
            (existing_models_finished['model_name'] == model_name) &
            (existing_models_finished['revision'] == revision) &
            (existing_models_finished['precision'] == precision)
        ).any()
        if model_exists_in_finished:
            return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"

    api = HfApi()
    try:
        _ = api.model_info(model_name)
    except Exception:
        return f"**Error: Could not find model '{model_name}' on HuggingFace Hub. Please ensure the model name is correct and the model is public.**"

    status = "PENDING"
    submission = {
        "model_name": model_name,
        "license": license,
        "revision": revision,
        "precision": precision,
        "params": params,
        "status": status,
        "modality": modality
    }
    submission_json = json.dumps(submission, indent=2)

    org_model = model_name.split('/')
    if len(org_model) != 2:
        return "**Please enter the full model name including the organization or username, e.g., 'inceptionai/jais-family-30b-8k'**"
    org, model_id = org_model
    precision_str = precision if precision else 'Missing'
    file_path_in_repo = f"pending/{org}/{model_id}_eval_request_{revision}_{precision_str}.json"

    try:
        hf_api_token = os.environ.get('HF_API_TOKEN', None)
        api.upload_file(
            path_or_fileobj=submission_json.encode('utf-8'),
            path_in_repo=file_path_in_repo,
            repo_id=DATASET_REPO_ID,
            repo_type="dataset",
            token=hf_api_token
        )
    except Exception as e:
        return f"**Error: Could not submit the model. {str(e)}**"

    return f"**Model '{model_name}' has been submitted for evaluation.**"


def load_requests(status_folder):
    api = HfApi()
    requests_data = []
    folder_path_in_repo = status_folder

    hf_api_token = os.environ.get('HF_API_TOKEN', None)

    try:
        files_info = api.list_repo_files(
            repo_id=DATASET_REPO_ID,
            repo_type="dataset",
            token=hf_api_token
        )
    except Exception as e:
        print(f"Error accessing dataset repository: {e}")
        return pd.DataFrame()

    files_in_folder = [f for f in files_info if f.startswith(f"{folder_path_in_repo}/") and f.endswith('.json')]

    for file_path in files_in_folder:
        try:
            local_file_path = hf_hub_download(
                repo_id=DATASET_REPO_ID,
                filename=file_path,
                repo_type="dataset",
                token=hf_api_token
            )
            with open(local_file_path, 'r') as f:
                request = json.load(f)
            requests_data.append(request)
        except Exception as e:
            print(f"Error loading file {file_path}: {e}")
            continue

    df = pd.DataFrame(requests_data)
    return df


def filter_df_3c3h(search_query, selected_cols, precision_filters, license_filters, min_size, max_size):
    df_ = load_results()[0].copy()
    if min_size > max_size:
        min_size, max_size = max_size, min_size
    if search_query:
        df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
    if precision_filters:
        include_missing = 'Missing' in precision_filters
        selected_precisions = [p for p in precision_filters if p != 'Missing']
        if include_missing:
            df_ = df_[
                (df_['Precision'].isin(selected_precisions)) |
                (df_['Precision'] == 'UNK') | 
                (df_['Precision'].isna())
            ]
        else:
            df_ = df_[df_['Precision'].isin(selected_precisions)]
    if license_filters:
        include_missing = 'Missing' in license_filters
        selected_licenses = [l for l in license_filters if l != 'Missing']
        if include_missing:
            df_ = df_[
                (df_['License'].isin(selected_licenses)) |
                (df_['License'] == 'UNK') |
                (df_['License'].isna())
            ]
        else:
            df_ = df_[df_['License'].isin(selected_licenses)]
    df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
    if 'Rank' in df_.columns:
        df_ = df_.drop(columns=['Rank'])
    df_ = df_.reset_index(drop=True)
    df_.insert(0, 'Rank', range(1, len(df_)+1))
    fixed_column_order = [
        "Rank",
        "Model Name",
        "3C3H Score",
        "Correctness",
        "Completeness",
        "Conciseness",
        "Helpfulness",
        "Honesty",
        "Harmlessness",
        "Revision",
        "License",
        "Precision",
        "Model Size"
    ]

    selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]

    return df_[selected_cols]


def filter_df_tasks(search_query, selected_cols, precision_filters, license_filters, min_size, max_size, task_columns):
    df_ = load_results()[1].copy()
    if min_size > max_size:
        min_size, max_size = max_size, min_size
    if search_query:
        df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
    if precision_filters:
        include_missing = 'Missing' in precision_filters
        selected_precisions = [p for p in precision_filters if p != 'Missing']
        if include_missing:
            df_ = df_[
                (df_['Precision'].isin(selected_precisions)) |
                (df_['Precision'] == 'UNK') |
                (df_['Precision'].isna())
            ]
        else:
            df_ = df_[df_['Precision'].isin(selected_precisions)]
    if license_filters:
        include_missing = 'Missing' in license_filters
        selected_licenses = [l for l in license_filters if l != 'Missing']
        if include_missing:
            df_ = df_[
                (df_['License'].isin(selected_licenses)) |
                (df_['License'] == 'UNK') |
                (df_['License'].isna())
            ]
        else:
            df_ = df_[df_['License'].isin(selected_licenses)]
    df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
    if 'Rank' in df_.columns:
        df_ = df_.drop(columns=['Rank'])
    if task_columns:
        first_task = task_columns[0]
        df_ = df_.sort_values(by=first_task, ascending=False)
    else:
        df_ = df_.sort_values(by='Model Name', ascending=True)
    df_ = df_.reset_index(drop=True)
    df_.insert(0, 'Rank', range(1, len(df_)+1))
    fixed_column_order = [
        "Rank",
        "Model Name",
        "Question Answering (QA)",
        "Orthographic and Grammatical Analysis",
        "Safety",
        "Reasoning",
        "Revision",
        "License",
        "Precision",
        "Model Size"
    ]

    selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
    return df_[selected_cols]


def filter_if_df(search_query, selected_cols, family_filters, min_size, max_size):
    """
    Filters the instruction-following dataframe based on various criteria.
    We have removed 'Filter by Type' and 'Filter by Creator'.
    """
    df_ = load_if_data().copy()
    if min_size > max_size:
        min_size, max_size = max_size, min_size
    
    # Search by model name
    if search_query:
        df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
    
    # Filter by Family only (Creator and Type filters removed)
    if family_filters:
        df_ = df_[df_['Family'].isin(family_filters)]
    
    # Filter by Model Size
    df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
    
    # Re-rank
    if 'Rank' in df_.columns:
        df_ = df_.drop(columns=['Rank'])
    df_ = df_.reset_index(drop=True)
    df_.insert(0, 'Rank', range(1, len(df_)+1))
    
    fixed_column_order = [
        "Rank",
        "Model Name",
        "Creator",
        "Family",
        "Type",
        "Average Accuracy (Ar)",
        "Ar Prompt-lvl",
        "Ar Instruction-lvl",
        "Average Accuracy (En)",
        "En Prompt-lvl",
        "En Instruction-lvl",
        "Size (B)",
        "Base Model",
        "Context Window",
        "Lang."
    ]
    
    selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
    return df_[selected_cols]


def main():
    df_3c3h, df_tasks, task_columns = load_results()
    df_if = load_if_data()  # Instruction Following DF

    # Setup precision/license options for the 3C3H scoreboard
    precision_options_3c3h = sorted(df_3c3h['Precision'].dropna().unique().tolist())
    precision_options_3c3h = [p for p in precision_options_3c3h if p != 'UNK']
    precision_options_3c3h.append('Missing')

    license_options_3c3h = sorted(df_3c3h['License'].dropna().unique().tolist())
    license_options_3c3h = [l for l in license_options_3c3h if l != 'UNK']
    license_options_3c3h.append('Missing')

    # Setup precision/license options for tasks scoreboard
    precision_options_tasks = sorted(df_tasks['Precision'].dropna().unique().tolist())
    precision_options_tasks = [p for p in precision_options_tasks if p != 'UNK']
    precision_options_tasks.append('Missing')

    license_options_tasks = sorted(df_tasks['License'].dropna().unique().tolist())
    license_options_tasks = [l for l in license_options_tasks if l != 'UNK']
    license_options_tasks.append('Missing')

    # Model size range for 3C3H scoreboard
    min_model_size_3c3h = int(df_3c3h['Model Size Filter'].min())
    max_model_size_3c3h = int(df_3c3h['Model Size Filter'].max())

    # Model size range for tasks scoreboard
    min_model_size_tasks = int(df_tasks['Model Size Filter'].min())
    max_model_size_tasks = int(df_tasks['Model Size Filter'].max())

    # Column choices for 3C3H
    column_choices_3c3h = [col for col in df_3c3h.columns.tolist() if col != 'Model Size Filter']

    # Column choices for tasks
    column_choices_tasks = [col for col in df_tasks.columns.tolist() if col != 'Model Size Filter']

    # Now for instruction-following
    family_options_if = sorted(df_if['Family'].dropna().unique().tolist())
    min_model_size_if = int(df_if['Model Size Filter'].min())
    max_model_size_if = int(df_if['Model Size Filter'].max())

    #
    # IMPORTANT: Reorder the columns for the Instruction-Following leaderboard
    # Define the full order and the default visible columns separately.
    #
    all_if_columns = [
        "Rank",
        "Model Name",
        "Average Accuracy (Ar)",
        "Ar Prompt-lvl",
        "Ar Instruction-lvl",
        "Average Accuracy (En)",
        "En Prompt-lvl",
        "En Instruction-lvl",
        "Type",
        "Creator",
        "Family",
        "Size (B)",
        "Base Model",
        "Context Window",
        "Lang."
    ]
    default_if_columns = [
        "Rank",
        "Model Name",
        "Average Accuracy (Ar)",
        "Ar Prompt-lvl",
        "Ar Instruction-lvl",
        "Average Accuracy (En)"
    ]
    
    with gr.Blocks() as demo:
        gr.HTML(HEADER)

        with gr.Tabs():
            #
            # AL Leaderboards Tab
            #
            with gr.Tab("AL Leaderboards πŸ…"):
                # -------------------------
                # Sub-Tab: AraGen Leaderboards
                # -------------------------
                with gr.Tab("πŸͺ AraGen Leaderboards"):
                    with gr.Tabs():
                        # 3C3H Scores
                        with gr.Tab("3C3H Scores"):
                            with gr.Accordion("βš™οΈ Filters", open=False):
                                with gr.Row():
                                    search_box_3c3h = gr.Textbox(
                                        placeholder="Search for models...", 
                                        label="Search", 
                                        interactive=True
                                    )
                                with gr.Row():
                                    column_selector_3c3h = gr.CheckboxGroup(
                                        choices=column_choices_3c3h,
                                        value=[
                                            'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
                                            'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
                                        ],
                                        label="Select columns to display"
                                    )
                                with gr.Row():
                                    license_filter_3c3h = gr.CheckboxGroup(
                                        choices=license_options_3c3h,
                                        value=license_options_3c3h.copy(),
                                        label="Filter by License"
                                    )
                                    precision_filter_3c3h = gr.CheckboxGroup(
                                        choices=precision_options_3c3h,
                                        value=precision_options_3c3h.copy(),
                                        label="Filter by Precision"
                                    )
                                with gr.Row():
                                    model_size_min_filter_3c3h = gr.Slider(
                                        minimum=min_model_size_3c3h,
                                        maximum=max_model_size_3c3h,
                                        value=min_model_size_3c3h,
                                        step=1,
                                        label="Minimum Model Size",
                                        interactive=True
                                    )
                                    model_size_max_filter_3c3h = gr.Slider(
                                        minimum=min_model_size_3c3h,
                                        maximum=max_model_size_3c3h,
                                        value=max_model_size_3c3h,
                                        step=1,
                                        label="Maximum Model Size",
                                        interactive=True
                                    )
                            leaderboard_3c3h = gr.Dataframe(
                                df_3c3h[[
                                    'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
                                    'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
                                ]],
                                interactive=False
                            )
                            filter_inputs_3c3h = [
                                search_box_3c3h, column_selector_3c3h,
                                precision_filter_3c3h, license_filter_3c3h,
                                model_size_min_filter_3c3h, model_size_max_filter_3c3h
                            ]
                            search_box_3c3h.submit(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)
                            for component in filter_inputs_3c3h:
                                component.change(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)

                        # Tasks Scores
                        with gr.Tab("Tasks Scores"):
                            gr.Markdown("This Table is sorted based on the First Task (Question Answering)")
                            with gr.Accordion("βš™οΈ Filters", open=False):
                                with gr.Row():
                                    search_box_tasks = gr.Textbox(
                                        placeholder="Search for models...", 
                                        label="Search", 
                                        interactive=True
                                    )
                                with gr.Row():
                                    column_selector_tasks = gr.CheckboxGroup(
                                        choices=column_choices_tasks,
                                        value=['Rank', 'Model Name'] + task_columns,
                                        label="Select columns to display"
                                    )
                                with gr.Row():
                                    license_filter_tasks = gr.CheckboxGroup(
                                        choices=license_options_tasks,
                                        value=license_options_tasks.copy(),
                                        label="Filter by License"
                                    )
                                    precision_filter_tasks = gr.CheckboxGroup(
                                        choices=precision_options_tasks,
                                        value=precision_options_tasks.copy(),
                                        label="Filter by Precision"
                                    )
                                with gr.Row():
                                    model_size_min_filter_tasks = gr.Slider(
                                        minimum=min_model_size_tasks,
                                        maximum=max_model_size_tasks,
                                        value=min_model_size_tasks,
                                        step=1,
                                        label="Minimum Model Size",
                                        interactive=True
                                    )
                                    model_size_max_filter_tasks = gr.Slider(
                                        minimum=min_model_size_tasks,
                                        maximum=max_model_size_tasks,
                                        value=max_model_size_tasks,
                                        step=1,
                                        label="Maximum Model Size",
                                        interactive=True
                                    )
                            leaderboard_tasks = gr.Dataframe(
                                df_tasks[['Rank', 'Model Name'] + task_columns],
                                interactive=False
                            )
                            filter_inputs_tasks = [
                                search_box_tasks, column_selector_tasks,
                                precision_filter_tasks, license_filter_tasks,
                                model_size_min_filter_tasks, model_size_max_filter_tasks
                            ]
                            search_box_tasks.submit(
                                lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
                                inputs=filter_inputs_tasks,
                                outputs=leaderboard_tasks
                            )
                            for component in filter_inputs_tasks:
                                component.change(
                                    lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
                                    inputs=filter_inputs_tasks,
                                    outputs=leaderboard_tasks
                                )

                # -------------------------
                # Sub-Tab: Instruction Following Leaderboard
                # -------------------------
                with gr.Tab("πŸ—‘οΈ Instruction Following Leaderboard"):
                    with gr.Accordion("βš™οΈ Filters", open=False):
                        with gr.Row():
                            search_box_if = gr.Textbox(
                                placeholder="Search for models...", 
                                label="Search", 
                                interactive=True
                            )
                        with gr.Row():
                            column_selector_if = gr.CheckboxGroup(
                                choices=all_if_columns,
                                value=default_if_columns,
                                label="Select columns to display"
                            )
                        with gr.Row():
                            family_filter_if = gr.CheckboxGroup(
                                choices=family_options_if,
                                value=family_options_if.copy(),
                                label="Filter by Family"
                            )
                        with gr.Row():
                            model_size_min_filter_if = gr.Slider(
                                minimum=min_model_size_if,
                                maximum=max_model_size_if,
                                value=min_model_size_if,
                                step=1,
                                label="Minimum Model Size",
                                interactive=True
                            )
                            model_size_max_filter_if = gr.Slider(
                                minimum=min_model_size_if,
                                maximum=max_model_size_if,
                                value=max_model_size_if,
                                step=1,
                                label="Maximum Model Size",
                                interactive=True
                            )
                    leaderboard_if = gr.Dataframe(
                        df_if[default_if_columns],
                        interactive=False
                    )
                    filter_inputs_if = [
                        search_box_if, column_selector_if,
                        family_filter_if, 
                        model_size_min_filter_if, model_size_max_filter_if
                    ]
                    search_box_if.submit(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)
                    for component in filter_inputs_if:
                        component.change(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)

            #
            # Submit Tab
            #
            with gr.Tab("Submit Here πŸ“"):
                df_pending = load_requests('pending')
                df_finished = load_requests('finished')
                df_failed = load_requests('failed')
                
                gr.Markdown(ABOUT_SECTION)
                
                gr.Markdown("## Submit Your Model for Evaluation")
                with gr.Column():
                    model_name_input = gr.Textbox(
                        label="Model Name",
                        placeholder="Enter the full model name from HuggingFace Hub (e.g., inceptionai/jais-family-30b-8k)"
                    )
                    revision_input = gr.Textbox(label="Revision", placeholder="main", value="main")
                    precision_input = gr.Dropdown(
                        choices=["float16", "float32", "bfloat16", "8bit", "4bit"],
                        label="Precision",
                        value="float16"
                    )
                    params_input = gr.Textbox(
                        label="Params",
                        placeholder="Enter the approximate number of parameters as Integer (e.g., 7, 13, 30, 70 ...)"
                    )
                    license_input = gr.Textbox(
                        label="License",
                        placeholder="Enter the license type (Generic one is 'Open' in case no License is provided)",
                        value="Open"
                    )
                    modality_input = gr.Radio(
                        choices=["Text"],
                        label="Modality",
                        value="Text"
                    )
                    submit_button = gr.Button("Submit Model")
                    submission_result = gr.Markdown()
                    submit_button.click(
                        submit_model,
                        inputs=[
                            model_name_input, revision_input, precision_input,
                            params_input, license_input, modality_input
                        ],
                        outputs=submission_result
                    )

                gr.Markdown("## Evaluation Status")
                with gr.Accordion(f"Pending Evaluations ({len(df_pending)})", open=False):
                    if not df_pending.empty:
                        gr.Dataframe(df_pending)
                    else:
                        gr.Markdown("No pending evaluations.")
                with gr.Accordion(f"Finished Evaluations ({len(df_finished)})", open=False):
                    if not df_finished.empty:
                        gr.Dataframe(df_finished)
                    else:
                        gr.Markdown("No finished evaluations.")
                with gr.Accordion(f"Failed Evaluations ({len(df_failed)})", open=False):
                    if not df_failed.empty:
                        gr.Dataframe(df_failed)
                    else:
                        gr.Markdown("No failed evaluations.")

            # Citation Section
            with gr.Row():
                with gr.Accordion("πŸ“™ Citation", open=False):
                    citation_button = gr.Textbox(
                        value=CITATION_BUTTON_TEXT,
                        label=CITATION_BUTTON_LABEL,
                        lines=8,
                        elem_id="citation-button",
                        show_copy_button=True
                    )
        
        gr.HTML(BOTTOM_LOGO)
        
        demo.launch()


if __name__ == "__main__":
    main()