File size: 2,907 Bytes
29546b4
91e8a06
6dff40c
29546b4
 
 
 
 
 
91e8a06
32b707a
 
29546b4
4f3c2a8
b865e0e
 
 
 
 
 
 
01ea22b
32b707a
 
29546b4
 
 
786d44b
58733e4
29546b4
b98f07f
4f3c2a8
e7226cc
 
29546b4
e7226cc
3aa78c2
 
 
2a860f6
f7d1b51
3aa78c2
 
786d44b
 
3aa78c2
786d44b
 
a4ce706
 
 
786d44b
072fab0
786d44b
 
 
3aa78c2
786d44b
3aa78c2
786d44b
072fab0
786d44b
3aa78c2
786d44b
 
 
 
58733e4
2a73469
11111d4
 
2a73469
fccd458
786d44b
 
 
 
 
 
 
 
 
 
 
 
2a860f6
786d44b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("Math", "acc", "Math")
    task1 = Task("Chemistry", "acc", "Chemistry")
    task2 = Task("Physics", "acc", "Physics")
    task3 = Task("Arabic", "acc", "Arabic")
    task4 = Task("English", "acc", "English")
    task5 = Task("Religion", "acc", "Religion")
    task6 = Task("Persian Literature", "acc", "Persian Literature")
NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------



# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">IRUEX leaderboard</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
Intro text
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works

## Reproducibility
To reproduce our results, here is the commands you can run:

"""

# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">IRUEX Leaderboard</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
**Welcome to the IRUEX Leaderboard!**  
This platform evaluates large language models based on Iran's University Entrance Exam subjects.  
[Explore the IRUEX Dataset](https://github.com/hamedkhaledi/IRUEX-dataset) on GitHub.
"""

# Which evaluations are you running? How can people reproduce them?
LLM_BENCHMARKS_TEXT = """
## Evaluation Process

We assess models across various subjects, including Math, Chemistry, Physics, Arabic, English, Religion, and Persian Literature. Each model's performance is measured using accuracy metrics specific to each subject.

## Reproducibility

To reproduce our results, execute the following commands:

```bash
# Example command to run evaluations
python evaluate_model.py --model_name your_model_name --task Math --num_fewshot 0
```
"""

EVALUATION_QUEUE_TEXT = ""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@inproceedings{khademi-khaledi-faili-2025-iruex,
    title = "{IRUEX}: A Study on Large Language Models' Problem-Solving Skills in Iran's University Entrance Exam",
    author = "Khademi Khaledi, Hamed and Faili, Heshaam",
    editor = "Rambow, Owen and Wanner, Leo and Apidianaki, Marianna and Al-Khalifa, Hend and Di Eugenio, Barbara and Schockaert, Steven",
    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
    month = jan,
    year = "2025",
    address = "Abu Dhabi, UAE",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.coling-main.434/",
    pages = "6505--6519",
}
"""