Spaces:
Running
Running
File size: 2,907 Bytes
29546b4 91e8a06 6dff40c 29546b4 91e8a06 32b707a 29546b4 4f3c2a8 b865e0e 01ea22b 32b707a 29546b4 786d44b 58733e4 29546b4 b98f07f 4f3c2a8 e7226cc 29546b4 e7226cc 3aa78c2 2a860f6 f7d1b51 3aa78c2 786d44b 3aa78c2 786d44b a4ce706 786d44b 072fab0 786d44b 3aa78c2 786d44b 3aa78c2 786d44b 072fab0 786d44b 3aa78c2 786d44b 58733e4 2a73469 11111d4 2a73469 fccd458 786d44b 2a860f6 786d44b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
task0 = Task("Math", "acc", "Math")
task1 = Task("Chemistry", "acc", "Chemistry")
task2 = Task("Physics", "acc", "Physics")
task3 = Task("Arabic", "acc", "Arabic")
task4 = Task("English", "acc", "English")
task5 = Task("Religion", "acc", "Religion")
task6 = Task("Persian Literature", "acc", "Persian Literature")
NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">IRUEX leaderboard</h1>"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
Intro text
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works
## Reproducibility
To reproduce our results, here is the commands you can run:
"""
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">IRUEX Leaderboard</h1>"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
**Welcome to the IRUEX Leaderboard!**
This platform evaluates large language models based on Iran's University Entrance Exam subjects.
[Explore the IRUEX Dataset](https://github.com/hamedkhaledi/IRUEX-dataset) on GitHub.
"""
# Which evaluations are you running? How can people reproduce them?
LLM_BENCHMARKS_TEXT = """
## Evaluation Process
We assess models across various subjects, including Math, Chemistry, Physics, Arabic, English, Religion, and Persian Literature. Each model's performance is measured using accuracy metrics specific to each subject.
## Reproducibility
To reproduce our results, execute the following commands:
```bash
# Example command to run evaluations
python evaluate_model.py --model_name your_model_name --task Math --num_fewshot 0
```
"""
EVALUATION_QUEUE_TEXT = ""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@inproceedings{khademi-khaledi-faili-2025-iruex,
title = "{IRUEX}: A Study on Large Language Models' Problem-Solving Skills in Iran's University Entrance Exam",
author = "Khademi Khaledi, Hamed and Faili, Heshaam",
editor = "Rambow, Owen and Wanner, Leo and Apidianaki, Marianna and Al-Khalifa, Hend and Di Eugenio, Barbara and Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.434/",
pages = "6505--6519",
}
"""
|