tohid.abedini commited on
Commit
2c3fe6c
·
1 Parent(s): 3cc9970

[Add] about

Browse files
Files changed (2) hide show
  1. app.py +2 -94
  2. utils.py +90 -5
app.py CHANGED
@@ -1,100 +1,8 @@
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
3
  from pathlib import Path
4
- import pandas as pd
5
-
6
- import os
7
-
8
- import json
9
-
10
- import requests
11
-
12
- from envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
13
- from utils import LLM_BENCHMARKS_ABOUT_TEXT, LLM_BENCHMARKS_SUBMIT_TEXT, custom_css, jsonl_to_dataframe, add_average_column_to_df, apply_clickable_model
14
-
15
-
16
-
17
- def fill_form(model_name, model_id, contact_email, challenge, submission_id, paper_link, architecture, license):
18
- value = {
19
- # Model name
20
- "entry.1591601824": model_name,
21
- # username/space
22
- "entry.1171388028": model_id,
23
- # Submission ID on CMT
24
- "entry.171528970": submission_id,
25
- # Preprint or paper link
26
- "entry.1284338508": paper_link,
27
- # Model architecture
28
- "entry.1291571256": architecture,
29
- # License
30
- # Option: any text
31
- "entry.272554778": license,
32
- # Challenge
33
- # Option: any text
34
- "entry.1908975677": challenge,
35
- # Email
36
- # Option: any text
37
- "entry.964644151": contact_email
38
- }
39
-
40
- return value
41
-
42
- def sendForm(url, data):
43
- try:
44
- requests.post(url, data=data)
45
- print("Submitted successfully!")
46
- except:
47
- print("Error!")
48
-
49
- def submit(model_name, model_id, contact_email, challenge, submission_id, paper_link, architecture, license):
50
-
51
- if model_name == "" or model_id == "" or challenge == "" or architecture == "" or license == "":
52
- gr.Error("Please fill all the fields")
53
- return
54
- if submission_id == "" and paper_link =="":
55
- gr.Error("Provide either a link to a paper describing the method or a submission ID for the MLSB workshop.")
56
- return
57
- try:
58
- user_name = ""
59
- if "/" in model_id:
60
- user_name = model_id.split("/")[0]
61
- model_path = model_id.split("/")[1]
62
-
63
- eval_entry = {
64
- "model_name": model_name,
65
- "model_id": model_id,
66
- "challenge": challenge,
67
- "submission_id": submission_id,
68
- "architecture": architecture,
69
- "license": license
70
- }
71
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
72
- os.makedirs(OUT_DIR, exist_ok=True)
73
- out_path = f"{OUT_DIR}/{user_name}_{model_path}.json"
74
-
75
- with open(out_path, "w") as f:
76
- f.write(json.dumps(eval_entry))
77
-
78
- print("Sending form")
79
- formData = fill_form(model_name, model_id, contact_email, challenge, submission_id, paper_link, architecture, license)
80
- sendForm("https://docs.google.com/forms/d/e/1FAIpQLSf1zP7RAFC5RLlva03xm0eIAPLKXOmMvNUzirbm82kdCUFKNw/formResponse", formData)
81
-
82
- print("Uploading eval file")
83
- API.upload_file(
84
- path_or_fileobj=out_path,
85
- path_in_repo=out_path.split("eval-queue/")[1],
86
- repo_id=QUEUE_REPO,
87
- repo_type="dataset",
88
- commit_message=f"Add {model_name} to eval queue",
89
- )
90
-
91
- gr.Info("Successfully submitted", duration=10)
92
- # Remove the local file
93
- os.remove(out_path)
94
- except:
95
- gr.Error("Error submitting the model")
96
-
97
 
 
98
 
99
 
100
 
@@ -180,4 +88,4 @@ with gr.Blocks(css=custom_css) as demo:
180
  Please find more information about the challenges on [mlsb.io/#challenge](https://mlsb.io/#challenge)""")
181
 
182
  if __name__ == "__main__":
183
- demo.launch()
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
3
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ from utils import LLM_BENCHMARKS_ABOUT_TEXT, LLM_BENCHMARKS_SUBMIT_TEXT, custom_css, jsonl_to_dataframe, add_average_column_to_df, apply_clickable_model, submit
6
 
7
 
8
 
 
88
  Please find more information about the challenges on [mlsb.io/#challenge](https://mlsb.io/#challenge)""")
89
 
90
  if __name__ == "__main__":
91
+ demo.launch()
utils.py CHANGED
@@ -1,5 +1,10 @@
1
- import pandas as pd
2
  import json
 
 
 
 
 
 
3
 
4
  custom_css = """
5
  @import url('https://fonts.googleapis.com/css2?family=Vazirmatn&display=swap');
@@ -104,10 +109,46 @@ body, .gradio-container, .gr-button, .gr-input, .gr-slider, .gr-dropdown, .gr-ma
104
  """
105
 
106
  LLM_BENCHMARKS_ABOUT_TEXT = f"""
107
- ## How it works
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- ## Reproducibility
110
- To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  """
113
 
@@ -168,4 +209,48 @@ def make_clickable_model(model_name):
168
 
169
  def apply_clickable_model(df, column_name):
170
  df[column_name] = df[column_name].apply(make_clickable_model)
171
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ import os
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
+ from envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
8
 
9
  custom_css = """
10
  @import url('https://fonts.googleapis.com/css2?family=Vazirmatn&display=swap');
 
109
  """
110
 
111
  LLM_BENCHMARKS_ABOUT_TEXT = f"""
112
+ ## Persian LLM Evaluation Leaderboard (v1)
113
+
114
+ The Persian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **AUT (Amirkabir University of Technology) NLP Lab**, provides a comprehensive benchmarking system specifically designed for Persian language models. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Persian.
115
+
116
+ ## Key Features
117
+
118
+ 1. **Open Evaluation Access**
119
+ The leaderboard allows open participation, meaning that developers and researchers working with open-source models can submit evaluation requests for their models. This accessibility encourages the development and testing of Persian LLMs within the broader AI ecosystem.
120
+
121
+ 2. **Task Diversity**
122
+ Six specialized tasks have been curated for this leaderboard, each tailored to challenge different aspects of a model’s capabilities. These tasks include:
123
+ - **Part Multiple Choice**
124
+ - **ARC Easy**
125
+ - **ARC Challenging**
126
+ - **MMLU Pro**
127
+ - **GSM8k Persian**
128
+ - **Multiple Choice Persian**
129
+
130
+ Each dataset is available in Persian, providing a robust testing ground for models in a non-English setting.
131
+
132
+ 3. **Open-Source Dataset Sample**
133
+ A sample of the evaluation dataset is hosted on [Hugging Face Datasets](https://huggingface.co/datasets/PartAI/llm-leaderboard-datasets-sample), offering the AI community a glimpse of the benchmark content and format. This sample allows developers to pre-assess their models against representative data before a full leaderboard evaluation.
134
 
135
+ 4. **Collaborative Development**
136
+ This leaderboard represents a significant collaboration between Part AI and Professor Saeedeh Momtazi of Amirkabir University of Technology, leveraging academic research and industrial expertise to create a high-quality, open benchmarking tool. The partnership underscores a shared commitment to advancing Persian-language AI technologies.
137
+
138
+ 5. **Comprehensive Evaluation Pipeline**
139
+ By integrating a standardized evaluation pipeline, models are assessed across a variety of data types, including text, mathematical formulas, and numerical data. This multi-faceted approach enhances the evaluation’s reliability and allows for precise, nuanced assessment of model performance across multiple dimensions.
140
+
141
+ ## Background and Goals
142
+
143
+ Recent months have seen a notable increase in the development of Persian language models by research centers and AI companies in Iran. However, the lack of reliable, standardized benchmarks for Persian models has made it challenging to evaluate model quality comprehensively. Global benchmarks typically do not support Persian, resulting in skewed or unreliable results for Persian-based AI.
144
+
145
+ This leaderboard addresses this gap by providing a locally-focused, transparent system that enables consistent, fair comparisons of Persian models. It is expected to be a valuable tool for Persian-speaking businesses and developers, allowing them to select models best suited to their needs. Researchers and model developers also benefit from the competitive environment, with opportunities to showcase and improve their models based on benchmark rankings.
146
+
147
+ ## Data Privacy and Integrity
148
+
149
+ To maintain evaluation integrity and prevent overfitting or data leakage, only part of the benchmark dataset is openly available. This limited access approach upholds model evaluation reliability, ensuring that results are genuinely representative of each model’s capabilities across unseen data.
150
+
151
+ The leaderboard represents a significant milestone in Persian language AI and is positioned to become the leading standard for LLM evaluation in the Persian-speaking world.
152
 
153
  """
154
 
 
209
 
210
  def apply_clickable_model(df, column_name):
211
  df[column_name] = df[column_name].apply(make_clickable_model)
212
+ return df
213
+
214
+
215
+ def submit(model_name, model_id, contact_email, challenge, submission_id, paper_link, architecture, license):
216
+ if model_name == "" or model_id == "" or challenge == "" or architecture == "" or license == "":
217
+ gr.Error("Please fill all the fields")
218
+ return
219
+ if submission_id == "" and paper_link == "":
220
+ gr.Error("Provide either a link to a paper describing the method or a submission ID for the MLSB workshop.")
221
+ return
222
+ try:
223
+ user_name = ""
224
+ if "/" in model_id:
225
+ user_name = model_id.split("/")[0]
226
+ model_path = model_id.split("/")[1]
227
+
228
+ eval_entry = {
229
+ "model_name": model_name,
230
+ "model_id": model_id,
231
+ "challenge": challenge,
232
+ "submission_id": submission_id,
233
+ "architecture": architecture,
234
+ "license": license
235
+ }
236
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
237
+ os.makedirs(OUT_DIR, exist_ok=True)
238
+ out_path = f"{OUT_DIR}/{user_name}_{model_path}.json"
239
+
240
+ with open(out_path, "w") as f:
241
+ f.write(json.dumps(eval_entry))
242
+
243
+ print("Uploading eval file")
244
+ API.upload_file(
245
+ path_or_fileobj=out_path,
246
+ path_in_repo=out_path.split("eval-queue/")[1],
247
+ repo_id=QUEUE_REPO,
248
+ repo_type="dataset",
249
+ commit_message=f"Add {model_name} to eval queue",
250
+ )
251
+
252
+ gr.Info("Successfully submitted", duration=10)
253
+ # Remove the local file
254
+ os.remove(out_path)
255
+ except:
256
+ gr.Error("Error submitting the model")