lukecq commited on
Commit
0e06db3
Β·
1 Parent(s): 8c8300c

update results and display

Browse files
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import os
4
- from huggingface_hub import snapshot_download
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
  from src.display.about import (
@@ -20,6 +20,7 @@ from src.leaderboard.load_results import load_data
20
 
21
  # clone / pull the lmeh eval data
22
  TOKEN = os.environ.get("TOKEN", None)
 
23
  RESULTS_REPO = f"SeaLLMs/SeaExam-results"
24
  CACHE_PATH=os.getenv("HF_HOME", ".")
25
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
@@ -33,7 +34,7 @@ def restart_space():
33
  API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
34
 
35
  # Load the data from the csv file
36
- csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results.csv'
37
  df_m3exam, df_mmlu, df_avg = load_data(csv_path)
38
 
39
  # Searching and filtering
@@ -112,6 +113,7 @@ with demo:
112
  # datatype=TYPES,
113
  elem_id="leaderboard-table",
114
  interactive=False,
 
115
  visible=True,
116
  # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
117
  )
@@ -149,6 +151,7 @@ with demo:
149
  value=df_m3exam,
150
  interactive=False,
151
  visible=True,
 
152
  )
153
 
154
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
@@ -184,6 +187,7 @@ with demo:
184
  value=df_mmlu,
185
  interactive=False,
186
  visible=True,
 
187
  )
188
 
189
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
 
1
  import gradio as gr
2
  import pandas as pd
3
  import os
4
+ from huggingface_hub import snapshot_download, login
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
  from src.display.about import (
 
20
 
21
  # clone / pull the lmeh eval data
22
  TOKEN = os.environ.get("TOKEN", None)
23
+ login(token=TOKEN)
24
  RESULTS_REPO = f"SeaLLMs/SeaExam-results"
25
  CACHE_PATH=os.getenv("HF_HOME", ".")
26
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
34
  API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
35
 
36
  # Load the data from the csv file
37
+ csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
38
  df_m3exam, df_mmlu, df_avg = load_data(csv_path)
39
 
40
  # Searching and filtering
 
113
  # datatype=TYPES,
114
  elem_id="leaderboard-table",
115
  interactive=False,
116
+ datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
117
  visible=True,
118
  # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
119
  )
 
151
  value=df_m3exam,
152
  interactive=False,
153
  visible=True,
154
+ datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
155
  )
156
 
157
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
 
187
  value=df_mmlu,
188
  interactive=False,
189
  visible=True,
190
+ datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
191
  )
192
 
193
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
src/display/about.py CHANGED
@@ -53,7 +53,9 @@ How to interpret the leaderboard?
53
  * Each numerical value represet the accuracy (%).
54
  * The "M3Exam" and "MMLU" pages show the performance of each model for that dataset.
55
  * The "πŸ… Overall" shows the average results of "M3Exam" and "MMLU".
56
- * The leaderboard is sorted by avg_sea, the average score across SEA languages (id, th, and vi).
 
 
57
 
58
  ## Reproducibility
59
  To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
 
53
  * Each numerical value represet the accuracy (%).
54
  * The "M3Exam" and "MMLU" pages show the performance of each model for that dataset.
55
  * The "πŸ… Overall" shows the average results of "M3Exam" and "MMLU".
56
+ * The leaderboard is ranked by avg_sea, the average score across SEA languages (id, th, and vi).
57
+ * The rank is in "R" column.
58
+ * The "params(B)" column shows the number of parameters of the model in billions.
59
 
60
  ## Reproducibility
61
  To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
src/leaderboard/load_results.py CHANGED
@@ -4,16 +4,19 @@ from huggingface_hub import HfApi
4
  api = HfApi()
5
 
6
  def get_model_size(model_name, precision: str = "BF16", revision: str = "main"):
 
 
 
7
  model_info = api.model_info(repo_id=model_name, revision=revision)
8
  # model_size = get_model_size(model_info=model_info, precision=precision)
9
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
10
  try:
11
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
12
  except (AttributeError, TypeError):
13
  try:
14
  size_match = re.search(size_pattern, model_info.modelId.lower())
15
  model_size = size_match.group(0)
16
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
17
  except AttributeError:
18
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
19
 
@@ -21,11 +24,19 @@ def get_model_size(model_name, precision: str = "BF16", revision: str = "main"):
21
  model_size = size_factor * model_size
22
  return model_size
23
 
 
 
 
 
 
 
 
 
24
  def load_data(data_path):
25
  df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
26
 
27
  columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
28
- columns_sorted = ['rank','type', 'Model', 'open?', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
29
 
30
  # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
31
  df_m3exam = df.iloc[:, :11] # M3Exam columns
@@ -40,9 +51,9 @@ def load_data(data_path):
40
  df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
41
 
42
  # rank the DataFrames by the 'avg_sea' column
43
- df_m3exam['rank'] = df_m3exam['avg_sea'].rank(ascending=False).astype(int)
44
- df_mmlu['rank'] = df_mmlu['avg_sea'].rank(ascending=False).astype(int)
45
- df_avg['rank'] = df_avg['avg_sea'].rank(ascending=False).astype(int)
46
 
47
  # reorder the columns
48
  df_m3exam = df_m3exam[columns_sorted]
@@ -64,6 +75,16 @@ def load_data(data_path):
64
  df_mmlu['type'] = df_mmlu['type'].map({'base': '🟒 base', 'chat': 'πŸ”Ά chat'})
65
  df_avg['type'] = df_avg['type'].map({'base': '🟒 base', 'chat': 'πŸ”Ά chat'})
66
 
 
 
 
 
 
 
 
 
 
 
67
  return df_m3exam, df_mmlu, df_avg
68
 
69
 
 
4
  api = HfApi()
5
 
6
  def get_model_size(model_name, precision: str = "BF16", revision: str = "main"):
7
+ if len(model_name.split("/")) == 1:
8
+ return None
9
+
10
  model_info = api.model_info(repo_id=model_name, revision=revision)
11
  # model_size = get_model_size(model_info=model_info, precision=precision)
12
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
13
  try:
14
+ model_size = round(model_info.safetensors["total"] / 1e9, 1)
15
  except (AttributeError, TypeError):
16
  try:
17
  size_match = re.search(size_pattern, model_info.modelId.lower())
18
  model_size = size_match.group(0)
19
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 1)
20
  except AttributeError:
21
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
22
 
 
24
  model_size = size_factor * model_size
25
  return model_size
26
 
27
+ def make_clickable_model(model_name, link=None):
28
+ if len(model_name.split("/")) == 2:
29
+ link = "https://huggingface.co/" + model_name
30
+ return (
31
+ f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
32
+ )
33
+ return model_name
34
+
35
  def load_data(data_path):
36
  df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
37
 
38
  columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
39
+ columns_sorted = ['R','type', 'Model', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
40
 
41
  # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
42
  df_m3exam = df.iloc[:, :11] # M3Exam columns
 
51
  df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
52
 
53
  # rank the DataFrames by the 'avg_sea' column
54
+ df_m3exam['R'] = df_m3exam['avg_sea'].rank(ascending=False).astype(int)
55
+ df_mmlu['R'] = df_mmlu['avg_sea'].rank(ascending=False).astype(int)
56
+ df_avg['R'] = df_avg['avg_sea'].rank(ascending=False).astype(int)
57
 
58
  # reorder the columns
59
  df_m3exam = df_m3exam[columns_sorted]
 
75
  df_mmlu['type'] = df_mmlu['type'].map({'base': '🟒 base', 'chat': 'πŸ”Ά chat'})
76
  df_avg['type'] = df_avg['type'].map({'base': '🟒 base', 'chat': 'πŸ”Ά chat'})
77
 
78
+ # get the parameters of the models
79
+ df_m3exam['params(B)'] = df_m3exam['Model'].apply(get_model_size)
80
+ df_mmlu['params(B)'] = df_mmlu['Model'].apply(get_model_size)
81
+ df_avg['params(B)'] = df_avg['Model'].apply(get_model_size)
82
+
83
+ # make the 'Model' column clickable
84
+ df_m3exam['Model'] = df_m3exam['Model'].apply(make_clickable_model)
85
+ df_mmlu['Model'] = df_mmlu['Model'].apply(make_clickable_model)
86
+ df_avg['Model'] = df_avg['Model'].apply(make_clickable_model)
87
+
88
  return df_m3exam, df_mmlu, df_avg
89
 
90