Xueqing commited on
Commit
ac1edfa
·
1 Parent(s): d13cc61
Files changed (5) hide show
  1. app.py +13 -13
  2. model_performance.csv +19 -19
  3. src/about.py +3 -3
  4. src/display/utils.py +10 -10
  5. src/populate.py +7 -10
app.py CHANGED
@@ -68,21 +68,21 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
@@ -201,4 +201,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
+ search_columns=[AutoEvalColumn.model.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ # ColumnFilter(
77
+ # AutoEvalColumn.params.name,
78
+ # type="slider",
79
+ # min=0.01,
80
+ # max=150,
81
+ # label="Select the number of parameters (B)",
82
+ # ),
83
+ # ColumnFilter(
84
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ # ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
 
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
+ demo.queue(default_concurrency_limit=40).launch()
model_performance.csv CHANGED
@@ -1,19 +1,19 @@
1
- models,finqa,dm-simplong,xbrl-math
2
- 4o,72.49,60.0,72.22
3
- o1,49.07,56.0,74.44
4
- o3-mini,60.87,59.0,76.67
5
- v3,73.2,53.0,76.67
6
- r1,65.13,53.0,86.67
7
- deepseek-70b,66.73,53.0,86.67
8
- llama3-70B-instruct,58.92,41.0,56.67
9
- llama31-70B-instruct,63.18,48.0,63.33
10
- llama33-70B-instruct,68.15,54.0,70.0
11
- deepseek-32b,65.48,55.0,84.44
12
- deepseek-14b,63.27,44.0,84.44
13
- deepseek-8b,45.96,33.0,81.11
14
- llama3 8b-instruct,41.97,29.0,48.89
15
- llama31 8b-instruct,54.13,34.0,62.22
16
- Qwen2.5-32B-Instruct,,,
17
- Qwen2.5-72B-Instruct,73.38,59.0,67.78
18
- Qwen2.5-72B-Instruct-math,69.74,42.0,83.33
19
- Fino1-8B,60.87,40.0,82.22
 
1
+ Model,Type,finqa,dm-simplong,xbrl-math
2
+ 4o,instruction-tuned,72.49,60.0,72.22
3
+ o1,instruction-tuned,49.07,56.0,74.44
4
+ o3-mini,instruction-tuned,60.87,59.0,76.67
5
+ v3,instruction-tuned,73.2,53.0,76.67
6
+ r1,instruction-tuned,65.13,53.0,86.67
7
+ deepseek-70b,instruction-tuned,66.73,53.0,86.67
8
+ llama3-70B-instruct,instruction-tuned,58.92,41.0,56.67
9
+ llama31-70B-instruct,instruction-tuned,63.18,48.0,63.33
10
+ llama33-70B-instruct,instruction-tuned,68.15,54.0,70.0
11
+ deepseek-32b,instruction-tuned,65.48,55.0,84.44
12
+ deepseek-14b,instruction-tuned,63.27,44.0,84.44
13
+ deepseek-8b,instruction-tuned,45.96,33.0,81.11
14
+ llama3 8b-instruct,instruction-tuned,41.97,29.0,48.89
15
+ llama31 8b-instruct,instruction-tuned,54.13,34.0,62.22
16
+ Qwen2.5-32B-Instruct,instruction-tuned,,,
17
+ Qwen2.5-72B-Instruct,instruction-tuned,73.38,59.0,67.78
18
+ Qwen2.5-72B-Instruct-math,instruction-tuned,69.74,42.0,83.33
19
+ Fino1-8B,instruction-tuned,60.87,40.0,82.22
src/about.py CHANGED
@@ -12,9 +12,9 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_ri", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
20
 
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("FinQA", "acc", "finqa")
16
+ task1 = Task("DM-SimpLong", "acc", "dm-simplong")
17
+ task2 = Task("XBRL-math", "acc", "xbrl-math")
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
20
 
src/display/utils.py CHANGED
@@ -23,22 +23,22 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
+ # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
+ # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
src/populate.py CHANGED
@@ -10,18 +10,15 @@ from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
 
16
- df = pd.DataFrame.from_records(all_data_json)
17
- # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- # df = df[cols].round(decimals=2)
19
 
20
- #filter out if any of the benchmarks have not been produced
21
- import os
22
- print(os.getcwd()) # 获取并打印当前工作目录
23
-
24
- df = df[has_no_nan_values(df, benchmark_cols)] #pd.read_csv('model_performance.csv')#
25
  print(df)
26
  return df
27
 
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ # raw_data = get_raw_eval_results(results_path, requests_path)
14
+ # all_data_json = [v.to_dict() for v in raw_data]
15
 
16
+ # df = pd.DataFrame.from_records(all_data_json)
17
+ df = pd.read_csv('model_performance.csv')
18
+ df = df.dropna()
19
 
20
+ # filter out if any of the benchmarks have not been produced
21
+ # df = df[has_no_nan_values(df, benchmark_cols)] #pd.read_csv('model_performance.csv')#
 
 
 
22
  print(df)
23
  return df
24