fix: bugs
Browse files- app.py +13 -13
- model_performance.csv +19 -19
- src/about.py +3 -3
- src/display/utils.py +10 -10
- src/populate.py +7 -10
app.py
CHANGED
@@ -68,21 +68,21 @@ def init_leaderboard(dataframe):
|
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
|
85 |
-
),
|
86 |
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
@@ -201,4 +201,4 @@ with demo:
|
|
201 |
scheduler = BackgroundScheduler()
|
202 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
scheduler.start()
|
204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
+
search_columns=[AutoEvalColumn.model.name],
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
+
# ColumnFilter(
|
77 |
+
# AutoEvalColumn.params.name,
|
78 |
+
# type="slider",
|
79 |
+
# min=0.01,
|
80 |
+
# max=150,
|
81 |
+
# label="Select the number of parameters (B)",
|
82 |
+
# ),
|
83 |
+
# ColumnFilter(
|
84 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
+
# ),
|
86 |
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
|
|
201 |
scheduler = BackgroundScheduler()
|
202 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
scheduler.start()
|
204 |
+
demo.queue(default_concurrency_limit=40).launch()
|
model_performance.csv
CHANGED
@@ -1,19 +1,19 @@
|
|
1 |
-
|
2 |
-
4o,72.49,60.0,72.22
|
3 |
-
o1,49.07,56.0,74.44
|
4 |
-
o3-mini,60.87,59.0,76.67
|
5 |
-
v3,73.2,53.0,76.67
|
6 |
-
r1,65.13,53.0,86.67
|
7 |
-
deepseek-70b,66.73,53.0,86.67
|
8 |
-
llama3-70B-instruct,58.92,41.0,56.67
|
9 |
-
llama31-70B-instruct,63.18,48.0,63.33
|
10 |
-
llama33-70B-instruct,68.15,54.0,70.0
|
11 |
-
deepseek-32b,65.48,55.0,84.44
|
12 |
-
deepseek-14b,63.27,44.0,84.44
|
13 |
-
deepseek-8b,45.96,33.0,81.11
|
14 |
-
llama3 8b-instruct,41.97,29.0,48.89
|
15 |
-
llama31 8b-instruct,54.13,34.0,62.22
|
16 |
-
Qwen2.5-32B-Instruct,,,
|
17 |
-
Qwen2.5-72B-Instruct,73.38,59.0,67.78
|
18 |
-
Qwen2.5-72B-Instruct-math,69.74,42.0,83.33
|
19 |
-
Fino1-8B,60.87,40.0,82.22
|
|
|
1 |
+
Model,Type,finqa,dm-simplong,xbrl-math
|
2 |
+
4o,instruction-tuned,72.49,60.0,72.22
|
3 |
+
o1,instruction-tuned,49.07,56.0,74.44
|
4 |
+
o3-mini,instruction-tuned,60.87,59.0,76.67
|
5 |
+
v3,instruction-tuned,73.2,53.0,76.67
|
6 |
+
r1,instruction-tuned,65.13,53.0,86.67
|
7 |
+
deepseek-70b,instruction-tuned,66.73,53.0,86.67
|
8 |
+
llama3-70B-instruct,instruction-tuned,58.92,41.0,56.67
|
9 |
+
llama31-70B-instruct,instruction-tuned,63.18,48.0,63.33
|
10 |
+
llama33-70B-instruct,instruction-tuned,68.15,54.0,70.0
|
11 |
+
deepseek-32b,instruction-tuned,65.48,55.0,84.44
|
12 |
+
deepseek-14b,instruction-tuned,63.27,44.0,84.44
|
13 |
+
deepseek-8b,instruction-tuned,45.96,33.0,81.11
|
14 |
+
llama3 8b-instruct,instruction-tuned,41.97,29.0,48.89
|
15 |
+
llama31 8b-instruct,instruction-tuned,54.13,34.0,62.22
|
16 |
+
Qwen2.5-32B-Instruct,instruction-tuned,,,
|
17 |
+
Qwen2.5-72B-Instruct,instruction-tuned,73.38,59.0,67.78
|
18 |
+
Qwen2.5-72B-Instruct-math,instruction-tuned,69.74,42.0,83.33
|
19 |
+
Fino1-8B,instruction-tuned,60.87,40.0,82.22
|
src/about.py
CHANGED
@@ -12,9 +12,9 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
17 |
-
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("FinQA", "acc", "finqa")
|
16 |
+
task1 = Task("DM-SimpLong", "acc", "dm-simplong")
|
17 |
+
task2 = Task("XBRL-math", "acc", "xbrl-math")
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
src/display/utils.py
CHANGED
@@ -23,22 +23,22 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
+
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
src/populate.py
CHANGED
@@ -10,18 +10,15 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
-
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
|
18 |
-
|
19 |
|
20 |
-
#filter out if any of the benchmarks have not been produced
|
21 |
-
|
22 |
-
print(os.getcwd()) # 获取并打印当前工作目录
|
23 |
-
|
24 |
-
df = df[has_no_nan_values(df, benchmark_cols)] #pd.read_csv('model_performance.csv')#
|
25 |
print(df)
|
26 |
return df
|
27 |
|
|
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
# raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
+
# all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
+
# df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
df = pd.read_csv('model_performance.csv')
|
18 |
+
df = df.dropna()
|
19 |
|
20 |
+
# filter out if any of the benchmarks have not been produced
|
21 |
+
# df = df[has_no_nan_values(df, benchmark_cols)] #pd.read_csv('model_performance.csv')#
|
|
|
|
|
|
|
22 |
print(df)
|
23 |
return df
|
24 |
|