Update
Browse files- app.py +1 -2
- src/display/utils.py +17 -64
- src/populate.py +2 -6
app.py
CHANGED
@@ -14,7 +14,6 @@ from src.about import (
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
-
BENCHMARK_COLS,
|
18 |
COLS,
|
19 |
AutoEvalColumn,
|
20 |
fields,
|
@@ -42,7 +41,7 @@ except Exception:
|
|
42 |
restart_space()
|
43 |
|
44 |
total_issues = load_dataset("dtcxzyw/llvm-apr-benchmark").num_rows["test"]
|
45 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, COLS
|
46 |
|
47 |
|
48 |
def init_leaderboard(dataframe):
|
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
|
|
17 |
COLS,
|
18 |
AutoEvalColumn,
|
19 |
fields,
|
|
|
41 |
restart_space()
|
42 |
|
43 |
total_issues = load_dataset("dtcxzyw/llvm-apr-benchmark").num_rows["test"]
|
44 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, COLS)
|
45 |
|
46 |
|
47 |
def init_leaderboard(dataframe):
|
src/display/utils.py
CHANGED
@@ -3,7 +3,6 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -20,75 +19,29 @@ class ColumnContent:
|
|
20 |
hidden: bool = False
|
21 |
never_hidden: bool = False
|
22 |
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(
|
27 |
-
|
28 |
-
|
29 |
-
auto_eval_column_dict.append(["
|
30 |
-
#
|
31 |
-
auto_eval_column_dict.append(["
|
32 |
-
auto_eval_column_dict.append(["
|
33 |
-
auto_eval_column_dict.append(["
|
34 |
-
auto_eval_column_dict.append(["
|
35 |
-
auto_eval_column_dict.append(
|
36 |
-
|
37 |
-
|
38 |
-
auto_eval_column_dict.append(
|
39 |
-
|
|
|
|
|
40 |
|
41 |
# We use make dataclass to dynamically fill the scores from Tasks
|
42 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
43 |
|
44 |
-
## All the model information that we might need
|
45 |
-
@dataclass
|
46 |
-
class ModelDetails:
|
47 |
-
name: str
|
48 |
-
display_name: str = ""
|
49 |
-
symbol: str = "" # emoji
|
50 |
-
|
51 |
-
|
52 |
-
class ModelType(Enum):
|
53 |
-
PT = ModelDetails(name="pretrained", symbol="🟢")
|
54 |
-
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
55 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
56 |
-
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
57 |
-
Unknown = ModelDetails(name="", symbol="?")
|
58 |
-
|
59 |
-
def to_str(self, separator=" "):
|
60 |
-
return f"{self.value.symbol}{separator}{self.value.name}"
|
61 |
-
|
62 |
-
@staticmethod
|
63 |
-
def from_str(type):
|
64 |
-
if "fine-tuned" in type or "🔶" in type:
|
65 |
-
return ModelType.FT
|
66 |
-
if "pretrained" in type or "🟢" in type:
|
67 |
-
return ModelType.PT
|
68 |
-
if "RL-tuned" in type or "🟦" in type:
|
69 |
-
return ModelType.RL
|
70 |
-
if "instruction-tuned" in type or "⭕" in type:
|
71 |
-
return ModelType.IFT
|
72 |
-
return ModelType.Unknown
|
73 |
-
|
74 |
-
class WeightType(Enum):
|
75 |
-
Adapter = ModelDetails("Adapter")
|
76 |
-
Original = ModelDetails("Original")
|
77 |
-
Delta = ModelDetails("Delta")
|
78 |
-
|
79 |
-
class Precision(Enum):
|
80 |
-
float16 = ModelDetails("float16")
|
81 |
-
bfloat16 = ModelDetails("bfloat16")
|
82 |
-
Unknown = ModelDetails("?")
|
83 |
-
|
84 |
-
def from_str(precision):
|
85 |
-
if precision in ["torch.float16", "float16"]:
|
86 |
-
return Precision.float16
|
87 |
-
if precision in ["torch.bfloat16", "bfloat16"]:
|
88 |
-
return Precision.bfloat16
|
89 |
-
return Precision.Unknown
|
90 |
-
|
91 |
# Column selection
|
92 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
93 |
-
|
94 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
|
|
6 |
|
7 |
def fields(raw_class):
|
8 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
19 |
hidden: bool = False
|
20 |
never_hidden: bool = False
|
21 |
|
22 |
+
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
auto_eval_column_dict.append(
|
27 |
+
["method_name", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]
|
28 |
+
)
|
29 |
+
auto_eval_column_dict.append(["model_name", ColumnContent, ColumnContent("Model", "markdown", True)])
|
30 |
+
# Scores
|
31 |
+
auto_eval_column_dict.append(["full_pass_count", ColumnContent, ColumnContent("Repaired ⬆️", "number", True)])
|
32 |
+
auto_eval_column_dict.append(["fast_pass_count", ColumnContent, ColumnContent("Repaired (Fast)", "number", True)])
|
33 |
+
auto_eval_column_dict.append(["with_hint", ColumnContent, ColumnContent("Repair with hint", "bool", True)])
|
34 |
+
auto_eval_column_dict.append(["attempts", ColumnContent, ColumnContent("Number of attempts", "number", True)])
|
35 |
+
auto_eval_column_dict.append(
|
36 |
+
["full_pass_count_crash", ColumnContent, ColumnContent("Repaired (Crash)", "number", True)]
|
37 |
+
)
|
38 |
+
auto_eval_column_dict.append(
|
39 |
+
["full_pass_count_miscompilation", ColumnContent, ColumnContent("Repaired (Miscompilation)", "number", True)]
|
40 |
+
)
|
41 |
+
auto_eval_column_dict.append(["full_pass_count_hang", ColumnContent, ColumnContent("Repaired (Hang)", "number", True)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
# Column selection
|
47 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
src/populate.py
CHANGED
@@ -3,20 +3,16 @@ import os
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.display.formatting import has_no_nan_values
|
7 |
from src.display.utils import AutoEvalColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df(requests_path: str, cols: list
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[AutoEvalColumn.
|
18 |
df = df[cols].round(decimals=2)
|
19 |
-
|
20 |
-
# filter out if any of the benchmarks have not been produced
|
21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
|
|
6 |
from src.display.utils import AutoEvalColumn
|
7 |
from src.leaderboard.read_evals import get_raw_eval_results
|
8 |
|
9 |
|
10 |
+
def get_leaderboard_df(requests_path: str, cols: list) -> pd.DataFrame:
|
11 |
"""Creates a dataframe from all the individual experiment results"""
|
12 |
raw_data = get_raw_eval_results(requests_path)
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
16 |
+
df = df.sort_values(by=[AutoEvalColumn.full_pass_count.name], ascending=False)
|
17 |
df = df[cols].round(decimals=2)
|
|
|
|
|
|
|
18 |
return df
|