add BSR
Browse files- src/display/utils.py +3 -0
- src/leaderboard/read_evals.py +11 -0
src/display/utils.py
CHANGED
@@ -40,6 +40,9 @@ auto_eval_column_dict.append(
|
|
40 |
["full_pass_count_miscompilation", ColumnContent, ColumnContent("Repaired (Miscompilation)", "number", True)]
|
41 |
)
|
42 |
auto_eval_column_dict.append(["full_pass_count_hang", ColumnContent, ColumnContent("Repaired (Hang)", "number", True)])
|
|
|
|
|
|
|
43 |
|
44 |
# We use make dataclass to dynamically fill the scores from Tasks
|
45 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
40 |
["full_pass_count_miscompilation", ColumnContent, ColumnContent("Repaired (Miscompilation)", "number", True)]
|
41 |
)
|
42 |
auto_eval_column_dict.append(["full_pass_count_hang", ColumnContent, ColumnContent("Repaired (Hang)", "number", True)])
|
43 |
+
auto_eval_column_dict.append(
|
44 |
+
["build_success_rate", ColumnContent, ColumnContent("Build Success Rate", "number", True)]
|
45 |
+
)
|
46 |
|
47 |
# We use make dataclass to dynamically fill the scores from Tasks
|
48 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
@@ -22,6 +22,8 @@ class EvalResult:
|
|
22 |
full_pass_count_crash: int
|
23 |
full_pass_count_hang: int
|
24 |
full_pass_count_miscompilation: int
|
|
|
|
|
25 |
|
26 |
@classmethod
|
27 |
def init_from_json_file(self, json_filepath):
|
@@ -39,6 +41,8 @@ class EvalResult:
|
|
39 |
fast_pass_count = 0
|
40 |
full_pass_count = 0
|
41 |
full_pass_count_cat = {}
|
|
|
|
|
42 |
for fix in fixes:
|
43 |
bug_type = fix.get("bug_type", "")
|
44 |
if fix.get("fast_check_pass", False):
|
@@ -46,6 +50,8 @@ class EvalResult:
|
|
46 |
if fix.get("full_check_pass", False):
|
47 |
full_pass_count += 1
|
48 |
full_pass_count_cat[bug_type] = full_pass_count_cat.get(bug_type, 0) + 1
|
|
|
|
|
49 |
|
50 |
return self(
|
51 |
method_name=method_name,
|
@@ -59,6 +65,8 @@ class EvalResult:
|
|
59 |
full_pass_count_crash=full_pass_count_cat.get("crash", 0),
|
60 |
full_pass_count_hang=full_pass_count_cat.get("hang", 0),
|
61 |
full_pass_count_miscompilation=full_pass_count_cat.get("miscompilation", 0),
|
|
|
|
|
62 |
)
|
63 |
|
64 |
def to_dict(self, total_issues):
|
@@ -74,6 +82,9 @@ class EvalResult:
|
|
74 |
AutoEvalColumn.full_pass_count_crash.name: self.full_pass_count_crash,
|
75 |
AutoEvalColumn.full_pass_count_hang.name: self.full_pass_count_hang,
|
76 |
AutoEvalColumn.full_pass_count_miscompilation.name: self.full_pass_count_miscompilation,
|
|
|
|
|
|
|
77 |
}
|
78 |
|
79 |
return data_dict
|
|
|
22 |
full_pass_count_crash: int
|
23 |
full_pass_count_hang: int
|
24 |
full_pass_count_miscompilation: int
|
25 |
+
build_count: int
|
26 |
+
build_failure_count: int
|
27 |
|
28 |
@classmethod
|
29 |
def init_from_json_file(self, json_filepath):
|
|
|
41 |
fast_pass_count = 0
|
42 |
full_pass_count = 0
|
43 |
full_pass_count_cat = {}
|
44 |
+
build_count = 0
|
45 |
+
build_failure_count = 0
|
46 |
for fix in fixes:
|
47 |
bug_type = fix.get("bug_type", "")
|
48 |
if fix.get("fast_check_pass", False):
|
|
|
50 |
if fix.get("full_check_pass", False):
|
51 |
full_pass_count += 1
|
52 |
full_pass_count_cat[bug_type] = full_pass_count_cat.get(bug_type, 0) + 1
|
53 |
+
build_count += fix.get("build_count", 0)
|
54 |
+
build_failure_count += fix.get("build_failure_count", 0)
|
55 |
|
56 |
return self(
|
57 |
method_name=method_name,
|
|
|
65 |
full_pass_count_crash=full_pass_count_cat.get("crash", 0),
|
66 |
full_pass_count_hang=full_pass_count_cat.get("hang", 0),
|
67 |
full_pass_count_miscompilation=full_pass_count_cat.get("miscompilation", 0),
|
68 |
+
build_count=build_count,
|
69 |
+
build_failure_count=build_failure_count,
|
70 |
)
|
71 |
|
72 |
def to_dict(self, total_issues):
|
|
|
82 |
AutoEvalColumn.full_pass_count_crash.name: self.full_pass_count_crash,
|
83 |
AutoEvalColumn.full_pass_count_hang.name: self.full_pass_count_hang,
|
84 |
AutoEvalColumn.full_pass_count_miscompilation.name: self.full_pass_count_miscompilation,
|
85 |
+
AutoEvalColumn.build_success_rate.name: round(
|
86 |
+
(self.build_count - self.build_failure_count) * 100.0 / self.build_count, 1
|
87 |
+
),
|
88 |
}
|
89 |
|
90 |
return data_dict
|