Update
Browse files- app.py +36 -1
- src/leaderboard/read_evals.py +6 -0
app.py
CHANGED
@@ -40,8 +40,33 @@ try:
|
|
40 |
except Exception:
|
41 |
restart_space()
|
42 |
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, COLS, total_issues)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
|
47 |
def init_leaderboard(dataframe):
|
@@ -73,6 +98,16 @@ with demo:
|
|
73 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
74 |
with gr.TabItem("π
Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
75 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
with gr.TabItem("π Submission", elem_id="llm-benchmark-tab-table", id=1):
|
78 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
40 |
except Exception:
|
41 |
restart_space()
|
42 |
|
43 |
+
dataset = load_dataset("dtcxzyw/llvm-apr-benchmark")
|
44 |
+
total_issues = dataset.num_rows["test"]
|
45 |
+
bug_id_to_time = dict()
|
46 |
+
for issue in dataset["test"]:
|
47 |
+
bug_id_to_time[issue["bug_id"]] = pd.to_datetime(issue["knowledge_cutoff"])
|
48 |
+
timeline_xs = []
|
49 |
+
timeline_ys = []
|
50 |
+
timeline_cols = []
|
51 |
+
model_cnt = 0
|
52 |
+
for bug_id, time in bug_id_to_time.items():
|
53 |
+
timeline_xs.append(time)
|
54 |
+
timeline_ys.append(0)
|
55 |
+
timeline_cols.append("Baseline")
|
56 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, COLS, total_issues)
|
57 |
+
for row in LEADERBOARD_DF.itertuples():
|
58 |
+
model_cnt += 1
|
59 |
+
for fix in row.fixed_bug_ids:
|
60 |
+
timeline_xs.append(bug_id_to_time[fix])
|
61 |
+
timeline_ys.append(model_cnt)
|
62 |
+
timeline_cols.append(row.method_name)
|
63 |
+
timeline_df = pd.DataFrame(
|
64 |
+
{
|
65 |
+
"time": timeline_xs,
|
66 |
+
"model": timeline_ys,
|
67 |
+
"method_name": timeline_cols,
|
68 |
+
}
|
69 |
+
)
|
70 |
|
71 |
|
72 |
def init_leaderboard(dataframe):
|
|
|
98 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
99 |
with gr.TabItem("π
Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
100 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
101 |
+
gr.ScatterPlot(
|
102 |
+
timeline_df,
|
103 |
+
x="time",
|
104 |
+
y="model",
|
105 |
+
color="method_name",
|
106 |
+
x_label="Time",
|
107 |
+
y_label="Model",
|
108 |
+
title="Timeline",
|
109 |
+
color="timeline",
|
110 |
+
)
|
111 |
|
112 |
with gr.TabItem("π Submission", elem_id="llm-benchmark-tab-table", id=1):
|
113 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
src/leaderboard/read_evals.py
CHANGED
@@ -25,6 +25,7 @@ class EvalResult:
|
|
25 |
build_count: int
|
26 |
build_failure_count: int
|
27 |
mttr: float
|
|
|
28 |
|
29 |
@classmethod
|
30 |
def init_from_json_file(self, json_filepath):
|
@@ -45,6 +46,7 @@ class EvalResult:
|
|
45 |
build_count = 0
|
46 |
build_failure_count = 0
|
47 |
ttr_sum = 0
|
|
|
48 |
for fix in fixes:
|
49 |
bug_type = fix.get("bug_type", "")
|
50 |
if fix.get("fast_check_pass", False):
|
@@ -53,6 +55,7 @@ class EvalResult:
|
|
53 |
full_pass_count += 1
|
54 |
full_pass_count_cat[bug_type] = full_pass_count_cat.get(bug_type, 0) + 1
|
55 |
ttr_sum += fix.get("wall_time", 0)
|
|
|
56 |
build_count += fix.get("build_count", 0)
|
57 |
build_failure_count += fix.get("build_failure_count", 0)
|
58 |
|
@@ -71,6 +74,7 @@ class EvalResult:
|
|
71 |
build_count=build_count,
|
72 |
build_failure_count=build_failure_count,
|
73 |
mttr=round(ttr_sum / full_pass_count / 60, 1) if full_pass_count > 0 else 0,
|
|
|
74 |
)
|
75 |
|
76 |
def to_dict(self, total_issues):
|
@@ -90,6 +94,8 @@ class EvalResult:
|
|
90 |
(self.build_count - self.build_failure_count) * 100.0 / self.build_count, 1
|
91 |
),
|
92 |
AutoEvalColumn.mttr.name: self.mttr,
|
|
|
|
|
93 |
}
|
94 |
|
95 |
return data_dict
|
|
|
25 |
build_count: int
|
26 |
build_failure_count: int
|
27 |
mttr: float
|
28 |
+
fixed_bug_ids: list[str]
|
29 |
|
30 |
@classmethod
|
31 |
def init_from_json_file(self, json_filepath):
|
|
|
46 |
build_count = 0
|
47 |
build_failure_count = 0
|
48 |
ttr_sum = 0
|
49 |
+
fixed_bug_ids = []
|
50 |
for fix in fixes:
|
51 |
bug_type = fix.get("bug_type", "")
|
52 |
if fix.get("fast_check_pass", False):
|
|
|
55 |
full_pass_count += 1
|
56 |
full_pass_count_cat[bug_type] = full_pass_count_cat.get(bug_type, 0) + 1
|
57 |
ttr_sum += fix.get("wall_time", 0)
|
58 |
+
fixed_bug_ids.append(fix.get("bug_id", ""))
|
59 |
build_count += fix.get("build_count", 0)
|
60 |
build_failure_count += fix.get("build_failure_count", 0)
|
61 |
|
|
|
74 |
build_count=build_count,
|
75 |
build_failure_count=build_failure_count,
|
76 |
mttr=round(ttr_sum / full_pass_count / 60, 1) if full_pass_count > 0 else 0,
|
77 |
+
fixed_bug_ids=fixed_bug_ids,
|
78 |
)
|
79 |
|
80 |
def to_dict(self, total_issues):
|
|
|
94 |
(self.build_count - self.build_failure_count) * 100.0 / self.build_count, 1
|
95 |
),
|
96 |
AutoEvalColumn.mttr.name: self.mttr,
|
97 |
+
"fixed_bug_ids": self.fixed_bug_ids,
|
98 |
+
"method_id": self.method_name + "(" + self.model_name + ")",
|
99 |
}
|
100 |
|
101 |
return data_dict
|