Spaces:

dtcxzyw
/

llvm-apr-benchmark-leaderboard

Running

App Files Files Community

dtcxzyw commited on Feb 9

Commit

22a227d

unverified ·

1 Parent(s): c629528

Update

Browse files

Files changed (2) hide show

app.py +36 -1
src/leaderboard/read_evals.py +6 -0

app.py CHANGED Viewed

@@ -40,8 +40,33 @@ try:
 except Exception:
     restart_space()
-total_issues = load_dataset("dtcxzyw/llvm-apr-benchmark").num_rows["test"]
 LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, COLS, total_issues)
 def init_leaderboard(dataframe):
@@ -73,6 +98,16 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(LEADERBOARD_DF)
         with gr.TabItem("🚀 Submission", elem_id="llm-benchmark-tab-table", id=1):
             gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

 except Exception:
     restart_space()
+dataset = load_dataset("dtcxzyw/llvm-apr-benchmark")
+total_issues = dataset.num_rows["test"]
+bug_id_to_time = dict()
+for issue in dataset["test"]:
+    bug_id_to_time[issue["bug_id"]] = pd.to_datetime(issue["knowledge_cutoff"])
+timeline_xs = []
+timeline_ys = []
+timeline_cols = []
+model_cnt = 0
+for bug_id, time in bug_id_to_time.items():
+    timeline_xs.append(time)
+    timeline_ys.append(0)
+    timeline_cols.append("Baseline")
 LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, COLS, total_issues)
+for row in LEADERBOARD_DF.itertuples():
+    model_cnt += 1
+    for fix in row.fixed_bug_ids:
+        timeline_xs.append(bug_id_to_time[fix])
+        timeline_ys.append(model_cnt)
+        timeline_cols.append(row.method_name)
+timeline_df = pd.DataFrame(
+    {
+        "time": timeline_xs,
+        "model": timeline_ys,
+        "method_name": timeline_cols,
+    }
+)
 def init_leaderboard(dataframe):
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(LEADERBOARD_DF)
+            gr.ScatterPlot(
+                timeline_df,
+                x="time",
+                y="model",
+                color="method_name",
+                x_label="Time",
+                y_label="Model",
+                title="Timeline",
+                color="timeline",
+            )
         with gr.TabItem("🚀 Submission", elem_id="llm-benchmark-tab-table", id=1):
             gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -25,6 +25,7 @@ class EvalResult:
     build_count: int
     build_failure_count: int
     mttr: float
     @classmethod
     def init_from_json_file(self, json_filepath):
@@ -45,6 +46,7 @@ class EvalResult:
         build_count = 0
         build_failure_count = 0
         ttr_sum = 0
         for fix in fixes:
             bug_type = fix.get("bug_type", "")
             if fix.get("fast_check_pass", False):
@@ -53,6 +55,7 @@ class EvalResult:
                 full_pass_count += 1
                 full_pass_count_cat[bug_type] = full_pass_count_cat.get(bug_type, 0) + 1
                 ttr_sum += fix.get("wall_time", 0)
             build_count += fix.get("build_count", 0)
             build_failure_count += fix.get("build_failure_count", 0)
@@ -71,6 +74,7 @@ class EvalResult:
             build_count=build_count,
             build_failure_count=build_failure_count,
             mttr=round(ttr_sum / full_pass_count / 60, 1) if full_pass_count > 0 else 0,
         )
     def to_dict(self, total_issues):
@@ -90,6 +94,8 @@ class EvalResult:
                 (self.build_count - self.build_failure_count) * 100.0 / self.build_count, 1
             ),
             AutoEvalColumn.mttr.name: self.mttr,
         }
         return data_dict

     build_count: int
     build_failure_count: int
     mttr: float
+    fixed_bug_ids: list[str]
     @classmethod
     def init_from_json_file(self, json_filepath):
         build_count = 0
         build_failure_count = 0
         ttr_sum = 0
+        fixed_bug_ids = []
         for fix in fixes:
             bug_type = fix.get("bug_type", "")
             if fix.get("fast_check_pass", False):
                 full_pass_count += 1
                 full_pass_count_cat[bug_type] = full_pass_count_cat.get(bug_type, 0) + 1
                 ttr_sum += fix.get("wall_time", 0)
+                fixed_bug_ids.append(fix.get("bug_id", ""))
             build_count += fix.get("build_count", 0)
             build_failure_count += fix.get("build_failure_count", 0)
             build_count=build_count,
             build_failure_count=build_failure_count,
             mttr=round(ttr_sum / full_pass_count / 60, 1) if full_pass_count > 0 else 0,
+            fixed_bug_ids=fixed_bug_ids,
         )
     def to_dict(self, total_issues):
                 (self.build_count - self.build_failure_count) * 100.0 / self.build_count, 1
             ),
             AutoEvalColumn.mttr.name: self.mttr,
+            "fixed_bug_ids": self.fixed_bug_ids,
+            "method_id": self.method_name + "(" + self.model_name + ")",
         }
         return data_dict