dtcxzyw commited on
Commit
22a227d
Β·
unverified Β·
1 Parent(s): c629528
Files changed (2) hide show
  1. app.py +36 -1
  2. src/leaderboard/read_evals.py +6 -0
app.py CHANGED
@@ -40,8 +40,33 @@ try:
40
  except Exception:
41
  restart_space()
42
 
43
- total_issues = load_dataset("dtcxzyw/llvm-apr-benchmark").num_rows["test"]
 
 
 
 
 
 
 
 
 
 
 
 
44
  LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, COLS, total_issues)
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
  def init_leaderboard(dataframe):
@@ -73,6 +98,16 @@ with demo:
73
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
74
  with gr.TabItem("πŸ… Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
75
  leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
76
 
77
  with gr.TabItem("πŸš€ Submission", elem_id="llm-benchmark-tab-table", id=1):
78
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
40
  except Exception:
41
  restart_space()
42
 
43
+ dataset = load_dataset("dtcxzyw/llvm-apr-benchmark")
44
+ total_issues = dataset.num_rows["test"]
45
+ bug_id_to_time = dict()
46
+ for issue in dataset["test"]:
47
+ bug_id_to_time[issue["bug_id"]] = pd.to_datetime(issue["knowledge_cutoff"])
48
+ timeline_xs = []
49
+ timeline_ys = []
50
+ timeline_cols = []
51
+ model_cnt = 0
52
+ for bug_id, time in bug_id_to_time.items():
53
+ timeline_xs.append(time)
54
+ timeline_ys.append(0)
55
+ timeline_cols.append("Baseline")
56
  LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, COLS, total_issues)
57
+ for row in LEADERBOARD_DF.itertuples():
58
+ model_cnt += 1
59
+ for fix in row.fixed_bug_ids:
60
+ timeline_xs.append(bug_id_to_time[fix])
61
+ timeline_ys.append(model_cnt)
62
+ timeline_cols.append(row.method_name)
63
+ timeline_df = pd.DataFrame(
64
+ {
65
+ "time": timeline_xs,
66
+ "model": timeline_ys,
67
+ "method_name": timeline_cols,
68
+ }
69
+ )
70
 
71
 
72
  def init_leaderboard(dataframe):
 
98
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
99
  with gr.TabItem("πŸ… Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
100
  leaderboard = init_leaderboard(LEADERBOARD_DF)
101
+ gr.ScatterPlot(
102
+ timeline_df,
103
+ x="time",
104
+ y="model",
105
+ color="method_name",
106
+ x_label="Time",
107
+ y_label="Model",
108
+ title="Timeline",
109
+ color="timeline",
110
+ )
111
 
112
  with gr.TabItem("πŸš€ Submission", elem_id="llm-benchmark-tab-table", id=1):
113
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
src/leaderboard/read_evals.py CHANGED
@@ -25,6 +25,7 @@ class EvalResult:
25
  build_count: int
26
  build_failure_count: int
27
  mttr: float
 
28
 
29
  @classmethod
30
  def init_from_json_file(self, json_filepath):
@@ -45,6 +46,7 @@ class EvalResult:
45
  build_count = 0
46
  build_failure_count = 0
47
  ttr_sum = 0
 
48
  for fix in fixes:
49
  bug_type = fix.get("bug_type", "")
50
  if fix.get("fast_check_pass", False):
@@ -53,6 +55,7 @@ class EvalResult:
53
  full_pass_count += 1
54
  full_pass_count_cat[bug_type] = full_pass_count_cat.get(bug_type, 0) + 1
55
  ttr_sum += fix.get("wall_time", 0)
 
56
  build_count += fix.get("build_count", 0)
57
  build_failure_count += fix.get("build_failure_count", 0)
58
 
@@ -71,6 +74,7 @@ class EvalResult:
71
  build_count=build_count,
72
  build_failure_count=build_failure_count,
73
  mttr=round(ttr_sum / full_pass_count / 60, 1) if full_pass_count > 0 else 0,
 
74
  )
75
 
76
  def to_dict(self, total_issues):
@@ -90,6 +94,8 @@ class EvalResult:
90
  (self.build_count - self.build_failure_count) * 100.0 / self.build_count, 1
91
  ),
92
  AutoEvalColumn.mttr.name: self.mttr,
 
 
93
  }
94
 
95
  return data_dict
 
25
  build_count: int
26
  build_failure_count: int
27
  mttr: float
28
+ fixed_bug_ids: list[str]
29
 
30
  @classmethod
31
  def init_from_json_file(self, json_filepath):
 
46
  build_count = 0
47
  build_failure_count = 0
48
  ttr_sum = 0
49
+ fixed_bug_ids = []
50
  for fix in fixes:
51
  bug_type = fix.get("bug_type", "")
52
  if fix.get("fast_check_pass", False):
 
55
  full_pass_count += 1
56
  full_pass_count_cat[bug_type] = full_pass_count_cat.get(bug_type, 0) + 1
57
  ttr_sum += fix.get("wall_time", 0)
58
+ fixed_bug_ids.append(fix.get("bug_id", ""))
59
  build_count += fix.get("build_count", 0)
60
  build_failure_count += fix.get("build_failure_count", 0)
61
 
 
74
  build_count=build_count,
75
  build_failure_count=build_failure_count,
76
  mttr=round(ttr_sum / full_pass_count / 60, 1) if full_pass_count > 0 else 0,
77
+ fixed_bug_ids=fixed_bug_ids,
78
  )
79
 
80
  def to_dict(self, total_issues):
 
94
  (self.build_count - self.build_failure_count) * 100.0 / self.build_count, 1
95
  ),
96
  AutoEvalColumn.mttr.name: self.mttr,
97
+ "fixed_bug_ids": self.fixed_bug_ids,
98
+ "method_id": self.method_name + "(" + self.model_name + ")",
99
  }
100
 
101
  return data_dict