terryyz commited on
Commit
9d72e5c
·
verified ·
1 Parent(s): 5d7ffc1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -6
app.py CHANGED
@@ -106,10 +106,12 @@ def evaluate(
106
  max_as_limit: int = 30 * 1024,
107
  max_data_limit: int = 30 * 1024,
108
  max_stack_limit: int = 10,
 
109
  check_gt_only: bool = False,
110
  no_gt: bool = False,
 
111
  ):
112
- pass_k = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()]
113
  if parallel < 1:
114
  n_workers = max(1, multiprocessing.cpu_count() // 2)
115
  else:
@@ -121,6 +123,14 @@ def evaluate(
121
  extra = subset + "_" if subset != "full" else ""
122
 
123
  problems = get_bigcodebench(subset=subset)
 
 
 
 
 
 
 
 
124
  dataset_hash = get_bigcodebench_hash(subset=subset)
125
 
126
  if not no_gt:
@@ -156,7 +166,7 @@ def evaluate(
156
  if "solution" in sample
157
  else problems[task_id]["complete_prompt"] + sample["completion"]
158
  )
159
- if "sanitized_calibrated" in samples:
160
  solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
161
  remainings.add(sample["_identifier"])
162
  args = (
@@ -213,7 +223,7 @@ def evaluate(
213
 
214
  pass_at_k.update({
215
  f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
216
- for k in pass_k
217
  if total.min() >= k
218
  })
219
 
@@ -223,7 +233,7 @@ def evaluate(
223
  pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
224
  pass_at_k["split"] = split
225
  pass_at_k["subset"] = subset
226
- pass_at_k["calibrated"] = "sanitized_calibrated" in samples
227
  pass_at_k["gt_pass_rate"] = gt_pass_rate
228
  pass_at_k["failed_tasks"] = failed_tasks
229
 
@@ -243,8 +253,10 @@ interface = gr.Interface(
243
  gr.Slider(1, 100 * 1024, step=1024, label="Max AS Limit", value=30 * 1024),
244
  gr.Slider(1, 100 * 1024, step=1024, label="Max Data Limit", value=30 * 1024),
245
  gr.Slider(1, 100, step=1, label="Max Stack Limit", value=10),
 
246
  gr.Checkbox(label="Check GT Only"),
247
  gr.Checkbox(label="No GT"),
 
248
  ],
249
  outputs=[
250
  gr.JSON(label="Results"),
@@ -271,8 +283,14 @@ def restart_space():
271
 
272
 
273
  # if __name__ == "__main__":
274
- preload_gt()
 
 
 
 
 
 
275
  scheduler = BackgroundScheduler()
276
- scheduler.add_job(restart_space, "interval", hours=5) # Restart every 5hs
277
  scheduler.start()
278
  interface.launch(show_error=True)
 
106
  max_as_limit: int = 30 * 1024,
107
  max_data_limit: int = 30 * 1024,
108
  max_stack_limit: int = 10,
109
+ calibrated: bool = True,
110
  check_gt_only: bool = False,
111
  no_gt: bool = False,
112
+ selective_evaluate: str = "",
113
  ):
114
+ passk = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()]
115
  if parallel < 1:
116
  n_workers = max(1, multiprocessing.cpu_count() // 2)
117
  else:
 
123
  extra = subset + "_" if subset != "full" else ""
124
 
125
  problems = get_bigcodebench(subset=subset)
126
+
127
+ # Add selective evaluation logic
128
+ if selective_evaluate:
129
+ selected_ids = ["BigCodeBench/" + id for id in sorted(set(selective_evaluate.split(",")))]
130
+ problems = {k: v for k, v in problems.items() if k in selected_ids}
131
+ if not problems:
132
+ raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset")
133
+
134
  dataset_hash = get_bigcodebench_hash(subset=subset)
135
 
136
  if not no_gt:
 
166
  if "solution" in sample
167
  else problems[task_id]["complete_prompt"] + sample["completion"]
168
  )
169
+ if calibrated:
170
  solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
171
  remainings.add(sample["_identifier"])
172
  args = (
 
223
 
224
  pass_at_k.update({
225
  f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
226
+ for k in passk
227
  if total.min() >= k
228
  })
229
 
 
233
  pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
234
  pass_at_k["split"] = split
235
  pass_at_k["subset"] = subset
236
+ pass_at_k["calibrated"] = calibrated
237
  pass_at_k["gt_pass_rate"] = gt_pass_rate
238
  pass_at_k["failed_tasks"] = failed_tasks
239
 
 
253
  gr.Slider(1, 100 * 1024, step=1024, label="Max AS Limit", value=30 * 1024),
254
  gr.Slider(1, 100 * 1024, step=1024, label="Max Data Limit", value=30 * 1024),
255
  gr.Slider(1, 100, step=1, label="Max Stack Limit", value=10),
256
+ gr.Checkbox(label="Calibrated", value=True),
257
  gr.Checkbox(label="Check GT Only"),
258
  gr.Checkbox(label="No GT"),
259
+ gr.Textbox(label="Selective Evaluated Task IDs (comma-separated, e.g. '0,1,2')", value=""),
260
  ],
261
  outputs=[
262
  gr.JSON(label="Results"),
 
283
 
284
 
285
  # if __name__ == "__main__":
286
+ while True:
287
+ try:
288
+ preload_gt()
289
+ break
290
+ except:
291
+ continue
292
+
293
  scheduler = BackgroundScheduler()
294
+ scheduler.add_job(restart_space, "interval", hours=1) # Restart every 2hs
295
  scheduler.start()
296
  interface.launch(show_error=True)