yujinyujin9393 commited on
Commit
3704b12
·
verified ·
1 Parent(s): 9be313e

Add bountybench

Browse files
Files changed (4) hide show
  1. app.py +3 -3
  2. gen_table.py +3 -3
  3. meta_data.py +6 -0
  4. results.json +23 -0
app.py CHANGED
@@ -110,7 +110,7 @@ head_style) as demo:
110
  )
111
  s.headers = s.check_box['essential'] + s.checkbox_group.value
112
 
113
- if benchmark not in ["SWE-bench-verified", "CyberGym"]:
114
  with gr.Row():
115
  s.model_name = gr.Textbox(
116
  value='Input the Model Name (fuzzy, case insensitive)',
@@ -137,7 +137,7 @@ head_style) as demo:
137
  s = structs[benchmark_list.index(dataset_name)]
138
  headers = s.check_box['essential'] + fields
139
  df = cp.deepcopy(s.table)
140
- if dataset_name not in ["SWE-bench-verified", "CyberGym"]:
141
  default_val = 'Input the Model Name (fuzzy, case insensitive)'
142
  else:
143
  default_val = 'Input the Agent Name (fuzzy, case insensitive)'
@@ -145,7 +145,7 @@ head_style) as demo:
145
  if model_name != default_val:
146
  print(model_name)
147
  model_name = model_name.lower()
148
- if dataset_name not in ["SWE-bench-verified", "CyberGym"]:
149
  method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
150
  else:
151
  method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Agent']]
 
110
  )
111
  s.headers = s.check_box['essential'] + s.checkbox_group.value
112
 
113
+ if benchmark not in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
114
  with gr.Row():
115
  s.model_name = gr.Textbox(
116
  value='Input the Model Name (fuzzy, case insensitive)',
 
137
  s = structs[benchmark_list.index(dataset_name)]
138
  headers = s.check_box['essential'] + fields
139
  df = cp.deepcopy(s.table)
140
+ if dataset_name not in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
141
  default_val = 'Input the Model Name (fuzzy, case insensitive)'
142
  else:
143
  default_val = 'Input the Agent Name (fuzzy, case insensitive)'
 
145
  if model_name != default_val:
146
  print(model_name)
147
  model_name = model_name.lower()
148
+ if dataset_name not in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
149
  method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
150
  else:
151
  method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Agent']]
gen_table.py CHANGED
@@ -54,9 +54,9 @@ def BUILD_L2_DF(results, benchmark):
54
  model_list=list(set(model_list))
55
 
56
  res = defaultdict(list)
57
- if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified","CyberGym"]:
58
  res['Model']=model_list
59
- elif benchmark=="SWE-bench-verified" or benchmark=="CyberGym":
60
  res['Agent']=model_list
61
  elif benchmark == "PrimeVul":
62
  used=[]
@@ -104,7 +104,7 @@ def BUILD_L2_DF(results, benchmark):
104
  required_fields = all_fields
105
 
106
  check_box = {}
107
- if benchmark in ["SWE-bench-verified", "CyberGym"]:
108
  check_box['essential'] = ['Agent']
109
  elif benchmark=='PrimeVul':
110
  check_box['essential'] = ['Model','Method']
 
54
  model_list=list(set(model_list))
55
 
56
  res = defaultdict(list)
57
+ if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified","CyberGym", "BountyBench"]:
58
  res['Model']=model_list
59
+ elif benchmark in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
60
  res['Agent']=model_list
61
  elif benchmark == "PrimeVul":
62
  used=[]
 
104
  required_fields = all_fields
105
 
106
  check_box = {}
107
+ if benchmark in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
108
  check_box['essential'] = ['Agent']
109
  elif benchmark=='PrimeVul':
110
  check_box['essential'] = ['Model','Method']
meta_data.py CHANGED
@@ -91,4 +91,10 @@ LEADERBOARD_MD['CyberGym'] = """This is a large-scale and high-quality cybersecu
91
 
92
  Paper: https://arxiv.org/abs/2506.02548
93
  Code: https://github.com/sunblaze-ucb/cybergym
 
 
 
 
 
 
94
  """
 
91
 
92
  Paper: https://arxiv.org/abs/2506.02548
93
  Code: https://github.com/sunblaze-ucb/cybergym
94
+ """
95
+
96
+ LEADERBOARD_MD['BountyBench'] = """This is a benchmark with 25 systems with complex, real-world codebases, and includes 40 bug bounties that cover 9 of the OWASP Top 10 Risks.
97
+
98
+ Paper: https://arxiv.org/abs/2505.15216
99
+ Code: https://github.com/bountybench/bountybench
100
  """
results.json CHANGED
@@ -829,6 +829,29 @@
829
  "OpenHands + OpenHands-LM-32B": 0.33,
830
  "OpenHands + SWE-Gym-32B": 0.07
831
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
832
  }
833
  }
834
  }
 
829
  "OpenHands + OpenHands-LM-32B": 0.33,
830
  "OpenHands + SWE-Gym-32B": 0.07
831
  }
832
+ },
833
+ "BountyBench": {
834
+ "Detect Success Rate": {
835
+ "Claude Code": 5,
836
+ "OpenAI Codex CLI": 5,
837
+ "C-Agent: Claude 3.7": 5,
838
+ "C-Agent: Gemini 2.5": 2.5,
839
+ "C-Agent: GPT-4.1": 0
840
+ },
841
+ "Exploit Success Rate": {
842
+ "Claude Code": 57.5,
843
+ "OpenAI Codex CLI": 32.5,
844
+ "C-Agent: Claude 3.7": 67.5,
845
+ "C-Agent: Gemini 2.5": 40,
846
+ "C-Agent: GPT-4.1": 55
847
+ },
848
+ "Patch Success Rate": {
849
+ "Claude Code": 87.5,
850
+ "OpenAI Codex CLI": 90,
851
+ "C-Agent: Claude 3.7": 60,
852
+ "C-Agent: Gemini 2.5": 45,
853
+ "C-Agent: GPT-4.1": 50
854
+ }
855
  }
856
  }
857
  }