Add bountybench
Browse files- app.py +3 -3
- gen_table.py +3 -3
- meta_data.py +6 -0
- results.json +23 -0
app.py
CHANGED
@@ -110,7 +110,7 @@ head_style) as demo:
|
|
110 |
)
|
111 |
s.headers = s.check_box['essential'] + s.checkbox_group.value
|
112 |
|
113 |
-
if benchmark not in ["SWE-bench-verified", "CyberGym"]:
|
114 |
with gr.Row():
|
115 |
s.model_name = gr.Textbox(
|
116 |
value='Input the Model Name (fuzzy, case insensitive)',
|
@@ -137,7 +137,7 @@ head_style) as demo:
|
|
137 |
s = structs[benchmark_list.index(dataset_name)]
|
138 |
headers = s.check_box['essential'] + fields
|
139 |
df = cp.deepcopy(s.table)
|
140 |
-
if dataset_name not in ["SWE-bench-verified", "CyberGym"]:
|
141 |
default_val = 'Input the Model Name (fuzzy, case insensitive)'
|
142 |
else:
|
143 |
default_val = 'Input the Agent Name (fuzzy, case insensitive)'
|
@@ -145,7 +145,7 @@ head_style) as demo:
|
|
145 |
if model_name != default_val:
|
146 |
print(model_name)
|
147 |
model_name = model_name.lower()
|
148 |
-
if dataset_name not in ["SWE-bench-verified", "CyberGym"]:
|
149 |
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
|
150 |
else:
|
151 |
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Agent']]
|
|
|
110 |
)
|
111 |
s.headers = s.check_box['essential'] + s.checkbox_group.value
|
112 |
|
113 |
+
if benchmark not in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
114 |
with gr.Row():
|
115 |
s.model_name = gr.Textbox(
|
116 |
value='Input the Model Name (fuzzy, case insensitive)',
|
|
|
137 |
s = structs[benchmark_list.index(dataset_name)]
|
138 |
headers = s.check_box['essential'] + fields
|
139 |
df = cp.deepcopy(s.table)
|
140 |
+
if dataset_name not in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
141 |
default_val = 'Input the Model Name (fuzzy, case insensitive)'
|
142 |
else:
|
143 |
default_val = 'Input the Agent Name (fuzzy, case insensitive)'
|
|
|
145 |
if model_name != default_val:
|
146 |
print(model_name)
|
147 |
model_name = model_name.lower()
|
148 |
+
if dataset_name not in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
149 |
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
|
150 |
else:
|
151 |
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Agent']]
|
gen_table.py
CHANGED
@@ -54,9 +54,9 @@ def BUILD_L2_DF(results, benchmark):
|
|
54 |
model_list=list(set(model_list))
|
55 |
|
56 |
res = defaultdict(list)
|
57 |
-
if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified","CyberGym"]:
|
58 |
res['Model']=model_list
|
59 |
-
elif benchmark
|
60 |
res['Agent']=model_list
|
61 |
elif benchmark == "PrimeVul":
|
62 |
used=[]
|
@@ -104,7 +104,7 @@ def BUILD_L2_DF(results, benchmark):
|
|
104 |
required_fields = all_fields
|
105 |
|
106 |
check_box = {}
|
107 |
-
if benchmark in ["SWE-bench-verified", "CyberGym"]:
|
108 |
check_box['essential'] = ['Agent']
|
109 |
elif benchmark=='PrimeVul':
|
110 |
check_box['essential'] = ['Model','Method']
|
|
|
54 |
model_list=list(set(model_list))
|
55 |
|
56 |
res = defaultdict(list)
|
57 |
+
if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified","CyberGym", "BountyBench"]:
|
58 |
res['Model']=model_list
|
59 |
+
elif benchmark in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
60 |
res['Agent']=model_list
|
61 |
elif benchmark == "PrimeVul":
|
62 |
used=[]
|
|
|
104 |
required_fields = all_fields
|
105 |
|
106 |
check_box = {}
|
107 |
+
if benchmark in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
108 |
check_box['essential'] = ['Agent']
|
109 |
elif benchmark=='PrimeVul':
|
110 |
check_box['essential'] = ['Model','Method']
|
meta_data.py
CHANGED
@@ -91,4 +91,10 @@ LEADERBOARD_MD['CyberGym'] = """This is a large-scale and high-quality cybersecu
|
|
91 |
|
92 |
Paper: https://arxiv.org/abs/2506.02548
|
93 |
Code: https://github.com/sunblaze-ucb/cybergym
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
"""
|
|
|
91 |
|
92 |
Paper: https://arxiv.org/abs/2506.02548
|
93 |
Code: https://github.com/sunblaze-ucb/cybergym
|
94 |
+
"""
|
95 |
+
|
96 |
+
LEADERBOARD_MD['BountyBench'] = """This is a benchmark with 25 systems with complex, real-world codebases, and includes 40 bug bounties that cover 9 of the OWASP Top 10 Risks.
|
97 |
+
|
98 |
+
Paper: https://arxiv.org/abs/2505.15216
|
99 |
+
Code: https://github.com/bountybench/bountybench
|
100 |
"""
|
results.json
CHANGED
@@ -829,6 +829,29 @@
|
|
829 |
"OpenHands + OpenHands-LM-32B": 0.33,
|
830 |
"OpenHands + SWE-Gym-32B": 0.07
|
831 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
832 |
}
|
833 |
}
|
834 |
}
|
|
|
829 |
"OpenHands + OpenHands-LM-32B": 0.33,
|
830 |
"OpenHands + SWE-Gym-32B": 0.07
|
831 |
}
|
832 |
+
},
|
833 |
+
"BountyBench": {
|
834 |
+
"Detect Success Rate": {
|
835 |
+
"Claude Code": 5,
|
836 |
+
"OpenAI Codex CLI": 5,
|
837 |
+
"C-Agent: Claude 3.7": 5,
|
838 |
+
"C-Agent: Gemini 2.5": 2.5,
|
839 |
+
"C-Agent: GPT-4.1": 0
|
840 |
+
},
|
841 |
+
"Exploit Success Rate": {
|
842 |
+
"Claude Code": 57.5,
|
843 |
+
"OpenAI Codex CLI": 32.5,
|
844 |
+
"C-Agent: Claude 3.7": 67.5,
|
845 |
+
"C-Agent: Gemini 2.5": 40,
|
846 |
+
"C-Agent: GPT-4.1": 55
|
847 |
+
},
|
848 |
+
"Patch Success Rate": {
|
849 |
+
"Claude Code": 87.5,
|
850 |
+
"OpenAI Codex CLI": 90,
|
851 |
+
"C-Agent: Claude 3.7": 60,
|
852 |
+
"C-Agent: Gemini 2.5": 45,
|
853 |
+
"C-Agent: GPT-4.1": 50
|
854 |
+
}
|
855 |
}
|
856 |
}
|
857 |
}
|