yujinyujin9393 commited on
Commit
ea5e3a1
·
verified ·
1 Parent(s): bb8ff6c

Upload website codes

Browse files
Files changed (4) hide show
  1. app.py +90 -116
  2. gen_table.py +67 -34
  3. meta_data.py +51 -17
  4. results.json +766 -100
app.py CHANGED
@@ -18,145 +18,112 @@ head_style = """
18
  </style>
19
  """
20
 
21
- TAB_CSS = """
22
- /* 1. Target the real tab‐list container (old & new class names + role attr) */
23
- #leaderboard_tabs [role="tablist"],
24
- #leaderboard_tabs .gradio-tabs-tablist,
25
- #leaderboard_tabs .tab-container[role="tablist"] {
26
- display: flex !important;
27
- flex-wrap: wrap !important; /* allow multi‑row */
28
- white-space: normal !important; /* cancel nowrap */
29
- overflow-x: visible!important; /* don’t clip off */
30
- height: auto !important; /* grow as tall as needed */
31
- max-width: none !important; /* cancel any max‑width */
32
- }
33
-
34
- /* 2. Stop each button from flexing */
35
- #leaderboard_tabs [role="tab"],
36
- #leaderboard_tabs .tab-container[role="tablist"] .tab-button,
37
- #leaderboard_tabs .gradio-tabs-tab {
38
- flex: none !important;
39
- }
40
-
41
- /* 3. Hide every possible “more/overflow” toggle */
42
- #leaderboard_tabs .overflow-menu,
43
- #leaderboard_tabs [class*="overflow-button"],
44
- #leaderboard_tabs button[aria-label*="More"],
45
- #leaderboard_tabs .gradio-tabs-overflow,
46
- #leaderboard_tabs .gradio-tabs-overflow-button {
47
- display: none !important;
48
- }
49
- """
50
-
51
  with gr.Blocks(title="Cybersecurity Leaderboard", head=
52
  head_style) as demo:
53
  struct = load_results()
54
  timestamp = struct['time']
55
  EVAL_TIME = format_timestamp(timestamp)
56
  results = struct['results']
57
- model_list=[]
58
- task_list=[]
59
- benchmark_list=[]
60
- for task in results:
61
- task_list+=[task]
62
- for benchmark in results[task]:
63
- if benchmark!='category':
64
- benchmark_list+=[benchmark]
65
- model_list+=list(results[task][benchmark].keys())
66
-
67
- model_list=list(set(model_list))
68
- N_MODEL=len(model_list)
69
- N_TASK=len(task_list)
70
- N_DATA = len(list(set(benchmark_list)))
71
  DATASETS = benchmark_list
72
 
73
- gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_DATA,N_TASK,EVAL_TIME))
74
- structs = [abc.abstractproperty() for _ in range(N_TASK)] #N_DATA
75
 
76
  with gr.Tabs(elem_id="leaderboard_tabs", elem_classes='tab-buttons') as tabs:
77
- with gr.TabItem('🏅 Cybersecurity Main Leaderboard', elem_id='main', id=0):
78
- gr.Markdown(LEADERBOARD_MD['MAIN'].format(N_DATA,N_DATA))
79
- _, check_box = BUILD_L1_DF(results, DEFAULT_TASK)
80
- table = generate_table(results, DEFAULT_TASK)
81
-
82
- type_map = check_box['type_map']
83
-
84
- checkbox_group = gr.CheckboxGroup(
85
- choices=check_box['all'],
86
- value=check_box['required'],
87
- label='Aspects of Cybersecurity Work',
88
- interactive=True,
89
- )
90
-
91
- headers = check_box['essential'] + checkbox_group.value
92
- with gr.Row():
93
- model_name = gr.Textbox(
94
- value='Input the Model Name (fuzzy, case insensitive)',
95
- label='Model Name',
96
- interactive=True,
97
- visible=True)
98
- data_component = gr.components.DataFrame(
99
- value=table[headers],
100
- type='pandas',
101
- datatype=[type_map[x] for x in headers],
102
- interactive=False,
103
- wrap=True,
104
- visible=True)
105
-
106
- def filter_df(fields, model_name):
107
- headers = check_box['essential'] + fields
108
- df = generate_table(results, fields)
109
 
110
- default_val = 'Input the Model Name (fuzzy, case insensitive)'
111
- if model_name != default_val:
112
- print(model_name)
113
- model_name = model_name.lower()
114
- method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
115
- flag = [model_name in name for name in method_names]
116
- df['TEMP_FLAG'] = flag
117
- df = df[df['TEMP_FLAG'] == True]
118
- df.pop('TEMP_FLAG')
119
-
120
- comp = gr.components.DataFrame(
121
- value=df[headers],
122
- type='pandas',
123
- datatype=[type_map[x] for x in headers],
124
- interactive=False,
125
- wrap=True,
126
- visible=True)
127
- return comp
128
-
129
- for cbox in [checkbox_group]:
130
- cbox.change(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
131
- model_name.submit(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
132
 
133
  with gr.TabItem('🔍 About', elem_id='about', id=1):
134
  with open("about.md", 'r', encoding="utf-8") as file:
135
  gr.Markdown(file.read())
136
 
137
- for i, task in enumerate(task_list):
138
- with gr.TabItem(f'📊 {task} Leaderboard', elem_id=task, id=i + 2):
139
- if task in LEADERBOARD_MD:
140
- gr.Markdown(LEADERBOARD_MD[task])
141
 
142
  s = structs[i]
143
- s.table, s.check_box = BUILD_L2_DF(results, task)
144
  s.type_map = s.check_box['type_map']
145
 
146
  s.checkbox_group = gr.CheckboxGroup(
147
  choices=s.check_box['all'],
148
  value=s.check_box['required'],
149
- label=f'{task} CheckBoxes',
150
  interactive=True,
151
  )
152
  s.headers = s.check_box['essential'] + s.checkbox_group.value
153
 
154
- with gr.Row():
155
- s.model_name = gr.Textbox(
156
- value='Input the Model Name (fuzzy, case insensitive)',
157
- label='Model Name',
158
- interactive=True,
159
- visible=True)
 
 
 
 
 
 
 
 
160
  s.data_component = gr.components.DataFrame(
161
  value=s.table[s.headers],
162
  type='pandas',
@@ -164,17 +131,24 @@ head_style) as demo:
164
  interactive=False,
165
  wrap=True,
166
  visible=True)
167
- s.dataset = gr.Textbox(value=task, label=task, visible=False)
168
 
169
  def filter_df_l2(dataset_name, fields, model_name):
170
- s = structs[task_list.index(dataset_name)]
171
  headers = s.check_box['essential'] + fields
172
  df = cp.deepcopy(s.table)
173
- default_val = 'Input the Model Name (fuzzy, case insensitive)'
 
 
 
 
174
  if model_name != default_val:
175
  print(model_name)
176
  model_name = model_name.lower()
177
- method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Method']]
 
 
 
178
  flag = [model_name in name for name in method_names]
179
  df['TEMP_FLAG'] = flag
180
  df = df[df['TEMP_FLAG'] == True]
 
18
  </style>
19
  """
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  with gr.Blocks(title="Cybersecurity Leaderboard", head=
22
  head_style) as demo:
23
  struct = load_results()
24
  timestamp = struct['time']
25
  EVAL_TIME = format_timestamp(timestamp)
26
  results = struct['results']
27
+ benchmark_list=list(results.keys())
28
+
29
+ N_DATA = len(benchmark_list)
 
 
 
 
 
 
 
 
 
 
 
30
  DATASETS = benchmark_list
31
 
32
+ gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_DATA,EVAL_TIME))
33
+ structs = [abc.abstractproperty() for _ in range(N_DATA)]
34
 
35
  with gr.Tabs(elem_id="leaderboard_tabs", elem_classes='tab-buttons') as tabs:
36
+ # with gr.TabItem('🏅 Cybersecurity Main Leaderboard', elem_id='main', id=0):
37
+ # gr.Markdown(LEADERBOARD_MD['MAIN'].format(N_DATA,N_DATA))
38
+ # _, check_box = BUILD_L1_DF(results, DEFAULT_TASK)
39
+ # table = generate_table(results, DEFAULT_TASK)
40
+
41
+ # type_map = check_box['type_map']
42
+
43
+ # checkbox_group = gr.CheckboxGroup(
44
+ # choices=check_box['all'],
45
+ # value=check_box['required'],
46
+ # label='Aspects of Cybersecurity Work',
47
+ # interactive=True,
48
+ # )
49
+
50
+ # headers = check_box['essential'] + checkbox_group.value
51
+ # with gr.Row():
52
+ # model_name = gr.Textbox(
53
+ # value='Input the Model Name (fuzzy, case insensitive)',
54
+ # label='Model Name',
55
+ # interactive=True,
56
+ # visible=True)
57
+ # data_component = gr.components.DataFrame(
58
+ # value=table[headers],
59
+ # type='pandas',
60
+ # datatype=[type_map[x] for x in headers],
61
+ # interactive=False,
62
+ # wrap=True,
63
+ # visible=True)
64
+
65
+ # def filter_df(fields, model_name):
66
+ # headers = check_box['essential'] + fields
67
+ # df = generate_table(results, fields)
68
 
69
+ # default_val = 'Input the Model Name (fuzzy, case insensitive)'
70
+ # if model_name != default_val:
71
+ # print(model_name)
72
+ # model_name = model_name.lower()
73
+ # method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
74
+ # flag = [model_name in name for name in method_names]
75
+ # df['TEMP_FLAG'] = flag
76
+ # df = df[df['TEMP_FLAG'] == True]
77
+ # df.pop('TEMP_FLAG')
78
+
79
+ # comp = gr.components.DataFrame(
80
+ # value=df[headers],
81
+ # type='pandas',
82
+ # datatype=[type_map[x] for x in headers],
83
+ # interactive=False,
84
+ # wrap=True,
85
+ # visible=True)
86
+ # return comp
87
+
88
+ # for cbox in [checkbox_group]:
89
+ # cbox.change(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
90
+ # model_name.submit(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
91
 
92
  with gr.TabItem('🔍 About', elem_id='about', id=1):
93
  with open("about.md", 'r', encoding="utf-8") as file:
94
  gr.Markdown(file.read())
95
 
96
+ for i, benchmark in enumerate(benchmark_list):
97
+ with gr.TabItem(f'📊 {benchmark} Leaderboard', elem_id=benchmark, id=i + 2):
98
+ if benchmark in LEADERBOARD_MD:
99
+ gr.Markdown(LEADERBOARD_MD[benchmark])
100
 
101
  s = structs[i]
102
+ s.table, s.check_box = BUILD_L2_DF(results, benchmark)
103
  s.type_map = s.check_box['type_map']
104
 
105
  s.checkbox_group = gr.CheckboxGroup(
106
  choices=s.check_box['all'],
107
  value=s.check_box['required'],
108
+ label=f'{benchmark} CheckBoxes',
109
  interactive=True,
110
  )
111
  s.headers = s.check_box['essential'] + s.checkbox_group.value
112
 
113
+ if benchmark!='SWE-bench-verified':
114
+ with gr.Row():
115
+ s.model_name = gr.Textbox(
116
+ value='Input the Model Name (fuzzy, case insensitive)',
117
+ label='Model Name',
118
+ interactive=True,
119
+ visible=True)
120
+ else:
121
+ with gr.Row():
122
+ s.model_name = gr.Textbox(
123
+ value='Input the Agent Name (fuzzy, case insensitive)',
124
+ label='Agent Name',
125
+ interactive=True,
126
+ visible=True)
127
  s.data_component = gr.components.DataFrame(
128
  value=s.table[s.headers],
129
  type='pandas',
 
131
  interactive=False,
132
  wrap=True,
133
  visible=True)
134
+ s.dataset = gr.Textbox(value=benchmark, label=benchmark, visible=False)
135
 
136
  def filter_df_l2(dataset_name, fields, model_name):
137
+ s = structs[benchmark_list.index(dataset_name)]
138
  headers = s.check_box['essential'] + fields
139
  df = cp.deepcopy(s.table)
140
+ if dataset_name!="SWE-bench-verified":
141
+ default_val = 'Input the Model Name (fuzzy, case insensitive)'
142
+ else:
143
+ default_val = 'Input the Agent Name (fuzzy, case insensitive)'
144
+
145
  if model_name != default_val:
146
  print(model_name)
147
  model_name = model_name.lower()
148
+ if dataset_name!="SWE-bench-verified":
149
+ method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
150
+ else:
151
+ method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Agent']]
152
  flag = [model_name in name for name in method_names]
153
  df['TEMP_FLAG'] = flag
154
  df = df[df['TEMP_FLAG'] == True]
gen_table.py CHANGED
@@ -32,54 +32,87 @@ def format_timestamp(timestamp):
32
  date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:]
33
  return date
34
 
35
- def BUILD_L1_DF(results, fields):
36
- check_box = {}
37
- check_box['essential'] = ['Model']
38
- # revise there to set default dataset
39
- check_box['required'] = DEFAULT_TASK
40
- check_box['all'] = DEFAULT_TASK
41
- type_map = defaultdict(lambda: 'number')
42
- check_box['type_map'] = type_map
43
 
44
- df = generate_table(results, fields)
45
- return df, check_box
46
 
47
 
48
- def BUILD_L2_DF(results, task):
49
- results=results[task]
50
  model_list=[]
51
- benchmark_list=[]
52
- all_fields=[]
53
- for benchmark in results:
54
- if benchmark!='category':
55
- benchmark_list+=[benchmark]
56
- if benchmark not in ["CRUXEval","AutoPenBench"]:
57
- all_fields+=[benchmark]
58
- else:
59
- all_fields+=[benchmark+' (autonomous)', benchmark+' (assisted)']
60
- model_list+=list(results[benchmark].keys())
61
  model_list=list(set(model_list))
62
 
63
  res = defaultdict(list)
64
- res['Model']=model_list
65
-
66
- for benchmark in benchmark_list:
67
- if benchmark not in ["CRUXEval","AutoPenBench"]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  for model in model_list:
69
- if model in results[benchmark]:
70
- res[benchmark].append(results[benchmark][model])
71
  else:
72
- res[benchmark].append(None)
73
- else:
74
- for model in model_list:
75
- res[benchmark+' (autonomous)'].append(results[benchmark][model]['autonomous'])
76
- res[benchmark+' (assisted)'].append(results[benchmark][model]['assisted'])
 
 
 
77
 
78
  df = pd.DataFrame(res)
 
 
 
 
 
 
 
 
79
  required_fields = all_fields
80
 
81
  check_box = {}
82
- check_box['essential'] = ['Model']
 
 
 
 
 
 
 
 
83
  check_box['required'] = required_fields
84
  check_box['all'] = all_fields
85
  type_map = defaultdict(lambda: 'number')
 
32
  date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:]
33
  return date
34
 
35
+ # def BUILD_L1_DF(results, fields):
36
+ # check_box = {}
37
+ # check_box['essential'] = ['Model']
38
+ # # revise there to set default dataset
39
+ # check_box['required'] = DEFAULT_TASK
40
+ # check_box['all'] = DEFAULT_TASK
41
+ # type_map = defaultdict(lambda: 'number')
42
+ # check_box['type_map'] = type_map
43
 
44
+ # df = generate_table(results, fields)
45
+ # return df, check_box
46
 
47
 
48
+ def BUILD_L2_DF(results, benchmark):
49
+ results=results[benchmark]
50
  model_list=[]
51
+ all_fields=list(results.keys())
52
+ for task in results:
53
+ model_list+=list(results[task].keys())
 
 
 
 
 
 
 
54
  model_list=list(set(model_list))
55
 
56
  res = defaultdict(list)
57
+ if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified"]:
58
+ res['Model']=model_list
59
+ elif benchmark=="SWE-bench-verified":
60
+ res['Agent']=model_list
61
+ elif benchmark == "PrimeVul":
62
+ used=[]
63
+ for task in all_fields:
64
+ for model in results[task]:
65
+ for extra in results[task][model]:
66
+ if [model,extra] not in used:
67
+ res['Model'].append(model)
68
+ res['Method'].append(extra)
69
+ used.append([model,extra])
70
+ else:
71
+ used=[]
72
+ for task in all_fields:
73
+ for model in results[task]:
74
+ for extra in results[task][model]:
75
+ if [model,extra] not in used:
76
+ res['Model'].append(model)
77
+ res['Agent'].append(extra)
78
+ used.append([model,extra])
79
+
80
+ if benchmark not in ["RedCode","NYU CTF Bench",'PrimeVul']:
81
+ for task in all_fields:
82
  for model in model_list:
83
+ if model in results[task]:
84
+ res[task].append(results[task][model])
85
  else:
86
+ res[task].append(None)
87
+ else:
88
+ for task in all_fields:
89
+ for model, extra in used:
90
+ if model in results[task] and extra in results[task][model]:
91
+ res[task].append(results[task][model][extra])
92
+ else:
93
+ res[task].append(None)
94
 
95
  df = pd.DataFrame(res)
96
+ rank_criteria=all_fields[0]
97
+ valid, missing = df[~pd.isna(df[rank_criteria])], df[pd.isna(df[rank_criteria])]
98
+ valid = valid.sort_values(rank_criteria)
99
+ valid = valid.iloc[::-1]
100
+ if len(all_fields):
101
+ missing = missing.iloc[::-1]
102
+ df = pd.concat([valid, missing])
103
+
104
  required_fields = all_fields
105
 
106
  check_box = {}
107
+ if benchmark=="SWE-bench-verified":
108
+ check_box['essential'] = ['Agent']
109
+ elif benchmark=='PrimeVul':
110
+ check_box['essential'] = ['Model','Method']
111
+ elif benchmark in ["RedCode","NYU CTF Bench"]:
112
+ check_box['essential'] = ['Model','Agent']
113
+ else:
114
+ check_box['essential'] = ['Model']
115
+
116
  check_box['required'] = required_fields
117
  check_box['all'] = all_fields
118
  type_map = defaultdict(lambda: 'number')
meta_data.py CHANGED
@@ -11,7 +11,7 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
11
  # CONSTANTS-TEXT
12
  LEADERBORAD_INTRODUCTION = """# Cybersecurity Leaderboard
13
  ### Welcome to the Cybersecurity Leaderboard! This leaderboard is a collection of benchmarks relevant to cybersecurity capabilities.
14
- This leaderboard covers {} benchmarks across {} aspects of cybersecurity work.
15
 
16
  This leaderboard was last updated: {} """
17
  # CONSTANTS-FIELDS
@@ -20,35 +20,69 @@ This leaderboard was last updated: {} """
20
  # ]
21
 
22
  DEFAULT_TASK = [
23
- 'Vulnerable code generation', 'Attack generation', 'CTF', 'Cyber knowledge', 'Pen test', 'Vulnerability detection', 'PoC generation', 'Patching'
24
  ]
25
- MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
26
 
27
  # The README file for each benchmark
28
  LEADERBOARD_MD = {}
29
 
30
- LEADERBOARD_MD['MAIN'] = """
31
- ## Main Evaluation Results
32
 
33
- - Metrics:
34
- - Avg Score: The average score on {} Cybersecurity Benchmarks (normalized to 0 - 100, the higher the better).
35
- - Avg Rank: The average rank on {} Cybersecurity Benchmarks (the lower the better).
36
- - Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
37
  """
 
38
 
39
- LEADERBOARD_MD['Vulnerable code generation'] = """Need to add a description
 
40
  """
41
- LEADERBOARD_MD['Attack generation'] = """Need to add a description
 
 
 
42
  """
43
- LEADERBOARD_MD['CTF'] = """Need to add a description
 
 
 
44
  """
45
- LEADERBOARD_MD['Cyber knowledge'] = """Need to add a description
 
 
 
46
  """
47
- LEADERBOARD_MD['Pen test'] = """Need to add a description
 
 
 
48
  """
49
- LEADERBOARD_MD['Vulnerability detection'] = """Need to add a description
 
 
 
50
  """
51
- LEADERBOARD_MD['PoC generation'] = """Need to add a description
 
 
 
52
  """
53
- LEADERBOARD_MD['Patching'] = """Need to add a description
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  """
 
11
  # CONSTANTS-TEXT
12
  LEADERBORAD_INTRODUCTION = """# Cybersecurity Leaderboard
13
  ### Welcome to the Cybersecurity Leaderboard! This leaderboard is a collection of benchmarks relevant to cybersecurity capabilities.
14
+ This leaderboard covers {} benchmarks.
15
 
16
  This leaderboard was last updated: {} """
17
  # CONSTANTS-FIELDS
 
20
  # ]
21
 
22
  DEFAULT_TASK = [
23
+ 'Vulnerable Code Generation', 'Attack Generation', 'CTF', 'Cyber Knowledge', 'Pen Test', 'Vulnerability Detection', 'PoC Generation', 'Patching'
24
  ]
 
25
 
26
  # The README file for each benchmark
27
  LEADERBOARD_MD = {}
28
 
29
+ LEADERBOARD_MD['CyberSecEval-3'] = """CyberSecEval-3 is a security benchmarks for LLMs. CyberSecEval-3 assesses 8 different risks across two broad categories: risk to third parties, and risk to application developers and end users.
 
30
 
31
+ Paper: https://arxiv.org/abs/2408.01605
32
+ Code: https://github.com/meta-llama/PurpleLlama/tree/main/CybersecurityBenchmarks
 
 
33
  """
34
+ LEADERBOARD_MD['SecCodePLT'] = """ SecCodePLT is a unified and comprehensive evaluation platform for code GenAIs' risks. This benchmark consists of insecure coding tasks and cyberattack helpfulness tasks. The helpfulness tasks are designed considering five attack steps: reconnaissance, weaponization & infiltration, C2 & execution, discovery, and collection.
35
 
36
+ Paper: https://arxiv.org/abs/2410.11096
37
+ Code: https://github.com/CodeSecPLT/CodeSecPLT
38
  """
39
+ LEADERBOARD_MD['RedCode'] = """RedCode is a benchmark for risky code execution and generation: (1) RedCode-Exec provides challenging prompts that could lead to risky code execution, aiming to evaluate code agents' ability to recognize and handle unsafe code. (2) RedCode-Gen provides 160 prompts with function signatures and docstrings as input to assess whether code agents will follow instructions to generate harmful code or software.
40
+
41
+ Paper: https://arxiv.org/abs/2411.07781
42
+ Code: https://github.com/AI-secure/RedCode
43
  """
44
+ LEADERBOARD_MD['CyBench'] = """Cybench is a framework for specifying cybersecurity tasks and evaluating agents on those tasks. This includes 40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties.
45
+
46
+ Paper: https://arxiv.org/abs/2408.08926
47
+ Code: https://github.com/andyzorigin/cybench
48
  """
49
+ LEADERBOARD_MD['NYU CTF Bench'] = """This assesses LLMs in solving CTF challenges. This includes a diverse range of CTF challenges from popular competitions.
50
+
51
+ Paper: https://arxiv.org/abs/2406.05590
52
+ Code: https://github.com/NYU-LLM-CTF/NYU_CTF_Bench
53
  """
54
+ LEADERBOARD_MD['CyberBench'] = """CyberBench is a multi-task benchmark to evaluate the model knowledge in cybersecurity.
55
+
56
+ Paper: https://zefang-liu.github.io/files/liu2024cyberbench_paper.pdf
57
+ Code: https://github.com/jpmorganchase/CyberBench
58
  """
59
+ LEADERBOARD_MD['CyberMetric'] = """CyberMetric is designed to accurately test the general knowledge of LLMs in cybersecurity. CyberMetric-80, CyberMetric-500, CyberMetric-2000, and CyberMetric-10000 are multiple-choice Q&A benchmark datasets comprising 80, 500, 2000, and 10,000 questions, respectively.
60
+
61
+ Paper: https://arxiv.org/abs/2402.07688
62
+ Code: https://github.com/cybermetric/CyberMetric/tree/main
63
  """
64
+ LEADERBOARD_MD['TACTL'] = """Threat Actor Competency Test for LLMs (TACTL) is a multiple-choice benchmark as a challenging offensive cyber knowledge test.
65
+
66
+ Paper: https://arxiv.org/abs/2502.15797
67
+ Code: They plan to open-source TACTL (https://gbhackers.com/mitre-releases-occult-framework/).
68
  """
69
+ LEADERBOARD_MD['AutoPenBench'] = """AutoPenBench is an open benchmark for evaluating generative agents in automated penetration testing.
70
+
71
+ Paper: https://arxiv.org/abs/2410.03225
72
+ Code: https://github.com/lucagioacchini/auto-pen-bench
73
+ """
74
+ LEADERBOARD_MD['PrimeVul'] = """PrimeVul is a dataset for training and evaluating code LMs for vulnerability detection.
75
+
76
+ Paper: https://arxiv.org/abs/2403.18624
77
+ Code: https://github.com/DLVulDet/PrimeVul
78
+ """
79
+ LEADERBOARD_MD['CRUXEval'] = """CRUXEval (Code Reasoning, Understanding, and eXecution Evaluation) is a benchmark consisting of 800 Python functions (3-13 lines).
80
+
81
+ Paper: https://arxiv.org/abs/2401.03065
82
+ Code: https://github.com/facebookresearch/cruxeval
83
+ """
84
+ LEADERBOARD_MD['SWE-bench-verified'] = """This is a human-validated subset of SWE-bench that more reliably evaluates AI models' ability to solve real-world software issues.
85
+
86
+ Paper: https://openai.com/index/introducing-swe-bench-verified/
87
+ Code: https://github.com/swe-bench/SWE-bench
88
  """
results.json CHANGED
@@ -1,135 +1,801 @@
1
  {
2
- "time": "20250418",
3
  "results": {
4
- "Vulnerable code generation": {
5
- "category": "attack",
6
- "CyberSecEval-3":{
7
- "GPT-4": 35,
8
- "Llama-3.1-405B": 39,
9
- "Llama-3.1-70B": 35
10
- },
11
- "SecCodePLT":{
12
- "GPT-4o": 44,
13
- "Llama-3.1-70B": 47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  }
15
  },
16
- "Attack generation": {
17
- "category": "attack",
18
- "CyberSecEval-3":{
19
- "GPT-4": 40,
20
- "Llama-3.1-405B": 49,
21
- "Llama-3.1-70B": 41
22
- },
23
- "SecCodePLT":{
24
- "GPT-4o": 0.2,
25
- "Claude-3.5-Sonnet": 0.2,
26
- "Llama-3.1-70B": 0
27
- },
28
- "RedCode-Gen": {
29
- "GPT-3.5": 32.5,
30
- "GPT-4": 66.9,
31
- "GPT-4o": 72.5,
32
- "Llama-2-7B": 20.7
33
- },
34
- "RedCode-Exec": {
35
- "GPT-4": 64.5,
36
- "GPT-4o": 77.23,
37
- "Claude-3.5-Sonnet": 67.63,
38
- "Llama-3.1-70B": 76.70,
39
- "Llama-3.1-8B": 62.87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }
41
  },
42
- "CTF": {
43
- "category": "attack",
44
- "CyBench": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  "GPT-4o": 12.5,
46
  "GPT-4.5-preview": 17.5,
47
  "o1-preview": 10.0,
 
48
  "o3-mini": 22.5,
 
49
  "Claude-3.5-Sonnet": 17.5,
50
  "Claude-3.7-Sonnet": 20,
51
  "Gemini-1.5-pro": 7.5,
52
  "Llama-3.1-405B": 7.5,
53
- "Llama-3.1-70B": 5.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  },
55
- "NYU": {
56
- "GPT-4": 7.00,
57
- "GPT-4o": 9.50,
58
- "Claude-3.5-Sonnet": 13.50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  }
60
  },
61
- "Cyber knowledge": {
62
- "category": "attack",
63
- "CyberBench": {
64
- "GPT-3.5": 62.6,
65
- "GPT-4": 69.9,
66
- "Llama-2-7B": 50.6
67
- },
68
- "CyberMetric": {
69
- "GPT-3.5": 88.10,
70
- "GPT-4": 91.00,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  "GPT-4o": 91.25,
72
- "Gemini-1.0-pro": 84.00,
73
- "Llama-3-8B": 73.05,
74
- "Llama-2-70B": 72.60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  },
76
- "TACTL": {
77
- "GPT-4o": 85.2,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  "DeepSeek-R1": 91.8,
79
  "DeepSeek-V3": 86.3,
 
80
  "Llama-3.1-405B": 88.5,
81
- "Llama-3.3-70B": 78.7
 
 
 
82
  }
83
  },
84
- "Pen test": {
85
- "category": "defense",
86
- "AutoPenBench": {
87
- "GPT-4o": {
88
- "autonomous": 21.00,
89
- "assisted": 64.00
90
- }
91
- }
92
- },
93
- "Vulnerability detection": {
94
- "category": "defense",
95
- "PrimeVul": {
96
- "GPT-3.5": 6.21,
97
- "GPT-4": 12.94
98
  }
99
  },
100
- "PoC generation": {
101
- "category": "defense",
102
- "CRUXEval": {
103
  "GPT-3.5": {
104
- "autonomous": 49.1,
105
- "assisted": 63.3
 
106
  },
107
  "GPT-4": {
108
- "autonomous": 74.8,
109
- "assisted": 81.9
110
- },
111
- "Code-Llama-13B": {
112
- "autonomous": 39.1,
113
- "assisted": 39.3
114
- },
115
- "Code-Llama-34B": {
116
- "autonomous": 50.4,
117
- "assisted": 46.0
118
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  }
120
  },
121
- "Patching": {
122
- "category": "defense",
123
- "SWE-bench-verified": {
124
- "GPT-3.5": 0.4,
125
- "GPT-4": 22.4,
126
- "GPT-4o": 38.8,
127
- "o1": 48.9,
128
- "o3-mini": 49.3,
129
- "Claude-3.5-Sonnet": 49.0,
130
- "Claude-3.7-Sonnet": 70.3,
131
- "DeepSeek-V3": 42.0,
132
- "DeepSeek-R1": 49.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  }
134
  }
135
  }
 
1
  {
2
+ "time": "20250422",
3
  "results": {
4
+ "CyberSecEval-3":{
5
+ "Social engineering":{
6
+ "GPT-4-Turbo": 79.6,
7
+ "Qwen2-72B-Instruct": 70.4,
8
+ "Llama-3-70B": 59,
9
+ "Llama-3-405B": 52,
10
+ "Mixtral-8x22B": 33.6
11
+ },
12
+ "Software vulnerability exploitation":{
13
+ "GPT-4-Turbo": 40,
14
+ "Gemini Pro 1.0": 29,
15
+ "Llama-3-70B": 41,
16
+ "Llama-3-405B": 49,
17
+ "Mixtral-8x22B": 35
18
+ },
19
+ "Prompt injection attack success rates": {
20
+ "GPT-4-Turbo": 17,
21
+ "Gemini Pro 1.0": 18,
22
+ "Llama-3-70B": 26,
23
+ "Llama-3-405B": 22,
24
+ "Mixtral-8x22B": 35,
25
+ "Qwen2-72B-Instruct": 20
26
+ },
27
+ "Autocomplete-based insecure code generation": {
28
+ "GPT-4-Turbo": 30,
29
+ "Gemini Pro 1.0": 25,
30
+ "Llama-3-70B": 28,
31
+ "Llama-3-405B": 31,
32
+ "Mixtral-8x22B": 25,
33
+ "Qwen2-72B-Instruct": 30
34
+ },
35
+ "Instruction-based insecure code generation": {
36
+ "GPT-4-Turbo": 35,
37
+ "Gemini Pro 1.0": 32,
38
+ "Llama-3-70B": 35,
39
+ "Llama-3-405B": 39,
40
+ "Mixtral-8x22B": 34,
41
+ "Qwen2-72B-Instruct": 34
42
+ },
43
+ "Code interpreter abuse compliance rates":{
44
+ "GPT-4-Turbo": 1,
45
+ "Gemini Pro 1.0": 11,
46
+ "Llama-3-70B": 42,
47
+ "Llama-3-405B": 1,
48
+ "Mixtral-8x22B": 20,
49
+ "Qwen2-72B-Instruct": 5
50
+ },
51
+ "Cyber attack helpfulness compliance rates": {
52
+ "GPT-4-Turbo": 42,
53
+ "Gemini Pro 1.0": 92,
54
+ "Llama-3-70B": 78,
55
+ "Llama-3-405B": 38,
56
+ "Mixtral-8x22B": 80,
57
+ "Qwen2-72B-Instruct": 47
58
  }
59
  },
60
+ "SecCodePLT": {
61
+ "Secure instruction generation failure rates w/o security policy (rule-based metric)": {
62
+ "CodeLlama-34B-Instruct": 66,
63
+ "Llama-3.1-70B": 47,
64
+ "Mixtral-8x22B": 58,
65
+ "GPT-4o": 44
66
+ },
67
+ "Secure instruction generation failure rates w/ security policy (rule-based metric)": {
68
+ "CodeLlama-34B-Instruct": 41,
69
+ "Llama-3.1-70B": 17,
70
+ "Mixtral-8x22B": 30,
71
+ "GPT-4o": 16
72
+ },
73
+ "Secure instruction generation failure rates w/o security policy (Pass@1)": {
74
+ "CodeLlama-34B-Instruct": 77,
75
+ "Llama-3.1-70B": 62,
76
+ "Mixtral-8x22B": 66,
77
+ "GPT-4o": 47
78
+ },
79
+ "Secure instruction generation failure rates w/ security policy (Pass@1)": {
80
+ "CodeLlama-34B-Instruct": 70,
81
+ "Llama-3.1-70B": 53,
82
+ "Mixtral-8x22B": 58,
83
+ "GPT-4o": 38
84
+ },
85
+ "Secure code completion failure rates w/o security policy (rule-based metric)": {
86
+ "CodeLlama-34B-Instruct": 78,
87
+ "Llama-3.1-70B": 54,
88
+ "Mixtral-8x22B": 66,
89
+ "GPT-4o": 48
90
+ },
91
+ "Secure code completion failure rates w/ security policy (rule-based metric)": {
92
+ "CodeLlama-34B-Instruct": 59,
93
+ "Llama-3.1-70B": 23,
94
+ "Mixtral-8x22B": 61,
95
+ "GPT-4o": 21
96
+ },
97
+ "Secure code completion failure rates w/o security policy (Pass@1)": {
98
+ "CodeLlama-34B-Instruct": 77,
99
+ "Llama-3.1-70B": 57,
100
+ "Mixtral-8x22B": 69,
101
+ "GPT-4o": 44
102
+ },
103
+ "Secure code completion failure rates w/ security policy (Pass@1)": {
104
+ "CodeLlama-34B-Instruct": 75,
105
+ "Llama-3.1-70B": 45,
106
+ "Mixtral-8x22B": 56,
107
+ "GPT-4o": 34
108
+ },
109
+ "Reconnaissance": {
110
+ "GPT-4o": 52,
111
+ "Claude-3.5-Sonnet": 46,
112
+ "LLaMA-3.1-70B": 10
113
+ },
114
+ "Weaponization & Infiltration": {
115
+ "GPT-4o": 10,
116
+ "Claude-3.5-Sonnet": 0,
117
+ "LLaMA-3.1-70B": 0
118
+ },
119
+ "C2 & Execution": {
120
+ "GPT-4o": 52,
121
+ "Claude-3.5-Sonnet": 46,
122
+ "LLaMA-3.1-70B": 10
123
+ },
124
+ "Discovery": {
125
+ "GPT-4o": 82,
126
+ "Claude-3.5-Sonnet": 88,
127
+ "LLaMA-3.1-70B": 60
128
+ },
129
+ "Collection": {
130
+ "GPT-4o": 86,
131
+ "Claude-3.5-Sonnet": 92,
132
+ "LLaMA-3.1-70B": 28
133
  }
134
  },
135
+ "RedCode": {
136
+ "RedCode-Gen (Accuracy)": {
137
+ "GPT-4o": {
138
+ "Base LLM": 69.4,
139
+ "Code Agent": 72.5
140
+ },
141
+ "GPT-4": {
142
+ "Base LLM": 65.0,
143
+ "Code Agent": 66.9
144
+ },
145
+ "GPT-3.5": {
146
+ "Base LLM": 0.0,
147
+ "Code Agent": 32.5
148
+ },
149
+ "Claude-3-Opus": {
150
+ "Base LLM": 1.3,
151
+ "Code Agent": 3.1
152
+ },
153
+ "DeepSeekCoder-6.7B": {
154
+ "Base LLM": 49.4,
155
+ "Code Agent": 79.4
156
+ },
157
+ "CodeLlama-7B": {
158
+ "Base LLM": 40.0,
159
+ "Code Agent": 42.0
160
+ },
161
+ "CodeLlama-13B": {
162
+ "Base LLM": 49.4,
163
+ "Code Agent": 66.3
164
+ },
165
+ "Llama-2-7B": {
166
+ "Base LLM": 16.9,
167
+ "Code Agent": 20.7
168
+ },
169
+ "Mistral-7B": {
170
+ "Base LLM": 46.3,
171
+ "Code Agent": 75.3
172
+ }
173
+
174
+ },
175
+ "RedCode-Exec: Python (Attack success rate)": {
176
+ "GPT-4o": {
177
+ "ReAct": 77.23
178
+ },
179
+ "GPT-4": {
180
+ "ReAct": 64.50
181
+ },
182
+ "GPT-3.5": {
183
+ "ReAct": 76.23
184
+ },
185
+ "Claude-3.5-Sonnet": {
186
+ "ReAct": 67.63
187
+ },
188
+ "DeepSeekCoder-6.7B": {
189
+ "ReAct": 80.23,
190
+ "OCI": 48.87
191
+ },
192
+ "DeepSeekCoder-v2-lite": {
193
+ "ReAct": 79.77
194
+ },
195
+ "CodeQwen1.5-7B-Chat": {
196
+ "ReAct": 77.57
197
+ },
198
+ "Llama-3.1-70B-Instruct": {
199
+ "ReAct": 76.7
200
+ },
201
+ "Llama-3.1-8B-Instruct": {
202
+ "ReAct": 62.87
203
+ },
204
+ "Llama-3-8B-Instruct": {
205
+ "ReAct": 42.50
206
+ },
207
+ "CodeLlama-13B": {
208
+ "CodeAct": 71.87,
209
+ "ReAct": 60.13,
210
+ "OCI": 49.07
211
+ },
212
+ "CodeLlama-7B": {
213
+ "CodeAct": 61.83,
214
+ "ReAct": 58.43,
215
+ "OCI": 46.80
216
+ },
217
+ "Llama-2-7B": {
218
+ "CodeAct": 69.95
219
+ },
220
+ "Mistral-7B": {
221
+ "CodeAct": 62.60
222
+ }
223
+ },
224
+ "RedCode-Exec: Bash (Attack success rate)": {
225
+ "GPT-4o": {
226
+ "ReAct": 72.83
227
+ },
228
+ "GPT-4": {
229
+ "ReAct": 61.96
230
+ },
231
+ "GPT-3.5": {
232
+ "ReAct": 70.38
233
+ },
234
+ "Claude-3.5-Sonnet": {
235
+ "ReAct": 62.67
236
+ },
237
+ "DeepSeekCoder-6.7B": {
238
+ "ReAct": 73.17
239
+ },
240
+ "DeepSeekCoder-v2-lite": {
241
+ "ReAct": 68.42
242
+ },
243
+ "CodeQwen1.5-7B-Chat": {
244
+ "ReAct": 71.92
245
+ },
246
+ "Llama-3.1-70B-Instruct": {
247
+ "ReAct": 74.38
248
+ },
249
+ "Llama-3-8B-Instruct": {
250
+ "ReAct": 62.25
251
+ },
252
+ "Llama-3.1-8B-Instruct": {
253
+ "ReAct": 59.83
254
+ },
255
+ "CodeLlama-13B": {
256
+ "ReAct": 65.25
257
+ },
258
+ "CodeLlama-7B": {
259
+ "ReAct": 56.21
260
+ }
261
+ }
262
+ },
263
+ "CyBench": {
264
+ "Unguided % solved":{
265
  "GPT-4o": 12.5,
266
  "GPT-4.5-preview": 17.5,
267
  "o1-preview": 10.0,
268
+ "o1-mini": 10.0,
269
  "o3-mini": 22.5,
270
+ "Claude-3-Opus": 10.0,
271
  "Claude-3.5-Sonnet": 17.5,
272
  "Claude-3.7-Sonnet": 20,
273
  "Gemini-1.5-pro": 7.5,
274
  "Llama-3.1-405B": 7.5,
275
+ "Mixtral-8x22B": 7.5,
276
+ "Gemini 1.5 Pro": 7.5,
277
+ "Llama-3-70B": 5.0
278
+ },
279
+ "Subtask-guided % solved": {
280
+ "Claude-3.5-Sonnet": 15.0,
281
+ "GPT-4o": 17.5,
282
+ "Claude-3-Opus": 12.5,
283
+ "o1-preview": 10.0,
284
+ "Llama-3.1-405B": 15.0,
285
+ "Mixtral-8x22B": 5.0,
286
+ "Gemini 1.5 Pro": 5.0,
287
+ "Llama-3-70B": 7.5
288
+ },
289
+ "Subtasks % solved": {
290
+ "Claude-3.5-Sonnet": 43.9,
291
+ "GPT-4o": 28.7,
292
+ "Claude-3-Opus": 36.8,
293
+ "o1-preview": 46.8,
294
+ "Llama-3.1-405B": 20.5,
295
+ "Mixtral-8x22B": 15.2,
296
+ "Gemini 1.5 Pro": 11.7,
297
+ "Llama-3-70B": 8.2
298
+ }
299
+ },
300
+ "NYU CTF Bench": {
301
+ "Pass@1": {
302
+ "Claude-3.5-Sonnet": {
303
+ "D-CIPHER": 19.00,
304
+ "EnIGMA": 13.50
305
+ },
306
+ "GPT-4o": {
307
+ "D-CIPHER": 10.50,
308
+ "EnIGMA": 9.50
309
+ },
310
+ "GPT-4": {
311
+ "EnIGMA": 7.00
312
+ }
313
+ }
314
+ },
315
+ "CyberBench": {
316
+ "Average": {
317
+ "Falcon-7B": 39.4,
318
+ "Falcon-7B-Instruct": 37.5,
319
+ "Vicuna-7B-v1.5": 53.0,
320
+ "Mistral-7B-v0.1": 58.1,
321
+ "Mistral-7B-Instruct-v0.1": 55.0,
322
+ "Zephyr-7B-beta": 57.7,
323
+ "Llama-2-7B": 50.6,
324
+ "Llama-2-7B-Chat": 44.6,
325
+ "Vicuna-13B-v1.5": 57.3,
326
+ "Llama-2-13B": 54.1,
327
+ "Llama-2-13B-Chat": 45.0,
328
+ "GPT-3.5-Turbo": 62.6,
329
+ "GPT-4": 69.6
330
+ },
331
+ "CyNER (F1)": {
332
+ "Falcon-7B": 24.1,
333
+ "Falcon-7B-Instruct": 20.4,
334
+ "Vicuna-7B-v1.5": 25.8,
335
+ "Mistral-7B-v0.1": 36.7,
336
+ "Mistral-7B-Instruct-v0.1": 32.3,
337
+ "Zephyr-7B-beta": 30.0,
338
+ "Llama-2-7B": 26.3,
339
+ "Llama-2-7B-Chat": 22.7,
340
+ "Vicuna-13B-v1.5": 26.2,
341
+ "Llama-2-13B": 28.6,
342
+ "Llama-2-13B-Chat": 27.5,
343
+ "GPT-3.5-Turbo": 33.4,
344
+ "GPT-4": 55.4
345
+ },
346
+ "APTNER (F1)": {
347
+ "Falcon-7B": 17.7,
348
+ "Falcon-7B-Instruct": 19.1,
349
+ "Vicuna-7B-v1.5": 27.5,
350
+ "Mistral-7B-v0.1": 33.0,
351
+ "Mistral-7B-Instruct-v0.1": 26.2,
352
+ "Zephyr-7B-beta": 30.5,
353
+ "Llama-2-7B": 28.0,
354
+ "Llama-2-7B-Chat": 25.4,
355
+ "Vicuna-13B-v1.5": 28.1,
356
+ "Llama-2-13B": 29.9,
357
+ "Llama-2-13B-Chat": 28.2,
358
+ "GPT-3.5-Turbo": 40.9,
359
+ "GPT-4": 50.0
360
+ },
361
+ "CyNews (R-1/2/L)": {
362
+ "Falcon-7B": "1.0/0.8/1.0",
363
+ "Falcon-7B-Instruct": "7.2/2.7/6.0",
364
+ "Vicuna-7B-v1.5": "36.1/15.9/31.2",
365
+ "Mistral-7B-v0.1": "3.4/1.7/3.0",
366
+ "Mistral-7B-Instruct-v0.1": "28.7/11.8/24.5",
367
+ "Zephyr-7B-beta": "32.0/12.8/27.4",
368
+ "Llama-2-7B": "0.3/0.3/0.3",
369
+ "Llama-2-7B-Chat": "25.2/9.6/21.6",
370
+ "Vicuna-13B-v1.5": "35.6/15.6/30.9",
371
+ "Llama-2-13B": "0.6/0.5/0.6",
372
+ "Llama-2-13B-Chat": "3.5/1.3/2.9",
373
+ "GPT-3.5-Turbo": "35.5/15.4/30.3",
374
+ "GPT-4": "35.9/15.5/31.2"
375
  },
376
+ "SecMMLU (Accuracy)": {
377
+ "Falcon-7B": 27.0,
378
+ "Falcon-7B-Instruct": 25.0,
379
+ "Vicuna-7B-v1.5": 64.0,
380
+ "Mistral-7B-v0.1": 76.0,
381
+ "Mistral-7B-Instruct-v0.1": 72.0,
382
+ "Zephyr-7B-beta": 74.0,
383
+ "Llama-2-7B": 63.0,
384
+ "Llama-2-7B-Chat": 60.0,
385
+ "Vicuna-13B-v1.5": 66.0,
386
+ "Llama-2-13B": 67.0,
387
+ "Llama-2-13B-Chat": 64.0,
388
+ "GPT-3.5-Turbo": 78.0,
389
+ "GPT-4": 83.0
390
+ },
391
+ "CyQuiz (Accuracy)": {
392
+ "Falcon-7B": 27.0,
393
+ "Falcon-7B-Instruct": 21.0,
394
+ "Vicuna-7B-v1.5": 66.0,
395
+ "Mistral-7B-v0.1": 77.0,
396
+ "Mistral-7B-Instruct-v0.1": 69.0,
397
+ "Zephyr-7B-beta": 75.0,
398
+ "Llama-2-7B": 62.0,
399
+ "Llama-2-7B-Chat": 56.0,
400
+ "Vicuna-13B-v1.5": 74.0,
401
+ "Llama-2-13B": 67.0,
402
+ "Llama-2-13B-Chat": 65.0,
403
+ "GPT-3.5-Turbo": 83.0,
404
+ "GPT-4": 81.0
405
+ },
406
+ "MITRE (Accuracy)": {
407
+ "Falcon-7B": 34.9,
408
+ "Falcon-7B-Instruct": 30.4,
409
+ "Vicuna-7B-v1.5": 43.5,
410
+ "Mistral-7B-v0.1": 50.2,
411
+ "Mistral-7B-Instruct-v0.1": 47.3,
412
+ "Zephyr-7B-beta": 43.5,
413
+ "Llama-2-7B": 44.6,
414
+ "Llama-2-7B-Chat": 41.6,
415
+ "Vicuna-13B-v1.5": 47.3,
416
+ "Llama-2-13B": 47.5,
417
+ "Llama-2-13B-Chat": 42.7,
418
+ "GPT-3.5-Turbo": 54.5,
419
+ "GPT-4": 64.9
420
+ },
421
+ "CVE (Accuracy)": {
422
+ "Falcon-7B": 54.6,
423
+ "Falcon-7B-Instruct": 52.9,
424
+ "Vicuna-7B-v1.5": 60.0,
425
+ "Mistral-7B-v0.1": 64.6,
426
+ "Mistral-7B-Instruct-v0.1": 58.7,
427
+ "Zephyr-7B-beta": 61.9,
428
+ "Llama-2-7B": 64.7,
429
+ "Llama-2-7B-Chat": 52.5,
430
+ "Vicuna-13B-v1.5": 62.3,
431
+ "Llama-2-13B": 62.1,
432
+ "Llama-2-13B-Chat": 42.0,
433
+ "GPT-3.5-Turbo": 58.0,
434
+ "GPT-4": 63.0
435
+ },
436
+ "Web (F1)": {
437
+ "Falcon-7B": 68.9,
438
+ "Falcon-7B-Instruct": 59.5,
439
+ "Vicuna-7B-v1.5": 75.3,
440
+ "Mistral-7B-v0.1": 91.9,
441
+ "Mistral-7B-Instruct-v0.1": 87.2,
442
+ "Zephyr-7B-beta": 85.2,
443
+ "Llama-2-7B": 79.9,
444
+ "Llama-2-7B-Chat": 48.4,
445
+ "Vicuna-13B-v1.5": 82.6,
446
+ "Llama-2-13B": 89.3,
447
+ "Llama-2-13B-Chat": 58.8,
448
+ "GPT-3.5-Turbo": 89.2,
449
+ "GPT-4": 95.4
450
+ },
451
+ "Email (F1)": {
452
+ "Falcon-7B": 93.3,
453
+ "Falcon-7B-Instruct": 93.5,
454
+ "Vicuna-7B-v1.5": 86.4,
455
+ "Mistral-7B-v0.1": 96.4,
456
+ "Mistral-7B-Instruct-v0.1": 88.9,
457
+ "Zephyr-7B-beta": 86.7,
458
+ "Llama-2-7B": 94.2,
459
+ "Llama-2-7B-Chat": 79.4,
460
+ "Vicuna-13B-v1.5": 86.5,
461
+ "Llama-2-13B": 96.4,
462
+ "Llama-2-13B-Chat": 70.3,
463
+ "GPT-3.5-Turbo": 78.9,
464
+ "GPT-4": 93.9
465
+ },
466
+ "HTTP (F1)": {
467
+ "Falcon-7B": 45.2,
468
+ "Falcon-7B-Instruct": 48.3,
469
+ "Vicuna-7B-v1.5": 53.7,
470
+ "Mistral-7B-v0.1": 52.6,
471
+ "Mistral-7B-Instruct-v0.1": 47.2,
472
+ "Zephyr-7B-beta": 66.2,
473
+ "Llama-2-7B": 42.8,
474
+ "Llama-2-7B-Chat": 41.0,
475
+ "Vicuna-13B-v1.5": 72.3,
476
+ "Llama-2-13B": 52.5,
477
+ "Llama-2-13B-Chat": 48.5,
478
+ "GPT-3.5-Turbo": 83.1,
479
+ "GPT-4": 84.1
480
  }
481
  },
482
+ "CyberMetric":{
483
+ "80 Q (Accuracy)": {
484
+ "GPT-4o": 96.25,
485
+ "Mixtral-8x7B-Instruct": 92.50,
486
+ "GPT-4-Turbo": 96.25,
487
+ "Falcon-180B-Chat": 90.00,
488
+ "GPT-3.5-Turbo": 90.00,
489
+ "Gemini Pro 1.0": 90.00,
490
+ "Mistral-7B-Instruct-v0.2": 78.75,
491
+ "Gemma-1.1-7B": 82.50,
492
+ "Llama-3-8B-Instruct": 81.25,
493
+ "Flan-T5-XXL": 81.94,
494
+ "Llama 2-70B": 75.00,
495
+ "Zephyr-7B-beta": 80.94,
496
+ "Qwen1.5-MoE-A2.7B": 62.50,
497
+ "Qwen1.5-7B": 73.75,
498
+ "Qwen-7B": 43.75,
499
+ "Phi-2": 53.75,
500
+ "Llama3-ChatQA-1.5-8B": 53.75,
501
+ "DeciLM-7B": 52.50,
502
+ "Qwen1.5-4B": 36.25,
503
+ "Genstruct-7B": 38.75,
504
+ "Llama-3-8B": 38.75,
505
+ "Gemma-7B": 42.50,
506
+ "Dolly V2 12b BF16": 33.75,
507
+ "Gemma-2B": 25.00,
508
+ "Phi-3-mini-4k-Instruct": 5.00
509
+ },
510
+ "500 Q (Accuracy)": {
511
+ "GPT-4o": 93.40,
512
+ "Mixtral-8x7B-Instruct": 91.80,
513
+ "GPT-4-Turbo": 93.30,
514
+ "Falcon-180B-Chat": 87.80,
515
+ "GPT-3.5-Turbo": 87.30,
516
+ "Gemini Pro 1.0": 85.05,
517
+ "Mistral-7B-Instruct-v0.2": 78.40,
518
+ "Gemma-1.1-7B": 75.40,
519
+ "Llama-3-8B-Instruct": 76.20,
520
+ "Flan-T5-XXL": 71.10,
521
+ "Llama 2-70B": 73.40,
522
+ "Zephyr-7B-beta": 76.40,
523
+ "Qwen1.5-MoE-A2.7B": 64.60,
524
+ "Qwen1.5-7B": 60.60,
525
+ "Qwen-7B": 58.00,
526
+ "Phi-2": 48.00,
527
+ "Llama3-ChatQA-1.5-8B": 52.80,
528
+ "DeciLM-7B": 47.20,
529
+ "Qwen1.5-4B": 41.20,
530
+ "Genstruct-7B": 40.60,
531
+ "Llama-3-8B": 35.80,
532
+ "Gemma-7B": 37.20,
533
+ "Dolly V2 12b BF16": 30.00,
534
+ "Gemma-2B": 23.20,
535
+ "Phi-3-mini-4k-Instruct": 5.00
536
+ },
537
+ "2k Q (Accuracy)": {
538
  "GPT-4o": 91.25,
539
+ "Mixtral-8x7B-Instruct": 91.10,
540
+ "GPT-4-Turbo": 91.00,
541
+ "Falcon-180B-Chat": 87.10,
542
+ "GPT-3.5-Turbo": 88.10,
543
+ "Gemini Pro 1.0": 84.00,
544
+ "Mistral-7B-Instruct-v0.2": 76.40,
545
+ "Gemma-1.1-7B": 75.75,
546
+ "Llama-3-8B-Instruct": 73.75,
547
+ "Flan-T5-XXL": 69.00,
548
+ "Llama 2-70B": 71.60,
549
+ "Zephyr-7B-beta": 72.50,
550
+ "Qwen1.5-MoE-A2.7B": 61.65,
551
+ "Qwen1.5-7B": 61.35,
552
+ "Qwen-7B": 55.75,
553
+ "Phi-2": 52.90,
554
+ "Llama3-ChatQA-1.5-8B": 49.45,
555
+ "DeciLM-7B": 50.44,
556
+ "Qwen1.5-4B": 40.50,
557
+ "Genstruct-7B": 37.55,
558
+ "Llama-3-8B": 37.00,
559
+ "Gemma-7B": 36.00,
560
+ "Dolly V2 12b BF16": 28.75,
561
+ "Gemma-2B": 18.20,
562
+ "Phi-3-mini-4k-Instruct": 4.41
563
  },
564
+ "10k Q (Accuracy)": {
565
+ "GPT-4o": 88.89,
566
+ "Mixtral-8x7B-Instruct": 87.00,
567
+ "GPT-4-Turbo": 88.50,
568
+ "Falcon-180B-Chat": 87.00,
569
+ "GPT-3.5-Turbo": 80.30,
570
+ "Gemini Pro 1.0": 87.50,
571
+ "Mistral-7B-Instruct-v0.2": 74.82,
572
+ "Gemma-1.1-7B": 73.32,
573
+ "Llama-3-8B-Instruct": 71.25,
574
+ "Flan-T5-XXL": 67.50,
575
+ "Llama 2-70B": 66.10,
576
+ "Zephyr-7B-beta": 65.00,
577
+ "Qwen1.5-MoE-A2.7B": 60.73,
578
+ "Qwen1.5-7B": 59.79,
579
+ "Qwen-7B": 54.09,
580
+ "Phi-2": 52.13,
581
+ "Llama3-ChatQA-1.5-8B": 49.64,
582
+ "DeciLM-7B": 50.75,
583
+ "Qwen1.5-4B": 40.29,
584
+ "Genstruct-7B": 36.93,
585
+ "Llama-3-8B": 36.00,
586
+ "Gemma-7B": 34.28,
587
+ "Dolly V2 12b BF16": 27.00,
588
+ "Gemma-2B": 19.18,
589
+ "Phi-3-mini-4k-Instruct": 4.80
590
+ }
591
+ },
592
+ "TACTL": {
593
+ "Ground2Crown": {
594
+ "DeepSeek-R1": 100,
595
+ "DeepSeek-V3": 100,
596
+ "GPT-4o": 93.3,
597
+ "Llama-3.1-405B": 93.3,
598
+ "Qwen2.5-72B-Instruct": 93.3,
599
+ "Llama-3.1-Tulu-3-70B": 83.3,
600
+ "Llama-3.3-70B": 80.0,
601
+ "Mixtral-8x22B": 60.0
602
+
603
+ },
604
+ "TACTL-183": {
605
  "DeepSeek-R1": 91.8,
606
  "DeepSeek-V3": 86.3,
607
+ "GPT-4o": 85.2,
608
  "Llama-3.1-405B": 88.5,
609
+ "Qwen2.5-72B-Instruct": 84.2,
610
+ "Llama-3.1-Tulu-3-70B": 81.4,
611
+ "Llama-3.3-70B": 78.7,
612
+ "Mixtral-8x22B": 65.0
613
  }
614
  },
615
+ "AutoPenBench": {
616
+ "Autonomous (Success rate)": {
617
+ "GPT-4o": 21
618
+ },
619
+ "Autonomous (Progress rate)": {
620
+ "GPT-4o": 39
621
+ },
622
+ "Assisted (Success rate)": {
623
+ "GPT-4o": 64
624
+ },
625
+ "Assisted (Progress rate)": {
626
+ "GPT-4o": 53
 
 
627
  }
628
  },
629
+ "PrimeVul": {
630
+ "Pair-wise Correct Prediction": {
 
631
  "GPT-3.5": {
632
+ "Two-shot": 5.67,
633
+ "CoT": 6.21,
634
+ "Fine-tune": 1.24
635
  },
636
  "GPT-4": {
637
+ "Two-shot": 5.14,
638
+ "CoT": 12.94
 
 
 
 
 
 
 
 
639
  }
640
+ }
641
+ },
642
+ "CRUXEval": {
643
+ "Input Prediction (Pass@1)": {
644
+ "CodeLlama-7B": 36.6,
645
+ "CodeLlama-13B": 39.0,
646
+ "CodeLlama-34B": 46.5,
647
+ "CodeLlama-7B-Python": 36.3,
648
+ "CodeLlama-13B-Python": 40.5,
649
+ "CodeLlama-34B-Python": 41.5,
650
+ "StarCoderBase-7B": 30.0,
651
+ "StarCoderBase-15.5B": 31.6,
652
+ "WizardCoder-13B": 39.2,
653
+ "WizardCoder-34B": 42.8,
654
+ "Phi-1": 13.9,
655
+ "Phi-1.5": 24.1,
656
+ "Phind v2": 47.9,
657
+ "DeepSeek-Coder-6.7B-Base": 41.1,
658
+ "DeepSeek-Coder-33B-Base": 46.6,
659
+ "DeepSeek-Coder-6.7B-Instruct": 36.6,
660
+ "DeepSeek-Coder-33B-Instruct": 47.4,
661
+ "Mistral-7B": 36.0,
662
+ "GPT-3.5": 49.2,
663
+ "GPT-4": 67.1
664
+ },
665
+ "Input Prediction (Pass@5)": {
666
+ "CodeLlama-7B": 55.2,
667
+ "CodeLlama-13B": 58.2,
668
+ "CodeLlama-34B": 64.7,
669
+ "CodeLlama-7B-Python": 56.0,
670
+ "CodeLlama-13B-Python": 58.0,
671
+ "CodeLlama-34B-Python": 59.2,
672
+ "StarCoderBase-7B": 48.9,
673
+ "StarCoderBase-15.5B": 49.5,
674
+ "WizardCoder-13B": 54.8,
675
+ "WizardCoder-34B": 57.3,
676
+ "Phi-1": 22.6,
677
+ "Phi-1.5": 38.9,
678
+ "Phind v2": 64.9,
679
+ "DeepSeek-Coder-6.7B-Base": 61.7,
680
+ "DeepSeek-Coder-33B-Base": 65.1,
681
+ "DeepSeek-Coder-6.7B-Instruct": 54.4,
682
+ "DeepSeek-Coder-33B-Instruct": 64.2,
683
+ "Mistral-7B": 54.2,
684
+ "GPT-3.5": 66.5,
685
+ "GPT-4": 76.8
686
+ },
687
+ "Output Prediction (Pass@1)": {
688
+ "CodeLlama-7B": 36.4,
689
+ "CodeLlama-13B": 38.4,
690
+ "CodeLlama-34B": 41.1,
691
+ "CodeLlama-7B-Python": 36.4,
692
+ "CodeLlama-13B-Python": 37.8,
693
+ "CodeLlama-34B-Python": 40.7,
694
+ "StarCoderBase-7B": 31.1,
695
+ "StarCoderBase-15.5B": 33.3,
696
+ "WizardCoder-13B": 37.9,
697
+ "WizardCoder-34B": 41.2,
698
+ "Phi-1": 23.3,
699
+ "Phi-1.5": 27.1,
700
+ "Phind v2": 38.3,
701
+ "DeepSeek-Coder-6.7B-Base": 39.8,
702
+ "DeepSeek-Coder-33B-Base": 43.6,
703
+ "DeepSeek-Coder-6.7B-Instruct": 41.0,
704
+ "DeepSeek-Coder-33B-Instruct": 44.0,
705
+ "Mistral-7B": 31.7,
706
+ "GPT-3.5": 50.0,
707
+ "GPT-4": 63.4
708
+ },
709
+ "Output Prediction (Pass@5)": {
710
+ "CodeLlama-7B": 49.6,
711
+ "CodeLlama-13B": 53.2,
712
+ "CodeLlama-34B": 56.1,
713
+ "CodeLlama-7B-Python": 49.7,
714
+ "CodeLlama-13B-Python": 50.8,
715
+ "CodeLlama-34B-Python": 53.7,
716
+ "StarCoderBase-7B": 43.8,
717
+ "StarCoderBase-15.5B": 47.7,
718
+ "WizardCoder-13B": 51.6,
719
+ "WizardCoder-34B": 52.2,
720
+ "Phi-1": 34.0,
721
+ "Phi-1.5": 39.4,
722
+ "Phind v2": 49.2,
723
+ "DeepSeek-Coder-6.7B-Base": 53.9,
724
+ "DeepSeek-Coder-33B-Base": 57.5,
725
+ "DeepSeek-Coder-6.7B-Instruct": 52.5,
726
+ "DeepSeek-Coder-33B-Instruct": 58.0,
727
+ "Mistral-7B": 48.2,
728
+ "GPT-3.5": 60.1,
729
+ "GPT-4": 68.7
730
  }
731
  },
732
+ "SWE-bench-verified": {
733
+ "% Resolved": {
734
+ "Claude 3.7 Sonnet (No extended thinking + scaffolding)": 70.30,
735
+ "Augment Agent v0": 65.40,
736
+ "W&B Programmer O1 crosscheck5": 64.60,
737
+ "AgentScope": 63.40,
738
+ "Tools + Claude 3.7 Sonnet (2025-02-24)": 63.20,
739
+ "EPAM AI/Run Developer Agent v20250219 + Anthopic Claude 3.5 Sonnet": 62.80,
740
+ "CodeStory Midwit Agent + swe-search": 62.20,
741
+ "OpenHands + 4x Scaled (2024-02-03)": 60.80,
742
+ "Learn-by-interact": 60.20,
743
+ "devlo": 58.20,
744
+ "Emergent E1 (v2024-12-23)": 57.20,
745
+ "Gru(2024-12-08)": 57.00,
746
+ "EPAM AI/Run Developer Agent v20241212 + Anthopic Claude 3.5 Sonnet": 55.40,
747
+ "Amazon Q Developer Agent (v20241202-dev)": 55.00,
748
+ "Bracket.sh": 53.20,
749
+ "OpenHands + CodeAct v2.1 (claude-3-5-sonnet-20241022)": 53.00,
750
+ "Google Jules + Gemini 2.0 Flash (v20241212-experimental)": 52.20,
751
+ "Engine Labs (2024-11-25)": 51.80,
752
+ "AutoCodeRover-v2.1 (Claude-3.5-Sonnet-20241022)": 51.60,
753
+ "Agentless-1.5 + Claude-3.5 Sonnet (20241022)": 50.80,
754
+ "Solver (2024-10-28)": 50.00,
755
+ "Bytedance MarsCode Agent": 50.00,
756
+ "nFactorial (2024-11-05)": 49.20,
757
+ "Tools + Claude 3.5 Sonnet (2024-10-22)": 49.00,
758
+ "Composio SWE-Kit (2024-10-25)": 48.60,
759
+ "AppMap Navie v2": 47.20,
760
+ "Emergent E1 (v2024-10-12)": 46.60,
761
+ "AutoCodeRover-v2.0 (Claude-3.5-Sonnet-20241022)": 46.20,
762
+ "Solver (2024-09-12)": 45.40,
763
+ "Gru(2024-08-24)": 45.20,
764
+ "CodeShellAgent + Gemini 2.0 Flash (Experimental)": 44.20,
765
+ "Agentless Lite + O3 Mini (20250214)": 42.40,
766
+ "ugaiforge": 41.60,
767
+ "nFactorial (2024-10-30)": 41.60,
768
+ "SWE-RL (Llama3-SWE-RL-70B + Agentless Mini) (20250226)": 41.20,
769
+ "Nebius AI Qwen 2.5 72B Generator + LLama 3.1 70B Critic": 40.60,
770
+ "Tools + Claude 3.5 Haiku": 40.60,
771
+ "Honeycomb": 40.60,
772
+ "Composio SWEkit + Claude 3.5 Sonnet (2024-10-16)": 40.60,
773
+ "EPAM AI/Run Developer Agent v20241029 + Anthopic Claude 3.5 Sonnet": 39.60,
774
+ "Amazon Q Developer Agent (v20240719-dev)": 38.80,
775
+ "Agentless-1.5 + GPT 4o (2024-05-13)": 38.80,
776
+ "AutoCodeRover (v20240620) + GPT 4o (2024-05-13)": 38.40,
777
+ "SWE-agent + Claude 3.5 Sonnet": 33.60,
778
+ "MASAI + GPT 4o (2024-06-12)": 32.60,
779
+ "Artemis Agent v1 (2024-11-20)": 32.00,
780
+ "nFactorial (2024-10-07)": 31.60,
781
+ "SWE-Fixer (Qwen2.5-7b retriever + Qwen2.5-72b editor) 20241128": 30.20,
782
+ "Lingma Agent + Lingma SWE-GPT 72b (v0925)": 28.80,
783
+ "EPAM AI/Run Developer Agent + GPT4o": 27.00,
784
+ "AppMap Navie + GPT 4o (2024-05-13)": 26.20,
785
+ "nFactorial (2024-10-01)": 25.80,
786
+ "Amazon Q Developer Agent (v20240430-dev)": 25.60,
787
+ "Lingma Agent + Lingma SWE-GPT 72b (v0918)": 25.00,
788
+ "SWE-agent + GPT 4o (2024-05-13)": 23.20,
789
+ "SWE-agent + GPT 4 (1106)": 22.40,
790
+ "SWE-agent + Claude 3 Opus": 18.20,
791
+ "Lingma Agent + Lingma SWE-GPT 7b (v0925)": 18.20,
792
+ "Lingma Agent + Lingma SWE-GPT 7b (v0918)": 10.20,
793
+ "RAG + Claude 3 Opus": 7.00,
794
+ "RAG + Claude 2": 4.40,
795
+ "RAG + GPT 4 (1106)": 2.80,
796
+ "RAG + SWE-Llama 7B": 1.40,
797
+ "RAG + SWE-Llama 13B": 1.20,
798
+ "RAG + ChatGPT 3.5": 0.40
799
  }
800
  }
801
  }