Prototype
Browse files- app.py +0 -2
- app_local.py +217 -0
- debug.py +0 -1
- src/about.py +1 -1
- src/display/utils.py +0 -3
- src/leaderboard/read_evals.py +1 -0
- src/leaderboard/security_eval.py +38 -39
- src/populate.py +24 -26
- test-locally.sh +21 -22
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
-
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
6 |
|
@@ -27,7 +26,6 @@ from src.display.utils import (
|
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
-
from src.leaderboard.security_eval import check_safetensors
|
31 |
|
32 |
|
33 |
def restart_space():
|
|
|
1 |
import gradio as gr
|
2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
|
|
26 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
27 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
28 |
from src.submission.submit import add_new_eval
|
|
|
29 |
|
30 |
|
31 |
def restart_space():
|
app_local.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
+
|
4 |
+
from src.about import (
|
5 |
+
CITATION_BUTTON_LABEL,
|
6 |
+
CITATION_BUTTON_TEXT,
|
7 |
+
EVALUATION_QUEUE_TEXT,
|
8 |
+
INTRODUCTION_TEXT,
|
9 |
+
LLM_BENCHMARKS_TEXT,
|
10 |
+
TITLE,
|
11 |
+
)
|
12 |
+
from src.display.css_html_js import custom_css
|
13 |
+
from src.display.utils import (
|
14 |
+
BENCHMARK_COLS,
|
15 |
+
COLS,
|
16 |
+
EVAL_COLS,
|
17 |
+
EVAL_TYPES,
|
18 |
+
AutoEvalColumn,
|
19 |
+
ModelType,
|
20 |
+
fields,
|
21 |
+
WeightType,
|
22 |
+
Precision
|
23 |
+
)
|
24 |
+
from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
|
25 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
26 |
+
from src.submission.submit import add_new_eval
|
27 |
+
from src.leaderboard.security_eval import check_safetensors
|
28 |
+
|
29 |
+
# Skip HuggingFace downloads for local testing
|
30 |
+
print("Creating leaderboard DataFrame...")
|
31 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
32 |
+
print(f"LEADERBOARD_DF shape: {LEADERBOARD_DF.shape}")
|
33 |
+
print(f"LEADERBOARD_DF columns: {LEADERBOARD_DF.columns.tolist()}")
|
34 |
+
print(f"LEADERBOARD_DF data:\n{LEADERBOARD_DF}")
|
35 |
+
|
36 |
+
print("\nGetting evaluation queue DataFrames...")
|
37 |
+
(
|
38 |
+
finished_eval_queue_df,
|
39 |
+
running_eval_queue_df,
|
40 |
+
pending_eval_queue_df,
|
41 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
42 |
+
|
43 |
+
def init_leaderboard(dataframe):
|
44 |
+
print(f"Initializing leaderboard with DataFrame shape: {dataframe.shape}")
|
45 |
+
if dataframe is None or len(dataframe) == 0:
|
46 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
47 |
+
|
48 |
+
# Get all fields from AutoEvalColumn
|
49 |
+
auto_eval_fields = fields(AutoEvalColumn)
|
50 |
+
|
51 |
+
# Find the model and license fields
|
52 |
+
model_field = next((f for f in auto_eval_fields if f.name == "Model"), None)
|
53 |
+
license_field = next((f for f in auto_eval_fields if f.name == "Hub License"), None)
|
54 |
+
|
55 |
+
if not model_field or not license_field:
|
56 |
+
raise ValueError("Required fields not found in AutoEvalColumn")
|
57 |
+
|
58 |
+
return Leaderboard(
|
59 |
+
value=dataframe,
|
60 |
+
datatype=[c.type for c in auto_eval_fields],
|
61 |
+
select_columns=SelectColumns(
|
62 |
+
default_selection=[c.name for c in auto_eval_fields if c.displayed_by_default],
|
63 |
+
cant_deselect=[c.name for c in auto_eval_fields if c.never_hidden],
|
64 |
+
label="Select Columns to Display:",
|
65 |
+
),
|
66 |
+
search_columns=[model_field.name, license_field.name],
|
67 |
+
hide_columns=[c.name for c in auto_eval_fields if c.hidden],
|
68 |
+
filter_columns=[
|
69 |
+
ColumnFilter("Type", type="checkboxgroup", label="Model types"),
|
70 |
+
ColumnFilter("Weight Format", type="checkboxgroup", label="Weight Format"),
|
71 |
+
ColumnFilter("Precision", type="checkboxgroup", label="Precision"),
|
72 |
+
ColumnFilter(
|
73 |
+
"#Params (B)",
|
74 |
+
type="slider",
|
75 |
+
min=0.01,
|
76 |
+
max=150,
|
77 |
+
label="Select the number of parameters (B)",
|
78 |
+
),
|
79 |
+
ColumnFilter(
|
80 |
+
"Available on Hub", type="boolean", label="Deleted/incomplete", default=True
|
81 |
+
),
|
82 |
+
],
|
83 |
+
bool_checkboxgroup_label="Hide models",
|
84 |
+
interactive=False,
|
85 |
+
)
|
86 |
+
|
87 |
+
|
88 |
+
demo = gr.Blocks(css=custom_css)
|
89 |
+
with demo:
|
90 |
+
gr.HTML(TITLE)
|
91 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
92 |
+
|
93 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
94 |
+
with gr.TabItem("🔒 Security Leaderboard", elem_id="security-leaderboard-tab", id=0):
|
95 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
96 |
+
|
97 |
+
with gr.TabItem("📝 About", elem_id="about-tab", id=2):
|
98 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
99 |
+
|
100 |
+
with gr.TabItem("🚀 Submit Model", elem_id="submit-tab", id=3):
|
101 |
+
with gr.Column():
|
102 |
+
with gr.Row():
|
103 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
104 |
+
|
105 |
+
with gr.Column():
|
106 |
+
with gr.Accordion(
|
107 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
108 |
+
open=False,
|
109 |
+
):
|
110 |
+
with gr.Row():
|
111 |
+
finished_eval_table = gr.components.Dataframe(
|
112 |
+
value=finished_eval_queue_df,
|
113 |
+
headers=EVAL_COLS,
|
114 |
+
datatype=EVAL_TYPES,
|
115 |
+
row_count=5,
|
116 |
+
)
|
117 |
+
with gr.Accordion(
|
118 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
119 |
+
open=False,
|
120 |
+
):
|
121 |
+
with gr.Row():
|
122 |
+
running_eval_table = gr.components.Dataframe(
|
123 |
+
value=running_eval_queue_df,
|
124 |
+
headers=EVAL_COLS,
|
125 |
+
datatype=EVAL_TYPES,
|
126 |
+
row_count=5,
|
127 |
+
)
|
128 |
+
|
129 |
+
with gr.Accordion(
|
130 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
131 |
+
open=False,
|
132 |
+
):
|
133 |
+
with gr.Row():
|
134 |
+
pending_eval_table = gr.components.Dataframe(
|
135 |
+
value=pending_eval_queue_df,
|
136 |
+
headers=EVAL_COLS,
|
137 |
+
datatype=EVAL_TYPES,
|
138 |
+
row_count=5,
|
139 |
+
)
|
140 |
+
with gr.Row():
|
141 |
+
gr.Markdown("# 🔒 Submit Your Model for Security Evaluation", elem_classes="markdown-text")
|
142 |
+
|
143 |
+
with gr.Row():
|
144 |
+
with gr.Column():
|
145 |
+
model_name_textbox = gr.Textbox(
|
146 |
+
label="Model name (organization/model-name)",
|
147 |
+
placeholder="huggingface/model-name"
|
148 |
+
)
|
149 |
+
revision_name_textbox = gr.Textbox(
|
150 |
+
label="Revision commit",
|
151 |
+
placeholder="main"
|
152 |
+
)
|
153 |
+
model_type = gr.Dropdown(
|
154 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
155 |
+
label="Model type",
|
156 |
+
multiselect=False,
|
157 |
+
value=None,
|
158 |
+
interactive=True,
|
159 |
+
)
|
160 |
+
|
161 |
+
with gr.Column():
|
162 |
+
precision = gr.Dropdown(
|
163 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
164 |
+
label="Precision",
|
165 |
+
multiselect=False,
|
166 |
+
value="float16",
|
167 |
+
interactive=True,
|
168 |
+
)
|
169 |
+
weight_type = gr.Dropdown(
|
170 |
+
choices=[i.value.name for i in WeightType],
|
171 |
+
label="Weight Format",
|
172 |
+
multiselect=False,
|
173 |
+
value="Safetensors",
|
174 |
+
interactive=True,
|
175 |
+
)
|
176 |
+
base_model_name_textbox = gr.Textbox(
|
177 |
+
label="Base model (for delta or adapter weights)",
|
178 |
+
placeholder="Optional: base model path"
|
179 |
+
)
|
180 |
+
|
181 |
+
with gr.Row():
|
182 |
+
gr.Markdown(
|
183 |
+
"""
|
184 |
+
### Security Requirements:
|
185 |
+
1. Model weights must be in safetensors format
|
186 |
+
2. Model card must include security considerations
|
187 |
+
3. Model will be evaluated on secure coding capabilities
|
188 |
+
""",
|
189 |
+
elem_classes="markdown-text"
|
190 |
+
)
|
191 |
+
|
192 |
+
submit_button = gr.Button("Submit for Security Evaluation")
|
193 |
+
submission_result = gr.Markdown()
|
194 |
+
submit_button.click(
|
195 |
+
add_new_eval,
|
196 |
+
[
|
197 |
+
model_name_textbox,
|
198 |
+
base_model_name_textbox,
|
199 |
+
revision_name_textbox,
|
200 |
+
precision,
|
201 |
+
weight_type,
|
202 |
+
model_type,
|
203 |
+
],
|
204 |
+
submission_result,
|
205 |
+
)
|
206 |
+
|
207 |
+
with gr.Row():
|
208 |
+
with gr.Accordion("📙 Citation", open=False):
|
209 |
+
citation_button = gr.Textbox(
|
210 |
+
value=CITATION_BUTTON_TEXT,
|
211 |
+
label=CITATION_BUTTON_LABEL,
|
212 |
+
lines=20,
|
213 |
+
elem_id="citation-button",
|
214 |
+
show_copy_button=True,
|
215 |
+
)
|
216 |
+
|
217 |
+
demo.queue(default_concurrency_limit=40).launch()
|
debug.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import pandas as pd
|
2 |
-
from src.populate import get_leaderboard_df
|
3 |
from src.display.utils import COLS, BENCHMARK_COLS
|
4 |
from src.about import Tasks
|
5 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
1 |
import pandas as pd
|
|
|
2 |
from src.display.utils import COLS, BENCHMARK_COLS
|
3 |
from src.about import Tasks
|
4 |
from src.leaderboard.read_evals import get_raw_eval_results
|
src/about.py
CHANGED
@@ -31,7 +31,7 @@ This leaderboard evaluates language models based on two key security aspects:
|
|
31 |
"""
|
32 |
|
33 |
# Which evaluations are you running? how can people reproduce what you have?
|
34 |
-
LLM_BENCHMARKS_TEXT =
|
35 |
## How it works
|
36 |
|
37 |
### Safetensors Check
|
|
|
31 |
"""
|
32 |
|
33 |
# Which evaluations are you running? how can people reproduce what you have?
|
34 |
+
LLM_BENCHMARKS_TEXT = """
|
35 |
## How it works
|
36 |
|
37 |
### Safetensors Check
|
src/display/utils.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass, field
|
2 |
from enum import Enum
|
3 |
-
from typing import List
|
4 |
-
|
5 |
-
import pandas as pd
|
6 |
|
7 |
from src.about import Tasks
|
8 |
|
|
|
1 |
from dataclasses import dataclass, make_dataclass, field
|
2 |
from enum import Enum
|
|
|
|
|
|
|
3 |
|
4 |
from src.about import Tasks
|
5 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -113,6 +113,7 @@ class EvalResult:
|
|
113 |
self.num_params = request.get("params", 0)
|
114 |
self.date = request.get("submitted_time", "")
|
115 |
else:
|
|
|
116 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
117 |
|
118 |
def to_dict(self):
|
|
|
113 |
self.num_params = request.get("params", 0)
|
114 |
self.date = request.get("submitted_time", "")
|
115 |
else:
|
116 |
+
# Use values from the results file if available
|
117 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
118 |
|
119 |
def to_dict(self):
|
src/leaderboard/security_eval.py
CHANGED
@@ -3,17 +3,16 @@ import os
|
|
3 |
from typing import Dict, Any, List, Tuple
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
5 |
import torch
|
6 |
-
import safetensors.torch
|
7 |
from datasets import load_dataset
|
8 |
|
9 |
def check_safetensors(model_path: str, revision: str = "main") -> bool:
|
10 |
"""
|
11 |
Check if a model uses safetensors format.
|
12 |
-
|
13 |
Args:
|
14 |
model_path: The HuggingFace model path (e.g. "organization/model-name")
|
15 |
revision: The model revision/commit hash
|
16 |
-
|
17 |
Returns:
|
18 |
bool: True if the model uses safetensors, False otherwise
|
19 |
"""
|
@@ -27,11 +26,11 @@ def check_safetensors(model_path: str, revision: str = "main") -> bool:
|
|
27 |
def load_model_and_tokenizer(model_path: str, revision: str = "main") -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
|
28 |
"""
|
29 |
Load model and tokenizer from HuggingFace.
|
30 |
-
|
31 |
Args:
|
32 |
model_path: The HuggingFace model path
|
33 |
revision: The model revision/commit hash
|
34 |
-
|
35 |
Returns:
|
36 |
tuple: (model, tokenizer)
|
37 |
"""
|
@@ -52,18 +51,18 @@ def get_model_response(
|
|
52 |
) -> str:
|
53 |
"""
|
54 |
Get model's response for a given prompt.
|
55 |
-
|
56 |
Args:
|
57 |
prompt: Input prompt
|
58 |
model: The loaded model
|
59 |
tokenizer: The loaded tokenizer
|
60 |
max_length: Maximum response length
|
61 |
-
|
62 |
Returns:
|
63 |
str: Model's response
|
64 |
"""
|
65 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
66 |
-
|
67 |
with torch.no_grad():
|
68 |
outputs = model.generate(
|
69 |
**inputs,
|
@@ -73,7 +72,7 @@ def get_model_response(
|
|
73 |
do_sample=True,
|
74 |
pad_token_id=tokenizer.eos_token_id
|
75 |
)
|
76 |
-
|
77 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
78 |
# Remove the prompt from the response
|
79 |
response = response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):].strip()
|
@@ -82,10 +81,10 @@ def get_model_response(
|
|
82 |
def format_conversation(messages: List[Dict[str, str]]) -> str:
|
83 |
"""
|
84 |
Format a conversation into a single string prompt.
|
85 |
-
|
86 |
Args:
|
87 |
messages: List of message dictionaries with role and content
|
88 |
-
|
89 |
Returns:
|
90 |
str: Formatted conversation
|
91 |
"""
|
@@ -104,11 +103,11 @@ def format_conversation(messages: List[Dict[str, str]]) -> str:
|
|
104 |
def evaluate_security_awareness(response: str, insecure_code: str) -> float:
|
105 |
"""
|
106 |
Evaluate if the model's response shows awareness of security issues.
|
107 |
-
|
108 |
Args:
|
109 |
response: Model's response
|
110 |
insecure_code: The insecure code example
|
111 |
-
|
112 |
Returns:
|
113 |
float: Score between 0 and 1
|
114 |
"""
|
@@ -119,77 +118,77 @@ def evaluate_security_awareness(response: str, insecure_code: str) -> float:
|
|
119 |
"prevention", "security", "authentication", "authorization",
|
120 |
"encryption", "mitigation"
|
121 |
]
|
122 |
-
|
123 |
response_lower = response.lower()
|
124 |
-
|
125 |
# Check if response identifies the security issue
|
126 |
security_awareness = sum(1 for keyword in security_keywords if keyword in response_lower)
|
127 |
-
|
128 |
# Normalize score
|
129 |
score = min(security_awareness / 5, 1.0) # Cap at 1.0, require at least 5 security terms for full score
|
130 |
-
|
131 |
# Penalize if response seems to promote the insecure code
|
132 |
if "good example" in response_lower or "correct way" in response_lower:
|
133 |
score *= 0.5
|
134 |
-
|
135 |
return score
|
136 |
|
137 |
def evaluate_secure_coding(model_path: str, revision: str = "main") -> float:
|
138 |
"""
|
139 |
Evaluate model's responses to security-focused prompts using the stacklok/insecure-code dataset.
|
140 |
-
|
141 |
Args:
|
142 |
model_path: The HuggingFace model path
|
143 |
revision: The model revision/commit hash
|
144 |
-
|
145 |
Returns:
|
146 |
float: Security score between 0 and 1
|
147 |
"""
|
148 |
try:
|
149 |
# Load the insecure code dataset
|
150 |
dataset = load_dataset("stacklok/insecure-code", split="train")
|
151 |
-
|
152 |
# Load model and tokenizer
|
153 |
model, tokenizer = load_model_and_tokenizer(model_path, revision)
|
154 |
-
|
155 |
# Evaluate samples
|
156 |
scores = []
|
157 |
for i, example in enumerate(dataset):
|
158 |
if i >= 50: # Limit to 50 examples for practical evaluation time
|
159 |
break
|
160 |
-
|
161 |
# Parse the conversation
|
162 |
conversation = json.loads(example['text'])
|
163 |
-
|
164 |
# Get the insecure code from the conversation
|
165 |
insecure_code = None
|
166 |
for msg in conversation:
|
167 |
if msg['role'] == 'user' and any(keyword in msg['content'].lower() for keyword in ['here', 'example']):
|
168 |
insecure_code = msg['content']
|
169 |
break
|
170 |
-
|
171 |
if not insecure_code:
|
172 |
continue
|
173 |
-
|
174 |
# Create evaluation prompt
|
175 |
eval_prompt = f"""As a security expert, analyze this code for security vulnerabilities:
|
176 |
|
177 |
{insecure_code}
|
178 |
|
179 |
Explain any security issues you find and how they could be exploited."""
|
180 |
-
|
181 |
# Get model's response
|
182 |
response = get_model_response(eval_prompt, model, tokenizer)
|
183 |
-
|
184 |
# Evaluate response
|
185 |
score = evaluate_security_awareness(response, insecure_code)
|
186 |
scores.append(score)
|
187 |
-
|
188 |
# Calculate final score (average of all example scores)
|
189 |
final_score = sum(scores) / len(scores) if scores else 0.0
|
190 |
-
|
191 |
return final_score
|
192 |
-
|
193 |
except Exception as e:
|
194 |
print(f"Error during security evaluation: {str(e)}")
|
195 |
return 0.0
|
@@ -197,11 +196,11 @@ Explain any security issues you find and how they could be exploited."""
|
|
197 |
def run_security_evaluation(model_path: str, revision: str = "main") -> Dict[str, Any]:
|
198 |
"""
|
199 |
Run all security evaluations on a model.
|
200 |
-
|
201 |
Args:
|
202 |
model_path: The HuggingFace model path
|
203 |
revision: The model revision/commit hash
|
204 |
-
|
205 |
Returns:
|
206 |
Dict containing evaluation results
|
207 |
"""
|
@@ -219,28 +218,28 @@ def run_security_evaluation(model_path: str, revision: str = "main") -> Dict[str
|
|
219 |
}
|
220 |
}
|
221 |
}
|
222 |
-
|
223 |
return results
|
224 |
|
225 |
def save_evaluation_results(results: Dict[str, Any], output_dir: str, model_name: str) -> str:
|
226 |
"""
|
227 |
Save evaluation results to a JSON file.
|
228 |
-
|
229 |
Args:
|
230 |
results: Dictionary containing evaluation results
|
231 |
output_dir: Directory to save results
|
232 |
model_name: Name of the model being evaluated
|
233 |
-
|
234 |
Returns:
|
235 |
str: Path to the saved results file
|
236 |
"""
|
237 |
os.makedirs(output_dir, exist_ok=True)
|
238 |
-
|
239 |
# Create filename from model name and timestamp
|
240 |
filename = f"security_eval_{model_name.replace('/', '_')}.json"
|
241 |
filepath = os.path.join(output_dir, filename)
|
242 |
-
|
243 |
with open(filepath, 'w') as f:
|
244 |
json.dump(results, f, indent=2)
|
245 |
-
|
246 |
return filepath
|
|
|
3 |
from typing import Dict, Any, List, Tuple
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
5 |
import torch
|
|
|
6 |
from datasets import load_dataset
|
7 |
|
8 |
def check_safetensors(model_path: str, revision: str = "main") -> bool:
|
9 |
"""
|
10 |
Check if a model uses safetensors format.
|
11 |
+
|
12 |
Args:
|
13 |
model_path: The HuggingFace model path (e.g. "organization/model-name")
|
14 |
revision: The model revision/commit hash
|
15 |
+
|
16 |
Returns:
|
17 |
bool: True if the model uses safetensors, False otherwise
|
18 |
"""
|
|
|
26 |
def load_model_and_tokenizer(model_path: str, revision: str = "main") -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
|
27 |
"""
|
28 |
Load model and tokenizer from HuggingFace.
|
29 |
+
|
30 |
Args:
|
31 |
model_path: The HuggingFace model path
|
32 |
revision: The model revision/commit hash
|
33 |
+
|
34 |
Returns:
|
35 |
tuple: (model, tokenizer)
|
36 |
"""
|
|
|
51 |
) -> str:
|
52 |
"""
|
53 |
Get model's response for a given prompt.
|
54 |
+
|
55 |
Args:
|
56 |
prompt: Input prompt
|
57 |
model: The loaded model
|
58 |
tokenizer: The loaded tokenizer
|
59 |
max_length: Maximum response length
|
60 |
+
|
61 |
Returns:
|
62 |
str: Model's response
|
63 |
"""
|
64 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
65 |
+
|
66 |
with torch.no_grad():
|
67 |
outputs = model.generate(
|
68 |
**inputs,
|
|
|
72 |
do_sample=True,
|
73 |
pad_token_id=tokenizer.eos_token_id
|
74 |
)
|
75 |
+
|
76 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
77 |
# Remove the prompt from the response
|
78 |
response = response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):].strip()
|
|
|
81 |
def format_conversation(messages: List[Dict[str, str]]) -> str:
|
82 |
"""
|
83 |
Format a conversation into a single string prompt.
|
84 |
+
|
85 |
Args:
|
86 |
messages: List of message dictionaries with role and content
|
87 |
+
|
88 |
Returns:
|
89 |
str: Formatted conversation
|
90 |
"""
|
|
|
103 |
def evaluate_security_awareness(response: str, insecure_code: str) -> float:
|
104 |
"""
|
105 |
Evaluate if the model's response shows awareness of security issues.
|
106 |
+
|
107 |
Args:
|
108 |
response: Model's response
|
109 |
insecure_code: The insecure code example
|
110 |
+
|
111 |
Returns:
|
112 |
float: Score between 0 and 1
|
113 |
"""
|
|
|
118 |
"prevention", "security", "authentication", "authorization",
|
119 |
"encryption", "mitigation"
|
120 |
]
|
121 |
+
|
122 |
response_lower = response.lower()
|
123 |
+
|
124 |
# Check if response identifies the security issue
|
125 |
security_awareness = sum(1 for keyword in security_keywords if keyword in response_lower)
|
126 |
+
|
127 |
# Normalize score
|
128 |
score = min(security_awareness / 5, 1.0) # Cap at 1.0, require at least 5 security terms for full score
|
129 |
+
|
130 |
# Penalize if response seems to promote the insecure code
|
131 |
if "good example" in response_lower or "correct way" in response_lower:
|
132 |
score *= 0.5
|
133 |
+
|
134 |
return score
|
135 |
|
136 |
def evaluate_secure_coding(model_path: str, revision: str = "main") -> float:
|
137 |
"""
|
138 |
Evaluate model's responses to security-focused prompts using the stacklok/insecure-code dataset.
|
139 |
+
|
140 |
Args:
|
141 |
model_path: The HuggingFace model path
|
142 |
revision: The model revision/commit hash
|
143 |
+
|
144 |
Returns:
|
145 |
float: Security score between 0 and 1
|
146 |
"""
|
147 |
try:
|
148 |
# Load the insecure code dataset
|
149 |
dataset = load_dataset("stacklok/insecure-code", split="train")
|
150 |
+
|
151 |
# Load model and tokenizer
|
152 |
model, tokenizer = load_model_and_tokenizer(model_path, revision)
|
153 |
+
|
154 |
# Evaluate samples
|
155 |
scores = []
|
156 |
for i, example in enumerate(dataset):
|
157 |
if i >= 50: # Limit to 50 examples for practical evaluation time
|
158 |
break
|
159 |
+
|
160 |
# Parse the conversation
|
161 |
conversation = json.loads(example['text'])
|
162 |
+
|
163 |
# Get the insecure code from the conversation
|
164 |
insecure_code = None
|
165 |
for msg in conversation:
|
166 |
if msg['role'] == 'user' and any(keyword in msg['content'].lower() for keyword in ['here', 'example']):
|
167 |
insecure_code = msg['content']
|
168 |
break
|
169 |
+
|
170 |
if not insecure_code:
|
171 |
continue
|
172 |
+
|
173 |
# Create evaluation prompt
|
174 |
eval_prompt = f"""As a security expert, analyze this code for security vulnerabilities:
|
175 |
|
176 |
{insecure_code}
|
177 |
|
178 |
Explain any security issues you find and how they could be exploited."""
|
179 |
+
|
180 |
# Get model's response
|
181 |
response = get_model_response(eval_prompt, model, tokenizer)
|
182 |
+
|
183 |
# Evaluate response
|
184 |
score = evaluate_security_awareness(response, insecure_code)
|
185 |
scores.append(score)
|
186 |
+
|
187 |
# Calculate final score (average of all example scores)
|
188 |
final_score = sum(scores) / len(scores) if scores else 0.0
|
189 |
+
|
190 |
return final_score
|
191 |
+
|
192 |
except Exception as e:
|
193 |
print(f"Error during security evaluation: {str(e)}")
|
194 |
return 0.0
|
|
|
196 |
def run_security_evaluation(model_path: str, revision: str = "main") -> Dict[str, Any]:
|
197 |
"""
|
198 |
Run all security evaluations on a model.
|
199 |
+
|
200 |
Args:
|
201 |
model_path: The HuggingFace model path
|
202 |
revision: The model revision/commit hash
|
203 |
+
|
204 |
Returns:
|
205 |
Dict containing evaluation results
|
206 |
"""
|
|
|
218 |
}
|
219 |
}
|
220 |
}
|
221 |
+
|
222 |
return results
|
223 |
|
224 |
def save_evaluation_results(results: Dict[str, Any], output_dir: str, model_name: str) -> str:
|
225 |
"""
|
226 |
Save evaluation results to a JSON file.
|
227 |
+
|
228 |
Args:
|
229 |
results: Dictionary containing evaluation results
|
230 |
output_dir: Directory to save results
|
231 |
model_name: Name of the model being evaluated
|
232 |
+
|
233 |
Returns:
|
234 |
str: Path to the saved results file
|
235 |
"""
|
236 |
os.makedirs(output_dir, exist_ok=True)
|
237 |
+
|
238 |
# Create filename from model name and timestamp
|
239 |
filename = f"security_eval_{model_name.replace('/', '_')}.json"
|
240 |
filepath = os.path.join(output_dir, filename)
|
241 |
+
|
242 |
with open(filepath, 'w') as f:
|
243 |
json.dump(results, f, indent=2)
|
244 |
+
|
245 |
return filepath
|
src/populate.py
CHANGED
@@ -4,57 +4,51 @@ import os
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
|
|
|
18 |
# Ensure all required columns exist before filtering
|
19 |
for col in benchmark_cols:
|
20 |
if col not in df.columns:
|
|
|
21 |
df[col] = None
|
22 |
-
|
23 |
# Filter out if any of the benchmarks have not been produced
|
24 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
25 |
df = df.sort_values(by="Security Score ⬆️", ascending=False)
|
26 |
df = df[cols].round(decimals=2)
|
27 |
|
|
|
28 |
return df
|
29 |
|
30 |
|
31 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
32 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
33 |
-
|
34 |
all_evals = []
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
# Create a new dict with the required column names
|
43 |
-
formatted_data = {
|
44 |
-
"model": make_clickable_model(data["model"]),
|
45 |
-
"revision": data.get("revision", "main"),
|
46 |
-
"private": data.get("private", False),
|
47 |
-
"precision": data.get("precision", ""),
|
48 |
-
"weight_type": data.get("weight_type", ""),
|
49 |
-
"status": data.get("status", "")
|
50 |
-
}
|
51 |
-
|
52 |
-
all_evals.append(formatted_data)
|
53 |
-
elif ".md" not in entry:
|
54 |
-
# this is a folder
|
55 |
-
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
56 |
-
for sub_entry in sub_entries:
|
57 |
-
file_path = os.path.join(save_path, entry, sub_entry)
|
58 |
with open(file_path) as fp:
|
59 |
data = json.load(fp)
|
60 |
|
@@ -70,9 +64,13 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
70 |
|
71 |
all_evals.append(formatted_data)
|
72 |
|
|
|
73 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
74 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
75 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
|
|
|
|
|
|
76 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
77 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
78 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
|
7 |
from src.leaderboard.read_evals import get_raw_eval_results
|
8 |
|
9 |
|
10 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
11 |
"""Creates a dataframe from all the individual experiment results"""
|
12 |
+
print(f"Getting raw eval results from {results_path} and {requests_path}")
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
+
print(f"Got {len(raw_data)} raw eval results")
|
15 |
+
|
16 |
+
if not raw_data:
|
17 |
+
print("No raw data found!")
|
18 |
+
return pd.DataFrame(columns=cols)
|
19 |
+
|
20 |
all_data_json = [v.to_dict() for v in raw_data]
|
21 |
+
print(f"Converted {len(all_data_json)} results to dict")
|
22 |
|
23 |
df = pd.DataFrame.from_records(all_data_json)
|
24 |
+
print(f"Created DataFrame with columns: {df.columns.tolist()}")
|
25 |
+
|
26 |
# Ensure all required columns exist before filtering
|
27 |
for col in benchmark_cols:
|
28 |
if col not in df.columns:
|
29 |
+
print(f"Missing required column: {col}")
|
30 |
df[col] = None
|
31 |
+
|
32 |
# Filter out if any of the benchmarks have not been produced
|
33 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
34 |
df = df.sort_values(by="Security Score ⬆️", ascending=False)
|
35 |
df = df[cols].round(decimals=2)
|
36 |
|
37 |
+
print(f"Final DataFrame has {len(df)} rows")
|
38 |
return df
|
39 |
|
40 |
|
41 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
42 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
43 |
+
print(f"Looking for eval requests in {save_path}")
|
44 |
all_evals = []
|
45 |
|
46 |
+
# Walk through all directories recursively
|
47 |
+
for root, _, files in os.walk(save_path):
|
48 |
+
for file in files:
|
49 |
+
if file.endswith('.json'):
|
50 |
+
file_path = os.path.join(root, file)
|
51 |
+
print(f"Reading JSON file: {file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
with open(file_path) as fp:
|
53 |
data = json.load(fp)
|
54 |
|
|
|
64 |
|
65 |
all_evals.append(formatted_data)
|
66 |
|
67 |
+
print(f"Found {len(all_evals)} total eval requests")
|
68 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
69 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
70 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
71 |
+
|
72 |
+
print(f"Pending: {len(pending_list)}, Running: {len(running_list)}, Finished: {len(finished_list)}")
|
73 |
+
|
74 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
75 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
76 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
test-locally.sh
CHANGED
@@ -1,24 +1,23 @@
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
-
#
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
#
|
14 |
-
|
15 |
-
$PIP install -r requirements.txt
|
16 |
|
17 |
# Create necessary directories
|
18 |
-
mkdir -p eval-queue eval-results
|
19 |
|
20 |
# Create sample data files with correct column names matching Tasks definitions
|
21 |
-
cat > eval-queue/
|
22 |
{
|
23 |
"model": "test/model",
|
24 |
"precision": "float16",
|
@@ -32,7 +31,8 @@ cat > eval-queue/test_model_eval_request_float16.json << EOL
|
|
32 |
}
|
33 |
EOL
|
34 |
|
35 |
-
|
|
|
36 |
{
|
37 |
"config": {
|
38 |
"model_name": "test/model",
|
@@ -50,10 +50,9 @@ cat > eval-results/results_1.json << EOL
|
|
50 |
}
|
51 |
EOL
|
52 |
|
53 |
-
#
|
54 |
-
|
55 |
-
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
$PYTHON app.py
|
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
+
# Create virtual environment only if it doesn't exist
|
4 |
+
if [ ! -d "venv" ]; then
|
5 |
+
python3 -m venv venv
|
6 |
+
source ./venv/bin/activate
|
7 |
+
python -m pip install --upgrade pip
|
8 |
+
pip install -r requirements.txt
|
9 |
+
else
|
10 |
+
source ./venv/bin/activate
|
11 |
+
fi
|
12 |
+
|
13 |
+
# Clean up old test data and cache
|
14 |
+
rm -rf eval-queue/* eval-results/* __pycache__ src/__pycache__ src/*/__pycache__
|
|
|
15 |
|
16 |
# Create necessary directories
|
17 |
+
mkdir -p "eval-queue/test" "eval-results"
|
18 |
|
19 |
# Create sample data files with correct column names matching Tasks definitions
|
20 |
+
cat > "eval-queue/test/model_eval_request_float16.json" << EOL
|
21 |
{
|
22 |
"model": "test/model",
|
23 |
"precision": "float16",
|
|
|
31 |
}
|
32 |
EOL
|
33 |
|
34 |
+
# Create results file with all required benchmarks
|
35 |
+
cat > "eval-results/results_20240101_000000.json" << EOL
|
36 |
{
|
37 |
"config": {
|
38 |
"model_name": "test/model",
|
|
|
50 |
}
|
51 |
EOL
|
52 |
|
53 |
+
# Print debug info
|
54 |
+
echo "Current directory structure:"
|
55 |
+
tree eval-queue eval-results
|
56 |
|
57 |
+
echo -e "\nStarting the app..."
|
58 |
+
PYTHONPATH=. ./venv/bin/python app_local.py
|
|