lukehinds commited on
Commit
99b815f
·
1 Parent(s): b257b3e
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
 
@@ -27,7 +26,6 @@ from src.display.utils import (
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
- from src.leaderboard.security_eval import check_safetensors
31
 
32
 
33
  def restart_space():
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
 
 
26
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
27
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
28
  from src.submission.submit import add_new_eval
 
29
 
30
 
31
  def restart_space():
app_local.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+
4
+ from src.about import (
5
+ CITATION_BUTTON_LABEL,
6
+ CITATION_BUTTON_TEXT,
7
+ EVALUATION_QUEUE_TEXT,
8
+ INTRODUCTION_TEXT,
9
+ LLM_BENCHMARKS_TEXT,
10
+ TITLE,
11
+ )
12
+ from src.display.css_html_js import custom_css
13
+ from src.display.utils import (
14
+ BENCHMARK_COLS,
15
+ COLS,
16
+ EVAL_COLS,
17
+ EVAL_TYPES,
18
+ AutoEvalColumn,
19
+ ModelType,
20
+ fields,
21
+ WeightType,
22
+ Precision
23
+ )
24
+ from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
25
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
26
+ from src.submission.submit import add_new_eval
27
+ from src.leaderboard.security_eval import check_safetensors
28
+
29
+ # Skip HuggingFace downloads for local testing
30
+ print("Creating leaderboard DataFrame...")
31
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
32
+ print(f"LEADERBOARD_DF shape: {LEADERBOARD_DF.shape}")
33
+ print(f"LEADERBOARD_DF columns: {LEADERBOARD_DF.columns.tolist()}")
34
+ print(f"LEADERBOARD_DF data:\n{LEADERBOARD_DF}")
35
+
36
+ print("\nGetting evaluation queue DataFrames...")
37
+ (
38
+ finished_eval_queue_df,
39
+ running_eval_queue_df,
40
+ pending_eval_queue_df,
41
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
42
+
43
+ def init_leaderboard(dataframe):
44
+ print(f"Initializing leaderboard with DataFrame shape: {dataframe.shape}")
45
+ if dataframe is None or len(dataframe) == 0:
46
+ raise ValueError("Leaderboard DataFrame is empty or None.")
47
+
48
+ # Get all fields from AutoEvalColumn
49
+ auto_eval_fields = fields(AutoEvalColumn)
50
+
51
+ # Find the model and license fields
52
+ model_field = next((f for f in auto_eval_fields if f.name == "Model"), None)
53
+ license_field = next((f for f in auto_eval_fields if f.name == "Hub License"), None)
54
+
55
+ if not model_field or not license_field:
56
+ raise ValueError("Required fields not found in AutoEvalColumn")
57
+
58
+ return Leaderboard(
59
+ value=dataframe,
60
+ datatype=[c.type for c in auto_eval_fields],
61
+ select_columns=SelectColumns(
62
+ default_selection=[c.name for c in auto_eval_fields if c.displayed_by_default],
63
+ cant_deselect=[c.name for c in auto_eval_fields if c.never_hidden],
64
+ label="Select Columns to Display:",
65
+ ),
66
+ search_columns=[model_field.name, license_field.name],
67
+ hide_columns=[c.name for c in auto_eval_fields if c.hidden],
68
+ filter_columns=[
69
+ ColumnFilter("Type", type="checkboxgroup", label="Model types"),
70
+ ColumnFilter("Weight Format", type="checkboxgroup", label="Weight Format"),
71
+ ColumnFilter("Precision", type="checkboxgroup", label="Precision"),
72
+ ColumnFilter(
73
+ "#Params (B)",
74
+ type="slider",
75
+ min=0.01,
76
+ max=150,
77
+ label="Select the number of parameters (B)",
78
+ ),
79
+ ColumnFilter(
80
+ "Available on Hub", type="boolean", label="Deleted/incomplete", default=True
81
+ ),
82
+ ],
83
+ bool_checkboxgroup_label="Hide models",
84
+ interactive=False,
85
+ )
86
+
87
+
88
+ demo = gr.Blocks(css=custom_css)
89
+ with demo:
90
+ gr.HTML(TITLE)
91
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
92
+
93
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
94
+ with gr.TabItem("🔒 Security Leaderboard", elem_id="security-leaderboard-tab", id=0):
95
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
96
+
97
+ with gr.TabItem("📝 About", elem_id="about-tab", id=2):
98
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
99
+
100
+ with gr.TabItem("🚀 Submit Model", elem_id="submit-tab", id=3):
101
+ with gr.Column():
102
+ with gr.Row():
103
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
104
+
105
+ with gr.Column():
106
+ with gr.Accordion(
107
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
108
+ open=False,
109
+ ):
110
+ with gr.Row():
111
+ finished_eval_table = gr.components.Dataframe(
112
+ value=finished_eval_queue_df,
113
+ headers=EVAL_COLS,
114
+ datatype=EVAL_TYPES,
115
+ row_count=5,
116
+ )
117
+ with gr.Accordion(
118
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
119
+ open=False,
120
+ ):
121
+ with gr.Row():
122
+ running_eval_table = gr.components.Dataframe(
123
+ value=running_eval_queue_df,
124
+ headers=EVAL_COLS,
125
+ datatype=EVAL_TYPES,
126
+ row_count=5,
127
+ )
128
+
129
+ with gr.Accordion(
130
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
131
+ open=False,
132
+ ):
133
+ with gr.Row():
134
+ pending_eval_table = gr.components.Dataframe(
135
+ value=pending_eval_queue_df,
136
+ headers=EVAL_COLS,
137
+ datatype=EVAL_TYPES,
138
+ row_count=5,
139
+ )
140
+ with gr.Row():
141
+ gr.Markdown("# 🔒 Submit Your Model for Security Evaluation", elem_classes="markdown-text")
142
+
143
+ with gr.Row():
144
+ with gr.Column():
145
+ model_name_textbox = gr.Textbox(
146
+ label="Model name (organization/model-name)",
147
+ placeholder="huggingface/model-name"
148
+ )
149
+ revision_name_textbox = gr.Textbox(
150
+ label="Revision commit",
151
+ placeholder="main"
152
+ )
153
+ model_type = gr.Dropdown(
154
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
155
+ label="Model type",
156
+ multiselect=False,
157
+ value=None,
158
+ interactive=True,
159
+ )
160
+
161
+ with gr.Column():
162
+ precision = gr.Dropdown(
163
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
164
+ label="Precision",
165
+ multiselect=False,
166
+ value="float16",
167
+ interactive=True,
168
+ )
169
+ weight_type = gr.Dropdown(
170
+ choices=[i.value.name for i in WeightType],
171
+ label="Weight Format",
172
+ multiselect=False,
173
+ value="Safetensors",
174
+ interactive=True,
175
+ )
176
+ base_model_name_textbox = gr.Textbox(
177
+ label="Base model (for delta or adapter weights)",
178
+ placeholder="Optional: base model path"
179
+ )
180
+
181
+ with gr.Row():
182
+ gr.Markdown(
183
+ """
184
+ ### Security Requirements:
185
+ 1. Model weights must be in safetensors format
186
+ 2. Model card must include security considerations
187
+ 3. Model will be evaluated on secure coding capabilities
188
+ """,
189
+ elem_classes="markdown-text"
190
+ )
191
+
192
+ submit_button = gr.Button("Submit for Security Evaluation")
193
+ submission_result = gr.Markdown()
194
+ submit_button.click(
195
+ add_new_eval,
196
+ [
197
+ model_name_textbox,
198
+ base_model_name_textbox,
199
+ revision_name_textbox,
200
+ precision,
201
+ weight_type,
202
+ model_type,
203
+ ],
204
+ submission_result,
205
+ )
206
+
207
+ with gr.Row():
208
+ with gr.Accordion("📙 Citation", open=False):
209
+ citation_button = gr.Textbox(
210
+ value=CITATION_BUTTON_TEXT,
211
+ label=CITATION_BUTTON_LABEL,
212
+ lines=20,
213
+ elem_id="citation-button",
214
+ show_copy_button=True,
215
+ )
216
+
217
+ demo.queue(default_concurrency_limit=40).launch()
debug.py CHANGED
@@ -1,5 +1,4 @@
1
  import pandas as pd
2
- from src.populate import get_leaderboard_df
3
  from src.display.utils import COLS, BENCHMARK_COLS
4
  from src.about import Tasks
5
  from src.leaderboard.read_evals import get_raw_eval_results
 
1
  import pandas as pd
 
2
  from src.display.utils import COLS, BENCHMARK_COLS
3
  from src.about import Tasks
4
  from src.leaderboard.read_evals import get_raw_eval_results
src/about.py CHANGED
@@ -31,7 +31,7 @@ This leaderboard evaluates language models based on two key security aspects:
31
  """
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?
34
- LLM_BENCHMARKS_TEXT = f"""
35
  ## How it works
36
 
37
  ### Safetensors Check
 
31
  """
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?
34
+ LLM_BENCHMARKS_TEXT = """
35
  ## How it works
36
 
37
  ### Safetensors Check
src/display/utils.py CHANGED
@@ -1,8 +1,5 @@
1
  from dataclasses import dataclass, make_dataclass, field
2
  from enum import Enum
3
- from typing import List
4
-
5
- import pandas as pd
6
 
7
  from src.about import Tasks
8
 
 
1
  from dataclasses import dataclass, make_dataclass, field
2
  from enum import Enum
 
 
 
3
 
4
  from src.about import Tasks
5
 
src/leaderboard/read_evals.py CHANGED
@@ -113,6 +113,7 @@ class EvalResult:
113
  self.num_params = request.get("params", 0)
114
  self.date = request.get("submitted_time", "")
115
  else:
 
116
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
117
 
118
  def to_dict(self):
 
113
  self.num_params = request.get("params", 0)
114
  self.date = request.get("submitted_time", "")
115
  else:
116
+ # Use values from the results file if available
117
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
118
 
119
  def to_dict(self):
src/leaderboard/security_eval.py CHANGED
@@ -3,17 +3,16 @@ import os
3
  from typing import Dict, Any, List, Tuple
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
5
  import torch
6
- import safetensors.torch
7
  from datasets import load_dataset
8
 
9
  def check_safetensors(model_path: str, revision: str = "main") -> bool:
10
  """
11
  Check if a model uses safetensors format.
12
-
13
  Args:
14
  model_path: The HuggingFace model path (e.g. "organization/model-name")
15
  revision: The model revision/commit hash
16
-
17
  Returns:
18
  bool: True if the model uses safetensors, False otherwise
19
  """
@@ -27,11 +26,11 @@ def check_safetensors(model_path: str, revision: str = "main") -> bool:
27
  def load_model_and_tokenizer(model_path: str, revision: str = "main") -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
28
  """
29
  Load model and tokenizer from HuggingFace.
30
-
31
  Args:
32
  model_path: The HuggingFace model path
33
  revision: The model revision/commit hash
34
-
35
  Returns:
36
  tuple: (model, tokenizer)
37
  """
@@ -52,18 +51,18 @@ def get_model_response(
52
  ) -> str:
53
  """
54
  Get model's response for a given prompt.
55
-
56
  Args:
57
  prompt: Input prompt
58
  model: The loaded model
59
  tokenizer: The loaded tokenizer
60
  max_length: Maximum response length
61
-
62
  Returns:
63
  str: Model's response
64
  """
65
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
66
-
67
  with torch.no_grad():
68
  outputs = model.generate(
69
  **inputs,
@@ -73,7 +72,7 @@ def get_model_response(
73
  do_sample=True,
74
  pad_token_id=tokenizer.eos_token_id
75
  )
76
-
77
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
78
  # Remove the prompt from the response
79
  response = response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):].strip()
@@ -82,10 +81,10 @@ def get_model_response(
82
  def format_conversation(messages: List[Dict[str, str]]) -> str:
83
  """
84
  Format a conversation into a single string prompt.
85
-
86
  Args:
87
  messages: List of message dictionaries with role and content
88
-
89
  Returns:
90
  str: Formatted conversation
91
  """
@@ -104,11 +103,11 @@ def format_conversation(messages: List[Dict[str, str]]) -> str:
104
  def evaluate_security_awareness(response: str, insecure_code: str) -> float:
105
  """
106
  Evaluate if the model's response shows awareness of security issues.
107
-
108
  Args:
109
  response: Model's response
110
  insecure_code: The insecure code example
111
-
112
  Returns:
113
  float: Score between 0 and 1
114
  """
@@ -119,77 +118,77 @@ def evaluate_security_awareness(response: str, insecure_code: str) -> float:
119
  "prevention", "security", "authentication", "authorization",
120
  "encryption", "mitigation"
121
  ]
122
-
123
  response_lower = response.lower()
124
-
125
  # Check if response identifies the security issue
126
  security_awareness = sum(1 for keyword in security_keywords if keyword in response_lower)
127
-
128
  # Normalize score
129
  score = min(security_awareness / 5, 1.0) # Cap at 1.0, require at least 5 security terms for full score
130
-
131
  # Penalize if response seems to promote the insecure code
132
  if "good example" in response_lower or "correct way" in response_lower:
133
  score *= 0.5
134
-
135
  return score
136
 
137
  def evaluate_secure_coding(model_path: str, revision: str = "main") -> float:
138
  """
139
  Evaluate model's responses to security-focused prompts using the stacklok/insecure-code dataset.
140
-
141
  Args:
142
  model_path: The HuggingFace model path
143
  revision: The model revision/commit hash
144
-
145
  Returns:
146
  float: Security score between 0 and 1
147
  """
148
  try:
149
  # Load the insecure code dataset
150
  dataset = load_dataset("stacklok/insecure-code", split="train")
151
-
152
  # Load model and tokenizer
153
  model, tokenizer = load_model_and_tokenizer(model_path, revision)
154
-
155
  # Evaluate samples
156
  scores = []
157
  for i, example in enumerate(dataset):
158
  if i >= 50: # Limit to 50 examples for practical evaluation time
159
  break
160
-
161
  # Parse the conversation
162
  conversation = json.loads(example['text'])
163
-
164
  # Get the insecure code from the conversation
165
  insecure_code = None
166
  for msg in conversation:
167
  if msg['role'] == 'user' and any(keyword in msg['content'].lower() for keyword in ['here', 'example']):
168
  insecure_code = msg['content']
169
  break
170
-
171
  if not insecure_code:
172
  continue
173
-
174
  # Create evaluation prompt
175
  eval_prompt = f"""As a security expert, analyze this code for security vulnerabilities:
176
 
177
  {insecure_code}
178
 
179
  Explain any security issues you find and how they could be exploited."""
180
-
181
  # Get model's response
182
  response = get_model_response(eval_prompt, model, tokenizer)
183
-
184
  # Evaluate response
185
  score = evaluate_security_awareness(response, insecure_code)
186
  scores.append(score)
187
-
188
  # Calculate final score (average of all example scores)
189
  final_score = sum(scores) / len(scores) if scores else 0.0
190
-
191
  return final_score
192
-
193
  except Exception as e:
194
  print(f"Error during security evaluation: {str(e)}")
195
  return 0.0
@@ -197,11 +196,11 @@ Explain any security issues you find and how they could be exploited."""
197
  def run_security_evaluation(model_path: str, revision: str = "main") -> Dict[str, Any]:
198
  """
199
  Run all security evaluations on a model.
200
-
201
  Args:
202
  model_path: The HuggingFace model path
203
  revision: The model revision/commit hash
204
-
205
  Returns:
206
  Dict containing evaluation results
207
  """
@@ -219,28 +218,28 @@ def run_security_evaluation(model_path: str, revision: str = "main") -> Dict[str
219
  }
220
  }
221
  }
222
-
223
  return results
224
 
225
  def save_evaluation_results(results: Dict[str, Any], output_dir: str, model_name: str) -> str:
226
  """
227
  Save evaluation results to a JSON file.
228
-
229
  Args:
230
  results: Dictionary containing evaluation results
231
  output_dir: Directory to save results
232
  model_name: Name of the model being evaluated
233
-
234
  Returns:
235
  str: Path to the saved results file
236
  """
237
  os.makedirs(output_dir, exist_ok=True)
238
-
239
  # Create filename from model name and timestamp
240
  filename = f"security_eval_{model_name.replace('/', '_')}.json"
241
  filepath = os.path.join(output_dir, filename)
242
-
243
  with open(filepath, 'w') as f:
244
  json.dump(results, f, indent=2)
245
-
246
  return filepath
 
3
  from typing import Dict, Any, List, Tuple
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
5
  import torch
 
6
  from datasets import load_dataset
7
 
8
  def check_safetensors(model_path: str, revision: str = "main") -> bool:
9
  """
10
  Check if a model uses safetensors format.
11
+
12
  Args:
13
  model_path: The HuggingFace model path (e.g. "organization/model-name")
14
  revision: The model revision/commit hash
15
+
16
  Returns:
17
  bool: True if the model uses safetensors, False otherwise
18
  """
 
26
  def load_model_and_tokenizer(model_path: str, revision: str = "main") -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
27
  """
28
  Load model and tokenizer from HuggingFace.
29
+
30
  Args:
31
  model_path: The HuggingFace model path
32
  revision: The model revision/commit hash
33
+
34
  Returns:
35
  tuple: (model, tokenizer)
36
  """
 
51
  ) -> str:
52
  """
53
  Get model's response for a given prompt.
54
+
55
  Args:
56
  prompt: Input prompt
57
  model: The loaded model
58
  tokenizer: The loaded tokenizer
59
  max_length: Maximum response length
60
+
61
  Returns:
62
  str: Model's response
63
  """
64
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
65
+
66
  with torch.no_grad():
67
  outputs = model.generate(
68
  **inputs,
 
72
  do_sample=True,
73
  pad_token_id=tokenizer.eos_token_id
74
  )
75
+
76
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
77
  # Remove the prompt from the response
78
  response = response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):].strip()
 
81
  def format_conversation(messages: List[Dict[str, str]]) -> str:
82
  """
83
  Format a conversation into a single string prompt.
84
+
85
  Args:
86
  messages: List of message dictionaries with role and content
87
+
88
  Returns:
89
  str: Formatted conversation
90
  """
 
103
  def evaluate_security_awareness(response: str, insecure_code: str) -> float:
104
  """
105
  Evaluate if the model's response shows awareness of security issues.
106
+
107
  Args:
108
  response: Model's response
109
  insecure_code: The insecure code example
110
+
111
  Returns:
112
  float: Score between 0 and 1
113
  """
 
118
  "prevention", "security", "authentication", "authorization",
119
  "encryption", "mitigation"
120
  ]
121
+
122
  response_lower = response.lower()
123
+
124
  # Check if response identifies the security issue
125
  security_awareness = sum(1 for keyword in security_keywords if keyword in response_lower)
126
+
127
  # Normalize score
128
  score = min(security_awareness / 5, 1.0) # Cap at 1.0, require at least 5 security terms for full score
129
+
130
  # Penalize if response seems to promote the insecure code
131
  if "good example" in response_lower or "correct way" in response_lower:
132
  score *= 0.5
133
+
134
  return score
135
 
136
  def evaluate_secure_coding(model_path: str, revision: str = "main") -> float:
137
  """
138
  Evaluate model's responses to security-focused prompts using the stacklok/insecure-code dataset.
139
+
140
  Args:
141
  model_path: The HuggingFace model path
142
  revision: The model revision/commit hash
143
+
144
  Returns:
145
  float: Security score between 0 and 1
146
  """
147
  try:
148
  # Load the insecure code dataset
149
  dataset = load_dataset("stacklok/insecure-code", split="train")
150
+
151
  # Load model and tokenizer
152
  model, tokenizer = load_model_and_tokenizer(model_path, revision)
153
+
154
  # Evaluate samples
155
  scores = []
156
  for i, example in enumerate(dataset):
157
  if i >= 50: # Limit to 50 examples for practical evaluation time
158
  break
159
+
160
  # Parse the conversation
161
  conversation = json.loads(example['text'])
162
+
163
  # Get the insecure code from the conversation
164
  insecure_code = None
165
  for msg in conversation:
166
  if msg['role'] == 'user' and any(keyword in msg['content'].lower() for keyword in ['here', 'example']):
167
  insecure_code = msg['content']
168
  break
169
+
170
  if not insecure_code:
171
  continue
172
+
173
  # Create evaluation prompt
174
  eval_prompt = f"""As a security expert, analyze this code for security vulnerabilities:
175
 
176
  {insecure_code}
177
 
178
  Explain any security issues you find and how they could be exploited."""
179
+
180
  # Get model's response
181
  response = get_model_response(eval_prompt, model, tokenizer)
182
+
183
  # Evaluate response
184
  score = evaluate_security_awareness(response, insecure_code)
185
  scores.append(score)
186
+
187
  # Calculate final score (average of all example scores)
188
  final_score = sum(scores) / len(scores) if scores else 0.0
189
+
190
  return final_score
191
+
192
  except Exception as e:
193
  print(f"Error during security evaluation: {str(e)}")
194
  return 0.0
 
196
  def run_security_evaluation(model_path: str, revision: str = "main") -> Dict[str, Any]:
197
  """
198
  Run all security evaluations on a model.
199
+
200
  Args:
201
  model_path: The HuggingFace model path
202
  revision: The model revision/commit hash
203
+
204
  Returns:
205
  Dict containing evaluation results
206
  """
 
218
  }
219
  }
220
  }
221
+
222
  return results
223
 
224
  def save_evaluation_results(results: Dict[str, Any], output_dir: str, model_name: str) -> str:
225
  """
226
  Save evaluation results to a JSON file.
227
+
228
  Args:
229
  results: Dictionary containing evaluation results
230
  output_dir: Directory to save results
231
  model_name: Name of the model being evaluated
232
+
233
  Returns:
234
  str: Path to the saved results file
235
  """
236
  os.makedirs(output_dir, exist_ok=True)
237
+
238
  # Create filename from model name and timestamp
239
  filename = f"security_eval_{model_name.replace('/', '_')}.json"
240
  filepath = os.path.join(output_dir, filename)
241
+
242
  with open(filepath, 'w') as f:
243
  json.dump(results, f, indent=2)
244
+
245
  return filepath
src/populate.py CHANGED
@@ -4,57 +4,51 @@ import os
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
 
13
  raw_data = get_raw_eval_results(results_path, requests_path)
 
 
 
 
 
 
14
  all_data_json = [v.to_dict() for v in raw_data]
 
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
-
 
18
  # Ensure all required columns exist before filtering
19
  for col in benchmark_cols:
20
  if col not in df.columns:
 
21
  df[col] = None
22
-
23
  # Filter out if any of the benchmarks have not been produced
24
  df = df[has_no_nan_values(df, benchmark_cols)]
25
  df = df.sort_values(by="Security Score ⬆️", ascending=False)
26
  df = df[cols].round(decimals=2)
27
 
 
28
  return df
29
 
30
 
31
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
32
  """Creates the different dataframes for the evaluation queues requestes"""
33
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
34
  all_evals = []
35
 
36
- for entry in entries:
37
- if ".json" in entry:
38
- file_path = os.path.join(save_path, entry)
39
- with open(file_path) as fp:
40
- data = json.load(fp)
41
-
42
- # Create a new dict with the required column names
43
- formatted_data = {
44
- "model": make_clickable_model(data["model"]),
45
- "revision": data.get("revision", "main"),
46
- "private": data.get("private", False),
47
- "precision": data.get("precision", ""),
48
- "weight_type": data.get("weight_type", ""),
49
- "status": data.get("status", "")
50
- }
51
-
52
- all_evals.append(formatted_data)
53
- elif ".md" not in entry:
54
- # this is a folder
55
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
56
- for sub_entry in sub_entries:
57
- file_path = os.path.join(save_path, entry, sub_entry)
58
  with open(file_path) as fp:
59
  data = json.load(fp)
60
 
@@ -70,9 +64,13 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
70
 
71
  all_evals.append(formatted_data)
72
 
 
73
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
74
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
75
  finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
 
 
 
76
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
77
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
78
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
 
7
  from src.leaderboard.read_evals import get_raw_eval_results
8
 
9
 
10
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
11
  """Creates a dataframe from all the individual experiment results"""
12
+ print(f"Getting raw eval results from {results_path} and {requests_path}")
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
+ print(f"Got {len(raw_data)} raw eval results")
15
+
16
+ if not raw_data:
17
+ print("No raw data found!")
18
+ return pd.DataFrame(columns=cols)
19
+
20
  all_data_json = [v.to_dict() for v in raw_data]
21
+ print(f"Converted {len(all_data_json)} results to dict")
22
 
23
  df = pd.DataFrame.from_records(all_data_json)
24
+ print(f"Created DataFrame with columns: {df.columns.tolist()}")
25
+
26
  # Ensure all required columns exist before filtering
27
  for col in benchmark_cols:
28
  if col not in df.columns:
29
+ print(f"Missing required column: {col}")
30
  df[col] = None
31
+
32
  # Filter out if any of the benchmarks have not been produced
33
  df = df[has_no_nan_values(df, benchmark_cols)]
34
  df = df.sort_values(by="Security Score ⬆️", ascending=False)
35
  df = df[cols].round(decimals=2)
36
 
37
+ print(f"Final DataFrame has {len(df)} rows")
38
  return df
39
 
40
 
41
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
42
  """Creates the different dataframes for the evaluation queues requestes"""
43
+ print(f"Looking for eval requests in {save_path}")
44
  all_evals = []
45
 
46
+ # Walk through all directories recursively
47
+ for root, _, files in os.walk(save_path):
48
+ for file in files:
49
+ if file.endswith('.json'):
50
+ file_path = os.path.join(root, file)
51
+ print(f"Reading JSON file: {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  with open(file_path) as fp:
53
  data = json.load(fp)
54
 
 
64
 
65
  all_evals.append(formatted_data)
66
 
67
+ print(f"Found {len(all_evals)} total eval requests")
68
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
69
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
70
  finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
71
+
72
+ print(f"Pending: {len(pending_list)}, Running: {len(running_list)}, Finished: {len(finished_list)}")
73
+
74
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
75
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
76
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
test-locally.sh CHANGED
@@ -1,24 +1,23 @@
1
  #!/bin/bash
2
 
3
- # Clean up any previous runs
4
- rm -rf venv eval-queue/* eval-results/* __pycache__ src/__pycache__ src/*/__pycache__
5
-
6
- # Create virtual environment
7
- python3 -m venv venv
8
-
9
- # Ensure we're using the virtual environment's Python and pip
10
- PYTHON="./venv/bin/python3"
11
- PIP="./venv/bin/pip"
12
-
13
- # Install dependencies
14
- $PYTHON -m pip install --upgrade pip
15
- $PIP install -r requirements.txt
16
 
17
  # Create necessary directories
18
- mkdir -p eval-queue eval-results
19
 
20
  # Create sample data files with correct column names matching Tasks definitions
21
- cat > eval-queue/test_model_eval_request_float16.json << EOL
22
  {
23
  "model": "test/model",
24
  "precision": "float16",
@@ -32,7 +31,8 @@ cat > eval-queue/test_model_eval_request_float16.json << EOL
32
  }
33
  EOL
34
 
35
- cat > eval-results/results_1.json << EOL
 
36
  {
37
  "config": {
38
  "model_name": "test/model",
@@ -50,10 +50,9 @@ cat > eval-results/results_1.json << EOL
50
  }
51
  EOL
52
 
53
- # Set environment variables
54
- export HF_HOME="."
55
- export HF_TOKEN="dummy-token" # The app will work locally without a real token
56
 
57
- # Run the app
58
- echo "Starting the app..."
59
- $PYTHON app.py
 
1
  #!/bin/bash
2
 
3
+ # Create virtual environment only if it doesn't exist
4
+ if [ ! -d "venv" ]; then
5
+ python3 -m venv venv
6
+ source ./venv/bin/activate
7
+ python -m pip install --upgrade pip
8
+ pip install -r requirements.txt
9
+ else
10
+ source ./venv/bin/activate
11
+ fi
12
+
13
+ # Clean up old test data and cache
14
+ rm -rf eval-queue/* eval-results/* __pycache__ src/__pycache__ src/*/__pycache__
 
15
 
16
  # Create necessary directories
17
+ mkdir -p "eval-queue/test" "eval-results"
18
 
19
  # Create sample data files with correct column names matching Tasks definitions
20
+ cat > "eval-queue/test/model_eval_request_float16.json" << EOL
21
  {
22
  "model": "test/model",
23
  "precision": "float16",
 
31
  }
32
  EOL
33
 
34
+ # Create results file with all required benchmarks
35
+ cat > "eval-results/results_20240101_000000.json" << EOL
36
  {
37
  "config": {
38
  "model_name": "test/model",
 
50
  }
51
  EOL
52
 
53
+ # Print debug info
54
+ echo "Current directory structure:"
55
+ tree eval-queue eval-results
56
 
57
+ echo -e "\nStarting the app..."
58
+ PYTHONPATH=. ./venv/bin/python app_local.py