wu981526092 commited on
Commit
f03f82b
Β·
1 Parent(s): fbd403a
app.py CHANGED
@@ -12,6 +12,7 @@ from src.about import (
12
  INTRODUCTION_TEXT,
13
  LLM_BENCHMARKS_TEXT,
14
  TITLE,
 
15
  )
16
  from src.display.css_html_js import custom_css
17
  from src.display.utils import (
@@ -91,7 +92,8 @@ def init_leaderboard(dataframe):
91
  """Initialize the leaderboard component"""
92
  if dataframe is None or dataframe.empty:
93
  # Create an empty dataframe with the expected columns
94
- empty_df = pd.DataFrame(columns=COLS)
 
95
  print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
96
  dataframe = empty_df
97
 
 
12
  INTRODUCTION_TEXT,
13
  LLM_BENCHMARKS_TEXT,
14
  TITLE,
15
+ Tasks
16
  )
17
  from src.display.css_html_js import custom_css
18
  from src.display.utils import (
 
92
  """Initialize the leaderboard component"""
93
  if dataframe is None or dataframe.empty:
94
  # Create an empty dataframe with the expected columns
95
+ all_columns = COLS + [task.name for task in Tasks]
96
+ empty_df = pd.DataFrame(columns=all_columns)
97
  print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
98
  dataframe = empty_df
99
 
assessment-queue/{langchain-ai_langchain_request.json β†’ langchain-ai_langchain_eval_request_timestamp_abc123.json} RENAMED
File without changes
assessment-queue/{microsoft_autogen_request.json β†’ microsoft_autogen_eval_request_timestamp_ghi789.json} RENAMED
File without changes
assessment-queue/{pytorch_pytorch_request.json β†’ pytorch_pytorch_eval_request_timestamp_def456.json} RENAMED
File without changes
src/display/utils.py CHANGED
@@ -105,5 +105,6 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
106
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
107
 
108
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
109
 
 
105
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
106
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
107
 
108
+ # Task columns for benchmarking - use the task names from the Tasks enum
109
+ BENCHMARK_COLS = [task.name for task in Tasks]
110
 
src/leaderboard/read_evals.py CHANGED
@@ -138,31 +138,53 @@ class AssessmentResult:
138
  AutoEvalColumn.availability.name: self.availability,
139
  }
140
 
 
141
  for task in Tasks:
142
- data_dict[task.name] = self.results.get(task.value.benchmark, 10) # Default to highest risk
 
 
 
 
143
 
144
  return data_dict
145
 
146
 
147
  def get_request_file_for_library(requests_path, library_name, version):
148
  """Selects the correct request file for a given library. Only keeps runs tagged as FINISHED"""
149
- request_files = os.path.join(
150
- requests_path,
151
- f"{library_name.replace('/', '_')}_eval_request_*.json",
152
- )
153
- request_files = glob.glob(request_files)
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  # Select correct request file (version)
156
  request_file = ""
157
  request_files = sorted(request_files, reverse=True)
158
  for tmp_request_file in request_files:
159
- with open(tmp_request_file, "r") as f:
160
- req_content = json.load(f)
161
- if (
162
- req_content["status"] in ["FINISHED"]
163
- and req_content["version"] == version
164
- ):
165
- request_file = tmp_request_file
 
 
 
 
 
 
166
  return request_file
167
 
168
 
 
138
  AutoEvalColumn.availability.name: self.availability,
139
  }
140
 
141
+ # Add task-specific risk scores - map each task to its column name
142
  for task in Tasks:
143
+ task_enum = task.value # Task dataclass instance
144
+ benchmark_key = task_enum.benchmark # e.g., "license_validation"
145
+ col_name = task.name # The field name in AutoEvalColumn, e.g., "license"
146
+ risk_score = self.results.get(benchmark_key, 10) # Default to highest risk
147
+ data_dict[col_name] = risk_score
148
 
149
  return data_dict
150
 
151
 
152
  def get_request_file_for_library(requests_path, library_name, version):
153
  """Selects the correct request file for a given library. Only keeps runs tagged as FINISHED"""
154
+ # Try multiple naming patterns for flexibility
155
+ possible_patterns = [
156
+ f"{library_name.replace('/', '_')}_eval_request_*.json", # Original pattern
157
+ f"{library_name.replace('/', '_')}_request.json", # Simple pattern
158
+ f"{library_name.replace('/', '_')}*.json" # Fallback pattern
159
+ ]
160
+
161
+ request_files = []
162
+ for pattern in possible_patterns:
163
+ pattern_path = os.path.join(requests_path, pattern)
164
+ found_files = glob.glob(pattern_path)
165
+ request_files.extend(found_files)
166
+
167
+ if not request_files:
168
+ print(f"Warning: No request files found matching {library_name}")
169
+ return ""
170
 
171
  # Select correct request file (version)
172
  request_file = ""
173
  request_files = sorted(request_files, reverse=True)
174
  for tmp_request_file in request_files:
175
+ try:
176
+ with open(tmp_request_file, "r") as f:
177
+ req_content = json.load(f)
178
+ if (
179
+ req_content.get("status", "") in ["FINISHED"] and
180
+ req_content.get("version", "") == version
181
+ ):
182
+ request_file = tmp_request_file
183
+ break
184
+ except Exception as e:
185
+ print(f"Error reading {tmp_request_file}: {e}")
186
+ continue
187
+
188
  return request_file
189
 
190
 
src/populate.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
 
5
  from src.display.utils import AutoEvalColumn
6
  from src.leaderboard.read_evals import get_raw_assessment_results
 
7
 
8
 
9
  def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
@@ -26,6 +27,9 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
26
  # Create dataframe from assessment results
27
  all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
28
 
 
 
 
29
  # Sort by overall risk score (ascending - lower is better)
30
  if AutoEvalColumn.overall_risk.name in all_df.columns:
31
  all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
@@ -35,6 +39,8 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
35
  return pd.DataFrame(columns=cols) # Empty dataframe with columns
36
  except Exception as e:
37
  print(f"Error reading evaluation results: {e}")
 
 
38
  return pd.DataFrame(columns=cols) # Return empty dataframe
39
 
40
 
 
4
 
5
  from src.display.utils import AutoEvalColumn
6
  from src.leaderboard.read_evals import get_raw_assessment_results
7
+ from src.about import Tasks
8
 
9
 
10
  def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
 
27
  # Create dataframe from assessment results
28
  all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
29
 
30
+ # Ensure the task columns are included
31
+ task_cols = [task.name for task in Tasks]
32
+
33
  # Sort by overall risk score (ascending - lower is better)
34
  if AutoEvalColumn.overall_risk.name in all_df.columns:
35
  all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
 
39
  return pd.DataFrame(columns=cols) # Empty dataframe with columns
40
  except Exception as e:
41
  print(f"Error reading evaluation results: {e}")
42
+ import traceback
43
+ traceback.print_exc()
44
  return pd.DataFrame(columns=cols) # Return empty dataframe
45
 
46