wu981526092 commited on
Commit
5fc842f
·
1 Parent(s): f03f82b
app.py CHANGED
@@ -92,7 +92,7 @@ def init_leaderboard(dataframe):
92
  """Initialize the leaderboard component"""
93
  if dataframe is None or dataframe.empty:
94
  # Create an empty dataframe with the expected columns
95
- all_columns = COLS + [task.name for task in Tasks]
96
  empty_df = pd.DataFrame(columns=all_columns)
97
  print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
98
  dataframe = empty_df
 
92
  """Initialize the leaderboard component"""
93
  if dataframe is None or dataframe.empty:
94
  # Create an empty dataframe with the expected columns
95
+ all_columns = COLS + [task.value.col_name for task in Tasks]
96
  empty_df = pd.DataFrame(columns=all_columns)
97
  print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
98
  dataframe = empty_df
assessment-queue/langchain-ai_langchain_eval_request_FINISHED_v0.1.0.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "library": "langchain-ai/langchain",
3
+ "version": "v0.1.0",
4
+ "repository_url": "https://github.com/langchain-ai/langchain",
5
+ "language": "Python",
6
+ "framework": "Python SDK",
7
+ "library_type": "llm framework",
8
+ "license": "MIT",
9
+ "stars": 74500,
10
+ "status": "FINISHED",
11
+ "submitted_time": "2025-04-30T10:00:00Z",
12
+ "last_updated": "2025-05-01T12:00:00Z",
13
+ "assessment_id": "abc123"
14
+ }
assessment-queue/microsoft_autogen_eval_request_FINISHED_v0.2.0.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "library": "microsoft/autogen",
3
+ "version": "v0.2.0",
4
+ "repository_url": "https://github.com/microsoft/autogen",
5
+ "language": "Python",
6
+ "framework": "Agent Framework",
7
+ "library_type": "agent framework",
8
+ "license": "MIT",
9
+ "stars": 48700,
10
+ "status": "FINISHED",
11
+ "submitted_time": "2025-05-02T08:15:00Z",
12
+ "last_updated": "2025-05-03T09:15:00Z",
13
+ "assessment_id": "ghi789"
14
+ }
assessment-queue/pytorch_pytorch_eval_request_FINISHED_v2.1.0.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "library": "pytorch/pytorch",
3
+ "version": "v2.1.0",
4
+ "repository_url": "https://github.com/pytorch/pytorch",
5
+ "language": "Python",
6
+ "framework": "Machine Learning",
7
+ "library_type": "machine learning",
8
+ "license": "BSD-3",
9
+ "stars": 72300,
10
+ "status": "FINISHED",
11
+ "submitted_time": "2025-05-01T16:30:00Z",
12
+ "last_updated": "2025-05-02T14:30:00Z",
13
+ "assessment_id": "def456"
14
+ }
src/display/utils.py CHANGED
@@ -105,6 +105,6 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
106
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
107
 
108
- # Task columns for benchmarking - use the task names from the Tasks enum
109
- BENCHMARK_COLS = [task.name for task in Tasks]
110
 
 
105
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
106
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
107
 
108
+ # Task columns for benchmarking - use the display column names from the Tasks enum
109
+ BENCHMARK_COLS = [task.value.col_name for task in Tasks]
110
 
src/leaderboard/read_evals.py CHANGED
@@ -138,11 +138,11 @@ class AssessmentResult:
138
  AutoEvalColumn.availability.name: self.availability,
139
  }
140
 
141
- # Add task-specific risk scores - map each task to its column name
142
  for task in Tasks:
143
  task_enum = task.value # Task dataclass instance
144
  benchmark_key = task_enum.benchmark # e.g., "license_validation"
145
- col_name = task.name # The field name in AutoEvalColumn, e.g., "license"
146
  risk_score = self.results.get(benchmark_key, 10) # Default to highest risk
147
  data_dict[col_name] = risk_score
148
 
 
138
  AutoEvalColumn.availability.name: self.availability,
139
  }
140
 
141
+ # Add task-specific risk scores - map to display column names
142
  for task in Tasks:
143
  task_enum = task.value # Task dataclass instance
144
  benchmark_key = task_enum.benchmark # e.g., "license_validation"
145
+ col_name = task_enum.col_name # Use the display name, e.g., "License Risk"
146
  risk_score = self.results.get(benchmark_key, 10) # Default to highest risk
147
  data_dict[col_name] = risk_score
148
 
src/populate.py CHANGED
@@ -14,7 +14,7 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
14
  eval_results_path: Path to the assessment result files
15
  eval_requests_path: Path to the assessment request files
16
  cols: Columns names to include in the dataframe
17
- benchmark_cols: Risk categories column names
18
 
19
  Returns:
20
  Pandas dataframe for the leaderboard
@@ -27,21 +27,25 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
27
  # Create dataframe from assessment results
28
  all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
29
 
30
- # Ensure the task columns are included
31
- task_cols = [task.name for task in Tasks]
32
-
 
 
 
 
33
  # Sort by overall risk score (ascending - lower is better)
34
  if AutoEvalColumn.overall_risk.name in all_df.columns:
35
  all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
36
 
37
  return all_df
38
 
39
- return pd.DataFrame(columns=cols) # Empty dataframe with columns
40
  except Exception as e:
41
  print(f"Error reading evaluation results: {e}")
42
  import traceback
43
  traceback.print_exc()
44
- return pd.DataFrame(columns=cols) # Return empty dataframe
45
 
46
 
47
  def get_evaluation_queue_df(eval_requests_path, eval_cols):
 
14
  eval_results_path: Path to the assessment result files
15
  eval_requests_path: Path to the assessment request files
16
  cols: Columns names to include in the dataframe
17
+ benchmark_cols: Risk categories column names (display names)
18
 
19
  Returns:
20
  Pandas dataframe for the leaderboard
 
27
  # Create dataframe from assessment results
28
  all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
29
 
30
+ # Ensure we have all the needed display columns
31
+ all_columns = set(all_df.columns)
32
+ for col in benchmark_cols:
33
+ if col not in all_columns:
34
+ print(f"Warning: Column '{col}' missing, adding empty column")
35
+ all_df[col] = 10.0 # Default to highest risk
36
+
37
  # Sort by overall risk score (ascending - lower is better)
38
  if AutoEvalColumn.overall_risk.name in all_df.columns:
39
  all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
40
 
41
  return all_df
42
 
43
+ return pd.DataFrame(columns=cols + benchmark_cols) # Empty dataframe with all columns
44
  except Exception as e:
45
  print(f"Error reading evaluation results: {e}")
46
  import traceback
47
  traceback.print_exc()
48
+ return pd.DataFrame(columns=cols + benchmark_cols) # Return empty dataframe with all columns
49
 
50
 
51
  def get_evaluation_queue_df(eval_requests_path, eval_cols):