Spaces:
Running
Running
Commit
·
5fc842f
1
Parent(s):
f03f82b
update
Browse files- app.py +1 -1
- assessment-queue/langchain-ai_langchain_eval_request_FINISHED_v0.1.0.json +14 -0
- assessment-queue/microsoft_autogen_eval_request_FINISHED_v0.2.0.json +14 -0
- assessment-queue/pytorch_pytorch_eval_request_FINISHED_v2.1.0.json +14 -0
- src/display/utils.py +2 -2
- src/leaderboard/read_evals.py +2 -2
- src/populate.py +10 -6
app.py
CHANGED
@@ -92,7 +92,7 @@ def init_leaderboard(dataframe):
|
|
92 |
"""Initialize the leaderboard component"""
|
93 |
if dataframe is None or dataframe.empty:
|
94 |
# Create an empty dataframe with the expected columns
|
95 |
-
all_columns = COLS + [task.
|
96 |
empty_df = pd.DataFrame(columns=all_columns)
|
97 |
print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
|
98 |
dataframe = empty_df
|
|
|
92 |
"""Initialize the leaderboard component"""
|
93 |
if dataframe is None or dataframe.empty:
|
94 |
# Create an empty dataframe with the expected columns
|
95 |
+
all_columns = COLS + [task.value.col_name for task in Tasks]
|
96 |
empty_df = pd.DataFrame(columns=all_columns)
|
97 |
print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
|
98 |
dataframe = empty_df
|
assessment-queue/langchain-ai_langchain_eval_request_FINISHED_v0.1.0.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"library": "langchain-ai/langchain",
|
3 |
+
"version": "v0.1.0",
|
4 |
+
"repository_url": "https://github.com/langchain-ai/langchain",
|
5 |
+
"language": "Python",
|
6 |
+
"framework": "Python SDK",
|
7 |
+
"library_type": "llm framework",
|
8 |
+
"license": "MIT",
|
9 |
+
"stars": 74500,
|
10 |
+
"status": "FINISHED",
|
11 |
+
"submitted_time": "2025-04-30T10:00:00Z",
|
12 |
+
"last_updated": "2025-05-01T12:00:00Z",
|
13 |
+
"assessment_id": "abc123"
|
14 |
+
}
|
assessment-queue/microsoft_autogen_eval_request_FINISHED_v0.2.0.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"library": "microsoft/autogen",
|
3 |
+
"version": "v0.2.0",
|
4 |
+
"repository_url": "https://github.com/microsoft/autogen",
|
5 |
+
"language": "Python",
|
6 |
+
"framework": "Agent Framework",
|
7 |
+
"library_type": "agent framework",
|
8 |
+
"license": "MIT",
|
9 |
+
"stars": 48700,
|
10 |
+
"status": "FINISHED",
|
11 |
+
"submitted_time": "2025-05-02T08:15:00Z",
|
12 |
+
"last_updated": "2025-05-03T09:15:00Z",
|
13 |
+
"assessment_id": "ghi789"
|
14 |
+
}
|
assessment-queue/pytorch_pytorch_eval_request_FINISHED_v2.1.0.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"library": "pytorch/pytorch",
|
3 |
+
"version": "v2.1.0",
|
4 |
+
"repository_url": "https://github.com/pytorch/pytorch",
|
5 |
+
"language": "Python",
|
6 |
+
"framework": "Machine Learning",
|
7 |
+
"library_type": "machine learning",
|
8 |
+
"license": "BSD-3",
|
9 |
+
"stars": 72300,
|
10 |
+
"status": "FINISHED",
|
11 |
+
"submitted_time": "2025-05-01T16:30:00Z",
|
12 |
+
"last_updated": "2025-05-02T14:30:00Z",
|
13 |
+
"assessment_id": "def456"
|
14 |
+
}
|
src/display/utils.py
CHANGED
@@ -105,6 +105,6 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
105 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
106 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
107 |
|
108 |
-
# Task columns for benchmarking - use the
|
109 |
-
BENCHMARK_COLS = [task.
|
110 |
|
|
|
105 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
106 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
107 |
|
108 |
+
# Task columns for benchmarking - use the display column names from the Tasks enum
|
109 |
+
BENCHMARK_COLS = [task.value.col_name for task in Tasks]
|
110 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -138,11 +138,11 @@ class AssessmentResult:
|
|
138 |
AutoEvalColumn.availability.name: self.availability,
|
139 |
}
|
140 |
|
141 |
-
# Add task-specific risk scores - map
|
142 |
for task in Tasks:
|
143 |
task_enum = task.value # Task dataclass instance
|
144 |
benchmark_key = task_enum.benchmark # e.g., "license_validation"
|
145 |
-
col_name =
|
146 |
risk_score = self.results.get(benchmark_key, 10) # Default to highest risk
|
147 |
data_dict[col_name] = risk_score
|
148 |
|
|
|
138 |
AutoEvalColumn.availability.name: self.availability,
|
139 |
}
|
140 |
|
141 |
+
# Add task-specific risk scores - map to display column names
|
142 |
for task in Tasks:
|
143 |
task_enum = task.value # Task dataclass instance
|
144 |
benchmark_key = task_enum.benchmark # e.g., "license_validation"
|
145 |
+
col_name = task_enum.col_name # Use the display name, e.g., "License Risk"
|
146 |
risk_score = self.results.get(benchmark_key, 10) # Default to highest risk
|
147 |
data_dict[col_name] = risk_score
|
148 |
|
src/populate.py
CHANGED
@@ -14,7 +14,7 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
|
|
14 |
eval_results_path: Path to the assessment result files
|
15 |
eval_requests_path: Path to the assessment request files
|
16 |
cols: Columns names to include in the dataframe
|
17 |
-
benchmark_cols: Risk categories column names
|
18 |
|
19 |
Returns:
|
20 |
Pandas dataframe for the leaderboard
|
@@ -27,21 +27,25 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
|
|
27 |
# Create dataframe from assessment results
|
28 |
all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
|
29 |
|
30 |
-
# Ensure the
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
33 |
# Sort by overall risk score (ascending - lower is better)
|
34 |
if AutoEvalColumn.overall_risk.name in all_df.columns:
|
35 |
all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
|
36 |
|
37 |
return all_df
|
38 |
|
39 |
-
return pd.DataFrame(columns=cols) # Empty dataframe with columns
|
40 |
except Exception as e:
|
41 |
print(f"Error reading evaluation results: {e}")
|
42 |
import traceback
|
43 |
traceback.print_exc()
|
44 |
-
return pd.DataFrame(columns=cols) # Return empty dataframe
|
45 |
|
46 |
|
47 |
def get_evaluation_queue_df(eval_requests_path, eval_cols):
|
|
|
14 |
eval_results_path: Path to the assessment result files
|
15 |
eval_requests_path: Path to the assessment request files
|
16 |
cols: Columns names to include in the dataframe
|
17 |
+
benchmark_cols: Risk categories column names (display names)
|
18 |
|
19 |
Returns:
|
20 |
Pandas dataframe for the leaderboard
|
|
|
27 |
# Create dataframe from assessment results
|
28 |
all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
|
29 |
|
30 |
+
# Ensure we have all the needed display columns
|
31 |
+
all_columns = set(all_df.columns)
|
32 |
+
for col in benchmark_cols:
|
33 |
+
if col not in all_columns:
|
34 |
+
print(f"Warning: Column '{col}' missing, adding empty column")
|
35 |
+
all_df[col] = 10.0 # Default to highest risk
|
36 |
+
|
37 |
# Sort by overall risk score (ascending - lower is better)
|
38 |
if AutoEvalColumn.overall_risk.name in all_df.columns:
|
39 |
all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
|
40 |
|
41 |
return all_df
|
42 |
|
43 |
+
return pd.DataFrame(columns=cols + benchmark_cols) # Empty dataframe with all columns
|
44 |
except Exception as e:
|
45 |
print(f"Error reading evaluation results: {e}")
|
46 |
import traceback
|
47 |
traceback.print_exc()
|
48 |
+
return pd.DataFrame(columns=cols + benchmark_cols) # Return empty dataframe with all columns
|
49 |
|
50 |
|
51 |
def get_evaluation_queue_df(eval_requests_path, eval_cols):
|