Spaces:
Running
Running
Commit
Β·
f03f82b
1
Parent(s):
fbd403a
update
Browse files- app.py +3 -1
- assessment-queue/{langchain-ai_langchain_request.json β langchain-ai_langchain_eval_request_timestamp_abc123.json} +0 -0
- assessment-queue/{microsoft_autogen_request.json β microsoft_autogen_eval_request_timestamp_ghi789.json} +0 -0
- assessment-queue/{pytorch_pytorch_request.json β pytorch_pytorch_eval_request_timestamp_def456.json} +0 -0
- src/display/utils.py +2 -1
- src/leaderboard/read_evals.py +35 -13
- src/populate.py +6 -0
app.py
CHANGED
@@ -12,6 +12,7 @@ from src.about import (
|
|
12 |
INTRODUCTION_TEXT,
|
13 |
LLM_BENCHMARKS_TEXT,
|
14 |
TITLE,
|
|
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
17 |
from src.display.utils import (
|
@@ -91,7 +92,8 @@ def init_leaderboard(dataframe):
|
|
91 |
"""Initialize the leaderboard component"""
|
92 |
if dataframe is None or dataframe.empty:
|
93 |
# Create an empty dataframe with the expected columns
|
94 |
-
|
|
|
95 |
print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
|
96 |
dataframe = empty_df
|
97 |
|
|
|
12 |
INTRODUCTION_TEXT,
|
13 |
LLM_BENCHMARKS_TEXT,
|
14 |
TITLE,
|
15 |
+
Tasks
|
16 |
)
|
17 |
from src.display.css_html_js import custom_css
|
18 |
from src.display.utils import (
|
|
|
92 |
"""Initialize the leaderboard component"""
|
93 |
if dataframe is None or dataframe.empty:
|
94 |
# Create an empty dataframe with the expected columns
|
95 |
+
all_columns = COLS + [task.name for task in Tasks]
|
96 |
+
empty_df = pd.DataFrame(columns=all_columns)
|
97 |
print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
|
98 |
dataframe = empty_df
|
99 |
|
assessment-queue/{langchain-ai_langchain_request.json β langchain-ai_langchain_eval_request_timestamp_abc123.json}
RENAMED
File without changes
|
assessment-queue/{microsoft_autogen_request.json β microsoft_autogen_eval_request_timestamp_ghi789.json}
RENAMED
File without changes
|
assessment-queue/{pytorch_pytorch_request.json β pytorch_pytorch_eval_request_timestamp_def456.json}
RENAMED
File without changes
|
src/display/utils.py
CHANGED
@@ -105,5 +105,6 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
105 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
106 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
107 |
|
108 |
-
|
|
|
109 |
|
|
|
105 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
106 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
107 |
|
108 |
+
# Task columns for benchmarking - use the task names from the Tasks enum
|
109 |
+
BENCHMARK_COLS = [task.name for task in Tasks]
|
110 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -138,31 +138,53 @@ class AssessmentResult:
|
|
138 |
AutoEvalColumn.availability.name: self.availability,
|
139 |
}
|
140 |
|
|
|
141 |
for task in Tasks:
|
142 |
-
|
|
|
|
|
|
|
|
|
143 |
|
144 |
return data_dict
|
145 |
|
146 |
|
147 |
def get_request_file_for_library(requests_path, library_name, version):
|
148 |
"""Selects the correct request file for a given library. Only keeps runs tagged as FINISHED"""
|
149 |
-
|
150 |
-
|
151 |
-
f"{library_name.replace('/', '_')}_eval_request_*.json",
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
# Select correct request file (version)
|
156 |
request_file = ""
|
157 |
request_files = sorted(request_files, reverse=True)
|
158 |
for tmp_request_file in request_files:
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
return request_file
|
167 |
|
168 |
|
|
|
138 |
AutoEvalColumn.availability.name: self.availability,
|
139 |
}
|
140 |
|
141 |
+
# Add task-specific risk scores - map each task to its column name
|
142 |
for task in Tasks:
|
143 |
+
task_enum = task.value # Task dataclass instance
|
144 |
+
benchmark_key = task_enum.benchmark # e.g., "license_validation"
|
145 |
+
col_name = task.name # The field name in AutoEvalColumn, e.g., "license"
|
146 |
+
risk_score = self.results.get(benchmark_key, 10) # Default to highest risk
|
147 |
+
data_dict[col_name] = risk_score
|
148 |
|
149 |
return data_dict
|
150 |
|
151 |
|
152 |
def get_request_file_for_library(requests_path, library_name, version):
|
153 |
"""Selects the correct request file for a given library. Only keeps runs tagged as FINISHED"""
|
154 |
+
# Try multiple naming patterns for flexibility
|
155 |
+
possible_patterns = [
|
156 |
+
f"{library_name.replace('/', '_')}_eval_request_*.json", # Original pattern
|
157 |
+
f"{library_name.replace('/', '_')}_request.json", # Simple pattern
|
158 |
+
f"{library_name.replace('/', '_')}*.json" # Fallback pattern
|
159 |
+
]
|
160 |
+
|
161 |
+
request_files = []
|
162 |
+
for pattern in possible_patterns:
|
163 |
+
pattern_path = os.path.join(requests_path, pattern)
|
164 |
+
found_files = glob.glob(pattern_path)
|
165 |
+
request_files.extend(found_files)
|
166 |
+
|
167 |
+
if not request_files:
|
168 |
+
print(f"Warning: No request files found matching {library_name}")
|
169 |
+
return ""
|
170 |
|
171 |
# Select correct request file (version)
|
172 |
request_file = ""
|
173 |
request_files = sorted(request_files, reverse=True)
|
174 |
for tmp_request_file in request_files:
|
175 |
+
try:
|
176 |
+
with open(tmp_request_file, "r") as f:
|
177 |
+
req_content = json.load(f)
|
178 |
+
if (
|
179 |
+
req_content.get("status", "") in ["FINISHED"] and
|
180 |
+
req_content.get("version", "") == version
|
181 |
+
):
|
182 |
+
request_file = tmp_request_file
|
183 |
+
break
|
184 |
+
except Exception as e:
|
185 |
+
print(f"Error reading {tmp_request_file}: {e}")
|
186 |
+
continue
|
187 |
+
|
188 |
return request_file
|
189 |
|
190 |
|
src/populate.py
CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
|
|
4 |
|
5 |
from src.display.utils import AutoEvalColumn
|
6 |
from src.leaderboard.read_evals import get_raw_assessment_results
|
|
|
7 |
|
8 |
|
9 |
def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
|
@@ -26,6 +27,9 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
|
|
26 |
# Create dataframe from assessment results
|
27 |
all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
|
28 |
|
|
|
|
|
|
|
29 |
# Sort by overall risk score (ascending - lower is better)
|
30 |
if AutoEvalColumn.overall_risk.name in all_df.columns:
|
31 |
all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
|
@@ -35,6 +39,8 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
|
|
35 |
return pd.DataFrame(columns=cols) # Empty dataframe with columns
|
36 |
except Exception as e:
|
37 |
print(f"Error reading evaluation results: {e}")
|
|
|
|
|
38 |
return pd.DataFrame(columns=cols) # Return empty dataframe
|
39 |
|
40 |
|
|
|
4 |
|
5 |
from src.display.utils import AutoEvalColumn
|
6 |
from src.leaderboard.read_evals import get_raw_assessment_results
|
7 |
+
from src.about import Tasks
|
8 |
|
9 |
|
10 |
def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
|
|
|
27 |
# Create dataframe from assessment results
|
28 |
all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
|
29 |
|
30 |
+
# Ensure the task columns are included
|
31 |
+
task_cols = [task.name for task in Tasks]
|
32 |
+
|
33 |
# Sort by overall risk score (ascending - lower is better)
|
34 |
if AutoEvalColumn.overall_risk.name in all_df.columns:
|
35 |
all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
|
|
|
39 |
return pd.DataFrame(columns=cols) # Empty dataframe with columns
|
40 |
except Exception as e:
|
41 |
print(f"Error reading evaluation results: {e}")
|
42 |
+
import traceback
|
43 |
+
traceback.print_exc()
|
44 |
return pd.DataFrame(columns=cols) # Return empty dataframe
|
45 |
|
46 |
|