Fix 'AutoEvalColumn' has no attribute
Browse files- debug.py +39 -0
- hub/version.txt +1 -0
- src/about.py +1 -1
- src/leaderboard/read_evals.py +29 -32
- src/populate.py +30 -9
- test-locally.sh +59 -0
debug.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from src.populate import get_leaderboard_df
|
3 |
+
from src.display.utils import COLS, BENCHMARK_COLS
|
4 |
+
from src.about import Tasks
|
5 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
6 |
+
|
7 |
+
print("Tasks definitions:")
|
8 |
+
for task in Tasks:
|
9 |
+
print(f"- {task.name}: benchmark={task.value.benchmark}, metric={task.value.metric}, col_name={task.value.col_name}")
|
10 |
+
|
11 |
+
print("\nBenchmark columns:", BENCHMARK_COLS)
|
12 |
+
|
13 |
+
try:
|
14 |
+
# Get raw results first
|
15 |
+
raw_results = get_raw_eval_results("eval-results", "eval-queue")
|
16 |
+
print("\nRaw results:")
|
17 |
+
for result in raw_results:
|
18 |
+
print("\nResult:")
|
19 |
+
print("- eval_name:", result.eval_name)
|
20 |
+
print("- results:", result.results)
|
21 |
+
data_dict = result.to_dict()
|
22 |
+
print("- data_dict:", data_dict)
|
23 |
+
|
24 |
+
# Convert to DataFrame
|
25 |
+
all_data_json = [v.to_dict() for v in raw_results]
|
26 |
+
df = pd.DataFrame.from_records(all_data_json)
|
27 |
+
print("\nDataFrame columns:", df.columns.tolist())
|
28 |
+
print("\nDataFrame contents:")
|
29 |
+
print(df)
|
30 |
+
except Exception as e:
|
31 |
+
print("\nError:", str(e))
|
32 |
+
import traceback
|
33 |
+
traceback.print_exc()
|
34 |
+
|
35 |
+
# Print raw data for debugging
|
36 |
+
print("\nRaw data from results file:")
|
37 |
+
import json
|
38 |
+
with open("eval-results/results_1.json") as f:
|
39 |
+
print(json.dumps(json.load(f), indent=2))
|
hub/version.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
1
|
src/about.py
CHANGED
@@ -15,7 +15,7 @@ class Tasks(Enum):
|
|
15 |
safetensors = Task("safetensors_check", "compliant", "Safetensors")
|
16 |
|
17 |
# Security prompts evaluation
|
18 |
-
secure_coding = Task("secure_coding", "security_score", "Security Score")
|
19 |
|
20 |
NUM_FEWSHOT = 0
|
21 |
# ---------------------------------------------------
|
|
|
15 |
safetensors = Task("safetensors_check", "compliant", "Safetensors")
|
16 |
|
17 |
# Security prompts evaluation
|
18 |
+
secure_coding = Task("secure_coding", "security_score", "Security Score ⬆️")
|
19 |
|
20 |
NUM_FEWSHOT = 0
|
21 |
# ---------------------------------------------------
|
src/leaderboard/read_evals.py
CHANGED
@@ -70,14 +70,14 @@ class EvalResult:
|
|
70 |
results = {}
|
71 |
for task in Tasks:
|
72 |
task = task.value
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
|
82 |
return self(
|
83 |
eval_name=result_key,
|
@@ -95,7 +95,7 @@ class EvalResult:
|
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
|
98 |
-
|
99 |
with open(request_file, "r") as f:
|
100 |
request = json.load(f)
|
101 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
@@ -112,30 +112,28 @@ class EvalResult:
|
|
112 |
self.likes = request.get("likes", 0)
|
113 |
self.num_params = request.get("params", 0)
|
114 |
self.date = request.get("submitted_time", "")
|
115 |
-
|
116 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
117 |
|
118 |
def to_dict(self):
|
119 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
120 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
121 |
data_dict = {
|
122 |
-
"eval_name": self.eval_name, # not a column, just a save name
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
135 |
}
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
|
140 |
return data_dict
|
141 |
|
@@ -159,6 +157,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
159 |
and req_content["precision"] == precision.split(".")[-1]
|
160 |
):
|
161 |
request_file = tmp_request_file
|
|
|
162 |
return request_file
|
163 |
|
164 |
|
@@ -189,16 +188,14 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
189 |
# Store results of same eval together
|
190 |
eval_name = eval_result.eval_name
|
191 |
if eval_name in eval_results.keys():
|
192 |
-
eval_results[eval_name].results.update(
|
193 |
else:
|
194 |
eval_results[eval_name] = eval_result
|
195 |
|
196 |
results = []
|
197 |
for v in eval_results.values():
|
198 |
-
|
199 |
-
|
200 |
results.append(v)
|
201 |
-
except KeyError: # not all eval values present
|
202 |
-
continue
|
203 |
|
204 |
return results
|
|
|
70 |
results = {}
|
71 |
for task in Tasks:
|
72 |
task = task.value
|
73 |
+
if task.benchmark in data["results"]:
|
74 |
+
result = data["results"][task.benchmark]
|
75 |
+
metric_value = result.get(task.metric)
|
76 |
+
if metric_value is not None:
|
77 |
+
if isinstance(metric_value, bool):
|
78 |
+
results[task.col_name] = metric_value
|
79 |
+
else:
|
80 |
+
results[task.col_name] = metric_value * 100.0
|
81 |
|
82 |
return self(
|
83 |
eval_name=result_key,
|
|
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
|
98 |
+
if request_file:
|
99 |
with open(request_file, "r") as f:
|
100 |
request = json.load(f)
|
101 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
|
|
112 |
self.likes = request.get("likes", 0)
|
113 |
self.num_params = request.get("params", 0)
|
114 |
self.date = request.get("submitted_time", "")
|
115 |
+
else:
|
116 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
117 |
|
118 |
def to_dict(self):
|
119 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
120 |
data_dict = {
|
121 |
+
"eval_name": self.eval_name, # not a column, just a save name
|
122 |
+
"Precision": self.precision.value.name,
|
123 |
+
"Type": self.model_type.value.name,
|
124 |
+
"T": self.model_type.value.symbol,
|
125 |
+
"Weight Format": self.weight_type.value.name,
|
126 |
+
"Architecture": self.architecture,
|
127 |
+
"Model": make_clickable_model(self.full_model),
|
128 |
+
"Model SHA": self.revision,
|
129 |
+
"Hub License": self.license,
|
130 |
+
"Hub ❤️": self.likes,
|
131 |
+
"#Params (B)": self.num_params,
|
132 |
+
"Available on Hub": self.still_on_hub,
|
|
|
133 |
}
|
134 |
+
|
135 |
+
# Add benchmark results
|
136 |
+
data_dict.update(self.results)
|
137 |
|
138 |
return data_dict
|
139 |
|
|
|
157 |
and req_content["precision"] == precision.split(".")[-1]
|
158 |
):
|
159 |
request_file = tmp_request_file
|
160 |
+
break
|
161 |
return request_file
|
162 |
|
163 |
|
|
|
188 |
# Store results of same eval together
|
189 |
eval_name = eval_result.eval_name
|
190 |
if eval_name in eval_results.keys():
|
191 |
+
eval_results[eval_name].results.update(eval_result.results)
|
192 |
else:
|
193 |
eval_results[eval_name] = eval_result
|
194 |
|
195 |
results = []
|
196 |
for v in eval_results.values():
|
197 |
+
# Only include results that have all required benchmark values
|
198 |
+
if all(task.value.col_name in v.results for task in Tasks):
|
199 |
results.append(v)
|
|
|
|
|
200 |
|
201 |
return results
|
src/populate.py
CHANGED
@@ -14,11 +14,17 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
-
# filter out if any of the benchmarks have not been produced
|
21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
23 |
|
24 |
|
@@ -33,10 +39,17 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
33 |
with open(file_path) as fp:
|
34 |
data = json.load(fp)
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
all_evals.append(
|
40 |
elif ".md" not in entry:
|
41 |
# this is a folder
|
42 |
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
@@ -45,9 +58,17 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
45 |
with open(file_path) as fp:
|
46 |
data = json.load(fp)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
53 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
|
18 |
+
# Ensure all required columns exist before filtering
|
19 |
+
for col in benchmark_cols:
|
20 |
+
if col not in df.columns:
|
21 |
+
df[col] = None
|
22 |
+
|
23 |
+
# Filter out if any of the benchmarks have not been produced
|
24 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
25 |
+
df = df.sort_values(by="Security Score ⬆️", ascending=False)
|
26 |
df = df[cols].round(decimals=2)
|
27 |
|
|
|
|
|
28 |
return df
|
29 |
|
30 |
|
|
|
39 |
with open(file_path) as fp:
|
40 |
data = json.load(fp)
|
41 |
|
42 |
+
# Create a new dict with the required column names
|
43 |
+
formatted_data = {
|
44 |
+
"model": make_clickable_model(data["model"]),
|
45 |
+
"revision": data.get("revision", "main"),
|
46 |
+
"private": data.get("private", False),
|
47 |
+
"precision": data.get("precision", ""),
|
48 |
+
"weight_type": data.get("weight_type", ""),
|
49 |
+
"status": data.get("status", "")
|
50 |
+
}
|
51 |
|
52 |
+
all_evals.append(formatted_data)
|
53 |
elif ".md" not in entry:
|
54 |
# this is a folder
|
55 |
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
|
|
58 |
with open(file_path) as fp:
|
59 |
data = json.load(fp)
|
60 |
|
61 |
+
# Create a new dict with the required column names
|
62 |
+
formatted_data = {
|
63 |
+
"model": make_clickable_model(data["model"]),
|
64 |
+
"revision": data.get("revision", "main"),
|
65 |
+
"private": data.get("private", False),
|
66 |
+
"precision": data.get("precision", ""),
|
67 |
+
"weight_type": data.get("weight_type", ""),
|
68 |
+
"status": data.get("status", "")
|
69 |
+
}
|
70 |
+
|
71 |
+
all_evals.append(formatted_data)
|
72 |
|
73 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
74 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
test-locally.sh
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Clean up any previous runs
|
4 |
+
rm -rf venv eval-queue/* eval-results/* __pycache__ src/__pycache__ src/*/__pycache__
|
5 |
+
|
6 |
+
# Create virtual environment
|
7 |
+
python3 -m venv venv
|
8 |
+
|
9 |
+
# Ensure we're using the virtual environment's Python and pip
|
10 |
+
PYTHON="./venv/bin/python3"
|
11 |
+
PIP="./venv/bin/pip"
|
12 |
+
|
13 |
+
# Install dependencies
|
14 |
+
$PYTHON -m pip install --upgrade pip
|
15 |
+
$PIP install -r requirements.txt
|
16 |
+
|
17 |
+
# Create necessary directories
|
18 |
+
mkdir -p eval-queue eval-results
|
19 |
+
|
20 |
+
# Create sample data files with correct column names matching Tasks definitions
|
21 |
+
cat > eval-queue/test_model_eval_request_float16.json << EOL
|
22 |
+
{
|
23 |
+
"model": "test/model",
|
24 |
+
"precision": "float16",
|
25 |
+
"model_type": "pretrained 🟢",
|
26 |
+
"weight_type": "Safetensors",
|
27 |
+
"license": "MIT",
|
28 |
+
"likes": 100,
|
29 |
+
"params": 7,
|
30 |
+
"submitted_time": "2024-01-01",
|
31 |
+
"status": "FINISHED"
|
32 |
+
}
|
33 |
+
EOL
|
34 |
+
|
35 |
+
cat > eval-results/results_1.json << EOL
|
36 |
+
{
|
37 |
+
"config": {
|
38 |
+
"model_name": "test/model",
|
39 |
+
"model_dtype": "float16",
|
40 |
+
"model_sha": "main"
|
41 |
+
},
|
42 |
+
"results": {
|
43 |
+
"secure_coding": {
|
44 |
+
"security_score": 0.85
|
45 |
+
},
|
46 |
+
"safetensors_check": {
|
47 |
+
"compliant": true
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
EOL
|
52 |
+
|
53 |
+
# Set environment variables
|
54 |
+
export HF_HOME="."
|
55 |
+
export HF_TOKEN="dummy-token" # The app will work locally without a real token
|
56 |
+
|
57 |
+
# Run the app
|
58 |
+
echo "Starting the app..."
|
59 |
+
$PYTHON app.py
|