lukehinds commited on
Commit
b257b3e
·
1 Parent(s): b908a37

Fix 'AutoEvalColumn' has no attribute

Browse files
Files changed (6) hide show
  1. debug.py +39 -0
  2. hub/version.txt +1 -0
  3. src/about.py +1 -1
  4. src/leaderboard/read_evals.py +29 -32
  5. src/populate.py +30 -9
  6. test-locally.sh +59 -0
debug.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from src.populate import get_leaderboard_df
3
+ from src.display.utils import COLS, BENCHMARK_COLS
4
+ from src.about import Tasks
5
+ from src.leaderboard.read_evals import get_raw_eval_results
6
+
7
+ print("Tasks definitions:")
8
+ for task in Tasks:
9
+ print(f"- {task.name}: benchmark={task.value.benchmark}, metric={task.value.metric}, col_name={task.value.col_name}")
10
+
11
+ print("\nBenchmark columns:", BENCHMARK_COLS)
12
+
13
+ try:
14
+ # Get raw results first
15
+ raw_results = get_raw_eval_results("eval-results", "eval-queue")
16
+ print("\nRaw results:")
17
+ for result in raw_results:
18
+ print("\nResult:")
19
+ print("- eval_name:", result.eval_name)
20
+ print("- results:", result.results)
21
+ data_dict = result.to_dict()
22
+ print("- data_dict:", data_dict)
23
+
24
+ # Convert to DataFrame
25
+ all_data_json = [v.to_dict() for v in raw_results]
26
+ df = pd.DataFrame.from_records(all_data_json)
27
+ print("\nDataFrame columns:", df.columns.tolist())
28
+ print("\nDataFrame contents:")
29
+ print(df)
30
+ except Exception as e:
31
+ print("\nError:", str(e))
32
+ import traceback
33
+ traceback.print_exc()
34
+
35
+ # Print raw data for debugging
36
+ print("\nRaw data from results file:")
37
+ import json
38
+ with open("eval-results/results_1.json") as f:
39
+ print(json.dumps(json.load(f), indent=2))
hub/version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1
src/about.py CHANGED
@@ -15,7 +15,7 @@ class Tasks(Enum):
15
  safetensors = Task("safetensors_check", "compliant", "Safetensors")
16
 
17
  # Security prompts evaluation
18
- secure_coding = Task("secure_coding", "security_score", "Security Score")
19
 
20
  NUM_FEWSHOT = 0
21
  # ---------------------------------------------------
 
15
  safetensors = Task("safetensors_check", "compliant", "Safetensors")
16
 
17
  # Security prompts evaluation
18
+ secure_coding = Task("secure_coding", "security_score", "Security Score ⬆️")
19
 
20
  NUM_FEWSHOT = 0
21
  # ---------------------------------------------------
src/leaderboard/read_evals.py CHANGED
@@ -70,14 +70,14 @@ class EvalResult:
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
 
82
  return self(
83
  eval_name=result_key,
@@ -95,7 +95,7 @@ class EvalResult:
95
  """Finds the relevant request file for the current model and updates info with it"""
96
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
 
98
- try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
101
  self.model_type = ModelType.from_str(request.get("model_type", ""))
@@ -112,30 +112,28 @@ class EvalResult:
112
  self.likes = request.get("likes", 0)
113
  self.num_params = request.get("params", 0)
114
  self.date = request.get("submitted_time", "")
115
- except Exception:
116
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
117
 
118
  def to_dict(self):
119
  """Converts the Eval Result to a dict compatible with our dataframe display"""
120
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
121
  data_dict = {
122
- "eval_name": self.eval_name, # not a column, just a save name,
123
- AutoEvalColumn.precision.name: self.precision.value.name,
124
- AutoEvalColumn.model_type.name: self.model_type.value.name,
125
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
126
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
127
- AutoEvalColumn.architecture.name: self.architecture,
128
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
129
- AutoEvalColumn.revision.name: self.revision,
130
- AutoEvalColumn.average.name: average,
131
- AutoEvalColumn.license.name: self.license,
132
- AutoEvalColumn.likes.name: self.likes,
133
- AutoEvalColumn.params.name: self.num_params,
134
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
135
  }
136
-
137
- for task in Tasks:
138
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
139
 
140
  return data_dict
141
 
@@ -159,6 +157,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
159
  and req_content["precision"] == precision.split(".")[-1]
160
  ):
161
  request_file = tmp_request_file
 
162
  return request_file
163
 
164
 
@@ -189,16 +188,14 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
189
  # Store results of same eval together
190
  eval_name = eval_result.eval_name
191
  if eval_name in eval_results.keys():
192
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
193
  else:
194
  eval_results[eval_name] = eval_result
195
 
196
  results = []
197
  for v in eval_results.values():
198
- try:
199
- v.to_dict() # we test if the dict version is complete
200
  results.append(v)
201
- except KeyError: # not all eval values present
202
- continue
203
 
204
  return results
 
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
+ if task.benchmark in data["results"]:
74
+ result = data["results"][task.benchmark]
75
+ metric_value = result.get(task.metric)
76
+ if metric_value is not None:
77
+ if isinstance(metric_value, bool):
78
+ results[task.col_name] = metric_value
79
+ else:
80
+ results[task.col_name] = metric_value * 100.0
81
 
82
  return self(
83
  eval_name=result_key,
 
95
  """Finds the relevant request file for the current model and updates info with it"""
96
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
 
98
+ if request_file:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
101
  self.model_type = ModelType.from_str(request.get("model_type", ""))
 
112
  self.likes = request.get("likes", 0)
113
  self.num_params = request.get("params", 0)
114
  self.date = request.get("submitted_time", "")
115
+ else:
116
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
117
 
118
  def to_dict(self):
119
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
120
  data_dict = {
121
+ "eval_name": self.eval_name, # not a column, just a save name
122
+ "Precision": self.precision.value.name,
123
+ "Type": self.model_type.value.name,
124
+ "T": self.model_type.value.symbol,
125
+ "Weight Format": self.weight_type.value.name,
126
+ "Architecture": self.architecture,
127
+ "Model": make_clickable_model(self.full_model),
128
+ "Model SHA": self.revision,
129
+ "Hub License": self.license,
130
+ "Hub ❤️": self.likes,
131
+ "#Params (B)": self.num_params,
132
+ "Available on Hub": self.still_on_hub,
 
133
  }
134
+
135
+ # Add benchmark results
136
+ data_dict.update(self.results)
137
 
138
  return data_dict
139
 
 
157
  and req_content["precision"] == precision.split(".")[-1]
158
  ):
159
  request_file = tmp_request_file
160
+ break
161
  return request_file
162
 
163
 
 
188
  # Store results of same eval together
189
  eval_name = eval_result.eval_name
190
  if eval_name in eval_results.keys():
191
+ eval_results[eval_name].results.update(eval_result.results)
192
  else:
193
  eval_results[eval_name] = eval_result
194
 
195
  results = []
196
  for v in eval_results.values():
197
+ # Only include results that have all required benchmark values
198
+ if all(task.value.col_name in v.results for task in Tasks):
199
  results.append(v)
 
 
200
 
201
  return results
src/populate.py CHANGED
@@ -14,11 +14,17 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by="Security Score ⬆️", ascending=False) # Using the display name directly
 
 
 
 
 
 
 
 
18
  df = df[cols].round(decimals=2)
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24
 
@@ -33,10 +39,17 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
33
  with open(file_path) as fp:
34
  data = json.load(fp)
35
 
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
 
 
 
 
 
 
 
38
 
39
- all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
  sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
@@ -45,9 +58,17 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
45
  with open(file_path) as fp:
46
  data = json.load(fp)
47
 
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
 
 
 
 
 
 
 
 
51
 
52
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+
18
+ # Ensure all required columns exist before filtering
19
+ for col in benchmark_cols:
20
+ if col not in df.columns:
21
+ df[col] = None
22
+
23
+ # Filter out if any of the benchmarks have not been produced
24
+ df = df[has_no_nan_values(df, benchmark_cols)]
25
+ df = df.sort_values(by="Security Score ⬆️", ascending=False)
26
  df = df[cols].round(decimals=2)
27
 
 
 
28
  return df
29
 
30
 
 
39
  with open(file_path) as fp:
40
  data = json.load(fp)
41
 
42
+ # Create a new dict with the required column names
43
+ formatted_data = {
44
+ "model": make_clickable_model(data["model"]),
45
+ "revision": data.get("revision", "main"),
46
+ "private": data.get("private", False),
47
+ "precision": data.get("precision", ""),
48
+ "weight_type": data.get("weight_type", ""),
49
+ "status": data.get("status", "")
50
+ }
51
 
52
+ all_evals.append(formatted_data)
53
  elif ".md" not in entry:
54
  # this is a folder
55
  sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
 
58
  with open(file_path) as fp:
59
  data = json.load(fp)
60
 
61
+ # Create a new dict with the required column names
62
+ formatted_data = {
63
+ "model": make_clickable_model(data["model"]),
64
+ "revision": data.get("revision", "main"),
65
+ "private": data.get("private", False),
66
+ "precision": data.get("precision", ""),
67
+ "weight_type": data.get("weight_type", ""),
68
+ "status": data.get("status", "")
69
+ }
70
+
71
+ all_evals.append(formatted_data)
72
 
73
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
74
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
test-locally.sh ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Clean up any previous runs
4
+ rm -rf venv eval-queue/* eval-results/* __pycache__ src/__pycache__ src/*/__pycache__
5
+
6
+ # Create virtual environment
7
+ python3 -m venv venv
8
+
9
+ # Ensure we're using the virtual environment's Python and pip
10
+ PYTHON="./venv/bin/python3"
11
+ PIP="./venv/bin/pip"
12
+
13
+ # Install dependencies
14
+ $PYTHON -m pip install --upgrade pip
15
+ $PIP install -r requirements.txt
16
+
17
+ # Create necessary directories
18
+ mkdir -p eval-queue eval-results
19
+
20
+ # Create sample data files with correct column names matching Tasks definitions
21
+ cat > eval-queue/test_model_eval_request_float16.json << EOL
22
+ {
23
+ "model": "test/model",
24
+ "precision": "float16",
25
+ "model_type": "pretrained 🟢",
26
+ "weight_type": "Safetensors",
27
+ "license": "MIT",
28
+ "likes": 100,
29
+ "params": 7,
30
+ "submitted_time": "2024-01-01",
31
+ "status": "FINISHED"
32
+ }
33
+ EOL
34
+
35
+ cat > eval-results/results_1.json << EOL
36
+ {
37
+ "config": {
38
+ "model_name": "test/model",
39
+ "model_dtype": "float16",
40
+ "model_sha": "main"
41
+ },
42
+ "results": {
43
+ "secure_coding": {
44
+ "security_score": 0.85
45
+ },
46
+ "safetensors_check": {
47
+ "compliant": true
48
+ }
49
+ }
50
+ }
51
+ EOL
52
+
53
+ # Set environment variables
54
+ export HF_HOME="."
55
+ export HF_TOKEN="dummy-token" # The app will work locally without a real token
56
+
57
+ # Run the app
58
+ echo "Starting the app..."
59
+ $PYTHON app.py