lukehinds's picture
Clean up
bd09cee
import pandas as pd
from src.display.utils import BENCHMARK_COLS
from src.about import Tasks
from src.leaderboard.read_evals import get_raw_eval_results
print("Tasks definitions:")
for task in Tasks:
print(f"- {task.name}: benchmark={task.value.benchmark}, metric={task.value.metric}, col_name={task.value.col_name}")
print("\nBenchmark columns:", BENCHMARK_COLS)
try:
# Get raw results first
raw_results = get_raw_eval_results("eval-results", "eval-queue")
print("\nRaw results:")
for result in raw_results:
print("\nResult:")
print("- eval_name:", result.eval_name)
print("- results:", result.results)
data_dict = result.to_dict()
print("- data_dict:", data_dict)
# Convert to DataFrame
all_data_json = [v.to_dict() for v in raw_results]
df = pd.DataFrame.from_records(all_data_json)
print("\nDataFrame columns:", df.columns.tolist())
print("\nDataFrame contents:")
print(df)
except Exception as e:
print("\nError:", str(e))
import traceback
traceback.print_exc()
# Print raw data for debugging
print("\nRaw data from results file:")
import json
with open("eval-results/results_1.json") as f:
print(json.dumps(json.load(f), indent=2))