import pandas as pd from src.display.utils import BENCHMARK_COLS from src.about import Tasks from src.leaderboard.read_evals import get_raw_eval_results print("Tasks definitions:") for task in Tasks: print(f"- {task.name}: benchmark={task.value.benchmark}, metric={task.value.metric}, col_name={task.value.col_name}") print("\nBenchmark columns:", BENCHMARK_COLS) try: # Get raw results first raw_results = get_raw_eval_results("eval-results", "eval-queue") print("\nRaw results:") for result in raw_results: print("\nResult:") print("- eval_name:", result.eval_name) print("- results:", result.results) data_dict = result.to_dict() print("- data_dict:", data_dict) # Convert to DataFrame all_data_json = [v.to_dict() for v in raw_results] df = pd.DataFrame.from_records(all_data_json) print("\nDataFrame columns:", df.columns.tolist()) print("\nDataFrame contents:") print(df) except Exception as e: print("\nError:", str(e)) import traceback traceback.print_exc() # Print raw data for debugging print("\nRaw data from results file:") import json with open("eval-results/results_1.json") as f: print(json.dumps(json.load(f), indent=2))