""" Data loading and processing utilities for the leaderboard application. """ import pandas as pd import json from src.utils.config import model_categories def load_metric_data(file_path): """ Load metric data from a JSON file Args: file_path (str): Path to the JSON file containing metric data Returns: dict: Dictionary containing the loaded metric data """ try: with open(file_path, "r") as f: return json.load(f) except FileNotFoundError: print(f"Error: File {file_path} not found.") return {} except json.JSONDecodeError: print(f"Error: File {file_path} is not a valid JSON file.") return {} def process_data(metric_data): """ Process the metric data into a pandas DataFrame Args: metric_data (dict): Dictionary containing the metric data Returns: pandas.DataFrame: DataFrame containing the processed data """ # Create a DataFrame to store the model metric data tasks = list(metric_data.keys()) models = [] model_data = {} # Extract model names and their metric values for each task for task in tasks: for model in metric_data[task]: if model not in models: models.append(model) model_data[model] = {} # Store the metric value for this task model_data[model][task] = metric_data[task][model] # Create DataFrame from the model_data dictionary df = pd.DataFrame.from_dict(model_data, orient='index') # Replace NaN values with '-' df.fillna('-', inplace=True) # Rename the columns to more readable format df.columns = [task.replace("-", " ").replace("_", " ").title() for task in df.columns] # Add a model type column to the dataframe df['Model Type'] = df.index.map(lambda x: model_categories.get(x, "Unknown")) return df def calculate_selected_overall(row, selected_tasks): """ Calculate overall average for selected tasks Args: row (pandas.Series): Row of data selected_tasks (list): List of task names to include in the average Returns: float or str: The calculated average or '-' if no numeric values """ numeric_values = [] for task in selected_tasks: value = row[task] # Check if the value is numeric (could be float or string representing float) if isinstance(value, (int, float)) or (isinstance(value, str) and value.replace('.', '', 1).replace('-', '', 1).isdigit()): numeric_values.append(float(value)) # Calculate average if there are numeric values if numeric_values: return sum(numeric_values) / len(numeric_values) else: return '-' def filter_and_prepare_data(df, selected_tasks, selected_model_types): """ Filter and prepare data based on selections Args: df (pandas.DataFrame): The original DataFrame selected_tasks (list): List of selected task names selected_model_types (list): List of selected model types Returns: pandas.DataFrame: Filtered and prepared DataFrame """ # Filter the dataframe based on selected model types filtered_df = df[df['Model Type'].isin(selected_model_types)] # Calculate the average for selected tasks only selected_tasks_df = filtered_df[selected_tasks] filtered_df['Selected Overall'] = selected_tasks_df.mean(axis=1) # Sort by Selected Overall and add rank filtered_df = filtered_df.sort_values('Selected Overall', ascending=False) filtered_df.insert(0, 'Rank', range(1, len(filtered_df) + 1)) # Add a Model Name column that shows the index (actual model name) filtered_df['Model Name'] = filtered_df.index return filtered_df def format_display_dataframe(filtered_df, selected_tasks): """ Create and format the display DataFrame for the leaderboard table Args: filtered_df (pandas.DataFrame): The filtered DataFrame selected_tasks (list): List of selected task names Returns: tuple: (pandas.DataFrame, list) - The display DataFrame and the metric columns """ # Create a fixed display DataFrame with only the model info display_df = filtered_df[['Rank', 'Model Name', 'Model Type']].copy() # Format the rank column with medals medal_ranks = {1: "🥇 1", 2: "🥈 2", 3: "🥉 3"} display_df['Rank'] = display_df['Rank'].apply(lambda x: medal_ranks.get(x, str(x))) # Add metrics columns (Selected Overall and individual tasks) metric_columns = ['Selected Overall'] + selected_tasks for col in metric_columns: if col in filtered_df.columns: # Format numeric columns to 3 decimal places if filtered_df[col].dtype in ['float64', 'float32']: display_df[col] = filtered_df[col].apply(lambda x: f"{x:.3f}" if isinstance(x, (int, float)) else x) else: display_df[col] = filtered_df[col] # Rename "Selected Overall" to "Metric Average" in display_df if "Selected Overall" in display_df.columns: display_df = display_df.rename(columns={"Selected Overall": "Metric Average"}) # Also update the metric_columns list to reflect the rename metric_columns = ['Metric Average'] + selected_tasks return display_df, metric_columns