import gradio as gr import pandas as pd import numpy as np import tempfile import os def handle_analysis(df_state, model_selection_group, analyze_results_button): with gr.Group(visible=False) as analysis_group: gr.Markdown("## Analysis") # Dropdown to select the accuracy measurement accuracy_measurement_dropdown = gr.Dropdown( choices=['Accuracy', 'Pearson Correlation'], label='Select Evaluation Metric' ) # We remove the two compare dropdowns and only keep ground truth with gr.Row(): ground_truth_dropdown = gr.Dropdown( choices=[], label='Select True Label Column' ) # Define two side-by-side boxes for results with gr.Row(): judge_a_result = gr.Textbox( label="Judge A Results", lines=10, interactive=False, visible=False ) judge_b_result = gr.Textbox( label="Judge B Results", lines=10, interactive=False, visible=False ) # Move the JSON output below those textboxes and buttons json_output = gr.File(label="Results .json", interactive=False, visible=False) # Now place the row of buttons AFTER the json_output with gr.Row(): back_to_results_button = gr.Button("← Back to Results") calculate_button = gr.Button("Calculate") download_button = gr.Button("Download Results as JSON") # Show analysis group def show_analysis_group(): df = df_state.value if df is not None: columns = df.columns.tolist() else: columns = [] # Now we only update ground_truth_dropdown return ( gr.update(visible=True), # analysis_group gr.update(visible=False), # model_selection_group gr.update(choices=columns), # ground_truth_dropdown ) analyze_results_button.click( fn=show_analysis_group, inputs=[], outputs=[ analysis_group, model_selection_group, ground_truth_dropdown # only this one ] ) def back_to_results(): return ( gr.update(visible=False), # Hide analysis_group gr.update(visible=True), # Show model_selection_group ) back_to_results_button.click( fn=back_to_results, inputs=[], outputs=[analysis_group, model_selection_group] ) def calculate_multiple_accuracies(measurement, ground_truth_col, df_state): # Update column names to match new format col2_name = "score_selene" # Get the non-selene score column (should be the other score_* column) df = df_state.value score_columns = [col for col in df.columns if col.startswith('score_') and col != 'score_selene'] col3_name = score_columns[0] if score_columns else None if df is None: return ( gr.update(value="No DataFrame available.", visible=True), gr.update(value="No DataFrame available.", visible=True) ) # Check if user-supplied ground_truth_col is valid missing_columns = [col for col in [ground_truth_col, col2_name, col3_name] if col not in df.columns] if missing_columns: msg = f"Selected columns not found in DataFrame: {', '.join(missing_columns)}." return ( gr.update(value=msg, visible=True), gr.update(value=msg, visible=True) ) # Compare ground_truth_col with score_selene result1 = calculate_accuracy( measurement, ground_truth_col, col2_name, df_state, compare_to_ground_truth=True ) text_a = f"Comparison: '{ground_truth_col}' vs. 'Selene'\n{result1}" # Compare ground_truth_col with the other model's score result2 = calculate_accuracy( measurement, ground_truth_col, col3_name, df_state, compare_to_ground_truth=True ) # Extract model name from column name for display model_name = col3_name.replace('score_', '').replace('_', ' ').title() text_b = f"Comparison: '{ground_truth_col}' vs. '{model_name}'\n{result2}" return ( gr.update(value=text_a, visible=True), gr.update(value=text_b, visible=True) ) # Now the calculate_button only expects measurement, ground_truth_col, df_state calculate_button.click( fn=calculate_multiple_accuracies, inputs=[ accuracy_measurement_dropdown, ground_truth_dropdown, df_state ], outputs=[judge_a_result, judge_b_result] ) def create_json_download(df_state): if df_state.value is None: return gr.update(value=None, visible=True) json_str = df_state.value.to_json(orient='records', indent=2) temp_dir = tempfile.gettempdir() file_path = os.path.join(temp_dir, 'atla_custom_eval_results.json') with open(file_path, 'w', encoding='utf-8') as f: f.write(json_str) return gr.update(value=file_path, visible=True) download_button.click( fn=create_json_download, inputs=[df_state], outputs=[json_output] ) # Helper functions def calculate_accuracy(measurement, col1, col2, df_state, compare_to_ground_truth=False): df = df_state.value # No changes here (function remains sacred as per your request) if df is None: return "No DataFrame available." if col1 not in df.columns or col2 not in df.columns: return "Selected columns not found in DataFrame." results_df = pd.DataFrame() if compare_to_ground_truth: results_df['ground_truth'] = df[col1] results_df['predicted'] = df[col2] else: results_df['extracted_winner'] = df[col1] results_df['truth_result'] = df[col2] if measurement == 'Accuracy': result = process_pairwise_accuracy(results_df, compare_to_ground_truth) output_text = ( f"Overall Accuracy: {result['overall_accuracy']}\n" f"Number of NaNs: {result['num_extracted_nan']}" ) elif measurement == 'Pearson Correlation': result = process_single_rating_pearson_correlation(results_df, compare_to_ground_truth) output_text = ( f"Pearson Correlation: {result['overall_pearson_correlation']}\n" f"Number of NaNs: {result['num_extracted_nan']}" ) else: output_text = "Unknown measurement selected." return output_text def process_pairwise_accuracy(results_df: pd.DataFrame, compare_to_ground_truth=False) -> dict: # Compute 'results' column based on whether comparing to ground truth if compare_to_ground_truth: # NEW: convert both columns to float results_df['ground_truth'] = results_df['ground_truth'].apply(convert_to_float_or_nan) results_df['predicted'] = results_df['predicted'].apply(convert_to_float_or_nan) results_df['results'] = results_df['ground_truth'] == results_df['predicted'] num_extracted_nan = int(results_df['predicted'].isna().sum()) else: results_df['results'] = results_df['extracted_winner'] == results_df['truth_result'] num_extracted_nan = int(results_df['extracted_winner'].isna().sum()) overall_accuracy = results_df['results'].mean() return { "overall_accuracy": overall_accuracy, "num_extracted_nan": num_extracted_nan, } def process_single_rating_pearson_correlation( results_df: pd.DataFrame, compare_to_ground_truth=False ) -> dict: if compare_to_ground_truth: pred_col = 'predicted' truth_col = 'ground_truth' else: pred_col = 'extracted_winner' truth_col = 'truth_result' results_df[pred_col] = results_df[pred_col].apply(convert_to_float_or_nan) results_df[truth_col] = results_df[truth_col].apply(convert_to_float_or_nan) numerical_results = results_df.dropna(subset=[pred_col, truth_col]) if len(numerical_results) == 0: pearson_corr = np.nan else: pearson_corr = numerical_results[pred_col].corr(numerical_results[truth_col]) num_extracted_nan = int(results_df[pred_col].isna().sum()) return { "overall_pearson_correlation": pearson_corr if not pd.isna(pearson_corr) else 0.0, "num_extracted_nan": num_extracted_nan, } def convert_to_float_or_nan(extracted_input): if extracted_input is None or pd.isna(extracted_input): return np.nan try: return float(extracted_input) except ValueError: return np.nan