File size: 8,881 Bytes
7db401b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import gradio as gr
import pandas as pd
import numpy as np
import tempfile
import os

def handle_analysis(df_state, model_selection_group, analyze_results_button):
    with gr.Group(visible=False) as analysis_group:
        gr.Markdown("## Analysis")

        # Dropdown to select the accuracy measurement
        accuracy_measurement_dropdown = gr.Dropdown(
            choices=['Accuracy', 'Pearson Correlation'],
            label='Select Evaluation Metric'
        )

        # We remove the two compare dropdowns and only keep ground truth
        with gr.Row():
            ground_truth_dropdown = gr.Dropdown(
                choices=[],
                label='Select True Label Column'
            )

        # Define two side-by-side boxes for results
        with gr.Row():
            judge_a_result = gr.Textbox(
                label="Judge A Results",
                lines=10,
                interactive=False,
                visible=False
            )
            judge_b_result = gr.Textbox(
                label="Judge B Results",
                lines=10,
                interactive=False,
                visible=False
            )

        # Move the JSON output below those textboxes and buttons
        json_output = gr.File(label="Results .json", interactive=False, visible=False)

        # Now place the row of buttons AFTER the json_output
        with gr.Row():
            back_to_results_button = gr.Button("← Back to Results")
            calculate_button = gr.Button("Calculate")
            download_button = gr.Button("Download Results as JSON")

    # Show analysis group
    def show_analysis_group():
        df = df_state.value
        if df is not None:
            columns = df.columns.tolist()
        else:
            columns = []
        # Now we only update ground_truth_dropdown
        return (
            gr.update(visible=True),         # analysis_group
            gr.update(visible=False),        # model_selection_group
            gr.update(choices=columns),      # ground_truth_dropdown
        )

    analyze_results_button.click(
        fn=show_analysis_group,
        inputs=[],
        outputs=[
            analysis_group,
            model_selection_group,
            ground_truth_dropdown  # only this one
        ]
    )

    def back_to_results():
        return (
            gr.update(visible=False),  # Hide analysis_group
            gr.update(visible=True),   # Show model_selection_group
        )

    back_to_results_button.click(
        fn=back_to_results,
        inputs=[],
        outputs=[analysis_group, model_selection_group]
    )

    def calculate_multiple_accuracies(measurement, ground_truth_col, df_state):
        # Update column names to match new format
        col2_name = "score_selene"
        # Get the non-selene score column (should be the other score_* column)
        df = df_state.value
        score_columns = [col for col in df.columns if col.startswith('score_') and col != 'score_selene']
        col3_name = score_columns[0] if score_columns else None
        
        if df is None:
            return (
                gr.update(value="No DataFrame available.", visible=True),
                gr.update(value="No DataFrame available.", visible=True)
            )

        # Check if user-supplied ground_truth_col is valid
        missing_columns = [col for col in [ground_truth_col, col2_name, col3_name] if col not in df.columns]
        if missing_columns:
            msg = f"Selected columns not found in DataFrame: {', '.join(missing_columns)}."
            return (
                gr.update(value=msg, visible=True),
                gr.update(value=msg, visible=True)
            )

        # Compare ground_truth_col with score_selene
        result1 = calculate_accuracy(
            measurement, ground_truth_col, col2_name,
            df_state, compare_to_ground_truth=True
        )
        text_a = f"Comparison: '{ground_truth_col}' vs. 'Selene'\n{result1}"

        # Compare ground_truth_col with the other model's score
        result2 = calculate_accuracy(
            measurement, ground_truth_col, col3_name,
            df_state, compare_to_ground_truth=True
        )
        # Extract model name from column name for display
        model_name = col3_name.replace('score_', '').replace('_', ' ').title()
        text_b = f"Comparison: '{ground_truth_col}' vs. '{model_name}'\n{result2}"

        return (
            gr.update(value=text_a, visible=True),
            gr.update(value=text_b, visible=True)
        )

    # Now the calculate_button only expects measurement, ground_truth_col, df_state
    calculate_button.click(
        fn=calculate_multiple_accuracies,
        inputs=[
            accuracy_measurement_dropdown,
            ground_truth_dropdown,
            df_state
        ],
        outputs=[judge_a_result, judge_b_result]
    )

    def create_json_download(df_state):
        if df_state.value is None:
            return gr.update(value=None, visible=True)
        
        json_str = df_state.value.to_json(orient='records', indent=2)
        temp_dir = tempfile.gettempdir()
        file_path = os.path.join(temp_dir, 'atla_custom_eval_results.json')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(json_str)
        return gr.update(value=file_path, visible=True)

    download_button.click(
        fn=create_json_download,
        inputs=[df_state],
        outputs=[json_output]
    )

# Helper functions

def calculate_accuracy(measurement, col1, col2, df_state, compare_to_ground_truth=False):
    df = df_state.value
    # No changes here (function remains sacred as per your request)
    if df is None:
        return "No DataFrame available."
    if col1 not in df.columns or col2 not in df.columns:
        return "Selected columns not found in DataFrame."

    results_df = pd.DataFrame()
    if compare_to_ground_truth:
        results_df['ground_truth'] = df[col1]
        results_df['predicted'] = df[col2]
    else:
        results_df['extracted_winner'] = df[col1]
        results_df['truth_result'] = df[col2]

    if measurement == 'Accuracy':
        result = process_pairwise_accuracy(results_df, compare_to_ground_truth)
        output_text = (
            f"Overall Accuracy: {result['overall_accuracy']}\n"
            f"Number of NaNs: {result['num_extracted_nan']}"
        )
    elif measurement == 'Pearson Correlation':
        result = process_single_rating_pearson_correlation(results_df, compare_to_ground_truth)
        output_text = (
            f"Pearson Correlation: {result['overall_pearson_correlation']}\n"
            f"Number of NaNs: {result['num_extracted_nan']}"
        )
    else:
        output_text = "Unknown measurement selected."

    return output_text

def process_pairwise_accuracy(results_df: pd.DataFrame, compare_to_ground_truth=False) -> dict:
    # Compute 'results' column based on whether comparing to ground truth
    if compare_to_ground_truth:
        # NEW: convert both columns to float
        results_df['ground_truth'] = results_df['ground_truth'].apply(convert_to_float_or_nan)
        results_df['predicted'] = results_df['predicted'].apply(convert_to_float_or_nan)

        results_df['results'] = results_df['ground_truth'] == results_df['predicted']
        num_extracted_nan = int(results_df['predicted'].isna().sum())
    else:
        results_df['results'] = results_df['extracted_winner'] == results_df['truth_result']
        num_extracted_nan = int(results_df['extracted_winner'].isna().sum())

    overall_accuracy = results_df['results'].mean()

    return {
        "overall_accuracy": overall_accuracy,
        "num_extracted_nan": num_extracted_nan,
    }

def process_single_rating_pearson_correlation(
    results_df: pd.DataFrame, compare_to_ground_truth=False
) -> dict:
    if compare_to_ground_truth:
        pred_col = 'predicted'
        truth_col = 'ground_truth'
    else:
        pred_col = 'extracted_winner'
        truth_col = 'truth_result'

    results_df[pred_col] = results_df[pred_col].apply(convert_to_float_or_nan)
    results_df[truth_col] = results_df[truth_col].apply(convert_to_float_or_nan)

    numerical_results = results_df.dropna(subset=[pred_col, truth_col])

    if len(numerical_results) == 0:
        pearson_corr = np.nan
    else:
        pearson_corr = numerical_results[pred_col].corr(numerical_results[truth_col])

    num_extracted_nan = int(results_df[pred_col].isna().sum())

    return {
        "overall_pearson_correlation": pearson_corr if not pd.isna(pearson_corr) else 0.0,
        "num_extracted_nan": num_extracted_nan,
    }

def convert_to_float_or_nan(extracted_input):
    if extracted_input is None or pd.isna(extracted_input):
        return np.nan
    try:
        return float(extracted_input)
    except ValueError:
        return np.nan