File size: 9,364 Bytes
7db401b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# criteria_handler.py

import gradio as gr
import re
from eval_criteria_library import EXAMPLE_METRICS

SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""

EVALUATION_TEMPLATE = '''You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.

Here are some rules of the evaluation:
(1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.

Your reply should strictly follow this format:
Your output format should strictly adhere to JSON as follows: {% raw %}{"feedback": "<write feedback>", "result": <numerical score>}{% endraw %}. Ensure the output is valid JSON, without additional formatting or explanations.

Here is the data.

{% if model_context is defined and model_context %}Context:
```
{{ model_context }}
```

{% endif %}Instruction:
```
{{ model_input }}
```

Response:
```
{{ model_output }}
```

Score Rubrics:
{{ evaluation_criteria }}

{% if expected_model_output is defined and expected_model_output %}Reference answer:
{{ expected_model_output }}{% endif %}'''

def select_evaluation_criteria(data_upload_group, df_state, prompt_state):
    with gr.Group(visible=True) as criteria_group:
        select_eval_criteria_button = gr.Button("Select Evaluation Criteria", visible=False)
        
        criteria_dropdown = gr.Dropdown(
            choices=list(EXAMPLE_METRICS.keys()),
            label="Choose Evaluation Criteria",
            value=list(EXAMPLE_METRICS.keys())[0],
            visible=False
        )

        with gr.Row(visible=False) as mapping_row:
            with gr.Column():
                # Left column - Evaluation Criteria Editor
                prompt_editor = gr.Textbox(
                    label="Evaluation Criteria",
                    lines=15,
                    visible=False,
                    placeholder="Enter the evaluation criteria/rubric here..."
                )
            with gr.Column():
                # Right column - Required and Optional Variable Mapping
                # Required mappings
                input_mapping = gr.Dropdown(
                    choices=[], 
                    label="Map 'model_input' to column (Required)",
                    interactive=True, 
                    visible=False
                )
                output_mapping = gr.Dropdown(
                    choices=[], 
                    label="Map 'model_output' to column (Required)",
                    interactive=True, 
                    visible=False
                )
                # Optional mappings
                context_mapping = gr.Dropdown(
                    choices=[], 
                    label="Map 'model_context' to column (Optional)",
                    interactive=True, 
                    visible=False
                )
                expected_output_mapping = gr.Dropdown(
                    choices=[], 
                    label="Map 'expected_model_output' to column (Optional)",
                    interactive=True, 
                    visible=False
                )
        # We'll place the "Back to Data" and "Select Evaluators" within the same row:
        with gr.Row(visible=False) as nav_row:
            back_to_data_button = gr.Button("← Back to Data", visible=False)
            save_prompt_button = gr.Button("Select Evaluators", visible=False)

        def update_column_choices(df_state):
            df = df_state.value
            columns = df.columns.tolist() if df is not None else []
            return {
                input_mapping: gr.update(choices=columns, visible=True),
                output_mapping: gr.update(choices=columns, visible=True),
                context_mapping: gr.update(choices=['None'] + columns, visible=True),
                expected_output_mapping: gr.update(choices=['None'] + columns, visible=True)
            }

        def update_prompt(selected_criteria, df_state):
            if selected_criteria in EXAMPLE_METRICS:
                evaluation_criteria = EXAMPLE_METRICS[selected_criteria]['prompt']
            else:
                evaluation_criteria = ""
            updates = {prompt_editor: gr.update(value=evaluation_criteria, visible=True)}
            updates.update(update_column_choices(df_state))
            return updates

        def show_criteria_selection():
            default_criterion = list(EXAMPLE_METRICS.keys())[0]
            evaluation_criteria = EXAMPLE_METRICS[default_criterion]['prompt']
            updates = {
                select_eval_criteria_button: gr.update(visible=False),
                criteria_dropdown: gr.update(visible=True),
                prompt_editor: gr.update(value=evaluation_criteria, visible=True),
                data_upload_group: gr.update(visible=False),
                mapping_row: gr.update(visible=True),
                # Show the nav row and buttons
                nav_row: gr.update(visible=True),
                back_to_data_button: gr.update(visible=True),
                save_prompt_button: gr.update(visible=True),
            }
            updates.update(update_column_choices(df_state))
            return updates

        def save_prompt(evaluation_criteria, input_col, output_col, context_col, expected_output_col):
            # Use the actual Jinja template with proper Jinja syntax and raw JSON
            template = EVALUATION_TEMPLATE
            
            # Create mapping dictionary
            mapping_dict = {
                'model_input': input_col,
                'model_output': output_col,
                'evaluation_criteria': evaluation_criteria
            }
            
            # Add optional mappings if selected
            if context_col != 'None':
                mapping_dict['model_context'] = context_col
            if expected_output_col != 'None':
                mapping_dict['expected_model_output'] = expected_output_col
                
            prompt_state.value = {
                'template': template,
                'mappings': mapping_dict
            }

        # Update event handlers
        select_eval_criteria_button.click(
            fn=show_criteria_selection,
            inputs=[],
            outputs=[
                
                select_eval_criteria_button,
                criteria_dropdown,
                prompt_editor,
               
                data_upload_group,
                mapping_row,
                nav_row,
                back_to_data_button,
                save_prompt_button
            ,
                input_mapping, output_mapping, context_mapping, expected_output_mapping
            ]
        )

        criteria_dropdown.change(
            fn=update_prompt,
            inputs=[criteria_dropdown, df_state],
            outputs=[prompt_editor, input_mapping, output_mapping, context_mapping, expected_output_mapping]
        )

        def make_select_button_visible(df_value):
            if df_value is not None:
                return gr.update(visible=True)
            else:
                return gr.update(visible=False)

        df_state.change(
            fn=make_select_button_visible,
            inputs=df_state,
            outputs=select_eval_criteria_button
        )

        save_prompt_button.click(
            fn=save_prompt,
            inputs=[
                prompt_editor, input_mapping, output_mapping,
                context_mapping, expected_output_mapping
            ],
            outputs=[]
        )

        # BACK BUTTON: Hide the criteria UI, show the data upload UI
        def back_to_data():
            return {
                # show data upload group again
                data_upload_group: gr.update(visible=True),
                # hide the criteria group
                criteria_dropdown: gr.update(visible=False),
                prompt_editor: gr.update(visible=False),
                mapping_row: gr.update(visible=False),
                nav_row: gr.update(visible=False),
                # make "Select Evaluation Criteria" button visible again
                select_eval_criteria_button: gr.update(visible=True),
            }

        back_to_data_button.click(
            fn=back_to_data,
            inputs=[],
            outputs=[
                data_upload_group,
                criteria_dropdown,
                prompt_editor,
                mapping_row,
                nav_row,
                select_eval_criteria_button
            ]
        )

    # Return both the criteria rule group, the df_state, prompt_state, save_prompt_button
    return criteria_group, df_state, prompt_state, save_prompt_button