Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files- data_handler.py +22 -3
- model_handler.py +40 -45
data_handler.py
CHANGED
@@ -34,7 +34,23 @@ def upload_test_data(df_state):
|
|
34 |
def import_data(file):
|
35 |
if file is not None:
|
36 |
try:
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
return {
|
40 |
df_display: gr.update(value=df_state.value, visible=True),
|
@@ -42,10 +58,13 @@ def upload_test_data(df_state):
|
|
42 |
df_state: df_state,
|
43 |
error_display: gr.update(visible=False) # Hide previous errors
|
44 |
}
|
45 |
-
except json.JSONDecodeError
|
46 |
return {
|
47 |
df_display: gr.update(visible=False),
|
48 |
-
error_display: gr.update(
|
|
|
|
|
|
|
49 |
import_button: gr.update(visible=True),
|
50 |
df_state: None
|
51 |
}
|
|
|
34 |
def import_data(file):
|
35 |
if file is not None:
|
36 |
try:
|
37 |
+
loaded_json = json.load(open(file.name))
|
38 |
+
|
39 |
+
# Handle various common JSON structures
|
40 |
+
if isinstance(loaded_json, list):
|
41 |
+
# Top-level list
|
42 |
+
df = pd.json_normalize(loaded_json, sep=".")
|
43 |
+
elif isinstance(loaded_json, dict):
|
44 |
+
# Dictionary could contain a "data" key or not
|
45 |
+
if "data" in loaded_json and isinstance(loaded_json["data"], list):
|
46 |
+
df = pd.json_normalize(loaded_json["data"], sep=".")
|
47 |
+
else:
|
48 |
+
# Flatten the top-level dictionary
|
49 |
+
df = pd.json_normalize(loaded_json, sep=".")
|
50 |
+
else:
|
51 |
+
raise ValueError("Unsupported JSON structure. Please provide a list or object.")
|
52 |
+
|
53 |
+
df_state.value = df
|
54 |
|
55 |
return {
|
56 |
df_display: gr.update(value=df_state.value, visible=True),
|
|
|
58 |
df_state: df_state,
|
59 |
error_display: gr.update(visible=False) # Hide previous errors
|
60 |
}
|
61 |
+
except json.JSONDecodeError:
|
62 |
return {
|
63 |
df_display: gr.update(visible=False),
|
64 |
+
error_display: gr.update(
|
65 |
+
value="**Error:** Invalid JSON file. Please upload a valid JSON file.",
|
66 |
+
visible=True
|
67 |
+
),
|
68 |
import_button: gr.update(visible=True),
|
69 |
df_state: None
|
70 |
}
|
model_handler.py
CHANGED
@@ -30,11 +30,9 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
30 |
return {}
|
31 |
return model_data
|
32 |
|
33 |
-
|
34 |
model_data = load_model_data()
|
35 |
model_choices = list(model_data.keys())
|
36 |
|
37 |
-
# Define dropdowns using model choices
|
38 |
with gr.Row(visible=False) as evaluator_row:
|
39 |
judge_a_dropdown = gr.Dropdown(
|
40 |
choices=["Selene"], label="Judge A", value="Selene", interactive=False
|
@@ -43,26 +41,20 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
43 |
choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
|
44 |
)
|
45 |
|
46 |
-
# A Markdown for "Evaluation in progress..." and final heading
|
47 |
loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
|
48 |
|
49 |
-
# NEW: define a Dataframe to show final evaluation results, like in data_handler
|
50 |
evaluation_result_df = gr.Dataframe(
|
51 |
visible=False,
|
52 |
label="Evaluation Results",
|
53 |
elem_classes=["truncate_cells"]
|
54 |
)
|
55 |
|
56 |
-
# Define the three-button row AFTER the markdown,
|
57 |
-
# so it appears *below* the "Evaluation Complete" message.
|
58 |
with gr.Row(visible=False) as evaluation_nav_row:
|
59 |
back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
|
60 |
run_evaluation_button = gr.Button("Run Evaluation", visible=False)
|
61 |
analyze_results_button = gr.Button("Analyze Results", visible=False)
|
62 |
|
63 |
-
# Show evaluator selection UI
|
64 |
def show_evaluator_selection(current_df):
|
65 |
-
# Hide Criteria UI and show Evaluator UI
|
66 |
updates = {
|
67 |
criteria_group: gr.update(visible=False),
|
68 |
save_prompt_button: gr.update(visible=False),
|
@@ -70,7 +62,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
70 |
evaluation_nav_row: gr.update(visible=True),
|
71 |
run_evaluation_button: gr.update(visible=True),
|
72 |
back_to_criteria_button: gr.update(visible=True),
|
73 |
-
# By default, hide "Analyze Results" and the result dataframe
|
74 |
analyze_results_button: gr.update(visible=False),
|
75 |
evaluation_result_df: gr.update(visible=False),
|
76 |
}
|
@@ -79,14 +70,12 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
79 |
and hasattr(current_df.value, "attrs")
|
80 |
and current_df.value.attrs.get("eval_done")
|
81 |
):
|
82 |
-
# If a previous evaluation was completed, show the heading + dataframe
|
83 |
updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
|
84 |
updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
|
85 |
updates[analyze_results_button] = gr.update(visible=True)
|
86 |
|
87 |
return updates
|
88 |
|
89 |
-
# Note that we pass df_state to show_evaluator_selection
|
90 |
save_prompt_button.click(
|
91 |
fn=show_evaluator_selection,
|
92 |
inputs=[df_state],
|
@@ -103,7 +92,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
103 |
],
|
104 |
)
|
105 |
|
106 |
-
# Back to Criteria
|
107 |
def back_to_criteria():
|
108 |
return {
|
109 |
save_prompt_button: gr.update(visible=True),
|
@@ -111,7 +99,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
111 |
evaluator_row: gr.update(visible=False),
|
112 |
evaluation_nav_row: gr.update(visible=False),
|
113 |
run_evaluation_button: gr.update(visible=False),
|
114 |
-
# Hide the "Evaluation Complete" markdown
|
115 |
loading_spinner: gr.update(visible=False),
|
116 |
analyze_results_button: gr.update(visible=False),
|
117 |
evaluation_result_df: gr.update(visible=False),
|
@@ -134,37 +121,39 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
134 |
|
135 |
# Run evaluation
|
136 |
def run_evaluation(judge_a, judge_b):
|
137 |
-
#
|
138 |
-
yield {
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
template_str = prompt_state.value['template']
|
142 |
mappings = prompt_state.value['mappings']
|
143 |
evaluation_criteria = mappings.get('evaluation_criteria')
|
144 |
-
|
145 |
-
# Create Jinja template for Judge B only
|
146 |
template = Template(template_str)
|
147 |
-
|
148 |
-
# Submit prompt to chosen models
|
149 |
for index, row in df_state.value.iterrows():
|
150 |
-
# Create a context dictionary for this row
|
151 |
context = {}
|
152 |
model_context = None
|
153 |
expected_output = None
|
154 |
-
|
155 |
for key, column in mappings.items():
|
156 |
if key == 'evaluation_criteria':
|
157 |
-
continue
|
158 |
elif column and column != 'None':
|
159 |
context[key] = str(row[column])
|
160 |
if column == 'model_context':
|
161 |
model_context = str(row[column])
|
162 |
elif column == 'expected_model_output':
|
163 |
expected_output = str(row[column])
|
164 |
-
|
165 |
-
#
|
166 |
current_prompt = template.render(**context)
|
167 |
-
# For Judge A (Atla Selene), call get_atla_response directly
|
168 |
response_a = get_atla_response(
|
169 |
"atla-selene",
|
170 |
model_input=context.get('model_input'),
|
@@ -174,47 +163,53 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
174 |
evaluation_criteria=evaluation_criteria
|
175 |
)
|
176 |
response_b = get_model_response(
|
177 |
-
judge_b,
|
178 |
-
model_data.get(judge_b),
|
179 |
current_prompt
|
180 |
)
|
181 |
-
|
182 |
-
# Parse
|
183 |
-
if isinstance(response_a, dict):
|
184 |
score_a, critique_a = response_a['score'], response_a['critique']
|
185 |
-
else:
|
186 |
score_a, critique_a = "Error", response_a
|
187 |
-
|
188 |
score_b, critique_b = parse_model_response(response_b)
|
189 |
-
|
190 |
df_state.value.loc[index, 'score_a'] = score_a
|
191 |
df_state.value.loc[index, 'critique_a'] = critique_a
|
192 |
df_state.value.loc[index, 'score_b'] = score_b
|
193 |
df_state.value.loc[index, 'critique_b'] = critique_b
|
194 |
-
|
195 |
import time
|
196 |
-
time.sleep(2)
|
197 |
-
|
198 |
-
# Hide
|
199 |
yield {loading_spinner: gr.update(visible=False)}
|
200 |
-
|
201 |
-
# Show
|
202 |
yield {
|
203 |
loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
|
204 |
evaluation_result_df: gr.update(value=df_state.value, visible=True),
|
205 |
analyze_results_button: gr.update(visible=True),
|
|
|
|
|
206 |
}
|
207 |
|
208 |
-
# Store the "already run evaluation" flag safely in .attrs
|
209 |
if hasattr(df_state.value, "attrs"):
|
210 |
df_state.value.attrs["eval_done"] = True
|
211 |
|
|
|
212 |
run_evaluation_button.click(
|
213 |
fn=run_evaluation,
|
214 |
inputs=[judge_a_dropdown, judge_b_dropdown],
|
215 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
)
|
217 |
|
218 |
-
|
219 |
-
|
220 |
return model_selection_group, df_state, analyze_results_button
|
|
|
30 |
return {}
|
31 |
return model_data
|
32 |
|
|
|
33 |
model_data = load_model_data()
|
34 |
model_choices = list(model_data.keys())
|
35 |
|
|
|
36 |
with gr.Row(visible=False) as evaluator_row:
|
37 |
judge_a_dropdown = gr.Dropdown(
|
38 |
choices=["Selene"], label="Judge A", value="Selene", interactive=False
|
|
|
41 |
choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
|
42 |
)
|
43 |
|
|
|
44 |
loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
|
45 |
|
|
|
46 |
evaluation_result_df = gr.Dataframe(
|
47 |
visible=False,
|
48 |
label="Evaluation Results",
|
49 |
elem_classes=["truncate_cells"]
|
50 |
)
|
51 |
|
|
|
|
|
52 |
with gr.Row(visible=False) as evaluation_nav_row:
|
53 |
back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
|
54 |
run_evaluation_button = gr.Button("Run Evaluation", visible=False)
|
55 |
analyze_results_button = gr.Button("Analyze Results", visible=False)
|
56 |
|
|
|
57 |
def show_evaluator_selection(current_df):
|
|
|
58 |
updates = {
|
59 |
criteria_group: gr.update(visible=False),
|
60 |
save_prompt_button: gr.update(visible=False),
|
|
|
62 |
evaluation_nav_row: gr.update(visible=True),
|
63 |
run_evaluation_button: gr.update(visible=True),
|
64 |
back_to_criteria_button: gr.update(visible=True),
|
|
|
65 |
analyze_results_button: gr.update(visible=False),
|
66 |
evaluation_result_df: gr.update(visible=False),
|
67 |
}
|
|
|
70 |
and hasattr(current_df.value, "attrs")
|
71 |
and current_df.value.attrs.get("eval_done")
|
72 |
):
|
|
|
73 |
updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
|
74 |
updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
|
75 |
updates[analyze_results_button] = gr.update(visible=True)
|
76 |
|
77 |
return updates
|
78 |
|
|
|
79 |
save_prompt_button.click(
|
80 |
fn=show_evaluator_selection,
|
81 |
inputs=[df_state],
|
|
|
92 |
],
|
93 |
)
|
94 |
|
|
|
95 |
def back_to_criteria():
|
96 |
return {
|
97 |
save_prompt_button: gr.update(visible=True),
|
|
|
99 |
evaluator_row: gr.update(visible=False),
|
100 |
evaluation_nav_row: gr.update(visible=False),
|
101 |
run_evaluation_button: gr.update(visible=False),
|
|
|
102 |
loading_spinner: gr.update(visible=False),
|
103 |
analyze_results_button: gr.update(visible=False),
|
104 |
evaluation_result_df: gr.update(visible=False),
|
|
|
121 |
|
122 |
# Run evaluation
|
123 |
def run_evaluation(judge_a, judge_b):
|
124 |
+
# 1) Immediately hide old results and disable navigation while running
|
125 |
+
yield {
|
126 |
+
loading_spinner: gr.update(value="Evaluation in progress...", visible=True),
|
127 |
+
evaluation_result_df: gr.update(visible=False),
|
128 |
+
analyze_results_button: gr.update(visible=False),
|
129 |
+
run_evaluation_button: gr.update(interactive=False),
|
130 |
+
back_to_criteria_button: gr.update(interactive=False),
|
131 |
+
}
|
132 |
+
|
133 |
+
# Perform the actual evaluation
|
134 |
template_str = prompt_state.value['template']
|
135 |
mappings = prompt_state.value['mappings']
|
136 |
evaluation_criteria = mappings.get('evaluation_criteria')
|
137 |
+
|
|
|
138 |
template = Template(template_str)
|
139 |
+
|
|
|
140 |
for index, row in df_state.value.iterrows():
|
|
|
141 |
context = {}
|
142 |
model_context = None
|
143 |
expected_output = None
|
144 |
+
|
145 |
for key, column in mappings.items():
|
146 |
if key == 'evaluation_criteria':
|
147 |
+
continue
|
148 |
elif column and column != 'None':
|
149 |
context[key] = str(row[column])
|
150 |
if column == 'model_context':
|
151 |
model_context = str(row[column])
|
152 |
elif column == 'expected_model_output':
|
153 |
expected_output = str(row[column])
|
154 |
+
|
155 |
+
# Render the template for Judge B
|
156 |
current_prompt = template.render(**context)
|
|
|
157 |
response_a = get_atla_response(
|
158 |
"atla-selene",
|
159 |
model_input=context.get('model_input'),
|
|
|
163 |
evaluation_criteria=evaluation_criteria
|
164 |
)
|
165 |
response_b = get_model_response(
|
166 |
+
judge_b,
|
167 |
+
model_data.get(judge_b),
|
168 |
current_prompt
|
169 |
)
|
170 |
+
|
171 |
+
# Parse ATLA response
|
172 |
+
if isinstance(response_a, dict):
|
173 |
score_a, critique_a = response_a['score'], response_a['critique']
|
174 |
+
else:
|
175 |
score_a, critique_a = "Error", response_a
|
176 |
+
|
177 |
score_b, critique_b = parse_model_response(response_b)
|
178 |
+
|
179 |
df_state.value.loc[index, 'score_a'] = score_a
|
180 |
df_state.value.loc[index, 'critique_a'] = critique_a
|
181 |
df_state.value.loc[index, 'score_b'] = score_b
|
182 |
df_state.value.loc[index, 'critique_b'] = critique_b
|
183 |
+
|
184 |
import time
|
185 |
+
time.sleep(2) # simulating time-consuming operations
|
186 |
+
|
187 |
+
# 2) Hide spinner
|
188 |
yield {loading_spinner: gr.update(visible=False)}
|
189 |
+
|
190 |
+
# 3) Show final results and re-enable buttons
|
191 |
yield {
|
192 |
loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
|
193 |
evaluation_result_df: gr.update(value=df_state.value, visible=True),
|
194 |
analyze_results_button: gr.update(visible=True),
|
195 |
+
run_evaluation_button: gr.update(interactive=True),
|
196 |
+
back_to_criteria_button: gr.update(interactive=True),
|
197 |
}
|
198 |
|
|
|
199 |
if hasattr(df_state.value, "attrs"):
|
200 |
df_state.value.attrs["eval_done"] = True
|
201 |
|
202 |
+
# Include back_to_criteria_button & run_evaluation_button in outputs so we can update them
|
203 |
run_evaluation_button.click(
|
204 |
fn=run_evaluation,
|
205 |
inputs=[judge_a_dropdown, judge_b_dropdown],
|
206 |
+
outputs=[
|
207 |
+
loading_spinner,
|
208 |
+
evaluation_result_df,
|
209 |
+
analyze_results_button,
|
210 |
+
run_evaluation_button,
|
211 |
+
back_to_criteria_button,
|
212 |
+
],
|
213 |
)
|
214 |
|
|
|
|
|
215 |
return model_selection_group, df_state, analyze_results_button
|