kaikaidai commited on
Commit
25afc99
·
verified ·
1 Parent(s): 94cc8e3

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (2) hide show
  1. data_handler.py +22 -3
  2. model_handler.py +40 -45
data_handler.py CHANGED
@@ -34,7 +34,23 @@ def upload_test_data(df_state):
34
  def import_data(file):
35
  if file is not None:
36
  try:
37
- df_state.value = pd.json_normalize(json.load(open(file.name)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  return {
40
  df_display: gr.update(value=df_state.value, visible=True),
@@ -42,10 +58,13 @@ def upload_test_data(df_state):
42
  df_state: df_state,
43
  error_display: gr.update(visible=False) # Hide previous errors
44
  }
45
- except json.JSONDecodeError as e:
46
  return {
47
  df_display: gr.update(visible=False),
48
- error_display: gr.update(value="**Error:** Invalid JSON file. Please upload a valid JSON file.", visible=True),
 
 
 
49
  import_button: gr.update(visible=True),
50
  df_state: None
51
  }
 
34
  def import_data(file):
35
  if file is not None:
36
  try:
37
+ loaded_json = json.load(open(file.name))
38
+
39
+ # Handle various common JSON structures
40
+ if isinstance(loaded_json, list):
41
+ # Top-level list
42
+ df = pd.json_normalize(loaded_json, sep=".")
43
+ elif isinstance(loaded_json, dict):
44
+ # Dictionary could contain a "data" key or not
45
+ if "data" in loaded_json and isinstance(loaded_json["data"], list):
46
+ df = pd.json_normalize(loaded_json["data"], sep=".")
47
+ else:
48
+ # Flatten the top-level dictionary
49
+ df = pd.json_normalize(loaded_json, sep=".")
50
+ else:
51
+ raise ValueError("Unsupported JSON structure. Please provide a list or object.")
52
+
53
+ df_state.value = df
54
 
55
  return {
56
  df_display: gr.update(value=df_state.value, visible=True),
 
58
  df_state: df_state,
59
  error_display: gr.update(visible=False) # Hide previous errors
60
  }
61
+ except json.JSONDecodeError:
62
  return {
63
  df_display: gr.update(visible=False),
64
+ error_display: gr.update(
65
+ value="**Error:** Invalid JSON file. Please upload a valid JSON file.",
66
+ visible=True
67
+ ),
68
  import_button: gr.update(visible=True),
69
  df_state: None
70
  }
model_handler.py CHANGED
@@ -30,11 +30,9 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
30
  return {}
31
  return model_data
32
 
33
-
34
  model_data = load_model_data()
35
  model_choices = list(model_data.keys())
36
 
37
- # Define dropdowns using model choices
38
  with gr.Row(visible=False) as evaluator_row:
39
  judge_a_dropdown = gr.Dropdown(
40
  choices=["Selene"], label="Judge A", value="Selene", interactive=False
@@ -43,26 +41,20 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
43
  choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
44
  )
45
 
46
- # A Markdown for "Evaluation in progress..." and final heading
47
  loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
48
 
49
- # NEW: define a Dataframe to show final evaluation results, like in data_handler
50
  evaluation_result_df = gr.Dataframe(
51
  visible=False,
52
  label="Evaluation Results",
53
  elem_classes=["truncate_cells"]
54
  )
55
 
56
- # Define the three-button row AFTER the markdown,
57
- # so it appears *below* the "Evaluation Complete" message.
58
  with gr.Row(visible=False) as evaluation_nav_row:
59
  back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
60
  run_evaluation_button = gr.Button("Run Evaluation", visible=False)
61
  analyze_results_button = gr.Button("Analyze Results", visible=False)
62
 
63
- # Show evaluator selection UI
64
  def show_evaluator_selection(current_df):
65
- # Hide Criteria UI and show Evaluator UI
66
  updates = {
67
  criteria_group: gr.update(visible=False),
68
  save_prompt_button: gr.update(visible=False),
@@ -70,7 +62,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
70
  evaluation_nav_row: gr.update(visible=True),
71
  run_evaluation_button: gr.update(visible=True),
72
  back_to_criteria_button: gr.update(visible=True),
73
- # By default, hide "Analyze Results" and the result dataframe
74
  analyze_results_button: gr.update(visible=False),
75
  evaluation_result_df: gr.update(visible=False),
76
  }
@@ -79,14 +70,12 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
79
  and hasattr(current_df.value, "attrs")
80
  and current_df.value.attrs.get("eval_done")
81
  ):
82
- # If a previous evaluation was completed, show the heading + dataframe
83
  updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
84
  updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
85
  updates[analyze_results_button] = gr.update(visible=True)
86
 
87
  return updates
88
 
89
- # Note that we pass df_state to show_evaluator_selection
90
  save_prompt_button.click(
91
  fn=show_evaluator_selection,
92
  inputs=[df_state],
@@ -103,7 +92,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
103
  ],
104
  )
105
 
106
- # Back to Criteria
107
  def back_to_criteria():
108
  return {
109
  save_prompt_button: gr.update(visible=True),
@@ -111,7 +99,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
111
  evaluator_row: gr.update(visible=False),
112
  evaluation_nav_row: gr.update(visible=False),
113
  run_evaluation_button: gr.update(visible=False),
114
- # Hide the "Evaluation Complete" markdown
115
  loading_spinner: gr.update(visible=False),
116
  analyze_results_button: gr.update(visible=False),
117
  evaluation_result_df: gr.update(visible=False),
@@ -134,37 +121,39 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
134
 
135
  # Run evaluation
136
  def run_evaluation(judge_a, judge_b):
137
- # Show loading spinner
138
- yield {loading_spinner: gr.update(visible=True)}
139
-
140
- # Get template and mappings from prompt state
 
 
 
 
 
 
141
  template_str = prompt_state.value['template']
142
  mappings = prompt_state.value['mappings']
143
  evaluation_criteria = mappings.get('evaluation_criteria')
144
-
145
- # Create Jinja template for Judge B only
146
  template = Template(template_str)
147
-
148
- # Submit prompt to chosen models
149
  for index, row in df_state.value.iterrows():
150
- # Create a context dictionary for this row
151
  context = {}
152
  model_context = None
153
  expected_output = None
154
-
155
  for key, column in mappings.items():
156
  if key == 'evaluation_criteria':
157
- continue # Skip as we handle it separately
158
  elif column and column != 'None':
159
  context[key] = str(row[column])
160
  if column == 'model_context':
161
  model_context = str(row[column])
162
  elif column == 'expected_model_output':
163
  expected_output = str(row[column])
164
-
165
- # For Judge B, render the template using Jinja
166
  current_prompt = template.render(**context)
167
- # For Judge A (Atla Selene), call get_atla_response directly
168
  response_a = get_atla_response(
169
  "atla-selene",
170
  model_input=context.get('model_input'),
@@ -174,47 +163,53 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
174
  evaluation_criteria=evaluation_criteria
175
  )
176
  response_b = get_model_response(
177
- judge_b,
178
- model_data.get(judge_b),
179
  current_prompt
180
  )
181
-
182
- # Parse the responses - handle Atla response differently
183
- if isinstance(response_a, dict): # Atla response
184
  score_a, critique_a = response_a['score'], response_a['critique']
185
- else: # Error case
186
  score_a, critique_a = "Error", response_a
187
-
188
  score_b, critique_b = parse_model_response(response_b)
189
-
190
  df_state.value.loc[index, 'score_a'] = score_a
191
  df_state.value.loc[index, 'critique_a'] = critique_a
192
  df_state.value.loc[index, 'score_b'] = score_b
193
  df_state.value.loc[index, 'critique_b'] = critique_b
194
-
195
  import time
196
- time.sleep(2)
197
-
198
- # Hide loading spinner
199
  yield {loading_spinner: gr.update(visible=False)}
200
-
201
- # Show "Evaluation Complete" heading and the final DataFrame
202
  yield {
203
  loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
204
  evaluation_result_df: gr.update(value=df_state.value, visible=True),
205
  analyze_results_button: gr.update(visible=True),
 
 
206
  }
207
 
208
- # Store the "already run evaluation" flag safely in .attrs
209
  if hasattr(df_state.value, "attrs"):
210
  df_state.value.attrs["eval_done"] = True
211
 
 
212
  run_evaluation_button.click(
213
  fn=run_evaluation,
214
  inputs=[judge_a_dropdown, judge_b_dropdown],
215
- outputs=[loading_spinner, evaluation_result_df, analyze_results_button],
 
 
 
 
 
 
216
  )
217
 
218
-
219
-
220
  return model_selection_group, df_state, analyze_results_button
 
30
  return {}
31
  return model_data
32
 
 
33
  model_data = load_model_data()
34
  model_choices = list(model_data.keys())
35
 
 
36
  with gr.Row(visible=False) as evaluator_row:
37
  judge_a_dropdown = gr.Dropdown(
38
  choices=["Selene"], label="Judge A", value="Selene", interactive=False
 
41
  choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
42
  )
43
 
 
44
  loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
45
 
 
46
  evaluation_result_df = gr.Dataframe(
47
  visible=False,
48
  label="Evaluation Results",
49
  elem_classes=["truncate_cells"]
50
  )
51
 
 
 
52
  with gr.Row(visible=False) as evaluation_nav_row:
53
  back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
54
  run_evaluation_button = gr.Button("Run Evaluation", visible=False)
55
  analyze_results_button = gr.Button("Analyze Results", visible=False)
56
 
 
57
  def show_evaluator_selection(current_df):
 
58
  updates = {
59
  criteria_group: gr.update(visible=False),
60
  save_prompt_button: gr.update(visible=False),
 
62
  evaluation_nav_row: gr.update(visible=True),
63
  run_evaluation_button: gr.update(visible=True),
64
  back_to_criteria_button: gr.update(visible=True),
 
65
  analyze_results_button: gr.update(visible=False),
66
  evaluation_result_df: gr.update(visible=False),
67
  }
 
70
  and hasattr(current_df.value, "attrs")
71
  and current_df.value.attrs.get("eval_done")
72
  ):
 
73
  updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
74
  updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
75
  updates[analyze_results_button] = gr.update(visible=True)
76
 
77
  return updates
78
 
 
79
  save_prompt_button.click(
80
  fn=show_evaluator_selection,
81
  inputs=[df_state],
 
92
  ],
93
  )
94
 
 
95
  def back_to_criteria():
96
  return {
97
  save_prompt_button: gr.update(visible=True),
 
99
  evaluator_row: gr.update(visible=False),
100
  evaluation_nav_row: gr.update(visible=False),
101
  run_evaluation_button: gr.update(visible=False),
 
102
  loading_spinner: gr.update(visible=False),
103
  analyze_results_button: gr.update(visible=False),
104
  evaluation_result_df: gr.update(visible=False),
 
121
 
122
  # Run evaluation
123
  def run_evaluation(judge_a, judge_b):
124
+ # 1) Immediately hide old results and disable navigation while running
125
+ yield {
126
+ loading_spinner: gr.update(value="Evaluation in progress...", visible=True),
127
+ evaluation_result_df: gr.update(visible=False),
128
+ analyze_results_button: gr.update(visible=False),
129
+ run_evaluation_button: gr.update(interactive=False),
130
+ back_to_criteria_button: gr.update(interactive=False),
131
+ }
132
+
133
+ # Perform the actual evaluation
134
  template_str = prompt_state.value['template']
135
  mappings = prompt_state.value['mappings']
136
  evaluation_criteria = mappings.get('evaluation_criteria')
137
+
 
138
  template = Template(template_str)
139
+
 
140
  for index, row in df_state.value.iterrows():
 
141
  context = {}
142
  model_context = None
143
  expected_output = None
144
+
145
  for key, column in mappings.items():
146
  if key == 'evaluation_criteria':
147
+ continue
148
  elif column and column != 'None':
149
  context[key] = str(row[column])
150
  if column == 'model_context':
151
  model_context = str(row[column])
152
  elif column == 'expected_model_output':
153
  expected_output = str(row[column])
154
+
155
+ # Render the template for Judge B
156
  current_prompt = template.render(**context)
 
157
  response_a = get_atla_response(
158
  "atla-selene",
159
  model_input=context.get('model_input'),
 
163
  evaluation_criteria=evaluation_criteria
164
  )
165
  response_b = get_model_response(
166
+ judge_b,
167
+ model_data.get(judge_b),
168
  current_prompt
169
  )
170
+
171
+ # Parse ATLA response
172
+ if isinstance(response_a, dict):
173
  score_a, critique_a = response_a['score'], response_a['critique']
174
+ else:
175
  score_a, critique_a = "Error", response_a
176
+
177
  score_b, critique_b = parse_model_response(response_b)
178
+
179
  df_state.value.loc[index, 'score_a'] = score_a
180
  df_state.value.loc[index, 'critique_a'] = critique_a
181
  df_state.value.loc[index, 'score_b'] = score_b
182
  df_state.value.loc[index, 'critique_b'] = critique_b
183
+
184
  import time
185
+ time.sleep(2) # simulating time-consuming operations
186
+
187
+ # 2) Hide spinner
188
  yield {loading_spinner: gr.update(visible=False)}
189
+
190
+ # 3) Show final results and re-enable buttons
191
  yield {
192
  loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
193
  evaluation_result_df: gr.update(value=df_state.value, visible=True),
194
  analyze_results_button: gr.update(visible=True),
195
+ run_evaluation_button: gr.update(interactive=True),
196
+ back_to_criteria_button: gr.update(interactive=True),
197
  }
198
 
 
199
  if hasattr(df_state.value, "attrs"):
200
  df_state.value.attrs["eval_done"] = True
201
 
202
+ # Include back_to_criteria_button & run_evaluation_button in outputs so we can update them
203
  run_evaluation_button.click(
204
  fn=run_evaluation,
205
  inputs=[judge_a_dropdown, judge_b_dropdown],
206
+ outputs=[
207
+ loading_spinner,
208
+ evaluation_result_df,
209
+ analyze_results_button,
210
+ run_evaluation_button,
211
+ back_to_criteria_button,
212
+ ],
213
  )
214
 
 
 
215
  return model_selection_group, df_state, analyze_results_button