bergr7f commited on
Commit
06d4d61
·
1 Parent(s): 06826f1

upgrade UI and functionality

Browse files
Files changed (1) hide show
  1. app.py +181 -154
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import spaces
3
  import pandas as pd
4
- from typing import List, Dict
5
  from flow_judge import Hf, FlowJudge, EvalInput
6
  from flow_judge.metrics import CustomMetric, RubricItem
7
  from huggingface_hub import snapshot_download
@@ -17,205 +17,232 @@ def download_model():
17
  return True
18
  except Exception as e:
19
  raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}")
20
-
21
- EXAMPLES = [
22
- {
23
- "example_description": "Faithfulness of a answer",
24
- "emoji": "🏈",
25
- "task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}],
26
- "task_output": {"name": "Answer", "value": "The capital of France is Paris."},
27
- "evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \
28
- directly inferable from the context?",
29
- "rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \
30
- from the provided context. There is hallucinated or fabricated information present in the response \
31
- that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \
32
- inferable from the provided context. There is no hallucinated or fabricated information present in \
33
- the response that cannot be traced back to or deduced from the context.']
34
- }
35
- ]
36
-
37
- def populate_fields(example_index: int):
38
- example = EXAMPLES[example_index]
39
- return (
40
- [[input["name"], input["value"]] for input in example["task_inputs"]],
41
- [[example["task_output"]["name"], example["task_output"]["value"]]],
42
- example["evaluation_criteria"],
43
- [[str(i), description] for i, description in enumerate(example["rubric"])]
44
- )
45
 
46
  @spaces.GPU
47
- def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple:
48
- # Load the model
 
 
 
 
 
 
 
49
  try:
50
  model = Hf(flash_attn=False)
51
  except Exception as e:
52
  raise RuntimeError(f"Failed to initialize Hf Model: {e}")
53
-
54
- # Convert inputs to the expected format
55
  eval_input = EvalInput(
56
- inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()],
57
- output={row['Name']: row['Value'] for _, row in task_output.iterrows()}
58
  )
59
 
60
- # Parse the rubric into RubricItems
61
- rubric_items = [
62
- RubricItem(score=int(row['Score']), description=row['Description'])
63
- for _, row in rubric.iterrows()
 
 
64
  ]
65
 
66
- # Create the CustomMetric
67
  custom_metric = CustomMetric(
68
  name="custom-metric",
69
  criteria=evaluation_criteria,
70
- rubric=rubric_items,
71
- required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()],
72
- required_output=task_output.iloc[0]['Name']
73
  )
74
 
75
- # Create a FlowJudge instance
76
  judge = FlowJudge(model=model, metric=custom_metric)
77
 
78
- # Evaluate using FlowJudge
79
  try:
80
  result = judge.evaluate(eval_input)
81
  except Exception as e:
82
  raise RuntimeError(f"Failed to evaluate: {e}")
83
 
84
- # Extract feedback and score from the result
85
- feedback = result.feedback
86
- score = result.score
87
-
88
- return feedback, score
89
-
90
- def reset_fields():
91
- return (
92
- [["", ""]], # task_inputs
93
- [["", ""]], # task_output
94
- "", # evaluation_criteria
95
- [["", ""]], # rubric
96
- "", # feedback
97
- "" # score
98
- )
99
 
100
- def reset_task():
101
  return (
102
- [["", ""]], # task_inputs
103
- [["", ""]] # task_output
104
  )
105
 
106
- def reset_evaluation_criteria():
107
- return (
108
- "", # evaluation_criteria
109
- [["", ""]] # rubric
110
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  with gr.Blocks() as demo:
113
  model_downloaded = download_model()
114
- with gr.Row():
115
- example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]
116
 
117
  with gr.Row(equal_height=False):
118
  with gr.Column(scale=1):
119
  gr.Markdown("**Inputs**")
120
- task_inputs = gr.Dataframe(
121
- headers=["Name", "Value"],
122
- col_count=(2, "fixed"),
123
- datatype=["str", "str"],
124
- row_count=1,
125
- column_widths=["30%", "70%"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  )
127
- add_input_btn = gr.Button("Add Input")
128
-
129
  gr.Markdown("**Output**")
130
- task_output = gr.Dataframe(
131
- headers=["Name", "Value"],
132
- col_count=(2, "fixed"),
133
- datatype=["str", "str"],
134
- row_count=1,
135
- column_widths=["30%", "70%"]
136
- )
137
-
138
- reset_task_btn = gr.Button("Clear Inputs and Output")
139
-
140
  with gr.Column(scale=1):
141
  gr.Markdown("**Evaluation criteria and rubric**")
142
  evaluation_criteria = gr.Textbox(label="Evaluation criteria")
143
- rubric = gr.Dataframe(
144
- headers=["Score", "Description"],
145
- col_count=(2, "fixed"),
146
- datatype=["str", "str"],
147
- row_count=1,
148
- column_widths=["10%", "90%"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  )
150
- add_score_btn = gr.Button("Add Score")
151
- reset_criteria_btn = gr.Button("Clear Evaluation Criteria")
152
-
153
  with gr.Row():
154
- with gr.Column(scale=1, variant="compact"):
155
  gr.Markdown("**Evaluation**")
156
- feedback = gr.Textbox(label="Feedback")
157
- score = gr.Textbox(label="Score")
158
- evaluate_btn = gr.Button("Evaluate")
159
-
160
- with gr.Row():
161
- # Add the reset buttons
162
- reset_all_btn = gr.Button("Clear All")
163
-
164
-
165
- # Event handlers
166
- add_input_btn.click(
167
- lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
168
- headers=["Name", "Value"],
169
- col_count=(2, "fixed"),
170
- datatype=["str", "str"],
171
- row_count=1,
172
- column_widths=["30%", "70%"]),
173
- inputs=task_inputs,
174
- outputs=task_inputs
175
- )
176
-
177
- add_score_btn.click(
178
- lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
179
- headers=["Score", "Description"],
180
- col_count=(2, "fixed"),
181
- datatype=["str", "str"],
182
- row_count=1,
183
- column_widths=["10%", "90%"]),
184
- inputs=rubric,
185
- outputs=rubric
186
- )
187
-
188
- for i, button in enumerate(example_buttons):
189
- button.click(
190
- populate_fields,
191
- inputs=[gr.State(i)], # Pass the example index as a state
192
- outputs=[task_inputs, task_output, evaluation_criteria, rubric]
193
- )
194
-
195
  evaluate_btn.click(
196
  evaluate,
197
- inputs=[task_inputs, task_output, evaluation_criteria, rubric],
198
  outputs=[feedback, score]
199
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- reset_task_btn.click(
202
- reset_task,
203
- inputs=[],
204
- outputs=[task_inputs, task_output]
205
- )
206
-
207
- reset_criteria_btn.click(
208
- reset_evaluation_criteria,
209
- inputs=[],
210
- outputs=[evaluation_criteria, rubric]
211
- )
212
-
213
- reset_all_btn.click(
214
- reset_fields,
215
- inputs=[],
216
- outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score]
217
  )
218
 
219
- if __name__ == "__main__":
220
- demo.launch(debug=True)
221
-
 
1
  import gradio as gr
2
  import spaces
3
  import pandas as pd
4
+ from typing import List, Dict, Tuple
5
  from flow_judge import Hf, FlowJudge, EvalInput
6
  from flow_judge.metrics import CustomMetric, RubricItem
7
  from huggingface_hub import snapshot_download
 
17
  return True
18
  except Exception as e:
19
  raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}")
20
+
21
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  @spaces.GPU
24
+ def evaluate(
25
+ inputs_task: List[Dict[str, str]],
26
+ output_name: str,
27
+ output_value: str,
28
+ evaluation_criteria: str,
29
+ rubric_items: List[Dict[str, str]]
30
+ ) -> Tuple[str, int]:
31
+
32
+ # [{'name': 'a', 'value': 'a'}]
33
  try:
34
  model = Hf(flash_attn=False)
35
  except Exception as e:
36
  raise RuntimeError(f"Failed to initialize Hf Model: {e}")
37
+
 
38
  eval_input = EvalInput(
39
+ inputs=[{input['name']: input['value']} for input in inputs_task],
40
+ output={output_name: output_value}
41
  )
42
 
43
+ score_rubric_items = [
44
+ RubricItem(
45
+ score=int(rubric_item['name']),
46
+ description=rubric_item['value']
47
+ )
48
+ for rubric_item in rubric_items
49
  ]
50
 
 
51
  custom_metric = CustomMetric(
52
  name="custom-metric",
53
  criteria=evaluation_criteria,
54
+ rubric=score_rubric_items,
55
+ required_inputs=[input['name'] for input in inputs_task],
56
+ required_output=output_name
57
  )
58
 
 
59
  judge = FlowJudge(model=model, metric=custom_metric)
60
 
 
61
  try:
62
  result = judge.evaluate(eval_input)
63
  except Exception as e:
64
  raise RuntimeError(f"Failed to evaluate: {e}")
65
 
66
+ return result.feedback, result.score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ def reset_all():
69
  return (
70
+ [], "", "", [], "", "", # Existing resets for inputs and rubrics
71
+ "", "", "", "", "" # New resets for additional fields
72
  )
73
 
74
+ # Define presets
75
+ EXAMPLES = [
76
+ {
77
+ "description": "Example 1: Basic Evaluation",
78
+ "inputs_task": [{"name": "Question", "value": "What is the capital of France?"}],
79
+ "output": {"name": "Answer", "value": "The capital of France is Paris."},
80
+ "evaluation_criteria": "Ensure the answer is accurate and based on the input question.",
81
+ "rubric": [
82
+ {"name": "1", "value": "Incorrect answer."},
83
+ {"name": "2", "value": "Partially correct answer."},
84
+ {"name": "3", "value": "Completely correct answer."}
85
+ ]
86
+ },
87
+ {
88
+ "description": "Example 2: Contextual Understanding",
89
+ "inputs_task": [
90
+ {"name": "Statement", "value": "All swans are white."}
91
+ ],
92
+ "output": {"name": "Conclusion", "value": "There are no black swans."},
93
+ "evaluation_criteria": "Verify the conclusion logically follows from the statement.",
94
+ "rubric": [
95
+ {"name": "1", "value": "Conclusion does not follow."},
96
+ {"name": "2", "value": "Conclusion somewhat follows."},
97
+ {"name": "3", "value": "Conclusion logically follows."}
98
+ ]
99
+ }
100
+ ]
101
 
102
  with gr.Blocks() as demo:
103
  model_downloaded = download_model()
104
+ # with gr.Row():
105
+ # example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]
106
 
107
  with gr.Row(equal_height=False):
108
  with gr.Column(scale=1):
109
  gr.Markdown("**Inputs**")
110
+ inputs_task = gr.State([])
111
+ new_input_name = gr.Textbox(label="Name")
112
+ new_input_value = gr.Textbox(label="Value")
113
+
114
+ def add_input(inputs_task, new_input_name, new_input_value):
115
+ return inputs_task + [{"name": new_input_name, "value": new_input_value}], "", ""
116
+
117
+ @gr.render(inputs=inputs_task) # You have to pass the state here
118
+ def render_inputs(inputs_list): # Use different name than the state variable
119
+
120
+ for input in inputs_list:
121
+ with gr.Group():
122
+ with gr.Row(equal_height=True):
123
+ with gr.Column(min_width=60, scale=2):
124
+ gr.Textbox(input['name'], label="Name", show_label=True, interactive=False, autoscroll=False, max_lines=1)
125
+ with gr.Column(scale=8):
126
+ gr.Textbox(input['value'], label="Value", show_label=True, interactive=False, autoscroll=False, max_lines=3)
127
+ with gr.Column(min_width=15, scale=1):
128
+ delete_btn = gr.Button("X", size="lg", variant="secondary")
129
+ def delete(input=input):
130
+ inputs_list.remove(input)
131
+ return inputs_list
132
+ delete_btn.click(delete, None, [inputs_task]) # This is the state variable
133
+
134
+ gr.Button("Add Input").click(
135
+ add_input,
136
+ [inputs_task, new_input_name, new_input_value],
137
+ [inputs_task, new_input_name, new_input_value]
138
  )
139
+
 
140
  gr.Markdown("**Output**")
141
+ with gr.Group():
142
+ with gr.Row(equal_height=True):
143
+ with gr.Column(min_width=60, scale=1):
144
+ output_name = gr.Textbox(label="Name", show_label=True, interactive=True, autoscroll=False, max_lines=1)
145
+ with gr.Column(scale=6):
146
+ output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3)
147
+
148
+
 
 
149
  with gr.Column(scale=1):
150
  gr.Markdown("**Evaluation criteria and rubric**")
151
  evaluation_criteria = gr.Textbox(label="Evaluation criteria")
152
+ gr.Markdown("**Score rubrics**")
153
+ rubric_items = gr.State([])
154
+ new_rubric_name = gr.Textbox(label="Score", show_label=True, interactive=True, autoscroll=False, max_lines=1)
155
+ new_rubric_value = gr.Textbox(label="Description", show_label=True, interactive=True, autoscroll=False, max_lines=3)
156
+
157
+ def add_rubric_item(rubric_items, new_rubric_name, new_rubric_value):
158
+ return rubric_items + [{"name": new_rubric_name, "value": new_rubric_value}], "", ""
159
+
160
+ @gr.render(inputs=rubric_items) # You have to pass the state here
161
+ def render_rubrics(rubric_items_list): # Use different name than the state variable
162
+
163
+ for rubric_item in rubric_items_list:
164
+ with gr.Group():
165
+ with gr.Row(equal_height=True):
166
+ with gr.Column(min_width=30, scale=1):
167
+ gr.Textbox(rubric_item['name'], label="Score", show_label=True, interactive=False)
168
+ with gr.Column(scale=8):
169
+ gr.Textbox(rubric_item['value'], label="Description", show_label=True, interactive=False)
170
+ with gr.Column(min_width=15, scale=1):
171
+ delete_btn = gr.Button("X", size="lg", variant="secondary")
172
+ def delete(rubric_item=rubric_item):
173
+ rubric_items_list.remove(rubric_item)
174
+ return rubric_items_list
175
+ delete_btn.click(delete, None, [rubric_items]) # This is the state variable
176
+
177
+ gr.Button("Add Rubric Item").click(
178
+ add_rubric_item,
179
+ [rubric_items, new_rubric_name, new_rubric_value],
180
+ [rubric_items, new_rubric_name, new_rubric_value]
181
  )
182
+
 
 
183
  with gr.Row():
184
+ with gr.Column(scale=1, variant="panel"):
185
  gr.Markdown("**Evaluation**")
186
+ with gr.Group():
187
+ with gr.Row(equal_height=True):
188
+ with gr.Column(min_width=15, scale=1):
189
+ score = gr.Textbox(label="Score", interactive=False, autoscroll=False, max_lines=1)
190
+ with gr.Column(scale=5):
191
+ feedback = gr.Textbox(label="Feedback", interactive=False, autoscroll=False, max_lines=6)
192
+ with gr.Column(min_width=15, scale=1):
193
+ evaluate_btn = gr.Button("Evaluate", variant="primary")
194
+
195
+ reset_all_btn = gr.Button("Clear All", variant="stop") # Add Reset All button
196
+ reset_all_btn.click(
197
+ reset_all,
198
+ inputs=[],
199
+ outputs=[
200
+ inputs_task,
201
+ new_input_name,
202
+ new_input_value,
203
+ rubric_items,
204
+ new_rubric_name,
205
+ new_rubric_value,
206
+ evaluation_criteria, # Reset evaluation criteria
207
+ output_name, # Reset output name
208
+ output_value, # Reset output value
209
+ feedback, # Reset feedback
210
+ score # Reset score
211
+ ]
212
+ )
213
+
 
 
 
 
 
 
 
 
 
 
 
214
  evaluate_btn.click(
215
  evaluate,
216
+ inputs=[inputs_task, output_name, output_value, evaluation_criteria, rubric_items],
217
  outputs=[feedback, score]
218
  )
219
+
220
+ preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES]
221
+
222
+ for i, button in enumerate(preset_buttons):
223
+ def populate_preset(ex_i=i):
224
+ return populate_fields(ex_i)
225
+
226
+ button.click(
227
+ populate_preset, # Use the closure to pass the current index
228
+ inputs=[], # No direct inputs needed
229
+ outputs=[
230
+ inputs_task,
231
+ output_name,
232
+ output_value,
233
+ evaluation_criteria,
234
+ rubric_items
235
+ ]
236
+ )
237
 
238
+ def populate_fields(example_index: int):
239
+ example = EXAMPLES[example_index]
240
+ return (
241
+ example["inputs_task"],
242
+ example["output"]["name"],
243
+ example["output"]["value"],
244
+ example["evaluation_criteria"],
245
+ example["rubric"]
 
 
 
 
 
 
 
 
246
  )
247
 
248
+ demo.launch(debug=True)