bergr7f commited on
Commit
31fda98
·
1 Parent(s): e871e90

Add WIP application file and dependencies

Browse files
Files changed (2) hide show
  1. app.py +204 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from typing import List, Dict
4
+ from flow_judge import Vllm, FlowJudge, EvalInput
5
+ from flow_judge.metrics import CustomMetric, RubricItem
6
+
7
+ try:
8
+ model = Vllm(quantized=False)
9
+ except Exception as e:
10
+ raise RuntimeError(f"Failed to initialize Vllm: {e}")
11
+
12
+ EXAMPLES = [
13
+ {
14
+ "example_description": "Faithfulness of a answer",
15
+ "emoji": "🏈",
16
+ "task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}],
17
+ "task_output": {"name": "Answer", "value": "The capital of France is Paris."},
18
+ "evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \
19
+ directly inferable from the context?",
20
+ "rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \
21
+ from the provided context. There is hallucinated or fabricated information present in the response \
22
+ that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \
23
+ inferable from the provided context. There is no hallucinated or fabricated information present in \
24
+ the response that cannot be traced back to or deduced from the context.']
25
+ }
26
+ ]
27
+
28
+ def populate_fields(example_index: int):
29
+ example = EXAMPLES[example_index]
30
+ return (
31
+ [[input["name"], input["value"]] for input in example["task_inputs"]],
32
+ [[example["task_output"]["name"], example["task_output"]["value"]]],
33
+ example["evaluation_criteria"],
34
+ [[str(i), description] for i, description in enumerate(example["rubric"])]
35
+ )
36
+
37
+ def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple:
38
+ # Convert inputs to the expected format
39
+ eval_input = EvalInput(
40
+ inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()],
41
+ output={row['Name']: row['Value'] for _, row in task_output.iterrows()}
42
+ )
43
+
44
+ # Parse the rubric into RubricItems
45
+ rubric_items = [
46
+ RubricItem(score=int(row['Score']), description=row['Description'])
47
+ for _, row in rubric.iterrows()
48
+ ]
49
+
50
+ # Create the CustomMetric
51
+ custom_metric = CustomMetric(
52
+ name="custom-metric",
53
+ criteria=evaluation_criteria,
54
+ rubric=rubric_items,
55
+ required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()],
56
+ required_output=task_output.iloc[0]['Name']
57
+ )
58
+
59
+ # Create a FlowJudge instance
60
+ judge = FlowJudge(model=model, metric=custom_metric)
61
+
62
+ # Evaluate using FlowJudge
63
+ try:
64
+ result = judge.evaluate(eval_input)
65
+ except Exception as e:
66
+ raise RuntimeError(f"Failed to evaluate: {e}")
67
+
68
+ # Extract feedback and score from the result
69
+ feedback = result.feedback
70
+ score = result.score
71
+
72
+ return feedback, score
73
+
74
+ def reset_fields():
75
+ return (
76
+ [["", ""]], # task_inputs
77
+ [["", ""]], # task_output
78
+ "", # evaluation_criteria
79
+ [["", ""]], # rubric
80
+ "", # feedback
81
+ "" # score
82
+ )
83
+
84
+ def reset_task():
85
+ return (
86
+ [["", ""]], # task_inputs
87
+ [["", ""]] # task_output
88
+ )
89
+
90
+ def reset_evaluation_criteria():
91
+ return (
92
+ "", # evaluation_criteria
93
+ [["", ""]] # rubric
94
+ )
95
+
96
+ with gr.Blocks() as demo:
97
+ with gr.Row():
98
+ example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]
99
+
100
+ with gr.Row(equal_height=False):
101
+ with gr.Column(scale=1):
102
+ gr.Markdown("**Inputs**")
103
+ task_inputs = gr.Dataframe(
104
+ headers=["Name", "Value"],
105
+ col_count=(2, "fixed"),
106
+ datatype=["str", "str"],
107
+ row_count=1,
108
+ column_widths=["30%", "70%"]
109
+ )
110
+ add_input_btn = gr.Button("Add Input")
111
+
112
+ gr.Markdown("**Output**")
113
+ task_output = gr.Dataframe(
114
+ headers=["Name", "Value"],
115
+ col_count=(2, "fixed"),
116
+ datatype=["str", "str"],
117
+ row_count=1,
118
+ column_widths=["30%", "70%"]
119
+ )
120
+
121
+ reset_task_btn = gr.Button("Clear Inputs and Output")
122
+
123
+ with gr.Column(scale=1):
124
+ gr.Markdown("**Evaluation criteria and rubric**")
125
+ evaluation_criteria = gr.Textbox(label="Evaluation criteria")
126
+ rubric = gr.Dataframe(
127
+ headers=["Score", "Description"],
128
+ col_count=(2, "fixed"),
129
+ datatype=["str", "str"],
130
+ row_count=1,
131
+ column_widths=["10%", "90%"]
132
+ )
133
+ add_score_btn = gr.Button("Add Score")
134
+ reset_criteria_btn = gr.Button("Clear Evaluation Criteria")
135
+
136
+ with gr.Row():
137
+ with gr.Column(scale=1, variant="compact"):
138
+ gr.Markdown("**Evaluation**")
139
+ feedback = gr.Textbox(label="Feedback")
140
+ score = gr.Textbox(label="Score")
141
+ evaluate_btn = gr.Button("Evaluate")
142
+
143
+ with gr.Row():
144
+ # Add the reset buttons
145
+ reset_all_btn = gr.Button("Clear All")
146
+
147
+
148
+ # Event handlers
149
+ add_input_btn.click(
150
+ lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
151
+ headers=["Name", "Value"],
152
+ col_count=(2, "fixed"),
153
+ datatype=["str", "str"],
154
+ row_count=1,
155
+ column_widths=["30%", "70%"]),
156
+ inputs=task_inputs,
157
+ outputs=task_inputs
158
+ )
159
+
160
+ add_score_btn.click(
161
+ lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
162
+ headers=["Score", "Description"],
163
+ col_count=(2, "fixed"),
164
+ datatype=["str", "str"],
165
+ row_count=1,
166
+ column_widths=["10%", "90%"]),
167
+ inputs=rubric,
168
+ outputs=rubric
169
+ )
170
+
171
+ for i, button in enumerate(example_buttons):
172
+ button.click(
173
+ populate_fields,
174
+ inputs=[gr.State(i)], # Pass the example index as a state
175
+ outputs=[task_inputs, task_output, evaluation_criteria, rubric]
176
+ )
177
+
178
+ evaluate_btn.click(
179
+ evaluate,
180
+ inputs=[task_inputs, task_output, evaluation_criteria, rubric],
181
+ outputs=[feedback, score]
182
+ )
183
+
184
+ reset_task_btn.click(
185
+ reset_task,
186
+ inputs=[],
187
+ outputs=[task_inputs, task_output]
188
+ )
189
+
190
+ reset_criteria_btn.click(
191
+ reset_evaluation_criteria,
192
+ inputs=[],
193
+ outputs=[evaluation_criteria, rubric]
194
+ )
195
+
196
+ reset_all_btn.click(
197
+ reset_fields,
198
+ inputs=[],
199
+ outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score]
200
+ )
201
+
202
+ if __name__ == "__main__":
203
+ demo.launch(debug=True)
204
+
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ vllm-flash-attn==2.6.2
2
+ flow-judge[vllm]==0.1.0
3
+ flash_attn>=2.6.3