File size: 7,138 Bytes
31fda98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import gradio as gr
import pandas as pd
from typing import List, Dict
from flow_judge import Vllm, FlowJudge, EvalInput
from flow_judge.metrics import CustomMetric, RubricItem

try:
    model = Vllm(quantized=False)
except Exception as e:
    raise RuntimeError(f"Failed to initialize Vllm: {e}")

EXAMPLES = [
    {
        "example_description": "Faithfulness of a answer",
        "emoji": "🏈",
        "task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}],
        "task_output": {"name": "Answer", "value": "The capital of France is Paris."},
        "evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \
directly inferable from the context?",
        "rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \
from the provided context. There is hallucinated or fabricated information present in the response \
that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \
inferable from the provided context. There is no hallucinated or fabricated information present in \
the response that cannot be traced back to or deduced from the context.']
    }
]

def populate_fields(example_index: int):
    example = EXAMPLES[example_index]
    return (
        [[input["name"], input["value"]] for input in example["task_inputs"]],
        [[example["task_output"]["name"], example["task_output"]["value"]]],
        example["evaluation_criteria"],
        [[str(i), description] for i, description in enumerate(example["rubric"])]
    )

def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple:
    # Convert inputs to the expected format
    eval_input = EvalInput(
        inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()],
        output={row['Name']: row['Value'] for _, row in task_output.iterrows()}
    )
    
    # Parse the rubric into RubricItems
    rubric_items = [
        RubricItem(score=int(row['Score']), description=row['Description'])
        for _, row in rubric.iterrows()
    ]
    
    # Create the CustomMetric
    custom_metric = CustomMetric(
        name="custom-metric",
        criteria=evaluation_criteria,
        rubric=rubric_items,
        required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()],
        required_output=task_output.iloc[0]['Name']
    )
    
    # Create a FlowJudge instance
    judge = FlowJudge(model=model, metric=custom_metric)
    
    # Evaluate using FlowJudge
    try:
        result = judge.evaluate(eval_input)
    except Exception as e:
        raise RuntimeError(f"Failed to evaluate: {e}")
    
    # Extract feedback and score from the result
    feedback = result.feedback
    score = result.score
    
    return feedback, score

def reset_fields():
    return (
        [["", ""]],  # task_inputs
        [["", ""]],  # task_output
        "",          # evaluation_criteria
        [["", ""]],  # rubric
        "",          # feedback
        ""           # score
    )

def reset_task():
    return (
        [["", ""]],  # task_inputs
        [["", ""]]   # task_output
    )

def reset_evaluation_criteria():
    return (
        "",          # evaluation_criteria
        [["", ""]]   # rubric
    )

with gr.Blocks() as demo:
    with gr.Row():
        example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]

    with gr.Row(equal_height=False):
        with gr.Column(scale=1):
            gr.Markdown("**Inputs**")
            task_inputs = gr.Dataframe(
                headers=["Name", "Value"],
                col_count=(2, "fixed"),
                datatype=["str", "str"],
                row_count=1,
                column_widths=["30%", "70%"]
            )
            add_input_btn = gr.Button("Add Input")

            gr.Markdown("**Output**")
            task_output = gr.Dataframe(
                headers=["Name", "Value"],
                col_count=(2, "fixed"),
                datatype=["str", "str"],
                row_count=1,
                column_widths=["30%", "70%"]
            )
            
            reset_task_btn = gr.Button("Clear Inputs and Output")

        with gr.Column(scale=1):
            gr.Markdown("**Evaluation criteria and rubric**")
            evaluation_criteria = gr.Textbox(label="Evaluation criteria")
            rubric = gr.Dataframe(
                headers=["Score", "Description"],
                col_count=(2, "fixed"),
                datatype=["str", "str"],
                row_count=1,
                column_widths=["10%", "90%"]
            )
            add_score_btn = gr.Button("Add Score")
            reset_criteria_btn = gr.Button("Clear Evaluation Criteria")

    with gr.Row():
        with gr.Column(scale=1, variant="compact"):
            gr.Markdown("**Evaluation**")
            feedback = gr.Textbox(label="Feedback")
            score = gr.Textbox(label="Score")
            evaluate_btn = gr.Button("Evaluate")
        
    with gr.Row():
        # Add the reset buttons
        reset_all_btn = gr.Button("Clear All")
        

    # Event handlers
    add_input_btn.click(
        lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
                                headers=["Name", "Value"],
                                col_count=(2, "fixed"),
                                datatype=["str", "str"],
                                row_count=1,
                                column_widths=["30%", "70%"]),
        inputs=task_inputs,
        outputs=task_inputs
    )

    add_score_btn.click(
        lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
                                headers=["Score", "Description"],
                                col_count=(2, "fixed"),
                                datatype=["str", "str"],
                                row_count=1,
                                column_widths=["10%", "90%"]),
        inputs=rubric,
        outputs=rubric
    )

    for i, button in enumerate(example_buttons):
        button.click(
            populate_fields,
            inputs=[gr.State(i)],  # Pass the example index as a state
            outputs=[task_inputs, task_output, evaluation_criteria, rubric]
        )

    evaluate_btn.click(
        evaluate,
        inputs=[task_inputs, task_output, evaluation_criteria, rubric],
        outputs=[feedback, score]
    )

    reset_task_btn.click(
        reset_task,
        inputs=[],
        outputs=[task_inputs, task_output]
    )

    reset_criteria_btn.click(
        reset_evaluation_criteria,
        inputs=[],
        outputs=[evaluation_criteria, rubric]
    )

    reset_all_btn.click(
        reset_fields,
        inputs=[],
        outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score]
    )

if __name__ == "__main__":
    demo.launch(debug=True)