Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from typing import List, Dict | |
from flow_judge import Vllm, FlowJudge, EvalInput | |
from flow_judge.metrics import CustomMetric, RubricItem | |
try: | |
model = Vllm(quantized=False) | |
except Exception as e: | |
raise RuntimeError(f"Failed to initialize Vllm: {e}") | |
EXAMPLES = [ | |
{ | |
"example_description": "Faithfulness of a answer", | |
"emoji": "🏈", | |
"task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}], | |
"task_output": {"name": "Answer", "value": "The capital of France is Paris."}, | |
"evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \ | |
directly inferable from the context?", | |
"rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \ | |
from the provided context. There is hallucinated or fabricated information present in the response \ | |
that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \ | |
inferable from the provided context. There is no hallucinated or fabricated information present in \ | |
the response that cannot be traced back to or deduced from the context.'] | |
} | |
] | |
def populate_fields(example_index: int): | |
example = EXAMPLES[example_index] | |
return ( | |
[[input["name"], input["value"]] for input in example["task_inputs"]], | |
[[example["task_output"]["name"], example["task_output"]["value"]]], | |
example["evaluation_criteria"], | |
[[str(i), description] for i, description in enumerate(example["rubric"])] | |
) | |
def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple: | |
# Convert inputs to the expected format | |
eval_input = EvalInput( | |
inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()], | |
output={row['Name']: row['Value'] for _, row in task_output.iterrows()} | |
) | |
# Parse the rubric into RubricItems | |
rubric_items = [ | |
RubricItem(score=int(row['Score']), description=row['Description']) | |
for _, row in rubric.iterrows() | |
] | |
# Create the CustomMetric | |
custom_metric = CustomMetric( | |
name="custom-metric", | |
criteria=evaluation_criteria, | |
rubric=rubric_items, | |
required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()], | |
required_output=task_output.iloc[0]['Name'] | |
) | |
# Create a FlowJudge instance | |
judge = FlowJudge(model=model, metric=custom_metric) | |
# Evaluate using FlowJudge | |
try: | |
result = judge.evaluate(eval_input) | |
except Exception as e: | |
raise RuntimeError(f"Failed to evaluate: {e}") | |
# Extract feedback and score from the result | |
feedback = result.feedback | |
score = result.score | |
return feedback, score | |
def reset_fields(): | |
return ( | |
[["", ""]], # task_inputs | |
[["", ""]], # task_output | |
"", # evaluation_criteria | |
[["", ""]], # rubric | |
"", # feedback | |
"" # score | |
) | |
def reset_task(): | |
return ( | |
[["", ""]], # task_inputs | |
[["", ""]] # task_output | |
) | |
def reset_evaluation_criteria(): | |
return ( | |
"", # evaluation_criteria | |
[["", ""]] # rubric | |
) | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)] | |
with gr.Row(equal_height=False): | |
with gr.Column(scale=1): | |
gr.Markdown("**Inputs**") | |
task_inputs = gr.Dataframe( | |
headers=["Name", "Value"], | |
col_count=(2, "fixed"), | |
datatype=["str", "str"], | |
row_count=1, | |
column_widths=["30%", "70%"] | |
) | |
add_input_btn = gr.Button("Add Input") | |
gr.Markdown("**Output**") | |
task_output = gr.Dataframe( | |
headers=["Name", "Value"], | |
col_count=(2, "fixed"), | |
datatype=["str", "str"], | |
row_count=1, | |
column_widths=["30%", "70%"] | |
) | |
reset_task_btn = gr.Button("Clear Inputs and Output") | |
with gr.Column(scale=1): | |
gr.Markdown("**Evaluation criteria and rubric**") | |
evaluation_criteria = gr.Textbox(label="Evaluation criteria") | |
rubric = gr.Dataframe( | |
headers=["Score", "Description"], | |
col_count=(2, "fixed"), | |
datatype=["str", "str"], | |
row_count=1, | |
column_widths=["10%", "90%"] | |
) | |
add_score_btn = gr.Button("Add Score") | |
reset_criteria_btn = gr.Button("Clear Evaluation Criteria") | |
with gr.Row(): | |
with gr.Column(scale=1, variant="compact"): | |
gr.Markdown("**Evaluation**") | |
feedback = gr.Textbox(label="Feedback") | |
score = gr.Textbox(label="Score") | |
evaluate_btn = gr.Button("Evaluate") | |
with gr.Row(): | |
# Add the reset buttons | |
reset_all_btn = gr.Button("Clear All") | |
# Event handlers | |
add_input_btn.click( | |
lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]], | |
headers=["Name", "Value"], | |
col_count=(2, "fixed"), | |
datatype=["str", "str"], | |
row_count=1, | |
column_widths=["30%", "70%"]), | |
inputs=task_inputs, | |
outputs=task_inputs | |
) | |
add_score_btn.click( | |
lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]], | |
headers=["Score", "Description"], | |
col_count=(2, "fixed"), | |
datatype=["str", "str"], | |
row_count=1, | |
column_widths=["10%", "90%"]), | |
inputs=rubric, | |
outputs=rubric | |
) | |
for i, button in enumerate(example_buttons): | |
button.click( | |
populate_fields, | |
inputs=[gr.State(i)], # Pass the example index as a state | |
outputs=[task_inputs, task_output, evaluation_criteria, rubric] | |
) | |
evaluate_btn.click( | |
evaluate, | |
inputs=[task_inputs, task_output, evaluation_criteria, rubric], | |
outputs=[feedback, score] | |
) | |
reset_task_btn.click( | |
reset_task, | |
inputs=[], | |
outputs=[task_inputs, task_output] | |
) | |
reset_criteria_btn.click( | |
reset_evaluation_criteria, | |
inputs=[], | |
outputs=[evaluation_criteria, rubric] | |
) | |
reset_all_btn.click( | |
reset_fields, | |
inputs=[], | |
outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score] | |
) | |
if __name__ == "__main__": | |
demo.launch(debug=True) | |