Flow-Judge-v0.1 / app.py
bergr7f's picture
Add WIP application file and dependencies
31fda98
raw
history blame
7.14 kB
import gradio as gr
import pandas as pd
from typing import List, Dict
from flow_judge import Vllm, FlowJudge, EvalInput
from flow_judge.metrics import CustomMetric, RubricItem
try:
model = Vllm(quantized=False)
except Exception as e:
raise RuntimeError(f"Failed to initialize Vllm: {e}")
EXAMPLES = [
{
"example_description": "Faithfulness of a answer",
"emoji": "🏈",
"task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}],
"task_output": {"name": "Answer", "value": "The capital of France is Paris."},
"evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \
directly inferable from the context?",
"rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \
from the provided context. There is hallucinated or fabricated information present in the response \
that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \
inferable from the provided context. There is no hallucinated or fabricated information present in \
the response that cannot be traced back to or deduced from the context.']
}
]
def populate_fields(example_index: int):
example = EXAMPLES[example_index]
return (
[[input["name"], input["value"]] for input in example["task_inputs"]],
[[example["task_output"]["name"], example["task_output"]["value"]]],
example["evaluation_criteria"],
[[str(i), description] for i, description in enumerate(example["rubric"])]
)
def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple:
# Convert inputs to the expected format
eval_input = EvalInput(
inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()],
output={row['Name']: row['Value'] for _, row in task_output.iterrows()}
)
# Parse the rubric into RubricItems
rubric_items = [
RubricItem(score=int(row['Score']), description=row['Description'])
for _, row in rubric.iterrows()
]
# Create the CustomMetric
custom_metric = CustomMetric(
name="custom-metric",
criteria=evaluation_criteria,
rubric=rubric_items,
required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()],
required_output=task_output.iloc[0]['Name']
)
# Create a FlowJudge instance
judge = FlowJudge(model=model, metric=custom_metric)
# Evaluate using FlowJudge
try:
result = judge.evaluate(eval_input)
except Exception as e:
raise RuntimeError(f"Failed to evaluate: {e}")
# Extract feedback and score from the result
feedback = result.feedback
score = result.score
return feedback, score
def reset_fields():
return (
[["", ""]], # task_inputs
[["", ""]], # task_output
"", # evaluation_criteria
[["", ""]], # rubric
"", # feedback
"" # score
)
def reset_task():
return (
[["", ""]], # task_inputs
[["", ""]] # task_output
)
def reset_evaluation_criteria():
return (
"", # evaluation_criteria
[["", ""]] # rubric
)
with gr.Blocks() as demo:
with gr.Row():
example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]
with gr.Row(equal_height=False):
with gr.Column(scale=1):
gr.Markdown("**Inputs**")
task_inputs = gr.Dataframe(
headers=["Name", "Value"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["30%", "70%"]
)
add_input_btn = gr.Button("Add Input")
gr.Markdown("**Output**")
task_output = gr.Dataframe(
headers=["Name", "Value"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["30%", "70%"]
)
reset_task_btn = gr.Button("Clear Inputs and Output")
with gr.Column(scale=1):
gr.Markdown("**Evaluation criteria and rubric**")
evaluation_criteria = gr.Textbox(label="Evaluation criteria")
rubric = gr.Dataframe(
headers=["Score", "Description"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["10%", "90%"]
)
add_score_btn = gr.Button("Add Score")
reset_criteria_btn = gr.Button("Clear Evaluation Criteria")
with gr.Row():
with gr.Column(scale=1, variant="compact"):
gr.Markdown("**Evaluation**")
feedback = gr.Textbox(label="Feedback")
score = gr.Textbox(label="Score")
evaluate_btn = gr.Button("Evaluate")
with gr.Row():
# Add the reset buttons
reset_all_btn = gr.Button("Clear All")
# Event handlers
add_input_btn.click(
lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
headers=["Name", "Value"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["30%", "70%"]),
inputs=task_inputs,
outputs=task_inputs
)
add_score_btn.click(
lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
headers=["Score", "Description"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["10%", "90%"]),
inputs=rubric,
outputs=rubric
)
for i, button in enumerate(example_buttons):
button.click(
populate_fields,
inputs=[gr.State(i)], # Pass the example index as a state
outputs=[task_inputs, task_output, evaluation_criteria, rubric]
)
evaluate_btn.click(
evaluate,
inputs=[task_inputs, task_output, evaluation_criteria, rubric],
outputs=[feedback, score]
)
reset_task_btn.click(
reset_task,
inputs=[],
outputs=[task_inputs, task_output]
)
reset_criteria_btn.click(
reset_evaluation_criteria,
inputs=[],
outputs=[evaluation_criteria, rubric]
)
reset_all_btn.click(
reset_fields,
inputs=[],
outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score]
)
if __name__ == "__main__":
demo.launch(debug=True)