Spaces:

flowaicom
/

Flow-Judge-v0.1

Running

App Files Files Community

Flow-Judge-v0.1 / app.py

bergr7f

Add WIP application file and dependencies

31fda98 4 months ago

raw

history blame

7.14 kB

	import gradio as gr
	import pandas as pd
	from typing import List, Dict
	from flow_judge import Vllm, FlowJudge, EvalInput
	from flow_judge.metrics import CustomMetric, RubricItem

	try:
	model = Vllm(quantized=False)
	except Exception as e:
	raise RuntimeError(f"Failed to initialize Vllm: {e}")

	EXAMPLES = [
	{
	"example_description": "Faithfulness of a answer",
	"emoji": "🏈",
	"task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}],
	"task_output": {"name": "Answer", "value": "The capital of France is Paris."},
	"evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \
	directly inferable from the context?",
	"rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \
	from the provided context. There is hallucinated or fabricated information present in the response \
	that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \
	inferable from the provided context. There is no hallucinated or fabricated information present in \
	the response that cannot be traced back to or deduced from the context.']
	}
	]

	def populate_fields(example_index: int):
	example = EXAMPLES[example_index]
	return (
	[[input["name"], input["value"]] for input in example["task_inputs"]],
	[[example["task_output"]["name"], example["task_output"]["value"]]],
	example["evaluation_criteria"],
	[[str(i), description] for i, description in enumerate(example["rubric"])]
	)

	def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple:
	# Convert inputs to the expected format
	eval_input = EvalInput(
	inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()],
	output={row['Name']: row['Value'] for _, row in task_output.iterrows()}
	)

	# Parse the rubric into RubricItems
	rubric_items = [
	RubricItem(score=int(row['Score']), description=row['Description'])
	for _, row in rubric.iterrows()
	]

	# Create the CustomMetric
	custom_metric = CustomMetric(
	name="custom-metric",
	criteria=evaluation_criteria,
	rubric=rubric_items,
	required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()],
	required_output=task_output.iloc[0]['Name']
	)

	# Create a FlowJudge instance
	judge = FlowJudge(model=model, metric=custom_metric)

	# Evaluate using FlowJudge
	try:
	result = judge.evaluate(eval_input)
	except Exception as e:
	raise RuntimeError(f"Failed to evaluate: {e}")

	# Extract feedback and score from the result
	feedback = result.feedback
	score = result.score

	return feedback, score

	def reset_fields():
	return (
	[["", ""]], # task_inputs
	[["", ""]], # task_output
	"", # evaluation_criteria
	[["", ""]], # rubric
	"", # feedback
	"" # score
	)

	def reset_task():
	return (
	[["", ""]], # task_inputs
	[["", ""]] # task_output
	)

	def reset_evaluation_criteria():
	return (
	"", # evaluation_criteria
	[["", ""]] # rubric
	)

	with gr.Blocks() as demo:
	with gr.Row():
	example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]

	with gr.Row(equal_height=False):
	with gr.Column(scale=1):
	gr.Markdown("Inputs")
	task_inputs = gr.Dataframe(
	headers=["Name", "Value"],
	col_count=(2, "fixed"),
	datatype=["str", "str"],
	row_count=1,
	column_widths=["30%", "70%"]
	)
	add_input_btn = gr.Button("Add Input")

	gr.Markdown("Output")
	task_output = gr.Dataframe(
	headers=["Name", "Value"],
	col_count=(2, "fixed"),
	datatype=["str", "str"],
	row_count=1,
	column_widths=["30%", "70%"]
	)

	reset_task_btn = gr.Button("Clear Inputs and Output")

	with gr.Column(scale=1):
	gr.Markdown("Evaluation criteria and rubric")
	evaluation_criteria = gr.Textbox(label="Evaluation criteria")
	rubric = gr.Dataframe(
	headers=["Score", "Description"],
	col_count=(2, "fixed"),
	datatype=["str", "str"],
	row_count=1,
	column_widths=["10%", "90%"]
	)
	add_score_btn = gr.Button("Add Score")
	reset_criteria_btn = gr.Button("Clear Evaluation Criteria")

	with gr.Row():
	with gr.Column(scale=1, variant="compact"):
	gr.Markdown("Evaluation")
	feedback = gr.Textbox(label="Feedback")
	score = gr.Textbox(label="Score")
	evaluate_btn = gr.Button("Evaluate")

	with gr.Row():
	# Add the reset buttons
	reset_all_btn = gr.Button("Clear All")


	# Event handlers
	add_input_btn.click(
	lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
	headers=["Name", "Value"],
	col_count=(2, "fixed"),
	datatype=["str", "str"],
	row_count=1,
	column_widths=["30%", "70%"]),
	inputs=task_inputs,
	outputs=task_inputs
	)

	add_score_btn.click(
	lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
	headers=["Score", "Description"],
	col_count=(2, "fixed"),
	datatype=["str", "str"],
	row_count=1,
	column_widths=["10%", "90%"]),
	inputs=rubric,
	outputs=rubric
	)

	for i, button in enumerate(example_buttons):
	button.click(
	populate_fields,
	inputs=[gr.State(i)], # Pass the example index as a state
	outputs=[task_inputs, task_output, evaluation_criteria, rubric]
	)

	evaluate_btn.click(
	evaluate,
	inputs=[task_inputs, task_output, evaluation_criteria, rubric],
	outputs=[feedback, score]
	)

	reset_task_btn.click(
	reset_task,
	inputs=[],
	outputs=[task_inputs, task_output]
	)

	reset_criteria_btn.click(
	reset_evaluation_criteria,
	inputs=[],
	outputs=[evaluation_criteria, rubric]
	)

	reset_all_btn.click(
	reset_fields,
	inputs=[],
	outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score]
	)

	if __name__ == "__main__":
	demo.launch(debug=True)