import gradio as gr
import spaces
import pandas as pd
from typing import List, Dict, Tuple
from flow_judge import Hf, FlowJudge, EvalInput
from flow_judge.metrics import CustomMetric, RubricItem
from huggingface_hub import snapshot_download
from flow_judge.models.huggingface import Hf
from examples import get_examples
MODEL_NAME = "flowaicom/Flow-Judge-v0.1"
def download_model():
try:
print(f"Downloading model {MODEL_NAME}...")
snapshot_download(repo_id=MODEL_NAME)
print(f"Model {MODEL_NAME} downloaded to default Hugging Face cache")
return True
except Exception as e:
raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}")
@spaces.GPU
def evaluate(
inputs_task: List[Dict[str, str]],
output_name: str,
output_value: str,
evaluation_criteria: str,
rubric_items: List[Dict[str, str]]
) -> Tuple[str, int]:
# [{'name': 'a', 'value': 'a'}]
try:
model = Hf(flash_attn=False)
except Exception as e:
raise RuntimeError(f"Failed to initialize Hf Model: {e}")
eval_input = EvalInput(
inputs=[{input['name']: input['value']} for input in inputs_task],
output={output_name: output_value}
)
score_rubric_items = [
RubricItem(
score=int(rubric_item['name']),
description=rubric_item['value']
)
for rubric_item in rubric_items
]
custom_metric = CustomMetric(
name="custom-metric",
criteria=evaluation_criteria,
rubric=score_rubric_items,
required_inputs=[input['name'] for input in inputs_task],
required_output=output_name
)
judge = FlowJudge(model=model, metric=custom_metric)
try:
result = judge.evaluate(eval_input)
except Exception as e:
raise RuntimeError(f"Failed to evaluate: {e}")
return result.feedback, result.score
def reset_all():
return (
[], "", "", [], "", "", # Existing resets for inputs and rubrics
"", "", "", "", "", # New resets for additional fields
gr.update(visible=True), # Show new_input_name
gr.update(visible=True), # Show new_input_value
gr.update(visible=True), # Show new_rubric_name
gr.update(visible=True), # Show new_rubric_value
gr.update(visible=True), # Show Add Input button
gr.update(visible=True), # Show Add Rubric Item button
)
# Define presets
EXAMPLES = get_examples()
IMAGE_PATH = "./img/flow_judge_banner.png"
HEADER = """
Flow Judge Demo
Technical Report |
Model Weights |
Evaluation Code |
Tutorials
flow-judge
is a lightweight library for evaluating LLM applications with Flow-Judge-v0.1
.
"""
with gr.Blocks() as demo:
model_downloaded = download_model()
with gr.Row(equal_height=False):
with gr.Column(scale=2):
gr.Image(IMAGE_PATH, show_label=False, interactive=False, show_share_button=False, show_fullscreen_button=False, show_download_button=False)
with gr.Column(scale=3):
gr.HTML(HEADER)
gr.Markdown("# ⚡ **Quickstart Examples**")
with gr.Row():
with gr.Column(scale=1):
preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES[:len(EXAMPLES)//3]]
with gr.Column(scale=1):
preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[len(EXAMPLES)//3:2*len(EXAMPLES)//3]]
with gr.Column(scale=1):
preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[2*len(EXAMPLES)//3:]]
with gr.Row(equal_height=False):
with gr.Column(scale=1):
gr.Markdown("## **Evaluation task inputs**")
gr.Markdown("*Define the input names and values. Inputs are optional if evaluation depends on the output only.*")
with gr.Group():
inputs_task = gr.State([])
with gr.Row(equal_height=True):
with gr.Column(min_width=60, scale=2):
new_input_name = gr.Textbox(
label="Name",
show_label=True,
autoscroll=False,
max_lines=1,
visible=True # Initially visible
)
with gr.Column(scale=9):
new_input_value = gr.Textbox(
label="Value",
show_label=True,
autoscroll=False,
max_lines=3,
visible=True # Initially visible
)
def add_input(inputs_task, new_input_name, new_input_value):
return inputs_task + [{"name": new_input_name, "value": new_input_value}], "", ""
@gr.render(inputs=inputs_task) # You have to pass the state here
def render_inputs(inputs_list): # Use different name than the state variable
for input in inputs_list:
with gr.Group():
with gr.Row(equal_height=True):
with gr.Column(min_width=60, scale=2):
gr.Textbox(input['name'], label="Name", show_label=True, interactive=False, autoscroll=False, max_lines=1)
with gr.Column(scale=8):
gr.Textbox(input['value'], label="Value", show_label=True, interactive=False, autoscroll=False, max_lines=3)
with gr.Column(min_width=15, scale=1):
delete_btn = gr.Button("X", size="lg", variant="secondary")
def delete(input=input):
inputs_list.remove(input)
return inputs_list
delete_btn.click(delete, None, [inputs_task]) # This is the state variable
with gr.Group():
add_input_btn = gr.Button("Add Input") # Assign to variable
add_input_btn.click(
add_input,
[inputs_task, new_input_name, new_input_value],
[inputs_task, new_input_name, new_input_value]
)
with gr.Column(scale=1):
gr.Markdown("## **Evaluation task output**")
gr.Markdown("*Define the output name and value. Output is always required.*")
with gr.Group():
with gr.Row(equal_height=True):
with gr.Column(min_width=60, scale=2):
output_name = gr.Textbox(label="Name", show_label=True, interactive=True, autoscroll=False, max_lines=1)
with gr.Column(scale=9):
output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3)
with gr.Column(scale=1):
gr.Markdown("## **Evaluation criteria and rubric**")
gr.Markdown("*Define the evaluation criteria and rubric for the evaluation task. Supported scoring scales: Binary (0 and 1), 3-Likert and 5-Likert.*\n\n*❗You can experiment with other scoring scales. However, performance may vary.*")
with gr.Row():
with gr.Column(scale=1):
with gr.Group():
rubric_items = gr.State([])
with gr.Row(equal_height=True):
with gr.Column(min_width=60, scale=2):
new_rubric_name = gr.Textbox(
label="Score",
show_label=True,
interactive=True,
autoscroll=False,
max_lines=1,
visible=True # Initially visible
)
with gr.Column(scale=9):
new_rubric_value = gr.Textbox(
label="Description",
show_label=True,
interactive=True,
autoscroll=False,
max_lines=3,
visible=True # Initially visible
)
def add_rubric_item(rubric_items, new_rubric_name, new_rubric_value):
return rubric_items + [{"name": new_rubric_name, "value": new_rubric_value}], "", ""
@gr.render(inputs=rubric_items) # You have to pass the state here
def render_rubrics(rubric_items_list): # Use different name than the state variable
for rubric_item in rubric_items_list:
with gr.Group():
with gr.Row(equal_height=True):
with gr.Column(min_width=60, scale=2):
gr.Textbox(
rubric_item['name'],
label="Score",
show_label=True,
interactive=False
)
with gr.Column(scale=8):
gr.Textbox(
rubric_item['value'],
label="Description",
show_label=True,
interactive=False
)
with gr.Column(min_width=15, scale=1):
delete_btn = gr.Button("X", size="lg", variant="secondary")
def delete(rubric_item=rubric_item):
rubric_items_list.remove(rubric_item)
return rubric_items_list
delete_btn.click(delete, None, [rubric_items]) # This is the state variable
with gr.Group():
add_rubric_btn = gr.Button("Add Rubric Item") # Assign to variable
add_rubric_btn.click(
add_rubric_item,
[rubric_items, new_rubric_name, new_rubric_value],
[rubric_items, new_rubric_name, new_rubric_value]
)
with gr.Column(scale=1):
evaluation_criteria = gr.Textbox(label="Evaluation criteria")
with gr.Row():
with gr.Column(scale=1, variant="panel"):
gr.Markdown("# **Evaluation**")
with gr.Group():
with gr.Row(equal_height=True):
with gr.Column(min_width=60, scale=1):
score = gr.Textbox(label="Score", interactive=False, autoscroll=False, max_lines=1)
with gr.Column(scale=9):
feedback = gr.Textbox(label="Feedback", interactive=False, autoscroll=False, max_lines=6)
with gr.Column(min_width=15, scale=1):
evaluate_btn = gr.Button("Evaluate", variant="primary")
reset_all_btn = gr.Button("Clear All", variant="stop") # Add Reset All button
reset_all_btn.click(
reset_all,
inputs=[],
outputs=[
inputs_task,
new_input_name,
new_input_value,
rubric_items,
new_rubric_name,
new_rubric_value,
evaluation_criteria,
output_name,
output_value,
feedback,
score,
new_input_name, # Visibility for new_input_name
new_input_value, # Visibility for new_input_value
new_rubric_name, # Visibility for new_rubric_name
new_rubric_value, # Visibility for new_rubric_value
add_input_btn, # Visibility for Add Input button
add_rubric_btn, # Visibility for Add Rubric Item button
]
)
evaluate_btn.click(
evaluate,
inputs=[inputs_task, output_name, output_value, evaluation_criteria, rubric_items],
outputs=[feedback, score]
)
for i, button in enumerate(preset_buttons):
def populate_preset(ex_i=i):
return populate_fields(ex_i)
button.click(
populate_preset,
inputs=[],
outputs=[
inputs_task,
output_name,
output_value,
evaluation_criteria,
rubric_items,
feedback,
score,
new_input_name, # Visibility for new_input_name
new_input_value, # Visibility for new_input_value
new_rubric_name, # Visibility for new_rubric_name
new_rubric_value, # Visibility for new_rubric_value
add_input_btn, # Visibility for Add Input button
add_rubric_btn, # Visibility for Add Rubric Item button
]
)
def populate_fields(example_index: int):
example = EXAMPLES[example_index]
return (
example["inputs_task"],
example["output"]["name"],
example["output"]["value"],
example["evaluation_criteria"],
example["rubric"],
"", # Reset feedback
"", # Reset score
gr.update(visible=False), # Hide new_input_name
gr.update(visible=False), # Hide new_input_value
gr.update(visible=False), # Hide new_rubric_name
gr.update(visible=False), # Hide new_rubric_value
gr.update(visible=False), # Hide Add Input button
gr.update(visible=False), # Hide Add Rubric Item button
)
demo.launch(debug=True)