Flow Judge Demo

import gradio as gr
import spaces
import pandas as pd
from typing import List, Dict, Tuple
from flow_judge import Hf, FlowJudge, EvalInput
from flow_judge.metrics import CustomMetric, RubricItem
from huggingface_hub import snapshot_download
from flow_judge.models.huggingface import Hf
from examples import get_examples

MODEL_NAME = "flowaicom/Flow-Judge-v0.1"

def download_model():
    try:
        print(f"Downloading model {MODEL_NAME}...")
        snapshot_download(repo_id=MODEL_NAME)
        print(f"Model {MODEL_NAME} downloaded to default Hugging Face cache")
        return True
    except Exception as e:
        raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}")
    
    
@spaces.GPU
def evaluate(
    inputs_task: List[Dict[str, str]],
    output_name: str,
    output_value: str,
    evaluation_criteria: str,
    rubric_items: List[Dict[str, str]]
) -> Tuple[str, int]:
    
    # [{'name': 'a', 'value': 'a'}]
    try:
        model = Hf(flash_attn=False)
    except Exception as e:
        raise RuntimeError(f"Failed to initialize Hf Model: {e}")
    
    eval_input = EvalInput(
        inputs=[{input['name']: input['value']} for input in inputs_task],
        output={output_name: output_value}
    )
    
    score_rubric_items = [
        RubricItem(
            score=int(rubric_item['name']), 
            description=rubric_item['value']
        )
        for rubric_item in rubric_items
    ]
    
    custom_metric = CustomMetric(
        name="custom-metric",
        criteria=evaluation_criteria,
        rubric=score_rubric_items,
        required_inputs=[input['name'] for input in inputs_task],
        required_output=output_name
    )
    
    judge = FlowJudge(model=model, metric=custom_metric)
    
    try:
        result = judge.evaluate(eval_input)
    except Exception as e:
        raise RuntimeError(f"Failed to evaluate: {e}")
    
    return result.feedback, result.score

def reset_all():
    return (
        [], "", "", [], "", "",  # Existing resets for inputs and rubrics
        "", "", "", "", "",      # New resets for additional fields
        gr.update(visible=True),  # Show new_input_name
        gr.update(visible=True),  # Show new_input_value
        gr.update(visible=True),  # Show new_rubric_name
        gr.update(visible=True),  # Show new_rubric_value
        gr.update(visible=True),  # Show Add Input button
        gr.update(visible=True),  # Show Add Rubric Item button
    )

# Define presets
EXAMPLES = get_examples()

IMAGE_PATH = "./img/flow_judge_banner.png"

HEADER = """<h1 align="center" style="font-family: 'Courier New', Courier, monospace;">Flow Judge Demo</h1>

<p align="center" style="font-family: 'Courier New', Courier, monospace;">
  <strong>
    <a href="https://www.flow-ai.com/judge">Technical Report</a> |
    <a href="https://huggingface.co/collections/flowaicom/flow-judge-v01-66e6af5fc3b3a128bde07dec">Model Weights</a> |
    <a href="https://github.com/flowaicom/lm-evaluation-harness/tree/Flow-Judge-v0.1_evals/lm_eval/tasks/flow_judge_evals">Evaluation Code</a> |
    <a href="https://github.com/flowaicom/flow-judge/tree/main/examples">Tutorials</a>
  </strong>
</p>

<p align="center" style="font-family: 'Courier New', Courier, monospace;">
  <code>flow-judge</code> is a lightweight library for evaluating LLM applications with <code>Flow-Judge-v0.1</code>.
</p>"""


with gr.Blocks() as demo:
    model_downloaded = download_model()
    
    with gr.Row(equal_height=False):
        with gr.Column(scale=2):
            gr.Image(IMAGE_PATH, show_label=False, interactive=False, show_share_button=False, show_fullscreen_button=False, show_download_button=False)
        with gr.Column(scale=3):
            gr.HTML(HEADER)
    gr.Markdown("# ⚡ **Quickstart Examples**")
    with gr.Row():
        with gr.Column(scale=1):
            preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES[:len(EXAMPLES)//3]]
        with gr.Column(scale=1):
            preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[len(EXAMPLES)//3:2*len(EXAMPLES)//3]]
        with gr.Column(scale=1):
            preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[2*len(EXAMPLES)//3:]]
    
    with gr.Row(equal_height=False):
        with gr.Column(scale=1):
            gr.Markdown("## **Evaluation task inputs**")
            gr.Markdown("*<span style='color: gray;'>Define the input names and values. Inputs are optional if evaluation depends on the output only.</span>*")
            with gr.Group():
                inputs_task = gr.State([])
                with gr.Row(equal_height=True):
                    with gr.Column(min_width=60, scale=2):
                        new_input_name = gr.Textbox(
                            label="Name",
                            show_label=True,
                            autoscroll=False,
                            max_lines=1,
                            visible=True  # Initially visible
                        )
                    with gr.Column(scale=9):
                        new_input_value = gr.Textbox(
                            label="Value",
                            show_label=True,
                            autoscroll=False,
                            max_lines=3,
                            visible=True  # Initially visible
                        )
        
            def add_input(inputs_task, new_input_name, new_input_value):
                return inputs_task + [{"name": new_input_name, "value": new_input_value}], "", ""
            
            @gr.render(inputs=inputs_task) # You have to pass the state here
            def render_inputs(inputs_list): # Use different name than the state variable
                
                for input in inputs_list:
                    with gr.Group():
                        with gr.Row(equal_height=True):
                            with gr.Column(min_width=60, scale=2):
                                gr.Textbox(input['name'], label="Name", show_label=True, interactive=False, autoscroll=False, max_lines=1)
                            with gr.Column(scale=8):
                                gr.Textbox(input['value'], label="Value", show_label=True, interactive=False, autoscroll=False, max_lines=3)
                            with gr.Column(min_width=15, scale=1):
                                delete_btn = gr.Button("X", size="lg", variant="secondary")
                                def delete(input=input):
                                    inputs_list.remove(input)
                                    return inputs_list
                            delete_btn.click(delete, None, [inputs_task]) # This is the state variable
                        
            with gr.Group():
                add_input_btn = gr.Button("Add Input")  # Assign to variable
                add_input_btn.click(
                    add_input,
                    [inputs_task, new_input_name, new_input_value],
                    [inputs_task, new_input_name, new_input_value]
                )
        
        with gr.Column(scale=1):
            gr.Markdown("## **Evaluation task output**")
            gr.Markdown("*<span style='color: gray;'>Define the output name and value. Output is always required.</span>*")
            with gr.Group():
                with gr.Row(equal_height=True):
                    with gr.Column(min_width=60, scale=2):
                        output_name = gr.Textbox(label="Name", show_label=True, interactive=True, autoscroll=False, max_lines=1)
                    with gr.Column(scale=9):
                        output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3)
    
   
    with gr.Column(scale=1):
        gr.Markdown("## **Evaluation criteria and rubric**")
        gr.Markdown("*<span style='color: gray;'>Define the evaluation criteria and rubric for the evaluation task. Supported scoring scales: Binary (0 and 1), 3-Likert and 5-Likert.</span>*\n\n*<span style='color: gray;'>❗You can experiment with other scoring scales. However, performance may vary.</span>*")

        with gr.Row():
            with gr.Column(scale=1):
                with gr.Group():
                    rubric_items = gr.State([])
                    with gr.Row(equal_height=True):
                        with gr.Column(min_width=60, scale=2):
                            new_rubric_name = gr.Textbox(
                                label="Score",
                                show_label=True,
                                interactive=True,
                                autoscroll=False,
                                max_lines=1,
                                visible=True  # Initially visible
                            )
                        with gr.Column(scale=9):
                            new_rubric_value = gr.Textbox(
                                label="Description",
                                show_label=True,
                                interactive=True,
                                autoscroll=False,
                                max_lines=3,
                                visible=True  # Initially visible
                            )
                    
                def add_rubric_item(rubric_items, new_rubric_name, new_rubric_value):
                    return rubric_items + [{"name": new_rubric_name, "value": new_rubric_value}], "", ""
                
                @gr.render(inputs=rubric_items) # You have to pass the state here
                def render_rubrics(rubric_items_list): # Use different name than the state variable
                    
                    for rubric_item in rubric_items_list:
                        with gr.Group():
                            with gr.Row(equal_height=True):
                                with gr.Column(min_width=60, scale=2):
                                    gr.Textbox(
                                        rubric_item['name'],
                                        label="Score",
                                        show_label=True,
                                        interactive=False
                                    )
                                with gr.Column(scale=8):
                                    gr.Textbox(
                                        rubric_item['value'],
                                        label="Description",
                                        show_label=True,
                                        interactive=False
                                    )
                                with gr.Column(min_width=15, scale=1):
                                    delete_btn = gr.Button("X", size="lg", variant="secondary")
                                    def delete(rubric_item=rubric_item):
                                        rubric_items_list.remove(rubric_item)
                                        return rubric_items_list
                                    delete_btn.click(delete, None, [rubric_items]) # This is the state variable
                                
                with gr.Group():
                    add_rubric_btn = gr.Button("Add Rubric Item")  # Assign to variable
                    add_rubric_btn.click(
                        add_rubric_item,
                        [rubric_items, new_rubric_name, new_rubric_value],
                        [rubric_items, new_rubric_name, new_rubric_value]
                    )
            with gr.Column(scale=1):
                evaluation_criteria = gr.Textbox(label="Evaluation criteria")
                            
    with gr.Row():
        with gr.Column(scale=1, variant="panel"):
            gr.Markdown("# **Evaluation**")
            with gr.Group():
                with gr.Row(equal_height=True):
                    with gr.Column(min_width=60, scale=1):
                        score = gr.Textbox(label="Score", interactive=False, autoscroll=False, max_lines=1)
                    with gr.Column(scale=9):
                        feedback = gr.Textbox(label="Feedback", interactive=False, autoscroll=False, max_lines=6)
                    with gr.Column(min_width=15, scale=1):
                        evaluate_btn = gr.Button("Evaluate", variant="primary")

                        reset_all_btn = gr.Button("Clear All", variant="stop")  # Add Reset All button
                        reset_all_btn.click(
                            reset_all,
                            inputs=[], 
                            outputs=[
                                inputs_task, 
                                new_input_name, 
                                new_input_value, 
                                rubric_items, 
                                new_rubric_name, 
                                new_rubric_value,
                                evaluation_criteria,
                                output_name,
                                output_value,
                                feedback,
                                score,
                                new_input_name,       # Visibility for new_input_name
                                new_input_value,      # Visibility for new_input_value
                                new_rubric_name,      # Visibility for new_rubric_name
                                new_rubric_value,     # Visibility for new_rubric_value
                                add_input_btn,        # Visibility for Add Input button
                                add_rubric_btn,       # Visibility for Add Rubric Item button
                            ]
                        )
    
    evaluate_btn.click(
        evaluate,
        inputs=[inputs_task, output_name, output_value, evaluation_criteria, rubric_items],
        outputs=[feedback, score]
    )
    
    for i, button in enumerate(preset_buttons):
        def populate_preset(ex_i=i):
            return populate_fields(ex_i)

        button.click(
            populate_preset,
            inputs=[],
            outputs=[
                inputs_task, 
                output_name, 
                output_value, 
                evaluation_criteria, 
                rubric_items,
                feedback,
                score,
                new_input_name,       # Visibility for new_input_name
                new_input_value,      # Visibility for new_input_value
                new_rubric_name,      # Visibility for new_rubric_name
                new_rubric_value,     # Visibility for new_rubric_value
                add_input_btn,        # Visibility for Add Input button
                add_rubric_btn,       # Visibility for Add Rubric Item button
            ]
        )

def populate_fields(example_index: int):
    example = EXAMPLES[example_index]
    return (
        example["inputs_task"],
        example["output"]["name"],
        example["output"]["value"],
        example["evaluation_criteria"],
        example["rubric"],
        "",  # Reset feedback
        "",  # Reset score
        gr.update(visible=False),  # Hide new_input_name
        gr.update(visible=False),  # Hide new_input_value
        gr.update(visible=False),  # Hide new_rubric_name
        gr.update(visible=False),  # Hide new_rubric_value
        gr.update(visible=False),  # Hide Add Input button
        gr.update(visible=False),  # Hide Add Rubric Item button
    )

demo.launch(debug=True)