Spaces:

flowaicom
/

Flow-Judge-v0.1

Running

File size: 7,138 Bytes

31fda98

import gradio as gr
import pandas as pd
from typing import List, Dict
from flow_judge import Vllm, FlowJudge, EvalInput
from flow_judge.metrics import CustomMetric, RubricItem

try:
    model = Vllm(quantized=False)
except Exception as e:
    raise RuntimeError(f"Failed to initialize Vllm: {e}")

EXAMPLES = [
    {
        "example_description": "Faithfulness of a answer",
        "emoji": "🏈",
        "task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}],
        "task_output": {"name": "Answer", "value": "The capital of France is Paris."},
        "evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \
directly inferable from the context?",
        "rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \
from the provided context. There is hallucinated or fabricated information present in the response \
that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \
inferable from the provided context. There is no hallucinated or fabricated information present in \
the response that cannot be traced back to or deduced from the context.']
    }
]

def populate_fields(example_index: int):
    example = EXAMPLES[example_index]
    return (
        [[input["name"], input["value"]] for input in example["task_inputs"]],
        [[example["task_output"]["name"], example["task_output"]["value"]]],
        example["evaluation_criteria"],
        [[str(i), description] for i, description in enumerate(example["rubric"])]
    )

def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple:
    # Convert inputs to the expected format
    eval_input = EvalInput(
        inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()],
        output={row['Name']: row['Value'] for _, row in task_output.iterrows()}
    )
    
    # Parse the rubric into RubricItems
    rubric_items = [
        RubricItem(score=int(row['Score']), description=row['Description'])
        for _, row in rubric.iterrows()
    ]
    
    # Create the CustomMetric
    custom_metric = CustomMetric(
        name="custom-metric",
        criteria=evaluation_criteria,
        rubric=rubric_items,
        required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()],
        required_output=task_output.iloc[0]['Name']
    )
    
    # Create a FlowJudge instance
    judge = FlowJudge(model=model, metric=custom_metric)
    
    # Evaluate using FlowJudge
    try:
        result = judge.evaluate(eval_input)
    except Exception as e:
        raise RuntimeError(f"Failed to evaluate: {e}")
    
    # Extract feedback and score from the result
    feedback = result.feedback
    score = result.score
    
    return feedback, score

def reset_fields():
    return (
        [["", ""]],  # task_inputs
        [["", ""]],  # task_output
        "",          # evaluation_criteria
        [["", ""]],  # rubric
        "",          # feedback
        ""           # score
    )

def reset_task():
    return (
        [["", ""]],  # task_inputs
        [["", ""]]   # task_output
    )

def reset_evaluation_criteria():
    return (
        "",          # evaluation_criteria
        [["", ""]]   # rubric
    )

with gr.Blocks() as demo:
    with gr.Row():
        example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]

    with gr.Row(equal_height=False):
        with gr.Column(scale=1):
            gr.Markdown("**Inputs**")
            task_inputs = gr.Dataframe(
                headers=["Name", "Value"],
                col_count=(2, "fixed"),
                datatype=["str", "str"],
                row_count=1,
                column_widths=["30%", "70%"]
            )
            add_input_btn = gr.Button("Add Input")

            gr.Markdown("**Output**")
            task_output = gr.Dataframe(
                headers=["Name", "Value"],
                col_count=(2, "fixed"),
                datatype=["str", "str"],
                row_count=1,
                column_widths=["30%", "70%"]
            )
            
            reset_task_btn = gr.Button("Clear Inputs and Output")

        with gr.Column(scale=1):
            gr.Markdown("**Evaluation criteria and rubric**")
            evaluation_criteria = gr.Textbox(label="Evaluation criteria")
            rubric = gr.Dataframe(
                headers=["Score", "Description"],
                col_count=(2, "fixed"),
                datatype=["str", "str"],
                row_count=1,
                column_widths=["10%", "90%"]
            )
            add_score_btn = gr.Button("Add Score")
            reset_criteria_btn = gr.Button("Clear Evaluation Criteria")

    with gr.Row():
        with gr.Column(scale=1, variant="compact"):
            gr.Markdown("**Evaluation**")
            feedback = gr.Textbox(label="Feedback")
            score = gr.Textbox(label="Score")
            evaluate_btn = gr.Button("Evaluate")
        
    with gr.Row():
        # Add the reset buttons
        reset_all_btn = gr.Button("Clear All")
        

    # Event handlers
    add_input_btn.click(
        lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
                                headers=["Name", "Value"],
                                col_count=(2, "fixed"),
                                datatype=["str", "str"],
                                row_count=1,
                                column_widths=["30%", "70%"]),
        inputs=task_inputs,
        outputs=task_inputs
    )

    add_score_btn.click(
        lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
                                headers=["Score", "Description"],
                                col_count=(2, "fixed"),
                                datatype=["str", "str"],
                                row_count=1,
                                column_widths=["10%", "90%"]),
        inputs=rubric,
        outputs=rubric
    )

    for i, button in enumerate(example_buttons):
        button.click(
            populate_fields,
            inputs=[gr.State(i)],  # Pass the example index as a state
            outputs=[task_inputs, task_output, evaluation_criteria, rubric]
        )

    evaluate_btn.click(
        evaluate,
        inputs=[task_inputs, task_output, evaluation_criteria, rubric],
        outputs=[feedback, score]
    )

    reset_task_btn.click(
        reset_task,
        inputs=[],
        outputs=[task_inputs, task_output]
    )

    reset_criteria_btn.click(
        reset_evaluation_criteria,
        inputs=[],
        outputs=[evaluation_criteria, rubric]
    )

    reset_all_btn.click(
        reset_fields,
        inputs=[],
        outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score]
    )

if __name__ == "__main__":
    demo.launch(debug=True)