import gradio as gr import spaces import pandas as pd from typing import List, Dict, Tuple from flow_judge import Hf, FlowJudge, EvalInput from flow_judge.metrics import CustomMetric, RubricItem from huggingface_hub import snapshot_download from flow_judge.models.huggingface import Hf from examples import get_examples MODEL_NAME = "flowaicom/Flow-Judge-v0.1" def download_model(): try: print(f"Downloading model {MODEL_NAME}...") snapshot_download(repo_id=MODEL_NAME) print(f"Model {MODEL_NAME} downloaded to default Hugging Face cache") return True except Exception as e: raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}") @spaces.GPU def evaluate( inputs_task: List[Dict[str, str]], output_name: str, output_value: str, evaluation_criteria: str, rubric_items: List[Dict[str, str]] ) -> Tuple[str, int]: # [{'name': 'a', 'value': 'a'}] try: model = Hf(flash_attn=False) except Exception as e: raise RuntimeError(f"Failed to initialize Hf Model: {e}") eval_input = EvalInput( inputs=[{input['name']: input['value']} for input in inputs_task], output={output_name: output_value} ) score_rubric_items = [ RubricItem( score=int(rubric_item['name']), description=rubric_item['value'] ) for rubric_item in rubric_items ] custom_metric = CustomMetric( name="custom-metric", criteria=evaluation_criteria, rubric=score_rubric_items, required_inputs=[input['name'] for input in inputs_task], required_output=output_name ) judge = FlowJudge(model=model, metric=custom_metric) try: result = judge.evaluate(eval_input) except Exception as e: raise RuntimeError(f"Failed to evaluate: {e}") return result.feedback, result.score def reset_all(): return ( [], "", "", [], "", "", # Existing resets for inputs and rubrics "", "", "", "", "", # New resets for additional fields gr.update(visible=True), # Show new_input_name gr.update(visible=True), # Show new_input_value gr.update(visible=True), # Show new_rubric_name gr.update(visible=True), # Show new_rubric_value gr.update(visible=True), # Show Add Input button gr.update(visible=True), # Show Add Rubric Item button ) # Define presets EXAMPLES = get_examples() IMAGE_PATH = "./img/flow_judge_banner.png" HEADER = """

Flow Judge Demo

Technical Report | Model Weights | Evaluation Code | Tutorials

flow-judge is a lightweight library for evaluating LLM applications with Flow-Judge-v0.1.

""" with gr.Blocks() as demo: model_downloaded = download_model() with gr.Row(equal_height=False): with gr.Column(scale=2): gr.Image(IMAGE_PATH, show_label=False, interactive=False, show_share_button=False, show_fullscreen_button=False, show_download_button=False) with gr.Column(scale=3): gr.HTML(HEADER) gr.Markdown("# ⚡ **Quickstart Examples**") with gr.Row(): with gr.Column(scale=1): preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES[:len(EXAMPLES)//3]] with gr.Column(scale=1): preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[len(EXAMPLES)//3:2*len(EXAMPLES)//3]] with gr.Column(scale=1): preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[2*len(EXAMPLES)//3:]] with gr.Row(equal_height=False): with gr.Column(scale=1): gr.Markdown("## **Evaluation task inputs**") gr.Markdown("*Define the input names and values. Inputs are optional if evaluation depends on the output only.*") with gr.Group(): inputs_task = gr.State([]) with gr.Row(equal_height=True): with gr.Column(min_width=60, scale=2): new_input_name = gr.Textbox( label="Name", show_label=True, autoscroll=False, max_lines=1, visible=True # Initially visible ) with gr.Column(scale=9): new_input_value = gr.Textbox( label="Value", show_label=True, autoscroll=False, max_lines=3, visible=True # Initially visible ) def add_input(inputs_task, new_input_name, new_input_value): return inputs_task + [{"name": new_input_name, "value": new_input_value}], "", "" @gr.render(inputs=inputs_task) # You have to pass the state here def render_inputs(inputs_list): # Use different name than the state variable for input in inputs_list: with gr.Group(): with gr.Row(equal_height=True): with gr.Column(min_width=60, scale=2): gr.Textbox(input['name'], label="Name", show_label=True, interactive=False, autoscroll=False, max_lines=1) with gr.Column(scale=8): gr.Textbox(input['value'], label="Value", show_label=True, interactive=False, autoscroll=False, max_lines=3) with gr.Column(min_width=15, scale=1): delete_btn = gr.Button("X", size="lg", variant="secondary") def delete(input=input): inputs_list.remove(input) return inputs_list delete_btn.click(delete, None, [inputs_task]) # This is the state variable with gr.Group(): add_input_btn = gr.Button("Add Input") # Assign to variable add_input_btn.click( add_input, [inputs_task, new_input_name, new_input_value], [inputs_task, new_input_name, new_input_value] ) with gr.Column(scale=1): gr.Markdown("## **Evaluation task output**") gr.Markdown("*Define the output name and value. Output is always required.*") with gr.Group(): with gr.Row(equal_height=True): with gr.Column(min_width=60, scale=2): output_name = gr.Textbox(label="Name", show_label=True, interactive=True, autoscroll=False, max_lines=1) with gr.Column(scale=9): output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3) with gr.Column(scale=1): gr.Markdown("## **Evaluation criteria and rubric**") gr.Markdown("*Define the evaluation criteria and rubric for the evaluation task. Supported scoring scales: Binary (0 and 1), 3-Likert and 5-Likert.*\n\n*❗You can experiment with other scoring scales. However, performance may vary.*") with gr.Row(): with gr.Column(scale=1): with gr.Group(): rubric_items = gr.State([]) with gr.Row(equal_height=True): with gr.Column(min_width=60, scale=2): new_rubric_name = gr.Textbox( label="Score", show_label=True, interactive=True, autoscroll=False, max_lines=1, visible=True # Initially visible ) with gr.Column(scale=9): new_rubric_value = gr.Textbox( label="Description", show_label=True, interactive=True, autoscroll=False, max_lines=3, visible=True # Initially visible ) def add_rubric_item(rubric_items, new_rubric_name, new_rubric_value): return rubric_items + [{"name": new_rubric_name, "value": new_rubric_value}], "", "" @gr.render(inputs=rubric_items) # You have to pass the state here def render_rubrics(rubric_items_list): # Use different name than the state variable for rubric_item in rubric_items_list: with gr.Group(): with gr.Row(equal_height=True): with gr.Column(min_width=60, scale=2): gr.Textbox( rubric_item['name'], label="Score", show_label=True, interactive=False ) with gr.Column(scale=8): gr.Textbox( rubric_item['value'], label="Description", show_label=True, interactive=False ) with gr.Column(min_width=15, scale=1): delete_btn = gr.Button("X", size="lg", variant="secondary") def delete(rubric_item=rubric_item): rubric_items_list.remove(rubric_item) return rubric_items_list delete_btn.click(delete, None, [rubric_items]) # This is the state variable with gr.Group(): add_rubric_btn = gr.Button("Add Rubric Item") # Assign to variable add_rubric_btn.click( add_rubric_item, [rubric_items, new_rubric_name, new_rubric_value], [rubric_items, new_rubric_name, new_rubric_value] ) with gr.Column(scale=1): evaluation_criteria = gr.Textbox(label="Evaluation criteria") with gr.Row(): with gr.Column(scale=1, variant="panel"): gr.Markdown("# **Evaluation**") with gr.Group(): with gr.Row(equal_height=True): with gr.Column(min_width=60, scale=1): score = gr.Textbox(label="Score", interactive=False, autoscroll=False, max_lines=1) with gr.Column(scale=9): feedback = gr.Textbox(label="Feedback", interactive=False, autoscroll=False, max_lines=6) with gr.Column(min_width=15, scale=1): evaluate_btn = gr.Button("Evaluate", variant="primary") reset_all_btn = gr.Button("Clear All", variant="stop") # Add Reset All button reset_all_btn.click( reset_all, inputs=[], outputs=[ inputs_task, new_input_name, new_input_value, rubric_items, new_rubric_name, new_rubric_value, evaluation_criteria, output_name, output_value, feedback, score, new_input_name, # Visibility for new_input_name new_input_value, # Visibility for new_input_value new_rubric_name, # Visibility for new_rubric_name new_rubric_value, # Visibility for new_rubric_value add_input_btn, # Visibility for Add Input button add_rubric_btn, # Visibility for Add Rubric Item button ] ) evaluate_btn.click( evaluate, inputs=[inputs_task, output_name, output_value, evaluation_criteria, rubric_items], outputs=[feedback, score] ) for i, button in enumerate(preset_buttons): def populate_preset(ex_i=i): return populate_fields(ex_i) button.click( populate_preset, inputs=[], outputs=[ inputs_task, output_name, output_value, evaluation_criteria, rubric_items, feedback, score, new_input_name, # Visibility for new_input_name new_input_value, # Visibility for new_input_value new_rubric_name, # Visibility for new_rubric_name new_rubric_value, # Visibility for new_rubric_value add_input_btn, # Visibility for Add Input button add_rubric_btn, # Visibility for Add Rubric Item button ] ) def populate_fields(example_index: int): example = EXAMPLES[example_index] return ( example["inputs_task"], example["output"]["name"], example["output"]["value"], example["evaluation_criteria"], example["rubric"], "", # Reset feedback "", # Reset score gr.update(visible=False), # Hide new_input_name gr.update(visible=False), # Hide new_input_value gr.update(visible=False), # Hide new_rubric_name gr.update(visible=False), # Hide new_rubric_value gr.update(visible=False), # Hide Add Input button gr.update(visible=False), # Hide Add Rubric Item button ) demo.launch(debug=True)