Spaces:
Sleeping
Sleeping
import gradio as gr | |
import spaces | |
import pandas as pd | |
from typing import List, Dict, Tuple | |
from flow_judge import Hf, FlowJudge, EvalInput | |
from flow_judge.metrics import CustomMetric, RubricItem | |
from huggingface_hub import snapshot_download | |
from flow_judge.models.huggingface import Hf | |
MODEL_NAME = "flowaicom/Flow-Judge-v0.1" | |
def download_model(): | |
try: | |
print(f"Downloading model {MODEL_NAME}...") | |
snapshot_download(repo_id=MODEL_NAME) | |
print(f"Model {MODEL_NAME} downloaded to default Hugging Face cache") | |
return True | |
except Exception as e: | |
raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}") | |
def evaluate( | |
inputs_task: List[Dict[str, str]], | |
output_name: str, | |
output_value: str, | |
evaluation_criteria: str, | |
rubric_items: List[Dict[str, str]] | |
) -> Tuple[str, int]: | |
# [{'name': 'a', 'value': 'a'}] | |
try: | |
model = Hf(flash_attn=False) | |
except Exception as e: | |
raise RuntimeError(f"Failed to initialize Hf Model: {e}") | |
eval_input = EvalInput( | |
inputs=[{input['name']: input['value']} for input in inputs_task], | |
output={output_name: output_value} | |
) | |
score_rubric_items = [ | |
RubricItem( | |
score=int(rubric_item['name']), | |
description=rubric_item['value'] | |
) | |
for rubric_item in rubric_items | |
] | |
custom_metric = CustomMetric( | |
name="custom-metric", | |
criteria=evaluation_criteria, | |
rubric=score_rubric_items, | |
required_inputs=[input['name'] for input in inputs_task], | |
required_output=output_name | |
) | |
judge = FlowJudge(model=model, metric=custom_metric) | |
try: | |
result = judge.evaluate(eval_input) | |
except Exception as e: | |
raise RuntimeError(f"Failed to evaluate: {e}") | |
return result.feedback, result.score | |
def reset_all(): | |
return ( | |
[], "", "", [], "", "", # Existing resets for inputs and rubrics | |
"", "", "", "", "" # New resets for additional fields | |
) | |
# Define presets | |
EXAMPLES = [ | |
{ | |
"description": "Example 1: Basic Evaluation", | |
"inputs_task": [{"name": "Question", "value": "What is the capital of France?"}], | |
"output": {"name": "Answer", "value": "The capital of France is Paris."}, | |
"evaluation_criteria": "Ensure the answer is accurate and based on the input question.", | |
"rubric": [ | |
{"name": "1", "value": "Incorrect answer."}, | |
{"name": "2", "value": "Partially correct answer."}, | |
{"name": "3", "value": "Completely correct answer."} | |
] | |
}, | |
{ | |
"description": "Example 2: Contextual Understanding", | |
"inputs_task": [ | |
{"name": "Statement", "value": "All swans are white."} | |
], | |
"output": {"name": "Conclusion", "value": "There are no black swans."}, | |
"evaluation_criteria": "Verify the conclusion logically follows from the statement.", | |
"rubric": [ | |
{"name": "1", "value": "Conclusion does not follow."}, | |
{"name": "2", "value": "Conclusion somewhat follows."}, | |
{"name": "3", "value": "Conclusion logically follows."} | |
] | |
} | |
] | |
with gr.Blocks() as demo: | |
model_downloaded = download_model() | |
# with gr.Row(): | |
# example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)] | |
with gr.Row(equal_height=False): | |
with gr.Column(scale=1): | |
gr.Markdown("**Inputs**") | |
inputs_task = gr.State([]) | |
new_input_name = gr.Textbox(label="Name") | |
new_input_value = gr.Textbox(label="Value") | |
def add_input(inputs_task, new_input_name, new_input_value): | |
return inputs_task + [{"name": new_input_name, "value": new_input_value}], "", "" | |
# You have to pass the state here | |
def render_inputs(inputs_list): # Use different name than the state variable | |
for input in inputs_list: | |
with gr.Group(): | |
with gr.Row(equal_height=True): | |
with gr.Column(min_width=60, scale=2): | |
gr.Textbox(input['name'], label="Name", show_label=True, interactive=False, autoscroll=False, max_lines=1) | |
with gr.Column(scale=8): | |
gr.Textbox(input['value'], label="Value", show_label=True, interactive=False, autoscroll=False, max_lines=3) | |
with gr.Column(min_width=15, scale=1): | |
delete_btn = gr.Button("X", size="lg", variant="secondary") | |
def delete(input=input): | |
inputs_list.remove(input) | |
return inputs_list | |
delete_btn.click(delete, None, [inputs_task]) # This is the state variable | |
gr.Button("Add Input").click( | |
add_input, | |
[inputs_task, new_input_name, new_input_value], | |
[inputs_task, new_input_name, new_input_value] | |
) | |
gr.Markdown("**Output**") | |
with gr.Group(): | |
with gr.Row(equal_height=True): | |
with gr.Column(min_width=60, scale=1): | |
output_name = gr.Textbox(label="Name", show_label=True, interactive=True, autoscroll=False, max_lines=1) | |
with gr.Column(scale=6): | |
output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3) | |
with gr.Column(scale=1): | |
gr.Markdown("**Evaluation criteria and rubric**") | |
evaluation_criteria = gr.Textbox(label="Evaluation criteria") | |
gr.Markdown("**Score rubrics**") | |
rubric_items = gr.State([]) | |
new_rubric_name = gr.Textbox(label="Score", show_label=True, interactive=True, autoscroll=False, max_lines=1) | |
new_rubric_value = gr.Textbox(label="Description", show_label=True, interactive=True, autoscroll=False, max_lines=3) | |
def add_rubric_item(rubric_items, new_rubric_name, new_rubric_value): | |
return rubric_items + [{"name": new_rubric_name, "value": new_rubric_value}], "", "" | |
# You have to pass the state here | |
def render_rubrics(rubric_items_list): # Use different name than the state variable | |
for rubric_item in rubric_items_list: | |
with gr.Group(): | |
with gr.Row(equal_height=True): | |
with gr.Column(min_width=30, scale=1): | |
gr.Textbox(rubric_item['name'], label="Score", show_label=True, interactive=False) | |
with gr.Column(scale=8): | |
gr.Textbox(rubric_item['value'], label="Description", show_label=True, interactive=False) | |
with gr.Column(min_width=15, scale=1): | |
delete_btn = gr.Button("X", size="lg", variant="secondary") | |
def delete(rubric_item=rubric_item): | |
rubric_items_list.remove(rubric_item) | |
return rubric_items_list | |
delete_btn.click(delete, None, [rubric_items]) # This is the state variable | |
gr.Button("Add Rubric Item").click( | |
add_rubric_item, | |
[rubric_items, new_rubric_name, new_rubric_value], | |
[rubric_items, new_rubric_name, new_rubric_value] | |
) | |
with gr.Row(): | |
with gr.Column(scale=1, variant="panel"): | |
gr.Markdown("**Evaluation**") | |
with gr.Group(): | |
with gr.Row(equal_height=True): | |
with gr.Column(min_width=15, scale=1): | |
score = gr.Textbox(label="Score", interactive=False, autoscroll=False, max_lines=1) | |
with gr.Column(scale=5): | |
feedback = gr.Textbox(label="Feedback", interactive=False, autoscroll=False, max_lines=6) | |
with gr.Column(min_width=15, scale=1): | |
evaluate_btn = gr.Button("Evaluate", variant="primary") | |
reset_all_btn = gr.Button("Clear All", variant="stop") # Add Reset All button | |
reset_all_btn.click( | |
reset_all, | |
inputs=[], | |
outputs=[ | |
inputs_task, | |
new_input_name, | |
new_input_value, | |
rubric_items, | |
new_rubric_name, | |
new_rubric_value, | |
evaluation_criteria, # Reset evaluation criteria | |
output_name, # Reset output name | |
output_value, # Reset output value | |
feedback, # Reset feedback | |
score # Reset score | |
] | |
) | |
evaluate_btn.click( | |
evaluate, | |
inputs=[inputs_task, output_name, output_value, evaluation_criteria, rubric_items], | |
outputs=[feedback, score] | |
) | |
preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES] | |
for i, button in enumerate(preset_buttons): | |
def populate_preset(ex_i=i): | |
return populate_fields(ex_i) | |
button.click( | |
populate_preset, # Use the closure to pass the current index | |
inputs=[], # No direct inputs needed | |
outputs=[ | |
inputs_task, | |
output_name, | |
output_value, | |
evaluation_criteria, | |
rubric_items | |
] | |
) | |
def populate_fields(example_index: int): | |
example = EXAMPLES[example_index] | |
return ( | |
example["inputs_task"], | |
example["output"]["name"], | |
example["output"]["value"], | |
example["evaluation_criteria"], | |
example["rubric"] | |
) | |
demo.launch(debug=True) |