Spaces:
Running
Running
File size: 7,138 Bytes
31fda98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import gradio as gr
import pandas as pd
from typing import List, Dict
from flow_judge import Vllm, FlowJudge, EvalInput
from flow_judge.metrics import CustomMetric, RubricItem
try:
model = Vllm(quantized=False)
except Exception as e:
raise RuntimeError(f"Failed to initialize Vllm: {e}")
EXAMPLES = [
{
"example_description": "Faithfulness of a answer",
"emoji": "🏈",
"task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}],
"task_output": {"name": "Answer", "value": "The capital of France is Paris."},
"evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \
directly inferable from the context?",
"rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \
from the provided context. There is hallucinated or fabricated information present in the response \
that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \
inferable from the provided context. There is no hallucinated or fabricated information present in \
the response that cannot be traced back to or deduced from the context.']
}
]
def populate_fields(example_index: int):
example = EXAMPLES[example_index]
return (
[[input["name"], input["value"]] for input in example["task_inputs"]],
[[example["task_output"]["name"], example["task_output"]["value"]]],
example["evaluation_criteria"],
[[str(i), description] for i, description in enumerate(example["rubric"])]
)
def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple:
# Convert inputs to the expected format
eval_input = EvalInput(
inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()],
output={row['Name']: row['Value'] for _, row in task_output.iterrows()}
)
# Parse the rubric into RubricItems
rubric_items = [
RubricItem(score=int(row['Score']), description=row['Description'])
for _, row in rubric.iterrows()
]
# Create the CustomMetric
custom_metric = CustomMetric(
name="custom-metric",
criteria=evaluation_criteria,
rubric=rubric_items,
required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()],
required_output=task_output.iloc[0]['Name']
)
# Create a FlowJudge instance
judge = FlowJudge(model=model, metric=custom_metric)
# Evaluate using FlowJudge
try:
result = judge.evaluate(eval_input)
except Exception as e:
raise RuntimeError(f"Failed to evaluate: {e}")
# Extract feedback and score from the result
feedback = result.feedback
score = result.score
return feedback, score
def reset_fields():
return (
[["", ""]], # task_inputs
[["", ""]], # task_output
"", # evaluation_criteria
[["", ""]], # rubric
"", # feedback
"" # score
)
def reset_task():
return (
[["", ""]], # task_inputs
[["", ""]] # task_output
)
def reset_evaluation_criteria():
return (
"", # evaluation_criteria
[["", ""]] # rubric
)
with gr.Blocks() as demo:
with gr.Row():
example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]
with gr.Row(equal_height=False):
with gr.Column(scale=1):
gr.Markdown("**Inputs**")
task_inputs = gr.Dataframe(
headers=["Name", "Value"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["30%", "70%"]
)
add_input_btn = gr.Button("Add Input")
gr.Markdown("**Output**")
task_output = gr.Dataframe(
headers=["Name", "Value"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["30%", "70%"]
)
reset_task_btn = gr.Button("Clear Inputs and Output")
with gr.Column(scale=1):
gr.Markdown("**Evaluation criteria and rubric**")
evaluation_criteria = gr.Textbox(label="Evaluation criteria")
rubric = gr.Dataframe(
headers=["Score", "Description"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["10%", "90%"]
)
add_score_btn = gr.Button("Add Score")
reset_criteria_btn = gr.Button("Clear Evaluation Criteria")
with gr.Row():
with gr.Column(scale=1, variant="compact"):
gr.Markdown("**Evaluation**")
feedback = gr.Textbox(label="Feedback")
score = gr.Textbox(label="Score")
evaluate_btn = gr.Button("Evaluate")
with gr.Row():
# Add the reset buttons
reset_all_btn = gr.Button("Clear All")
# Event handlers
add_input_btn.click(
lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
headers=["Name", "Value"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["30%", "70%"]),
inputs=task_inputs,
outputs=task_inputs
)
add_score_btn.click(
lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
headers=["Score", "Description"],
col_count=(2, "fixed"),
datatype=["str", "str"],
row_count=1,
column_widths=["10%", "90%"]),
inputs=rubric,
outputs=rubric
)
for i, button in enumerate(example_buttons):
button.click(
populate_fields,
inputs=[gr.State(i)], # Pass the example index as a state
outputs=[task_inputs, task_output, evaluation_criteria, rubric]
)
evaluate_btn.click(
evaluate,
inputs=[task_inputs, task_output, evaluation_criteria, rubric],
outputs=[feedback, score]
)
reset_task_btn.click(
reset_task,
inputs=[],
outputs=[task_inputs, task_output]
)
reset_criteria_btn.click(
reset_evaluation_criteria,
inputs=[],
outputs=[evaluation_criteria, rubric]
)
reset_all_btn.click(
reset_fields,
inputs=[],
outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score]
)
if __name__ == "__main__":
demo.launch(debug=True)
|