|
import json |
|
import re |
|
import gradio as gr |
|
|
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
|
|
from .gen_api_answer import ( |
|
get_atla_response, |
|
get_selene_mini_response, |
|
parse_selene_mini_response |
|
) |
|
|
|
from .prompts import ( |
|
DEFAULT_EVAL_CRITERIA, |
|
DEFAULT_EVAL_PROMPT, |
|
DEFAULT_EVAL_PROMPT_EDITABLE, |
|
ATLA_PROMPT, |
|
ATLA_PROMPT_WITH_REFERENCE |
|
) |
|
|
|
from .random_sample_generation import ( |
|
get_random_human_ai_pair, |
|
get_random_human_ai_ground_truth_pair, |
|
generate_ai_response |
|
) |
|
|
|
from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS |
|
|
|
def parse_variables(prompt): |
|
|
|
variables = re.findall(r"{{(.*?)}}", prompt) |
|
|
|
seen = set() |
|
variables = [ |
|
x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip())) |
|
] |
|
return variables |
|
|
|
|
|
def get_final_prompt(eval_prompt, variable_values): |
|
|
|
for var, val in variable_values.items(): |
|
eval_prompt = eval_prompt.replace("{{" + var + "}}", val) |
|
return eval_prompt |
|
|
|
|
|
def populate_random_example(request: gr.Request, compatible_mode: bool): |
|
"""Generate a random human-AI conversation example and reset judge outputs.""" |
|
if compatible_mode: |
|
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() |
|
else: |
|
human_msg, ai_msg = get_random_human_ai_pair() |
|
ground_truth_msg = "" |
|
|
|
return [ |
|
gr.update(value=human_msg), |
|
gr.update(value=ai_msg), |
|
gr.update(value="π²", variant="secondary"), |
|
gr.update(value=""), |
|
gr.update(value=""), |
|
gr.update(value=ground_truth_msg, visible=compatible_mode), |
|
] |
|
|
|
|
|
def create_arena_interface(): |
|
with gr.Blocks(theme="default", css=CSS_STYLES) as interface: |
|
|
|
eval_prompt = gr.Textbox( |
|
value=DEFAULT_EVAL_PROMPT, |
|
visible=False |
|
) |
|
with gr.Row(): |
|
|
|
model_selector = gr.Dropdown( |
|
choices=["Selene", "Selene Mini"], |
|
value="Selene", |
|
label="Choose your Atla Model", |
|
interactive=True |
|
) |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(): |
|
human_input = gr.TextArea( |
|
label="π© User Input", |
|
lines=5, |
|
placeholder="Enter the human message here..." |
|
) |
|
with gr.Row(): |
|
generate_btn = gr.Button( |
|
"Generate AI Response", |
|
size="sm", |
|
interactive=False |
|
) |
|
|
|
ai_response = gr.TextArea( |
|
label="π€ AI Response", |
|
lines=10, |
|
placeholder="Enter the AI response here..." |
|
) |
|
|
|
|
|
ground_truth = gr.TextArea( |
|
label="π― Ground truth response", |
|
lines=10, |
|
placeholder="Enter the ground truth response here...", |
|
visible=False |
|
) |
|
|
|
with gr.Row(): |
|
random_btn = gr.Button("π²", scale=2) |
|
send_btn = gr.Button( |
|
value="Run evaluation", |
|
variant="primary", |
|
size="lg", |
|
scale=8 |
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
gr.Markdown("## π©ββοΈ Atla Evaluation") |
|
with gr.Group(): |
|
with gr.Row(): |
|
score = gr.Textbox(label="Score", lines=1, interactive=False) |
|
critique = gr.TextArea(label="Critique", lines=12, interactive=False) |
|
|
|
gr.Markdown("<br>") |
|
|
|
|
|
|
|
with gr.Accordion("π Edit Judge Prompt", open=False) as prompt_accordion: |
|
gr.Markdown("<br>") |
|
use_reference_toggle = gr.Checkbox( |
|
label="Use a reference response", |
|
value=False |
|
) |
|
|
|
|
|
with gr.Column(visible=False) as default_prompt_editor: |
|
eval_prompt_editable = gr.TextArea( |
|
value=DEFAULT_EVAL_PROMPT_EDITABLE, |
|
label="Evaluation Criteria", |
|
lines=12 |
|
) |
|
|
|
with gr.Row(visible=False) as edit_buttons_row: |
|
cancel_prompt_btn = gr.Button("Cancel") |
|
save_prompt_btn = gr.Button("Save", variant="primary") |
|
|
|
|
|
with gr.Column(visible=True) as compatible_prompt_editor: |
|
eval_criteria_text = gr.TextArea( |
|
label="Evaluation Criteria", |
|
lines=12, |
|
value=DEFAULT_EVAL_CRITERIA, |
|
placeholder="Enter the complete evaluation criteria and scoring rubric..." |
|
) |
|
with gr.Row(visible=False) as compatible_edit_buttons_row: |
|
compatible_cancel_btn = gr.Button("Cancel") |
|
compatible_save_btn = gr.Button("Save", variant="primary") |
|
|
|
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) |
|
is_editing = gr.State(False) |
|
compatible_mode_state = gr.State(False) |
|
|
|
|
|
def update_model_names(model_a, model_b): |
|
return gr.update(value=f"*Model: {model_a}*"), gr.update( |
|
value=f"*Model: {model_b}*" |
|
) |
|
|
|
|
|
last_submission = gr.State({}) |
|
|
|
|
|
def save_criteria(new_criteria, previous_criteria): |
|
return [ |
|
gr.update(value=new_criteria), |
|
new_criteria, |
|
gr.update(visible=False) |
|
] |
|
|
|
def cancel_criteria(previous_criteria): |
|
return [ |
|
gr.update(value=previous_criteria), |
|
previous_criteria, |
|
gr.update(visible=False) |
|
] |
|
|
|
def show_criteria_edit_buttons(current_value, previous_value): |
|
|
|
return gr.update(visible=current_value != previous_value) |
|
|
|
|
|
compatible_save_btn.click( |
|
fn=save_criteria, |
|
inputs=[eval_criteria_text, eval_prompt_previous], |
|
outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row] |
|
) |
|
|
|
compatible_cancel_btn.click( |
|
fn=cancel_criteria, |
|
inputs=[eval_prompt_previous], |
|
outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row] |
|
) |
|
|
|
eval_criteria_text.change( |
|
fn=show_criteria_edit_buttons, |
|
inputs=[eval_criteria_text, eval_prompt_previous], |
|
outputs=compatible_edit_buttons_row |
|
) |
|
|
|
|
|
def toggle_use_reference(checked): |
|
if checked: |
|
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() |
|
return { |
|
ground_truth: gr.update(visible=True, value=ground_truth_msg), |
|
human_input: gr.update(value=human_msg), |
|
ai_response: gr.update(value=ai_msg), |
|
score: gr.update(value=""), |
|
critique: gr.update(value=""), |
|
random_btn: gr.update(value="π²", variant="secondary"), |
|
} |
|
else: |
|
return { |
|
ground_truth: gr.update(visible=False) |
|
} |
|
|
|
|
|
use_reference_toggle.change( |
|
fn=toggle_use_reference, |
|
inputs=[use_reference_toggle], |
|
outputs=[ |
|
ground_truth, |
|
human_input, |
|
ai_response, |
|
score, |
|
critique, |
|
random_btn, |
|
] |
|
) |
|
|
|
|
|
first_game_state = gr.State(True) |
|
|
|
|
|
def submit_and_store( |
|
model_choice, |
|
use_reference, |
|
eval_criteria_text, |
|
human_input, |
|
ai_response, |
|
ground_truth, |
|
): |
|
if model_choice == "Selene Mini": |
|
|
|
prompt_template = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT |
|
prompt = prompt_template.format( |
|
human_input=human_input, |
|
ai_response=ai_response, |
|
eval_criteria=eval_criteria_text, |
|
ground_truth=ground_truth if use_reference else "" |
|
) |
|
|
|
print("\n=== Debug: Prompt being sent to Selene Mini ===") |
|
print(prompt) |
|
print("============================================\n") |
|
|
|
|
|
raw_response = get_selene_mini_response( |
|
model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B", |
|
prompt=prompt, |
|
max_tokens=500, |
|
temperature=0.01 |
|
) |
|
response = parse_selene_mini_response(raw_response) |
|
else: |
|
|
|
prompt_data = { |
|
'human_input': human_input, |
|
'ai_response': ai_response, |
|
'ground_truth': ground_truth if use_reference else None, |
|
'eval_criteria': eval_criteria_text, |
|
} |
|
|
|
print("\n=== Debug: Prompt data being sent to Selene API ===") |
|
print(json.dumps(prompt_data, indent=2)) |
|
print("============================================\n") |
|
|
|
response = get_atla_response( |
|
model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B", |
|
prompt=prompt_data, |
|
max_tokens=500, |
|
temperature=0.01 |
|
) |
|
|
|
|
|
if isinstance(response, dict) and 'score' in response and 'critique' in response: |
|
score = str(response['score']) |
|
critique = response['critique'] |
|
else: |
|
score = "Error" |
|
critique = str(response) |
|
|
|
return [ |
|
score, |
|
critique, |
|
gr.update(value="Regenerate evaluation", variant="secondary", interactive=True), |
|
gr.update(value="π²"), |
|
] |
|
|
|
|
|
send_btn.click( |
|
fn=submit_and_store, |
|
inputs=[ |
|
model_selector, |
|
use_reference_toggle, |
|
eval_criteria_text, |
|
human_input, |
|
ai_response, |
|
ground_truth, |
|
], |
|
outputs=[ |
|
score, |
|
critique, |
|
send_btn, |
|
random_btn, |
|
], |
|
) |
|
|
|
|
|
random_btn.click( |
|
fn=populate_random_example, |
|
inputs=[use_reference_toggle], |
|
outputs=[ |
|
human_input, |
|
ai_response, |
|
random_btn, |
|
score, |
|
critique, |
|
ground_truth, |
|
] |
|
) |
|
|
|
|
|
def handle_input_change(): |
|
"""Reset UI state when inputs are changed""" |
|
return [ |
|
gr.update(value="Run evaluation", variant="primary"), |
|
gr.update(value="π²", variant="secondary"), |
|
] |
|
|
|
|
|
human_input.change( |
|
fn=handle_input_change, |
|
inputs=[], |
|
outputs=[send_btn, random_btn] |
|
) |
|
|
|
ai_response.change( |
|
fn=handle_input_change, |
|
inputs=[], |
|
outputs=[send_btn, random_btn] |
|
) |
|
|
|
generate_btn.click( |
|
fn=lambda msg: ( |
|
generate_ai_response(msg)[0], |
|
gr.update( |
|
value="Generate AI Response", |
|
interactive=False |
|
) |
|
), |
|
inputs=[human_input], |
|
outputs=[ai_response, generate_btn] |
|
) |
|
|
|
human_input.change( |
|
fn=lambda x: gr.update(interactive=bool(x.strip())), |
|
inputs=[human_input], |
|
outputs=[generate_btn] |
|
) |
|
|
|
|
|
interface.load( |
|
fn=lambda: populate_random_example(None, False), |
|
inputs=[], |
|
outputs=[ |
|
human_input, |
|
ai_response, |
|
random_btn, |
|
score, |
|
critique, |
|
ground_truth, |
|
] |
|
) |
|
|
|
return interface |
|
|
|
if __name__ == "__main__": |
|
demo = create_arena_interface() |
|
demo.launch() |