selene / random_sample /arena_interface.py
kaikaidai's picture
Synced repo using 'sync_with_huggingface' Github Action
9805ef5 verified
import json
import re
import gradio as gr
from dotenv import load_dotenv
load_dotenv()
from .gen_api_answer import (
get_atla_response,
get_selene_mini_response,
parse_selene_mini_response
)
from .prompts import (
DEFAULT_EVAL_CRITERIA,
DEFAULT_EVAL_PROMPT,
DEFAULT_EVAL_PROMPT_EDITABLE,
ATLA_PROMPT,
ATLA_PROMPT_WITH_REFERENCE
)
from .random_sample_generation import (
get_random_human_ai_pair,
get_random_human_ai_ground_truth_pair,
generate_ai_response
)
from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS
def parse_variables(prompt):
# Extract variables enclosed in double curly braces
variables = re.findall(r"{{(.*?)}}", prompt)
# Remove duplicates while preserving order
seen = set()
variables = [
x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
]
return variables
def get_final_prompt(eval_prompt, variable_values):
# Replace variables in the eval prompt with their values
for var, val in variable_values.items():
eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
return eval_prompt
def populate_random_example(request: gr.Request, compatible_mode: bool):
"""Generate a random human-AI conversation example and reset judge outputs."""
if compatible_mode:
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
else:
human_msg, ai_msg = get_random_human_ai_pair()
ground_truth_msg = ""
return [
gr.update(value=human_msg),
gr.update(value=ai_msg),
gr.update(value="🎲", variant="secondary"),
gr.update(value=""), # Clear score
gr.update(value=""), # Clear critique
gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
]
def create_arena_interface():
with gr.Blocks(theme="default", css=CSS_STYLES) as interface:
# Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
eval_prompt = gr.Textbox(
value=DEFAULT_EVAL_PROMPT,
visible=False
)
with gr.Row():
# Add model selector dropdown at the top
model_selector = gr.Dropdown(
choices=["Selene", "Selene Mini"],
value="Selene",
label="Choose your Atla Model",
interactive=True
)
with gr.Row():
# Left side - Input section
with gr.Column(scale=1):
with gr.Group():
human_input = gr.TextArea(
label="πŸ‘© User Input",
lines=5,
placeholder="Enter the human message here..."
)
with gr.Row():
generate_btn = gr.Button(
"Generate AI Response",
size="sm",
interactive=False
)
ai_response = gr.TextArea(
label="πŸ€– AI Response",
lines=10,
placeholder="Enter the AI response here..."
)
# Ground truth response (initially hidden)
ground_truth = gr.TextArea(
label="🎯 Ground truth response",
lines=10,
placeholder="Enter the ground truth response here...",
visible=False
)
with gr.Row():
random_btn = gr.Button("🎲", scale=2)
send_btn = gr.Button(
value="Run evaluation",
variant="primary",
size="lg",
scale=8
)
# Right side - Model outputs
with gr.Column(scale=1):
gr.Markdown("## πŸ‘©β€βš–οΈ Atla Evaluation")
with gr.Group():
with gr.Row():
score = gr.Textbox(label="Score", lines=1, interactive=False)
critique = gr.TextArea(label="Critique", lines=12, interactive=False)
gr.Markdown("<br>")
# Replace the "Edit Judge Prompt" Accordion section with:
with gr.Accordion("πŸ“ Edit Judge Prompt", open=False) as prompt_accordion:
gr.Markdown("<br>")
use_reference_toggle = gr.Checkbox(
label="Use a reference response",
value=False
)
# Hide the default prompt editor
with gr.Column(visible=False) as default_prompt_editor:
eval_prompt_editable = gr.TextArea(
value=DEFAULT_EVAL_PROMPT_EDITABLE,
label="Evaluation Criteria",
lines=12
)
with gr.Row(visible=False) as edit_buttons_row:
cancel_prompt_btn = gr.Button("Cancel")
save_prompt_btn = gr.Button("Save", variant="primary")
# Show the compatible mode editor
with gr.Column(visible=True) as compatible_prompt_editor:
eval_criteria_text = gr.TextArea(
label="Evaluation Criteria",
lines=12,
value=DEFAULT_EVAL_CRITERIA,
placeholder="Enter the complete evaluation criteria and scoring rubric..."
)
with gr.Row(visible=False) as compatible_edit_buttons_row:
compatible_cancel_btn = gr.Button("Cancel")
compatible_save_btn = gr.Button("Save", variant="primary")
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
is_editing = gr.State(False) # Track editing state
compatible_mode_state = gr.State(False) # Track compatible mode state
# Update model names after responses are generated
def update_model_names(model_a, model_b):
return gr.update(value=f"*Model: {model_a}*"), gr.update(
value=f"*Model: {model_b}*"
)
# Store the last submitted prompt and variables for comparison
last_submission = gr.State({})
# Update the save/cancel buttons section in the compatible prompt editor
def save_criteria(new_criteria, previous_criteria):
return [
gr.update(value=new_criteria), # Update the criteria
new_criteria, # Update the previous criteria state
gr.update(visible=False) # Hide the buttons
]
def cancel_criteria(previous_criteria):
return [
gr.update(value=previous_criteria), # Revert to previous criteria
previous_criteria, # Keep the previous criteria state
gr.update(visible=False) # Hide the buttons
]
def show_criteria_edit_buttons(current_value, previous_value):
# Show buttons only if the current value differs from the previous value
return gr.update(visible=current_value != previous_value)
# Add handlers for save/cancel buttons and criteria changes
compatible_save_btn.click(
fn=save_criteria,
inputs=[eval_criteria_text, eval_prompt_previous],
outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
)
compatible_cancel_btn.click(
fn=cancel_criteria,
inputs=[eval_prompt_previous],
outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
)
eval_criteria_text.change(
fn=show_criteria_edit_buttons,
inputs=[eval_criteria_text, eval_prompt_previous],
outputs=compatible_edit_buttons_row
)
# Function to toggle visibility based on compatible mode
def toggle_use_reference(checked):
if checked:
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
return {
ground_truth: gr.update(visible=True, value=ground_truth_msg),
human_input: gr.update(value=human_msg),
ai_response: gr.update(value=ai_msg),
score: gr.update(value=""),
critique: gr.update(value=""),
random_btn: gr.update(value="🎲", variant="secondary"),
}
else:
return {
ground_truth: gr.update(visible=False)
}
# Update the change handler to include all necessary outputs
use_reference_toggle.change(
fn=toggle_use_reference,
inputs=[use_reference_toggle],
outputs=[
ground_truth,
human_input,
ai_response,
score,
critique,
random_btn,
]
)
# Add a new state variable to track first game
first_game_state = gr.State(True) # Initialize as True
# Update the submit function to handle both models
def submit_and_store(
model_choice,
use_reference,
eval_criteria_text,
human_input,
ai_response,
ground_truth,
):
if model_choice == "Selene Mini":
# Prepare prompt based on reference mode
prompt_template = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
prompt = prompt_template.format(
human_input=human_input,
ai_response=ai_response,
eval_criteria=eval_criteria_text,
ground_truth=ground_truth if use_reference else ""
)
print("\n=== Debug: Prompt being sent to Selene Mini ===")
print(prompt)
print("============================================\n")
# Get and parse response
raw_response = get_selene_mini_response(
model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
prompt=prompt,
max_tokens=500,
temperature=0.01
)
response = parse_selene_mini_response(raw_response)
else:
# Selene API logic
prompt_data = {
'human_input': human_input,
'ai_response': ai_response,
'ground_truth': ground_truth if use_reference else None,
'eval_criteria': eval_criteria_text,
}
print("\n=== Debug: Prompt data being sent to Selene API ===")
print(json.dumps(prompt_data, indent=2))
print("============================================\n")
response = get_atla_response(
model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
prompt=prompt_data,
max_tokens=500,
temperature=0.01
)
# Response now contains score and critique directly
if isinstance(response, dict) and 'score' in response and 'critique' in response:
score = str(response['score'])
critique = response['critique']
else:
score = "Error"
critique = str(response)
return [
score,
critique,
gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
gr.update(value="🎲"),
]
# Update the send_btn click handler with new input
send_btn.click(
fn=submit_and_store,
inputs=[
model_selector,
use_reference_toggle,
eval_criteria_text,
human_input,
ai_response,
ground_truth,
],
outputs=[
score,
critique,
send_btn,
random_btn,
],
)
# Add random button handler
random_btn.click(
fn=populate_random_example,
inputs=[use_reference_toggle],
outputs=[
human_input,
ai_response,
random_btn,
score,
critique,
ground_truth,
]
)
# Add input change handlers
def handle_input_change():
"""Reset UI state when inputs are changed"""
return [
gr.update(value="Run evaluation", variant="primary"), # send_btn
gr.update(value="🎲", variant="secondary"), # random_btn
]
# Update the change handlers for inputs
human_input.change(
fn=handle_input_change,
inputs=[],
outputs=[send_btn, random_btn]
)
ai_response.change(
fn=handle_input_change,
inputs=[],
outputs=[send_btn, random_btn]
)
generate_btn.click(
fn=lambda msg: (
generate_ai_response(msg)[0], # Only take the response text
gr.update(
value="Generate AI Response", # Keep the label
interactive=False # Disable the button
)
),
inputs=[human_input],
outputs=[ai_response, generate_btn]
)
human_input.change(
fn=lambda x: gr.update(interactive=bool(x.strip())),
inputs=[human_input],
outputs=[generate_btn]
)
# Update the demo.load to include the random example population
interface.load(
fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
inputs=[],
outputs=[
human_input,
ai_response,
random_btn,
score,
critique,
ground_truth,
]
)
return interface
if __name__ == "__main__":
demo = create_arena_interface()
demo.launch()