Spaces:

kaikaidai
/

Sandbox_Test

Runtime error

App Files Files Community

Sandbox_Test / random_sample /arena_interface.py

kaikaidai

Synced repo using 'sync_with_huggingface' Github Action

f2bc5f6 verified 5 months ago

raw

history blame

14.7 kB

	import json
	import re
	import gradio as gr

	from dotenv import load_dotenv
	load_dotenv()

	from .gen_api_answer import (
	get_atla_response,
	get_selene_mini_response,
	parse_selene_mini_response
	)

	from .prompts import (
	DEFAULT_EVAL_CRITERIA,
	DEFAULT_EVAL_PROMPT,
	DEFAULT_EVAL_PROMPT_EDITABLE,
	ATLA_PROMPT,
	ATLA_PROMPT_WITH_REFERENCE
	)

	from .random_sample_generation import (
	get_random_human_ai_pair,
	get_random_human_ai_ground_truth_pair,
	generate_ai_response
	)

	from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS

	def parse_variables(prompt):
	# Extract variables enclosed in double curly braces
	variables = re.findall(r"{{(.*?)}}", prompt)
	# Remove duplicates while preserving order
	seen = set()
	variables = [
	x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
	]
	return variables


	def get_final_prompt(eval_prompt, variable_values):
	# Replace variables in the eval prompt with their values
	for var, val in variable_values.items():
	eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
	return eval_prompt


	def populate_random_example(request: gr.Request, compatible_mode: bool):
	"""Generate a random human-AI conversation example and reset judge outputs."""
	if compatible_mode:
	human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
	else:
	human_msg, ai_msg = get_random_human_ai_pair()
	ground_truth_msg = ""

	return [
	gr.update(value=human_msg),
	gr.update(value=ai_msg),
	gr.update(value="🎲", variant="secondary"),
	gr.update(value=""), # Clear score
	gr.update(value=""), # Clear critique
	gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
	]


	def create_arena_interface():
	with gr.Blocks(theme="default", css=CSS_STYLES) as interface:
	# Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
	eval_prompt = gr.Textbox(
	value=DEFAULT_EVAL_PROMPT,
	visible=False
	)
	with gr.Row():
	# Add model selector dropdown at the top
	model_selector = gr.Dropdown(
	choices=["Selene", "Selene Mini"],
	value="Selene",
	label="Choose your Atla Model",
	interactive=True
	)

	with gr.Row():
	# Left side - Input section
	with gr.Column(scale=1):
	with gr.Group():
	human_input = gr.TextArea(
	label="👩 User Input",
	lines=5,
	placeholder="Enter the human message here..."
	)
	with gr.Row():
	generate_btn = gr.Button(
	"Generate AI Response",
	size="sm",
	interactive=False
	)

	ai_response = gr.TextArea(
	label="🤖 AI Response",
	lines=10,
	placeholder="Enter the AI response here..."
	)

	# Ground truth response (initially hidden)
	ground_truth = gr.TextArea(
	label="🎯 Ground truth response",
	lines=10,
	placeholder="Enter the ground truth response here...",
	visible=False
	)

	with gr.Row():
	random_btn = gr.Button("🎲", scale=2)
	send_btn = gr.Button(
	value="Run evaluation",
	variant="primary",
	size="lg",
	scale=8
	)

	# Right side - Model outputs
	with gr.Column(scale=1):
	gr.Markdown("## 👩‍⚖️ Selene-Mini Evaluation")
	with gr.Group():
	with gr.Row():
	score = gr.Textbox(label="Score", lines=1, interactive=False)
	critique = gr.TextArea(label="Critique", lines=12, interactive=False)

	gr.Markdown("<br>")


	# Replace the "Edit Judge Prompt" Accordion section with:
	with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
	gr.Markdown("<br>")
	use_reference_toggle = gr.Checkbox(
	label="Use a reference response",
	value=False
	)

	# Hide the default prompt editor
	with gr.Column(visible=False) as default_prompt_editor:
	eval_prompt_editable = gr.TextArea(
	value=DEFAULT_EVAL_PROMPT_EDITABLE,
	label="Evaluation Criteria",
	lines=12
	)

	with gr.Row(visible=False) as edit_buttons_row:
	cancel_prompt_btn = gr.Button("Cancel")
	save_prompt_btn = gr.Button("Save", variant="primary")

	# Show the compatible mode editor
	with gr.Column(visible=True) as compatible_prompt_editor:
	eval_criteria_text = gr.TextArea(
	label="Evaluation Criteria",
	lines=12,
	value=DEFAULT_EVAL_CRITERIA,
	placeholder="Enter the complete evaluation criteria and scoring rubric..."
	)
	with gr.Row(visible=False) as compatible_edit_buttons_row:
	compatible_cancel_btn = gr.Button("Cancel")
	compatible_save_btn = gr.Button("Save", variant="primary")

	eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
	is_editing = gr.State(False) # Track editing state
	compatible_mode_state = gr.State(False) # Track compatible mode state

	# Update model names after responses are generated
	def update_model_names(model_a, model_b):
	return gr.update(value=f"Model: {model_a}"), gr.update(
	value=f"Model: {model_b}"
	)

	# Store the last submitted prompt and variables for comparison
	last_submission = gr.State({})

	# Update the save/cancel buttons section in the compatible prompt editor
	def save_criteria(new_criteria, previous_criteria):
	return [
	gr.update(value=new_criteria), # Update the criteria
	new_criteria, # Update the previous criteria state
	gr.update(visible=False) # Hide the buttons
	]

	def cancel_criteria(previous_criteria):
	return [
	gr.update(value=previous_criteria), # Revert to previous criteria
	previous_criteria, # Keep the previous criteria state
	gr.update(visible=False) # Hide the buttons
	]

	def show_criteria_edit_buttons(current_value, previous_value):
	# Show buttons only if the current value differs from the previous value
	return gr.update(visible=current_value != previous_value)

	# Add handlers for save/cancel buttons and criteria changes
	compatible_save_btn.click(
	fn=save_criteria,
	inputs=[eval_criteria_text, eval_prompt_previous],
	outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
	)

	compatible_cancel_btn.click(
	fn=cancel_criteria,
	inputs=[eval_prompt_previous],
	outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
	)

	eval_criteria_text.change(
	fn=show_criteria_edit_buttons,
	inputs=[eval_criteria_text, eval_prompt_previous],
	outputs=compatible_edit_buttons_row
	)

	# Function to toggle visibility based on compatible mode
	def toggle_use_reference(checked):
	if checked:
	human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
	return {
	ground_truth: gr.update(visible=True, value=ground_truth_msg),
	human_input: gr.update(value=human_msg),
	ai_response: gr.update(value=ai_msg),
	score: gr.update(value=""),
	critique: gr.update(value=""),
	random_btn: gr.update(value="🎲", variant="secondary"),
	}
	else:
	return {
	ground_truth: gr.update(visible=False)
	}

	# Update the change handler to include all necessary outputs
	use_reference_toggle.change(
	fn=toggle_use_reference,
	inputs=[use_reference_toggle],
	outputs=[
	ground_truth,
	human_input,
	ai_response,
	score,
	critique,
	random_btn,
	]
	)

	# Add a new state variable to track first game
	first_game_state = gr.State(True) # Initialize as True

	# Update the submit function to handle both models
	def submit_and_store(
	model_choice,
	use_reference,
	eval_criteria_text,
	human_input,
	ai_response,
	ground_truth,
	):
	if model_choice == "Selene Mini":
	# Prepare prompt based on reference mode
	prompt_template = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
	prompt = prompt_template.format(
	human_input=human_input,
	ai_response=ai_response,
	eval_criteria=eval_criteria_text,
	ground_truth=ground_truth if use_reference else ""
	)

	print("\n=== Debug: Prompt being sent to Selene Mini ===")
	print(prompt)
	print("============================================\n")

	# Get and parse response
	raw_response = get_selene_mini_response(
	model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
	prompt=prompt,
	max_tokens=500,
	temperature=0.01
	)
	response = parse_selene_mini_response(raw_response)
	else:
	# Selene API logic
	prompt_data = {
	'human_input': human_input,
	'ai_response': ai_response,
	'ground_truth': ground_truth if use_reference else None,
	'eval_criteria': eval_criteria_text,
	}

	print("\n=== Debug: Prompt data being sent to Selene API ===")
	print(json.dumps(prompt_data, indent=2))
	print("============================================\n")

	response = get_atla_response(
	model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
	prompt=prompt_data,
	max_tokens=500,
	temperature=0.01
	)

	# Response now contains score and critique directly
	if isinstance(response, dict) and 'score' in response and 'critique' in response:
	score = str(response['score'])
	critique = response['critique']
	else:
	score = "Error"
	critique = str(response)

	return [
	score,
	critique,
	gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
	gr.update(value="🎲"),
	]

	# Update the send_btn click handler with new input
	send_btn.click(
	fn=submit_and_store,
	inputs=[
	model_selector,
	use_reference_toggle,
	eval_criteria_text,
	human_input,
	ai_response,
	ground_truth,
	],
	outputs=[
	score,
	critique,
	send_btn,
	random_btn,
	],
	)

	# Add random button handler
	random_btn.click(
	fn=populate_random_example,
	inputs=[use_reference_toggle],
	outputs=[
	human_input,
	ai_response,
	random_btn,
	score,
	critique,
	ground_truth,
	]
	)

	# Add input change handlers
	def handle_input_change():
	"""Reset UI state when inputs are changed"""
	return [
	gr.update(value="Run evaluation", variant="primary"), # send_btn
	gr.update(value="🎲", variant="secondary"), # random_btn
	]

	# Update the change handlers for inputs
	human_input.change(
	fn=handle_input_change,
	inputs=[],
	outputs=[send_btn, random_btn]
	)

	ai_response.change(
	fn=handle_input_change,
	inputs=[],
	outputs=[send_btn, random_btn]
	)

	generate_btn.click(
	fn=lambda msg: (
	generate_ai_response(msg)[0], # Only take the response text
	gr.update(
	value="Generate AI Response", # Keep the label
	interactive=False # Disable the button
	)
	),
	inputs=[human_input],
	outputs=[ai_response, generate_btn]
	)

	human_input.change(
	fn=lambda x: gr.update(interactive=bool(x.strip())),
	inputs=[human_input],
	outputs=[generate_btn]
	)

	# Update the demo.load to include the random example population
	interface.load(
	fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
	inputs=[],
	outputs=[
	human_input,
	ai_response,
	random_btn,
	score,
	critique,
	ground_truth,
	]
	)

	return interface

	if __name__ == "__main__":
	demo = create_arena_interface()
	demo.launch()