Spaces:

AtlaAI
/

selene

Running

File size: 15,604 Bytes

ff974dc
 
 
 
e334572
 
ff974dc

import json
import re
from datetime import datetime
import hashlib
import gradio as gr

from gen_api_answer import (
     atla_parse_model_response,
    get_atla_response
)

from prompts import (
    ATLA_PROMPT,
    ATLA_PROMPT_WITH_REFERENCE
)

from random_sample_generation import (
    get_random_human_ai_pair,
    get_random_human_ai_ground_truth_pair,
    generate_ai_response
)   

from utils import Vote

from prompts import (
    DEFAULT_EVAL_PROMPT,
    DEFAULT_EVAL_PROMPT_EDITABLE,
    FIXED_EVAL_SUFFIX,
    DEFAULT_EVAL_CRITERIA
)

from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS


# Load the model_data from JSONL
def load_model_data():
    model_data = {}
    try:
        with open("data/models.jsonl", "r") as f:
            for line in f:
                model = json.loads(line)
                model_data[model["name"]] = {
                    "organization": model["organization"],
                    "license": model["license"],
                    "api_model": model["api_model"],
                }
    except FileNotFoundError:
        print("Warning: models.jsonl not found")
        return {}
    return model_data


model_data = load_model_data()

def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
    prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
    
    vote = Vote(
        timestamp=datetime.now().isoformat(),
        prompt=prompt_value,
        response_a=response_a,
        response_b=response_b,
        model_a=model_a,
        model_b=model_b,
        winner=winner,
        judge_id=judge_id,
    )
    add_vote(vote, db)


def parse_variables(prompt):
    # Extract variables enclosed in double curly braces
    variables = re.findall(r"{{(.*?)}}", prompt)
    # Remove duplicates while preserving order
    seen = set()
    variables = [
        x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
    ]
    return variables


def get_final_prompt(eval_prompt, variable_values):
    # Replace variables in the eval prompt with their values
    for var, val in variable_values.items():
        eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
    return eval_prompt



def get_ip(request: gr.Request) -> str:
    """Get and hash the IP address from the request."""
    if "cf-connecting-ip" in request.headers:
        ip = request.headers["cf-connecting-ip"]
    elif "x-forwarded-for" in request.headers:
        ip = request.headers["x-forwarded-for"]
        if "," in ip:
            ip = ip.split(",")[0]
    else:
        ip = request.client.host
    
    # Hash the IP address for privacy
    return hashlib.sha256(ip.encode()).hexdigest()[:16]


def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
    """Generate appropriate message based on vote and model rankings.
    Returns (title, message) tuple."""
    # Get current rankings
    voting_data = get_current_votes()
    leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
    rankings = get_model_rankings(leaderboard)
    pos_a = rankings.get(model_a, 0)
    pos_b = rankings.get(model_b, 0)
    
    if choice == "Tie":
        return "It's a tie!", "Keep voting responsibly 🤗"
    
    # Check if vote aligns with leaderboard
    if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
        return "The favourite wins!", "Keep voting responsibly 🤗"
    else:
        return "The underdog wins!", "Keep voting responsibly 🤗"


def populate_random_example(request: gr.Request, compatible_mode: bool):
    """Generate a random human-AI conversation example and reset judge outputs."""
    if compatible_mode:
        human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
    else:
        human_msg, ai_msg = get_random_human_ai_pair()
        ground_truth_msg = ""
    
    return [
        gr.update(value=human_msg),
        gr.update(value=ai_msg),
        gr.update(value="🎲", variant="secondary"),
        gr.update(value=""),  # Clear score
        gr.update(value=""),  # Clear critique
        gr.update(value=ground_truth_msg, visible=compatible_mode),  # Set ground truth and visibility
    ]


with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
    gr.Markdown(MAIN_TITLE)
    gr.Markdown(HOW_IT_WORKS)
    
    # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
    eval_prompt = gr.Textbox(
        value=DEFAULT_EVAL_PROMPT,
        visible=False
    )

    with gr.Tabs():
        with gr.TabItem("Playground"):
            with gr.Row():
                # Left side - Input section
                with gr.Column(scale=1):
                    with gr.Group():
                        human_input = gr.TextArea(
                            label="👩 User Input",
                            lines=5,
                            placeholder="Enter the human message here..."
                        )
                        with gr.Row():
                            generate_btn = gr.Button(
                                "Generate AI Response",
                                size="sm",
                                interactive=False
                            )
                        
                        ai_response = gr.TextArea(
                            label="🤖 AI Response", 
                            lines=10,
                            placeholder="Enter the AI response here..."
                        )
                        
                        # Ground truth response (initially hidden)
                        ground_truth = gr.TextArea(
                            label="🎯 Ground truth response",
                            lines=10,
                            placeholder="Enter the ground truth response here...",
                            visible=False
                        )
                        
                    with gr.Row():
                        random_btn = gr.Button("🎲", scale=2)
                        send_btn = gr.Button(
                            value="Run evaluation",
                            variant="primary",
                            size="lg",
                            scale=8
                        )

                # Right side - Model outputs
                with gr.Column(scale=1):
                    gr.Markdown("### 👩‍⚖️ Selene-Mini Evaluation")
                    with gr.Group():
                        with gr.Row():
                            score = gr.Textbox(label="Score", lines=1, interactive=False)
                        critique = gr.TextArea(label="Critique", lines=12, interactive=False)
                
            gr.Markdown("<br>")
            

            # Replace the "Edit Judge Prompt" Accordion section with:
            with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
                gr.Markdown("<br>")
                use_reference_toggle = gr.Checkbox(
                    label="Use a reference response",
                    value=False
                )
                
                # Hide the default prompt editor
                with gr.Column(visible=False) as default_prompt_editor:
                    eval_prompt_editable = gr.TextArea(
                        value=DEFAULT_EVAL_PROMPT_EDITABLE,
                        label="Evaluation Criteria",
                        lines=12
                    )

                    with gr.Row(visible=False) as edit_buttons_row:
                        cancel_prompt_btn = gr.Button("Cancel")
                        save_prompt_btn = gr.Button("Save", variant="primary")
                    gr.Markdown("*The sample being evaluated is always appended as:*")
                    gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
                
                # Show the compatible mode editor
                with gr.Column(visible=True) as compatible_prompt_editor:
                    eval_criteria_text = gr.TextArea(
                        label="Evaluation Criteria",
                        lines=12,
                        value=DEFAULT_EVAL_CRITERIA,
                        placeholder="Enter the complete evaluation criteria and scoring rubric..."
                    )

    # Define state variables for model tracking
    model_a_state = gr.State()
    model_b_state = gr.State()
    final_prompt_state = gr.State()
    eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE)  # Initialize with default value
    is_editing = gr.State(False)  # Track editing state
    compatible_mode_state = gr.State(False)  # Track compatible mode state

    # Update model names after responses are generated
    def update_model_names(model_a, model_b):
        return gr.update(value=f"*Model: {model_a}*"), gr.update(
            value=f"*Model: {model_b}*"
        )

    # Store the last submitted prompt and variables for comparison
    last_submission = gr.State({})

    # Add handlers for save/cancel buttons
    def save_prompt(new_prompt, previous_prompt):
        return [
            gr.update(value=new_prompt),  # Update the prompt
            new_prompt,  # Update the previous prompt state
            gr.update(visible=False)  # Hide the buttons
        ]

    def cancel_prompt(previous_prompt):
        return [
            gr.update(value=previous_prompt),  # Revert to previous prompt
            previous_prompt,  # Keep the previous prompt state
            gr.update(visible=False)  # Hide the buttons
        ]

    def show_edit_buttons(current_value, previous_value):
        # Show buttons only if the current value differs from the previous value
        return gr.update(visible=current_value != previous_value)

    # Add handlers for save/cancel buttons and prompt changes
    save_prompt_btn.click(
        fn=save_prompt,
        inputs=[eval_prompt_editable, eval_prompt_previous],
        outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
    )

    cancel_prompt_btn.click(
        fn=cancel_prompt,
        inputs=[eval_prompt_previous],
        outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
    )

    eval_prompt_editable.change(
        fn=show_edit_buttons,
        inputs=[eval_prompt_editable, eval_prompt_previous],
        outputs=edit_buttons_row
    )

    # Function to toggle visibility based on compatible mode
    def toggle_use_reference(checked):
        if checked:
            human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
            return {
                ground_truth: gr.update(visible=True, value=ground_truth_msg),
                human_input: gr.update(value=human_msg),
                ai_response: gr.update(value=ai_msg),
                score: gr.update(value=""),
                critique: gr.update(value=""),
                random_btn: gr.update(value="🎲", variant="secondary"),
            }
        else:
            return {
                ground_truth: gr.update(visible=False)
            }

    # Update the change handler to include all necessary outputs
    use_reference_toggle.change(
        fn=toggle_use_reference,
        inputs=[use_reference_toggle],
        outputs=[
            ground_truth,
            human_input,
            ai_response,
            score,
            critique,
            random_btn,
        ]
    )

    # Add a new state variable to track first game
    first_game_state = gr.State(True)  # Initialize as True

    # Update the submit function to parse the evaluation criteria
    def submit_and_store(
        use_reference,
        eval_criteria_text,
        human_input,
        ai_response,
        ground_truth_input,
    ):
        # Build prompt data dictionary
        prompt_data = {
            'human_input': human_input,
            'ai_response': ai_response,
            'ground_truth_input': ground_truth_input if use_reference else '',
            'eval_criteria': eval_criteria_text,
        }

        # Get base prompt based on whether reference is used
        base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT

        # Format the prompt
        final_prompt = base_prompt.format(
            human_input=prompt_data['human_input'],
            ai_response=prompt_data['ai_response'],
            ground_truth_input=prompt_data['ground_truth_input'],
            eval_criteria=prompt_data['eval_criteria']
        )

        # Get response from Atla
        response = get_atla_response(
            model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
            prompt=final_prompt,
            max_tokens=500,
            temperature=0.01
        )

        # Parse the response
        score, critique = atla_parse_model_response(response)

        return [
            score,
            critique,
            gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
            gr.update(value="🎲"),
        ]

    # Update the click handler to use False for is_first_game after first submission
    def create_submit_handler():
        first_game = True
        
        def handler(*args):
            nonlocal first_game
            result = submit_and_store(*args)
            first_game = False  # Set to False after first submission
            return result
        
        return handler

    # Update the send_btn click handler
    send_btn.click(
        fn=submit_and_store,
        inputs=[
            use_reference_toggle,
            eval_criteria_text,
            human_input,
            ai_response,
            ground_truth,
        ],
        outputs=[
            score,
            critique,
            send_btn,
            random_btn,
        ],
    )

    # Add random button handler
    random_btn.click(
        fn=populate_random_example,
        inputs=[use_reference_toggle],
        outputs=[
            human_input,
            ai_response,
            random_btn,
            score,
            critique,
            ground_truth,
        ]
    )

    # Add input change handlers
    def handle_input_change():
        """Reset UI state when inputs are changed"""
        return [
            gr.update(value="Run evaluation", variant="primary"),  # send_btn
            gr.update(value="🎲", variant="secondary"),  # random_btn
        ]

    # Update the change handlers for inputs
    human_input.change(
        fn=handle_input_change,
        inputs=[],
        outputs=[send_btn, random_btn]
    )

    ai_response.change(
        fn=handle_input_change,
        inputs=[],
        outputs=[send_btn, random_btn]
    )

    generate_btn.click(
        fn=lambda msg: (
            generate_ai_response(msg)[0],  # Only take the response text
            gr.update(
                value="Generate AI Response",  # Keep the label
                interactive=False  # Disable the button
            )
        ),
        inputs=[human_input],
        outputs=[ai_response, generate_btn]
    )

    human_input.change(
        fn=lambda x: gr.update(interactive=bool(x.strip())),
        inputs=[human_input],
        outputs=[generate_btn]
    )

    # Update the demo.load to include the random example population
    demo.load(
        fn=lambda: populate_random_example(None, False),  # Pass False for initial compatible_mode
        inputs=[],
        outputs=[
            human_input,
            ai_response,
            random_btn,
            score,
            critique,
            ground_truth,
        ]
    )

if __name__ == "__main__":
    demo.launch()