Spaces:

AtlaAI
/

selene

Running

App Files Files Community

kaikaidai commited on Jan 29

Commit

ff974dc

verified ·

1 Parent(s): 8b46047

Update app.py

Browse files

Files changed (1) hide show

app.py +468 -4

app.py CHANGED Viewed

@@ -1,7 +1,471 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import json
+import re
+from datetime import datetime
+import hashlib
 import gradio as gr
+from dotenv import load_dotenv
+load_dotenv()
+from gen_api_answer import (
+     atla_parse_model_response,
+    get_atla_response
+)
+from prompts import (
+    ATLA_PROMPT,
+    ATLA_PROMPT_WITH_REFERENCE
+)
+from random_sample_generation import (
+    get_random_human_ai_pair,
+    get_random_human_ai_ground_truth_pair,
+    generate_ai_response
+)
+from utils import Vote
+from prompts import (
+    DEFAULT_EVAL_PROMPT,
+    DEFAULT_EVAL_PROMPT_EDITABLE,
+    FIXED_EVAL_SUFFIX,
+    DEFAULT_EVAL_CRITERIA
+)
+from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS
+# Load the model_data from JSONL
+def load_model_data():
+    model_data = {}
+    try:
+        with open("data/models.jsonl", "r") as f:
+            for line in f:
+                model = json.loads(line)
+                model_data[model["name"]] = {
+                    "organization": model["organization"],
+                    "license": model["license"],
+                    "api_model": model["api_model"],
+                }
+    except FileNotFoundError:
+        print("Warning: models.jsonl not found")
+        return {}
+    return model_data
+model_data = load_model_data()
+def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
+    prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
+    vote = Vote(
+        timestamp=datetime.now().isoformat(),
+        prompt=prompt_value,
+        response_a=response_a,
+        response_b=response_b,
+        model_a=model_a,
+        model_b=model_b,
+        winner=winner,
+        judge_id=judge_id,
+    )
+    add_vote(vote, db)
+def parse_variables(prompt):
+    # Extract variables enclosed in double curly braces
+    variables = re.findall(r"{{(.*?)}}", prompt)
+    # Remove duplicates while preserving order
+    seen = set()
+    variables = [
+        x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
+    ]
+    return variables
+def get_final_prompt(eval_prompt, variable_values):
+    # Replace variables in the eval prompt with their values
+    for var, val in variable_values.items():
+        eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
+    return eval_prompt
+def get_ip(request: gr.Request) -> str:
+    """Get and hash the IP address from the request."""
+    if "cf-connecting-ip" in request.headers:
+        ip = request.headers["cf-connecting-ip"]
+    elif "x-forwarded-for" in request.headers:
+        ip = request.headers["x-forwarded-for"]
+        if "," in ip:
+            ip = ip.split(",")[0]
+    else:
+        ip = request.client.host
+    # Hash the IP address for privacy
+    return hashlib.sha256(ip.encode()).hexdigest()[:16]
+def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
+    """Generate appropriate message based on vote and model rankings.
+    Returns (title, message) tuple."""
+    # Get current rankings
+    voting_data = get_current_votes()
+    leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
+    rankings = get_model_rankings(leaderboard)
+    pos_a = rankings.get(model_a, 0)
+    pos_b = rankings.get(model_b, 0)
+    if choice == "Tie":
+        return "It's a tie!", "Keep voting responsibly 🤗"
+    # Check if vote aligns with leaderboard
+    if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
+        return "The favourite wins!", "Keep voting responsibly 🤗"
+    else:
+        return "The underdog wins!", "Keep voting responsibly 🤗"
+def populate_random_example(request: gr.Request, compatible_mode: bool):
+    """Generate a random human-AI conversation example and reset judge outputs."""
+    if compatible_mode:
+        human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
+    else:
+        human_msg, ai_msg = get_random_human_ai_pair()
+        ground_truth_msg = ""
+    return [
+        gr.update(value=human_msg),
+        gr.update(value=ai_msg),
+        gr.update(value="🎲", variant="secondary"),
+        gr.update(value=""),  # Clear score
+        gr.update(value=""),  # Clear critique
+        gr.update(value=ground_truth_msg, visible=compatible_mode),  # Set ground truth and visibility
+    ]
+with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
+    gr.Markdown(MAIN_TITLE)
+    gr.Markdown(HOW_IT_WORKS)
+    # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
+    eval_prompt = gr.Textbox(
+        value=DEFAULT_EVAL_PROMPT,
+        visible=False
+    )
+    with gr.Tabs():
+        with gr.TabItem("Playground"):
+            with gr.Row():
+                # Left side - Input section
+                with gr.Column(scale=1):
+                    with gr.Group():
+                        human_input = gr.TextArea(
+                            label="👩 User Input",
+                            lines=5,
+                            placeholder="Enter the human message here..."
+                        )
+                        with gr.Row():
+                            generate_btn = gr.Button(
+                                "Generate AI Response",
+                                size="sm",
+                                interactive=False
+                            )
+                        ai_response = gr.TextArea(
+                            label="🤖 AI Response",
+                            lines=10,
+                            placeholder="Enter the AI response here..."
+                        )
+                        # Ground truth response (initially hidden)
+                        ground_truth = gr.TextArea(
+                            label="🎯 Ground truth response",
+                            lines=10,
+                            placeholder="Enter the ground truth response here...",
+                            visible=False
+                        )
+                    with gr.Row():
+                        random_btn = gr.Button("🎲", scale=2)
+                        send_btn = gr.Button(
+                            value="Run evaluation",
+                            variant="primary",
+                            size="lg",
+                            scale=8
+                        )
+                # Right side - Model outputs
+                with gr.Column(scale=1):
+                    gr.Markdown("### 👩‍⚖️ Selene-Mini Evaluation")
+                    with gr.Group():
+                        with gr.Row():
+                            score = gr.Textbox(label="Score", lines=1, interactive=False)
+                        critique = gr.TextArea(label="Critique", lines=12, interactive=False)
+            gr.Markdown("<br>")
+            # Replace the "Edit Judge Prompt" Accordion section with:
+            with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
+                gr.Markdown("<br>")
+                use_reference_toggle = gr.Checkbox(
+                    label="Use a reference response",
+                    value=False
+                )
+                # Hide the default prompt editor
+                with gr.Column(visible=False) as default_prompt_editor:
+                    eval_prompt_editable = gr.TextArea(
+                        value=DEFAULT_EVAL_PROMPT_EDITABLE,
+                        label="Evaluation Criteria",
+                        lines=12
+                    )
+                    with gr.Row(visible=False) as edit_buttons_row:
+                        cancel_prompt_btn = gr.Button("Cancel")
+                        save_prompt_btn = gr.Button("Save", variant="primary")
+                    gr.Markdown("*The sample being evaluated is always appended as:*")
+                    gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
+                # Show the compatible mode editor
+                with gr.Column(visible=True) as compatible_prompt_editor:
+                    eval_criteria_text = gr.TextArea(
+                        label="Evaluation Criteria",
+                        lines=12,
+                        value=DEFAULT_EVAL_CRITERIA,
+                        placeholder="Enter the complete evaluation criteria and scoring rubric..."
+                    )
+    # Define state variables for model tracking
+    model_a_state = gr.State()
+    model_b_state = gr.State()
+    final_prompt_state = gr.State()
+    eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE)  # Initialize with default value
+    is_editing = gr.State(False)  # Track editing state
+    compatible_mode_state = gr.State(False)  # Track compatible mode state
+    # Update model names after responses are generated
+    def update_model_names(model_a, model_b):
+        return gr.update(value=f"*Model: {model_a}*"), gr.update(
+            value=f"*Model: {model_b}*"
+        )
+    # Store the last submitted prompt and variables for comparison
+    last_submission = gr.State({})
+    # Add handlers for save/cancel buttons
+    def save_prompt(new_prompt, previous_prompt):
+        return [
+            gr.update(value=new_prompt),  # Update the prompt
+            new_prompt,  # Update the previous prompt state
+            gr.update(visible=False)  # Hide the buttons
+        ]
+    def cancel_prompt(previous_prompt):
+        return [
+            gr.update(value=previous_prompt),  # Revert to previous prompt
+            previous_prompt,  # Keep the previous prompt state
+            gr.update(visible=False)  # Hide the buttons
+        ]
+    def show_edit_buttons(current_value, previous_value):
+        # Show buttons only if the current value differs from the previous value
+        return gr.update(visible=current_value != previous_value)
+    # Add handlers for save/cancel buttons and prompt changes
+    save_prompt_btn.click(
+        fn=save_prompt,
+        inputs=[eval_prompt_editable, eval_prompt_previous],
+        outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
+    )
+    cancel_prompt_btn.click(
+        fn=cancel_prompt,
+        inputs=[eval_prompt_previous],
+        outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
+    )
+    eval_prompt_editable.change(
+        fn=show_edit_buttons,
+        inputs=[eval_prompt_editable, eval_prompt_previous],
+        outputs=edit_buttons_row
+    )
+    # Function to toggle visibility based on compatible mode
+    def toggle_use_reference(checked):
+        if checked:
+            human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
+            return {
+                ground_truth: gr.update(visible=True, value=ground_truth_msg),
+                human_input: gr.update(value=human_msg),
+                ai_response: gr.update(value=ai_msg),
+                score: gr.update(value=""),
+                critique: gr.update(value=""),
+                random_btn: gr.update(value="🎲", variant="secondary"),
+            }
+        else:
+            return {
+                ground_truth: gr.update(visible=False)
+            }
+    # Update the change handler to include all necessary outputs
+    use_reference_toggle.change(
+        fn=toggle_use_reference,
+        inputs=[use_reference_toggle],
+        outputs=[
+            ground_truth,
+            human_input,
+            ai_response,
+            score,
+            critique,
+            random_btn,
+        ]
+    )
+    # Add a new state variable to track first game
+    first_game_state = gr.State(True)  # Initialize as True
+    # Update the submit function to parse the evaluation criteria
+    def submit_and_store(
+        use_reference,
+        eval_criteria_text,
+        human_input,
+        ai_response,
+        ground_truth_input,
+    ):
+        # Build prompt data dictionary
+        prompt_data = {
+            'human_input': human_input,
+            'ai_response': ai_response,
+            'ground_truth_input': ground_truth_input if use_reference else '',
+            'eval_criteria': eval_criteria_text,
+        }
+        # Get base prompt based on whether reference is used
+        base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
+        # Format the prompt
+        final_prompt = base_prompt.format(
+            human_input=prompt_data['human_input'],
+            ai_response=prompt_data['ai_response'],
+            ground_truth_input=prompt_data['ground_truth_input'],
+            eval_criteria=prompt_data['eval_criteria']
+        )
+        # Get response from Atla
+        response = get_atla_response(
+            model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
+            prompt=final_prompt,
+            max_tokens=500,
+            temperature=0.01
+        )
+        # Parse the response
+        score, critique = atla_parse_model_response(response)
+        return [
+            score,
+            critique,
+            gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
+            gr.update(value="🎲"),
+        ]
+    # Update the click handler to use False for is_first_game after first submission
+    def create_submit_handler():
+        first_game = True
+        def handler(*args):
+            nonlocal first_game
+            result = submit_and_store(*args)
+            first_game = False  # Set to False after first submission
+            return result
+        return handler
+    # Update the send_btn click handler
+    send_btn.click(
+        fn=submit_and_store,
+        inputs=[
+            use_reference_toggle,
+            eval_criteria_text,
+            human_input,
+            ai_response,
+            ground_truth,
+        ],
+        outputs=[
+            score,
+            critique,
+            send_btn,
+            random_btn,
+        ],
+    )
+    # Add random button handler
+    random_btn.click(
+        fn=populate_random_example,
+        inputs=[use_reference_toggle],
+        outputs=[
+            human_input,
+            ai_response,
+            random_btn,
+            score,
+            critique,
+            ground_truth,
+        ]
+    )
+    # Add input change handlers
+    def handle_input_change():
+        """Reset UI state when inputs are changed"""
+        return [
+            gr.update(value="Run evaluation", variant="primary"),  # send_btn
+            gr.update(value="🎲", variant="secondary"),  # random_btn
+        ]
+    # Update the change handlers for inputs
+    human_input.change(
+        fn=handle_input_change,
+        inputs=[],
+        outputs=[send_btn, random_btn]
+    )
+    ai_response.change(
+        fn=handle_input_change,
+        inputs=[],
+        outputs=[send_btn, random_btn]
+    )
+    generate_btn.click(
+        fn=lambda msg: (
+            generate_ai_response(msg)[0],  # Only take the response text
+            gr.update(
+                value="Generate AI Response",  # Keep the label
+                interactive=False  # Disable the button
+            )
+        ),
+        inputs=[human_input],
+        outputs=[ai_response, generate_btn]
+    )
+    human_input.change(
+        fn=lambda x: gr.update(interactive=bool(x.strip())),
+        inputs=[human_input],
+        outputs=[generate_btn]
+    )
+    # Update the demo.load to include the random example population
+    demo.load(
+        fn=lambda: populate_random_example(None, False),  # Pass False for initial compatible_mode
+        inputs=[],
+        outputs=[
+            human_input,
+            ai_response,
+            random_btn,
+            score,
+            critique,
+            ground_truth,
+        ]
+    )
+if __name__ == "__main__":
+    demo.launch()