Spaces:

KolumbusLindh
/

LLM-as-a-judge

Sleeping

App Files Files Community

Kolumbus Lindh commited on Dec 9, 2024

Commit

ec08b2a

1 Parent(s): 04d9cf4

updsated UI and functionality

Browse files

Files changed (1) hide show

app.py +42 -57

app.py CHANGED Viewed

@@ -15,84 +15,69 @@ print("Starting model loading...")
 model = load_model()
 print("Model loaded successfully!")
-# Function to generate and evaluate content
-def generate_and_evaluate(preconfigured_prompt):
-    # Step 1: Generate content
-    generation_prompt = [
-        {"role": "user", "content": preconfigured_prompt}
-    ]
-    generated_response = model.create_chat_completion(
-        messages=generation_prompt,
-        max_tokens=256,
-        temperature=1.5
-    )
-    generated_content = generated_response['choices'][0]['message']['content']
-    # Step 2: Evaluate the generated content
     evaluation_prompt = [
-        {"role": "system", "content": "You are a strict language evaluator who provides binary assessments of texts."},
-        {"role": "user", "content": f"""Carefully evaluate the generated story:
-Prompt: {preconfigured_prompt}
-Generated response: {generated_content}
-Provide a clear evaluation as follows:
-For each question, write the full question followed by your "Yes" or "No" answer.
-Example format:
-1. Is the story exactly 50 words? - Yes
-2. Does the story contain the letter 'a'? - No
-Now answer these questions:
-1. Is the story exactly 50 words?
-2. Does the story contain the letter 'a'?
-3. Does the story contain the word "alabaster"?
-4. Does the reader understand that the cat's name is Alabaster?
-5. Is the story 100% in English?
-6. Does the text rhyme?"""}
     ]
     evaluation_response = model.create_chat_completion(
         messages=evaluation_prompt,
-        max_tokens=128,
-        temperature=0.2
     )
     evaluation_results = evaluation_response['choices'][0]['message']['content']
-    return generated_content, evaluation_results
-# Preconfigured prompt
-PRECONFIGURED_PROMPT = """Write a story about the cat Alabaster. It should be exactly 50 words and you are not allowed to use the letter 'a'. The reader must understand that the cat's name is Alabaster. Only replacing the letter 'a' with something like "_" is not enough. The text should rhyme."""
 # Gradio interface
 with gr.Blocks(title="LLM as a Judge") as demo:
     gr.Markdown("## LLM as a Judge 🧐")
-    generate_evaluate_button = gr.Button("Judge the LLM!")
-    # Label for the preconfigured prompt
-    gr.Label("Preconfigured prompt:")
-    gr.Label(PRECONFIGURED_PROMPT)
-    generated_output = gr.Textbox(
-        label="Generated Content",
-        placeholder="The generated content will appear here...",
-        lines=5,
-        interactive=False
     )
     evaluation_output = gr.Textbox(
         label="Evaluation Results",
         placeholder="The evaluation results will appear here...",
-        lines=8,
         interactive=False
     )
-    # Link generation and evaluation
-    generate_evaluate_button.click(
-        fn=generate_and_evaluate,
-        inputs=[gr.State(PRECONFIGURED_PROMPT)],
-        outputs=[generated_output, evaluation_output]
     )
 # Launch the app
-demo.launch(
-    server_name="0.0.0.0",
-    server_port=7860,
-    share=False
-)

 model = load_model()
 print("Model loaded successfully!")
+# Function to evaluate two responses
+def evaluate_responses(prompt, response_a, response_b, evaluation_criteria):
+    # Format the evaluation prompt
     evaluation_prompt = [
+        {"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
+        {"role": "user", "content": f"""
+Prompt: {prompt}
+Response A: {response_a}
+Response B: {response_b}
+Please evaluate both responses based on the following criteria: {evaluation_criteria}
+For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
+"""}
     ]
+    # Generate the evaluation
     evaluation_response = model.create_chat_completion(
         messages=evaluation_prompt,
+        max_tokens=512,
+        temperature=0.5
     )
     evaluation_results = evaluation_response['choices'][0]['message']['content']
+    return evaluation_results
 # Gradio interface
 with gr.Blocks(title="LLM as a Judge") as demo:
     gr.Markdown("## LLM as a Judge 🧐")
+    # Input fields for the prompt, two responses, and selection of criteria
+    prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
+    response_a_input = gr.Textbox(label="Response A", placeholder="Enter Response A here...", lines=5)
+    response_b_input = gr.Textbox(label="Response B", placeholder="Enter Response B here...", lines=5)
+    # Dropdown for selecting evaluation criteria
+    criteria_dropdown = gr.Dropdown(
+        label="Select Evaluation Criteria",
+        choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
+        value="Clarity",
+        type="value"
     )
+    # Button to start the evaluation
+    evaluate_button = gr.Button("Evaluate Responses")
+    # Label for displaying the evaluation results
     evaluation_output = gr.Textbox(
         label="Evaluation Results",
         placeholder="The evaluation results will appear here...",
+        lines=10,
         interactive=False
     )
+    # Link evaluation function to the button
+    evaluate_button.click(
+        fn=evaluate_responses,
+        inputs=[prompt_input, response_a_input, response_b_input, criteria_dropdown],
+        outputs=[evaluation_output]
     )
 # Launch the app
+if __name__ == "__main__":
+    demo.launch()