Kolumbus Lindh commited on
Commit
ec08b2a
·
1 Parent(s): 04d9cf4

updsated UI and functionality

Browse files
Files changed (1) hide show
  1. app.py +42 -57
app.py CHANGED
@@ -15,84 +15,69 @@ print("Starting model loading...")
15
  model = load_model()
16
  print("Model loaded successfully!")
17
 
18
- # Function to generate and evaluate content
19
- def generate_and_evaluate(preconfigured_prompt):
20
- # Step 1: Generate content
21
- generation_prompt = [
22
- {"role": "user", "content": preconfigured_prompt}
23
- ]
24
- generated_response = model.create_chat_completion(
25
- messages=generation_prompt,
26
- max_tokens=256,
27
- temperature=1.5
28
- )
29
- generated_content = generated_response['choices'][0]['message']['content']
30
-
31
- # Step 2: Evaluate the generated content
32
  evaluation_prompt = [
33
- {"role": "system", "content": "You are a strict language evaluator who provides binary assessments of texts."},
34
- {"role": "user", "content": f"""Carefully evaluate the generated story:
35
- Prompt: {preconfigured_prompt}
36
- Generated response: {generated_content}
37
- Provide a clear evaluation as follows:
38
- For each question, write the full question followed by your "Yes" or "No" answer.
39
- Example format:
40
- 1. Is the story exactly 50 words? - Yes
41
- 2. Does the story contain the letter 'a'? - No
42
- Now answer these questions:
43
- 1. Is the story exactly 50 words?
44
- 2. Does the story contain the letter 'a'?
45
- 3. Does the story contain the word "alabaster"?
46
- 4. Does the reader understand that the cat's name is Alabaster?
47
- 5. Is the story 100% in English?
48
- 6. Does the text rhyme?"""}
49
  ]
 
 
50
  evaluation_response = model.create_chat_completion(
51
  messages=evaluation_prompt,
52
- max_tokens=128,
53
- temperature=0.2
54
  )
 
55
  evaluation_results = evaluation_response['choices'][0]['message']['content']
56
 
57
- return generated_content, evaluation_results
58
-
59
- # Preconfigured prompt
60
- PRECONFIGURED_PROMPT = """Write a story about the cat Alabaster. It should be exactly 50 words and you are not allowed to use the letter 'a'. The reader must understand that the cat's name is Alabaster. Only replacing the letter 'a' with something like "_" is not enough. The text should rhyme."""
61
 
62
  # Gradio interface
63
  with gr.Blocks(title="LLM as a Judge") as demo:
64
  gr.Markdown("## LLM as a Judge 🧐")
65
-
66
- generate_evaluate_button = gr.Button("Judge the LLM!")
67
 
68
- # Label for the preconfigured prompt
69
- gr.Label("Preconfigured prompt:")
70
- gr.Label(PRECONFIGURED_PROMPT)
 
71
 
72
- generated_output = gr.Textbox(
73
- label="Generated Content",
74
- placeholder="The generated content will appear here...",
75
- lines=5,
76
- interactive=False
 
77
  )
78
 
 
 
 
 
79
  evaluation_output = gr.Textbox(
80
  label="Evaluation Results",
81
  placeholder="The evaluation results will appear here...",
82
- lines=8,
83
  interactive=False
84
  )
85
 
86
- # Link generation and evaluation
87
- generate_evaluate_button.click(
88
- fn=generate_and_evaluate,
89
- inputs=[gr.State(PRECONFIGURED_PROMPT)],
90
- outputs=[generated_output, evaluation_output]
91
  )
92
 
93
  # Launch the app
94
- demo.launch(
95
- server_name="0.0.0.0",
96
- server_port=7860,
97
- share=False
98
- )
 
15
  model = load_model()
16
  print("Model loaded successfully!")
17
 
18
+ # Function to evaluate two responses
19
+ def evaluate_responses(prompt, response_a, response_b, evaluation_criteria):
20
+ # Format the evaluation prompt
 
 
 
 
 
 
 
 
 
 
 
21
  evaluation_prompt = [
22
+ {"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
23
+ {"role": "user", "content": f"""
24
+ Prompt: {prompt}
25
+
26
+ Response A: {response_a}
27
+ Response B: {response_b}
28
+
29
+ Please evaluate both responses based on the following criteria: {evaluation_criteria}
30
+
31
+ For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
32
+ """}
 
 
 
 
 
33
  ]
34
+
35
+ # Generate the evaluation
36
  evaluation_response = model.create_chat_completion(
37
  messages=evaluation_prompt,
38
+ max_tokens=512,
39
+ temperature=0.5
40
  )
41
+
42
  evaluation_results = evaluation_response['choices'][0]['message']['content']
43
 
44
+ return evaluation_results
 
 
 
45
 
46
  # Gradio interface
47
  with gr.Blocks(title="LLM as a Judge") as demo:
48
  gr.Markdown("## LLM as a Judge 🧐")
 
 
49
 
50
+ # Input fields for the prompt, two responses, and selection of criteria
51
+ prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
52
+ response_a_input = gr.Textbox(label="Response A", placeholder="Enter Response A here...", lines=5)
53
+ response_b_input = gr.Textbox(label="Response B", placeholder="Enter Response B here...", lines=5)
54
 
55
+ # Dropdown for selecting evaluation criteria
56
+ criteria_dropdown = gr.Dropdown(
57
+ label="Select Evaluation Criteria",
58
+ choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
59
+ value="Clarity",
60
+ type="value"
61
  )
62
 
63
+ # Button to start the evaluation
64
+ evaluate_button = gr.Button("Evaluate Responses")
65
+
66
+ # Label for displaying the evaluation results
67
  evaluation_output = gr.Textbox(
68
  label="Evaluation Results",
69
  placeholder="The evaluation results will appear here...",
70
+ lines=10,
71
  interactive=False
72
  )
73
 
74
+ # Link evaluation function to the button
75
+ evaluate_button.click(
76
+ fn=evaluate_responses,
77
+ inputs=[prompt_input, response_a_input, response_b_input, criteria_dropdown],
78
+ outputs=[evaluation_output]
79
  )
80
 
81
  # Launch the app
82
+ if __name__ == "__main__":
83
+ demo.launch()