Spaces:
Sleeping
Sleeping
Kolumbus Lindh
commited on
Commit
·
ec08b2a
1
Parent(s):
04d9cf4
updsated UI and functionality
Browse files
app.py
CHANGED
@@ -15,84 +15,69 @@ print("Starting model loading...")
|
|
15 |
model = load_model()
|
16 |
print("Model loaded successfully!")
|
17 |
|
18 |
-
# Function to
|
19 |
-
def
|
20 |
-
#
|
21 |
-
generation_prompt = [
|
22 |
-
{"role": "user", "content": preconfigured_prompt}
|
23 |
-
]
|
24 |
-
generated_response = model.create_chat_completion(
|
25 |
-
messages=generation_prompt,
|
26 |
-
max_tokens=256,
|
27 |
-
temperature=1.5
|
28 |
-
)
|
29 |
-
generated_content = generated_response['choices'][0]['message']['content']
|
30 |
-
|
31 |
-
# Step 2: Evaluate the generated content
|
32 |
evaluation_prompt = [
|
33 |
-
{"role": "system", "content": "You are
|
34 |
-
{"role": "user", "content": f"""
|
35 |
-
Prompt: {
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
2. Does the story contain the letter 'a'?
|
45 |
-
3. Does the story contain the word "alabaster"?
|
46 |
-
4. Does the reader understand that the cat's name is Alabaster?
|
47 |
-
5. Is the story 100% in English?
|
48 |
-
6. Does the text rhyme?"""}
|
49 |
]
|
|
|
|
|
50 |
evaluation_response = model.create_chat_completion(
|
51 |
messages=evaluation_prompt,
|
52 |
-
max_tokens=
|
53 |
-
temperature=0.
|
54 |
)
|
|
|
55 |
evaluation_results = evaluation_response['choices'][0]['message']['content']
|
56 |
|
57 |
-
return
|
58 |
-
|
59 |
-
# Preconfigured prompt
|
60 |
-
PRECONFIGURED_PROMPT = """Write a story about the cat Alabaster. It should be exactly 50 words and you are not allowed to use the letter 'a'. The reader must understand that the cat's name is Alabaster. Only replacing the letter 'a' with something like "_" is not enough. The text should rhyme."""
|
61 |
|
62 |
# Gradio interface
|
63 |
with gr.Blocks(title="LLM as a Judge") as demo:
|
64 |
gr.Markdown("## LLM as a Judge 🧐")
|
65 |
-
|
66 |
-
generate_evaluate_button = gr.Button("Judge the LLM!")
|
67 |
|
68 |
-
#
|
69 |
-
gr.
|
70 |
-
gr.
|
|
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
77 |
)
|
78 |
|
|
|
|
|
|
|
|
|
79 |
evaluation_output = gr.Textbox(
|
80 |
label="Evaluation Results",
|
81 |
placeholder="The evaluation results will appear here...",
|
82 |
-
lines=
|
83 |
interactive=False
|
84 |
)
|
85 |
|
86 |
-
# Link
|
87 |
-
|
88 |
-
fn=
|
89 |
-
inputs=[
|
90 |
-
outputs=[
|
91 |
)
|
92 |
|
93 |
# Launch the app
|
94 |
-
|
95 |
-
|
96 |
-
server_port=7860,
|
97 |
-
share=False
|
98 |
-
)
|
|
|
15 |
model = load_model()
|
16 |
print("Model loaded successfully!")
|
17 |
|
18 |
+
# Function to evaluate two responses
|
19 |
+
def evaluate_responses(prompt, response_a, response_b, evaluation_criteria):
|
20 |
+
# Format the evaluation prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
evaluation_prompt = [
|
22 |
+
{"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
|
23 |
+
{"role": "user", "content": f"""
|
24 |
+
Prompt: {prompt}
|
25 |
+
|
26 |
+
Response A: {response_a}
|
27 |
+
Response B: {response_b}
|
28 |
+
|
29 |
+
Please evaluate both responses based on the following criteria: {evaluation_criteria}
|
30 |
+
|
31 |
+
For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
|
32 |
+
"""}
|
|
|
|
|
|
|
|
|
|
|
33 |
]
|
34 |
+
|
35 |
+
# Generate the evaluation
|
36 |
evaluation_response = model.create_chat_completion(
|
37 |
messages=evaluation_prompt,
|
38 |
+
max_tokens=512,
|
39 |
+
temperature=0.5
|
40 |
)
|
41 |
+
|
42 |
evaluation_results = evaluation_response['choices'][0]['message']['content']
|
43 |
|
44 |
+
return evaluation_results
|
|
|
|
|
|
|
45 |
|
46 |
# Gradio interface
|
47 |
with gr.Blocks(title="LLM as a Judge") as demo:
|
48 |
gr.Markdown("## LLM as a Judge 🧐")
|
|
|
|
|
49 |
|
50 |
+
# Input fields for the prompt, two responses, and selection of criteria
|
51 |
+
prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
|
52 |
+
response_a_input = gr.Textbox(label="Response A", placeholder="Enter Response A here...", lines=5)
|
53 |
+
response_b_input = gr.Textbox(label="Response B", placeholder="Enter Response B here...", lines=5)
|
54 |
|
55 |
+
# Dropdown for selecting evaluation criteria
|
56 |
+
criteria_dropdown = gr.Dropdown(
|
57 |
+
label="Select Evaluation Criteria",
|
58 |
+
choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
|
59 |
+
value="Clarity",
|
60 |
+
type="value"
|
61 |
)
|
62 |
|
63 |
+
# Button to start the evaluation
|
64 |
+
evaluate_button = gr.Button("Evaluate Responses")
|
65 |
+
|
66 |
+
# Label for displaying the evaluation results
|
67 |
evaluation_output = gr.Textbox(
|
68 |
label="Evaluation Results",
|
69 |
placeholder="The evaluation results will appear here...",
|
70 |
+
lines=10,
|
71 |
interactive=False
|
72 |
)
|
73 |
|
74 |
+
# Link evaluation function to the button
|
75 |
+
evaluate_button.click(
|
76 |
+
fn=evaluate_responses,
|
77 |
+
inputs=[prompt_input, response_a_input, response_b_input, criteria_dropdown],
|
78 |
+
outputs=[evaluation_output]
|
79 |
)
|
80 |
|
81 |
# Launch the app
|
82 |
+
if __name__ == "__main__":
|
83 |
+
demo.launch()
|
|
|
|
|
|