Spaces:
Sleeping
Sleeping
Filip
commited on
Commit
·
96c0b50
1
Parent(s):
6bd03c4
update
Browse files- .gitignore +1 -0
- README.md +5 -1
- app.py +6 -10
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
venv
|
README.md
CHANGED
@@ -30,13 +30,17 @@ Quantization method: `float16`
|
|
30 |
### Hyperparameters
|
31 |
|
32 |
Both models used the same hyperparameters during training.\
|
33 |
-
`
|
|
|
|
|
34 |
`gradient_accumulation_steps=4`\
|
35 |
`learning_rate=2e-4`\
|
36 |
`optim="adamw_8bit"`\
|
37 |
`weight_decay=0.01`\
|
38 |
`lr_scheduler_type="linear"`
|
39 |
|
|
|
|
|
40 |
We chose float16 as the quantization method as it according to [Unsloth wiki](https://github.com/unslothai/unsloth/wiki) has the fastest conversion and retains 100% accuracy. However, it is slow and memory hungry which is a disadvantage.
|
41 |
|
42 |
## Judge
|
|
|
30 |
### Hyperparameters
|
31 |
|
32 |
Both models used the same hyperparameters during training.\
|
33 |
+
`lora_alpha=16`
|
34 |
+
`lora_dropout=0`
|
35 |
+
`per_device_train_batch_size=2`\
|
36 |
`gradient_accumulation_steps=4`\
|
37 |
`learning_rate=2e-4`\
|
38 |
`optim="adamw_8bit"`\
|
39 |
`weight_decay=0.01`\
|
40 |
`lr_scheduler_type="linear"`
|
41 |
|
42 |
+
Both models have a max sequence length of 2048 tokens. This means that they only process the 2048 first tokens in the input.
|
43 |
+
|
44 |
We chose float16 as the quantization method as it according to [Unsloth wiki](https://github.com/unslothai/unsloth/wiki) has the fastest conversion and retains 100% accuracy. However, it is slow and memory hungry which is a disadvantage.
|
45 |
|
46 |
## Judge
|
app.py
CHANGED
@@ -32,14 +32,13 @@ def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_crit
|
|
32 |
print(f"Response B: {response_b}")
|
33 |
|
34 |
# Format the evaluation prompt
|
35 |
-
criteria_list = ", ".join(evaluation_criteria)
|
36 |
evaluation_prompt = f"""
|
37 |
Prompt: {prompt}
|
38 |
|
39 |
Response A: {response_a}
|
40 |
Response B: {response_b}
|
41 |
|
42 |
-
Evaluation Criteria:
|
43 |
|
44 |
Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
|
45 |
"""
|
@@ -53,7 +52,7 @@ Please evaluate the responses based on the selected criteria. For each criterion
|
|
53 |
|
54 |
# Combine results for display
|
55 |
final_output = f"""
|
56 |
-
|
57 |
"""
|
58 |
return final_output, response_a, response_b
|
59 |
|
@@ -82,10 +81,6 @@ with gr.Blocks(title="LLM as a Judge") as demo:
|
|
82 |
|
83 |
# Prompt and criteria inputs
|
84 |
prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
|
85 |
-
criteria_dropdown = gr.CheckboxGroup(
|
86 |
-
label="Select Up to 3 Evaluation Criteria",
|
87 |
-
choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"]
|
88 |
-
)
|
89 |
|
90 |
# Button and outputs
|
91 |
evaluate_button = gr.Button("Evaluate Models")
|
@@ -94,7 +89,7 @@ with gr.Blocks(title="LLM as a Judge") as demo:
|
|
94 |
with gr.Column():
|
95 |
response_a = gr.Textbox(
|
96 |
label="Response A",
|
97 |
-
placeholder="The response
|
98 |
lines=20,
|
99 |
interactive=False
|
100 |
)
|
@@ -102,11 +97,12 @@ with gr.Blocks(title="LLM as a Judge") as demo:
|
|
102 |
with gr.Column():
|
103 |
response_b = gr.Textbox(
|
104 |
label="Response B",
|
105 |
-
placeholder="The response
|
106 |
lines=20,
|
107 |
interactive=False
|
108 |
)
|
109 |
|
|
|
110 |
evaluation_output = gr.Textbox(
|
111 |
label="Evaluation Results",
|
112 |
placeholder="The evaluation results will appear here...",
|
@@ -117,7 +113,7 @@ with gr.Blocks(title="LLM as a Judge") as demo:
|
|
117 |
# Link evaluation function
|
118 |
evaluate_button.click(
|
119 |
fn=evaluate_responses,
|
120 |
-
inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input
|
121 |
outputs=[evaluation_output, response_a, response_b]
|
122 |
)
|
123 |
|
|
|
32 |
print(f"Response B: {response_b}")
|
33 |
|
34 |
# Format the evaluation prompt
|
|
|
35 |
evaluation_prompt = f"""
|
36 |
Prompt: {prompt}
|
37 |
|
38 |
Response A: {response_a}
|
39 |
Response B: {response_b}
|
40 |
|
41 |
+
Evaluation Criteria: Relevance, Coherence and Completeness
|
42 |
|
43 |
Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
|
44 |
"""
|
|
|
52 |
|
53 |
# Combine results for display
|
54 |
final_output = f"""
|
55 |
+
{evaluation_results}
|
56 |
"""
|
57 |
return final_output, response_a, response_b
|
58 |
|
|
|
81 |
|
82 |
# Prompt and criteria inputs
|
83 |
prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
|
|
|
|
|
|
|
|
|
84 |
|
85 |
# Button and outputs
|
86 |
evaluate_button = gr.Button("Evaluate Models")
|
|
|
89 |
with gr.Column():
|
90 |
response_a = gr.Textbox(
|
91 |
label="Response A",
|
92 |
+
placeholder="The response from Model A will appear here...",
|
93 |
lines=20,
|
94 |
interactive=False
|
95 |
)
|
|
|
97 |
with gr.Column():
|
98 |
response_b = gr.Textbox(
|
99 |
label="Response B",
|
100 |
+
placeholder="The response from Model B will appear here...",
|
101 |
lines=20,
|
102 |
interactive=False
|
103 |
)
|
104 |
|
105 |
+
gr.Markdown("### The LLMs are evaluated based on the criterion of Relevance, Coherence and Completeness.")
|
106 |
evaluation_output = gr.Textbox(
|
107 |
label="Evaluation Results",
|
108 |
placeholder="The evaluation results will appear here...",
|
|
|
113 |
# Link evaluation function
|
114 |
evaluate_button.click(
|
115 |
fn=evaluate_responses,
|
116 |
+
inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input],
|
117 |
outputs=[evaluation_output, response_a, response_b]
|
118 |
)
|
119 |
|