Filip commited on
Commit
96c0b50
·
1 Parent(s): 6bd03c4
Files changed (3) hide show
  1. .gitignore +1 -0
  2. README.md +5 -1
  3. app.py +6 -10
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
README.md CHANGED
@@ -30,13 +30,17 @@ Quantization method: `float16`
30
  ### Hyperparameters
31
 
32
  Both models used the same hyperparameters during training.\
33
- `per_device_train_batch_size = 2`\
 
 
34
  `gradient_accumulation_steps=4`\
35
  `learning_rate=2e-4`\
36
  `optim="adamw_8bit"`\
37
  `weight_decay=0.01`\
38
  `lr_scheduler_type="linear"`
39
 
 
 
40
  We chose float16 as the quantization method as it according to [Unsloth wiki](https://github.com/unslothai/unsloth/wiki) has the fastest conversion and retains 100% accuracy. However, it is slow and memory hungry which is a disadvantage.
41
 
42
  ## Judge
 
30
  ### Hyperparameters
31
 
32
  Both models used the same hyperparameters during training.\
33
+ `lora_alpha=16`
34
+ `lora_dropout=0`
35
+ `per_device_train_batch_size=2`\
36
  `gradient_accumulation_steps=4`\
37
  `learning_rate=2e-4`\
38
  `optim="adamw_8bit"`\
39
  `weight_decay=0.01`\
40
  `lr_scheduler_type="linear"`
41
 
42
+ Both models have a max sequence length of 2048 tokens. This means that they only process the 2048 first tokens in the input.
43
+
44
  We chose float16 as the quantization method as it according to [Unsloth wiki](https://github.com/unslothai/unsloth/wiki) has the fastest conversion and retains 100% accuracy. However, it is slow and memory hungry which is a disadvantage.
45
 
46
  ## Judge
app.py CHANGED
@@ -32,14 +32,13 @@ def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_crit
32
  print(f"Response B: {response_b}")
33
 
34
  # Format the evaluation prompt
35
- criteria_list = ", ".join(evaluation_criteria)
36
  evaluation_prompt = f"""
37
  Prompt: {prompt}
38
 
39
  Response A: {response_a}
40
  Response B: {response_b}
41
 
42
- Evaluation Criteria: {criteria_list}
43
 
44
  Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
45
  """
@@ -53,7 +52,7 @@ Please evaluate the responses based on the selected criteria. For each criterion
53
 
54
  # Combine results for display
55
  final_output = f"""
56
- Evaluation Results:\n{evaluation_results}
57
  """
58
  return final_output, response_a, response_b
59
 
@@ -82,10 +81,6 @@ with gr.Blocks(title="LLM as a Judge") as demo:
82
 
83
  # Prompt and criteria inputs
84
  prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
85
- criteria_dropdown = gr.CheckboxGroup(
86
- label="Select Up to 3 Evaluation Criteria",
87
- choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"]
88
- )
89
 
90
  # Button and outputs
91
  evaluate_button = gr.Button("Evaluate Models")
@@ -94,7 +89,7 @@ with gr.Blocks(title="LLM as a Judge") as demo:
94
  with gr.Column():
95
  response_a = gr.Textbox(
96
  label="Response A",
97
- placeholder="The response for Model A will appear here...",
98
  lines=20,
99
  interactive=False
100
  )
@@ -102,11 +97,12 @@ with gr.Blocks(title="LLM as a Judge") as demo:
102
  with gr.Column():
103
  response_b = gr.Textbox(
104
  label="Response B",
105
- placeholder="The response for Model B will appear here...",
106
  lines=20,
107
  interactive=False
108
  )
109
 
 
110
  evaluation_output = gr.Textbox(
111
  label="Evaluation Results",
112
  placeholder="The evaluation results will appear here...",
@@ -117,7 +113,7 @@ with gr.Blocks(title="LLM as a Judge") as demo:
117
  # Link evaluation function
118
  evaluate_button.click(
119
  fn=evaluate_responses,
120
- inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input, criteria_dropdown],
121
  outputs=[evaluation_output, response_a, response_b]
122
  )
123
 
 
32
  print(f"Response B: {response_b}")
33
 
34
  # Format the evaluation prompt
 
35
  evaluation_prompt = f"""
36
  Prompt: {prompt}
37
 
38
  Response A: {response_a}
39
  Response B: {response_b}
40
 
41
+ Evaluation Criteria: Relevance, Coherence and Completeness
42
 
43
  Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
44
  """
 
52
 
53
  # Combine results for display
54
  final_output = f"""
55
+ {evaluation_results}
56
  """
57
  return final_output, response_a, response_b
58
 
 
81
 
82
  # Prompt and criteria inputs
83
  prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
 
 
 
 
84
 
85
  # Button and outputs
86
  evaluate_button = gr.Button("Evaluate Models")
 
89
  with gr.Column():
90
  response_a = gr.Textbox(
91
  label="Response A",
92
+ placeholder="The response from Model A will appear here...",
93
  lines=20,
94
  interactive=False
95
  )
 
97
  with gr.Column():
98
  response_b = gr.Textbox(
99
  label="Response B",
100
+ placeholder="The response from Model B will appear here...",
101
  lines=20,
102
  interactive=False
103
  )
104
 
105
+ gr.Markdown("### The LLMs are evaluated based on the criterion of Relevance, Coherence and Completeness.")
106
  evaluation_output = gr.Textbox(
107
  label="Evaluation Results",
108
  placeholder="The evaluation results will appear here...",
 
113
  # Link evaluation function
114
  evaluate_button.click(
115
  fn=evaluate_responses,
116
+ inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input],
117
  outputs=[evaluation_output, response_a, response_b]
118
  )
119