Kolumbus Lindh commited on
Commit
66cb564
·
1 Parent(s): 1de90bd
Files changed (1) hide show
  1. app.py +34 -42
app.py CHANGED
@@ -2,11 +2,10 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
- # Load LoRA-4100 model for evaluation
6
  def load_lora_model():
7
  repo_id = "KolumbusLindh/LoRA-4100"
8
  model_file = "unsloth.F16.gguf"
9
-
10
  local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
11
  print(f"Loading LoRA model from: {local_path}")
12
  return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
@@ -14,84 +13,77 @@ def load_lora_model():
14
  lora_model = load_lora_model()
15
  print("LoRA model loaded successfully!")
16
 
17
- # Load user-specified model
18
  def load_user_model(model_path):
19
  print(f"Loading user model from: {model_path}")
20
  return Llama(model_path=model_path, n_ctx=2048, n_threads=8)
21
 
22
- # Generate response using a specified model and prompt
23
- def generate_response(model_path, prompt):
24
- user_model = load_user_model(model_path)
25
- response = user_model(prompt, max_tokens=256, temperature=0.7)
26
  return response["choices"][0]["text"]
27
 
28
- # Evaluate responses using the LoRA model
29
  def evaluate_responses(prompt, model_a_path, model_b_path, evaluation_criteria):
 
 
 
 
30
  # Generate responses
31
- response_a = generate_response(model_a_path, prompt)
32
- response_b = generate_response(model_b_path, prompt)
 
 
 
33
 
34
- # Format the evaluation prompt
35
- evaluation_prompt = [
36
- {"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
37
- {"role": "user", "content": f"""
38
  Prompt: {prompt}
39
 
40
  Response A: {response_a}
41
  Response B: {response_b}
42
 
43
- Please evaluate both responses based on the following criteria: {evaluation_criteria}
44
 
45
- For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
46
- """}
47
- ]
48
-
49
- # Generate the evaluation
50
- evaluation_response = lora_model.create_chat_completion(
51
- messages=evaluation_prompt,
52
  max_tokens=512,
53
  temperature=0.5
54
  )
55
-
56
- evaluation_results = evaluation_response['choices'][0]['message']['content']
57
-
58
- return evaluation_results
59
 
60
  # Gradio interface
61
  with gr.Blocks(title="LLM as a Judge") as demo:
62
  gr.Markdown("## LLM as a Judge 🧐")
63
 
64
- # Inputs for model paths, prompt, and evaluation criteria
65
- model_a_input = gr.Textbox(label="Model A Path or URL", placeholder="Enter the path or URL to Model A...")
66
- model_b_input = gr.Textbox(label="Model B Path or URL", placeholder="Enter the path or URL to Model B...")
67
- prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
68
-
69
- # Dropdown for evaluation criteria
70
  criteria_dropdown = gr.Dropdown(
71
  label="Select Evaluation Criteria",
72
  choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
73
  value="Clarity",
74
  type="value"
75
  )
76
-
77
- # Button to evaluate responses
78
  evaluate_button = gr.Button("Evaluate Models")
79
-
80
- # Output for evaluation results
81
  evaluation_output = gr.Textbox(
82
- label="Evaluation Results",
83
- placeholder="The evaluation results will appear here...",
84
- lines=10,
85
  interactive=False
86
  )
87
-
88
- # Link evaluation function to the button
89
  evaluate_button.click(
90
  fn=evaluate_responses,
91
  inputs=[prompt_input, model_a_input, model_b_input, criteria_dropdown],
92
  outputs=[evaluation_output]
93
  )
94
 
95
- # Launch the app
96
  if __name__ == "__main__":
97
  demo.launch()
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
+ # Load the base LoRA evaluation model
6
  def load_lora_model():
7
  repo_id = "KolumbusLindh/LoRA-4100"
8
  model_file = "unsloth.F16.gguf"
 
9
  local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
10
  print(f"Loading LoRA model from: {local_path}")
11
  return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
 
13
  lora_model = load_lora_model()
14
  print("LoRA model loaded successfully!")
15
 
16
+ # Function to load a user-specified model
17
  def load_user_model(model_path):
18
  print(f"Loading user model from: {model_path}")
19
  return Llama(model_path=model_path, n_ctx=2048, n_threads=8)
20
 
21
+ # Generate a response using the specified model and prompt
22
+ def generate_response(model, prompt):
23
+ response = model(prompt, max_tokens=256, temperature=0.7)
 
24
  return response["choices"][0]["text"]
25
 
26
+ # Evaluate responses generated by two models using the LoRA model
27
  def evaluate_responses(prompt, model_a_path, model_b_path, evaluation_criteria):
28
+ # Load user-specified models
29
+ model_a = load_user_model(model_a_path)
30
+ model_b = load_user_model(model_b_path)
31
+
32
  # Generate responses
33
+ response_a = generate_response(model_a, prompt)
34
+ response_b = generate_response(model_b, prompt)
35
+
36
+ print(f"Response A: {response_a}")
37
+ print(f"Response B: {response_b}")
38
 
39
+ # Format the evaluation prompt for the LoRA model
40
+ evaluation_prompt = f"""
 
 
41
  Prompt: {prompt}
42
 
43
  Response A: {response_a}
44
  Response B: {response_b}
45
 
46
+ Evaluation Criteria: {evaluation_criteria}
47
 
48
+ Please evaluate the responses based on the criteria above. Rate each response on a scale from 1 to 10 for each criterion and provide a detailed explanation. Finally, declare a winner or state 'draw' if they are equal.
49
+ """
50
+ # Use the LoRA model to evaluate the responses
51
+ evaluation_response = lora_model.create_completion(
52
+ prompt=evaluation_prompt,
 
 
53
  max_tokens=512,
54
  temperature=0.5
55
  )
56
+ return evaluation_response["choices"][0]["text"]
 
 
 
57
 
58
  # Gradio interface
59
  with gr.Blocks(title="LLM as a Judge") as demo:
60
  gr.Markdown("## LLM as a Judge 🧐")
61
 
62
+ # User inputs for models, prompt, and evaluation criteria
63
+ model_a_input = gr.Textbox(label="Model A Path or URL", placeholder="Enter the path or URL for Model A...")
64
+ model_b_input = gr.Textbox(label="Model B Path or URL", placeholder="Enter the path or URL for Model B...")
65
+ prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
 
 
66
  criteria_dropdown = gr.Dropdown(
67
  label="Select Evaluation Criteria",
68
  choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
69
  value="Clarity",
70
  type="value"
71
  )
 
 
72
  evaluate_button = gr.Button("Evaluate Models")
 
 
73
  evaluation_output = gr.Textbox(
74
+ label="Evaluation Results",
75
+ placeholder="The evaluation results will appear here...",
76
+ lines=10,
77
  interactive=False
78
  )
79
+
80
+ # Link the evaluation function to the button
81
  evaluate_button.click(
82
  fn=evaluate_responses,
83
  inputs=[prompt_input, model_a_input, model_b_input, criteria_dropdown],
84
  outputs=[evaluation_output]
85
  )
86
 
87
+ # Launch the Gradio app
88
  if __name__ == "__main__":
89
  demo.launch()