Spaces:

KolumbusLindh
/

LLM-as-a-judge

Sleeping

App Files Files Community

Kolumbus Lindh commited on Dec 9, 2024

Commit

66cb564

1 Parent(s): 1de90bd

updates

Browse files

Files changed (1) hide show

app.py +34 -42

app.py CHANGED Viewed

@@ -2,11 +2,10 @@ import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-# Load LoRA-4100 model for evaluation
 def load_lora_model():
     repo_id = "KolumbusLindh/LoRA-4100"
     model_file = "unsloth.F16.gguf"
     local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
     print(f"Loading LoRA model from: {local_path}")
     return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
@@ -14,84 +13,77 @@ def load_lora_model():
 lora_model = load_lora_model()
 print("LoRA model loaded successfully!")
-# Load user-specified model
 def load_user_model(model_path):
     print(f"Loading user model from: {model_path}")
     return Llama(model_path=model_path, n_ctx=2048, n_threads=8)
-# Generate response using a specified model and prompt
-def generate_response(model_path, prompt):
-    user_model = load_user_model(model_path)
-    response = user_model(prompt, max_tokens=256, temperature=0.7)
     return response["choices"][0]["text"]
-# Evaluate responses using the LoRA model
 def evaluate_responses(prompt, model_a_path, model_b_path, evaluation_criteria):
     # Generate responses
-    response_a = generate_response(model_a_path, prompt)
-    response_b = generate_response(model_b_path, prompt)
-    # Format the evaluation prompt
-    evaluation_prompt = [
-        {"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
-        {"role": "user", "content": f"""
 Prompt: {prompt}
 Response A: {response_a}
 Response B: {response_b}
-Please evaluate both responses based on the following criteria: {evaluation_criteria}
-For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
-"""}
-    ]
-    # Generate the evaluation
-    evaluation_response = lora_model.create_chat_completion(
-        messages=evaluation_prompt,
         max_tokens=512,
         temperature=0.5
     )
-    evaluation_results = evaluation_response['choices'][0]['message']['content']
-    return evaluation_results
 # Gradio interface
 with gr.Blocks(title="LLM as a Judge") as demo:
     gr.Markdown("## LLM as a Judge 🧐")
-    # Inputs for model paths, prompt, and evaluation criteria
-    model_a_input = gr.Textbox(label="Model A Path or URL", placeholder="Enter the path or URL to Model A...")
-    model_b_input = gr.Textbox(label="Model B Path or URL", placeholder="Enter the path or URL to Model B...")
-    prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
-    # Dropdown for evaluation criteria
     criteria_dropdown = gr.Dropdown(
         label="Select Evaluation Criteria",
         choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
         value="Clarity",
         type="value"
     )
-    # Button to evaluate responses
     evaluate_button = gr.Button("Evaluate Models")
-    # Output for evaluation results
     evaluation_output = gr.Textbox(
-        label="Evaluation Results",
-        placeholder="The evaluation results will appear here...",
-        lines=10,
         interactive=False
     )
-    # Link evaluation function to the button
     evaluate_button.click(
         fn=evaluate_responses,
         inputs=[prompt_input, model_a_input, model_b_input, criteria_dropdown],
         outputs=[evaluation_output]
     )
-# Launch the app
 if __name__ == "__main__":
     demo.launch()

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+# Load the base LoRA evaluation model
 def load_lora_model():
     repo_id = "KolumbusLindh/LoRA-4100"
     model_file = "unsloth.F16.gguf"
     local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
     print(f"Loading LoRA model from: {local_path}")
     return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
 lora_model = load_lora_model()
 print("LoRA model loaded successfully!")
+# Function to load a user-specified model
 def load_user_model(model_path):
     print(f"Loading user model from: {model_path}")
     return Llama(model_path=model_path, n_ctx=2048, n_threads=8)
+# Generate a response using the specified model and prompt
+def generate_response(model, prompt):
+    response = model(prompt, max_tokens=256, temperature=0.7)
     return response["choices"][0]["text"]
+# Evaluate responses generated by two models using the LoRA model
 def evaluate_responses(prompt, model_a_path, model_b_path, evaluation_criteria):
+    # Load user-specified models
+    model_a = load_user_model(model_a_path)
+    model_b = load_user_model(model_b_path)
     # Generate responses
+    response_a = generate_response(model_a, prompt)
+    response_b = generate_response(model_b, prompt)
+    print(f"Response A: {response_a}")
+    print(f"Response B: {response_b}")
+    # Format the evaluation prompt for the LoRA model
+    evaluation_prompt = f"""
 Prompt: {prompt}
 Response A: {response_a}
 Response B: {response_b}
+Evaluation Criteria: {evaluation_criteria}
+Please evaluate the responses based on the criteria above. Rate each response on a scale from 1 to 10 for each criterion and provide a detailed explanation. Finally, declare a winner or state 'draw' if they are equal.
+"""
+    # Use the LoRA model to evaluate the responses
+    evaluation_response = lora_model.create_completion(
+        prompt=evaluation_prompt,
         max_tokens=512,
         temperature=0.5
     )
+    return evaluation_response["choices"][0]["text"]
 # Gradio interface
 with gr.Blocks(title="LLM as a Judge") as demo:
     gr.Markdown("## LLM as a Judge 🧐")
+    # User inputs for models, prompt, and evaluation criteria
+    model_a_input = gr.Textbox(label="Model A Path or URL", placeholder="Enter the path or URL for Model A...")
+    model_b_input = gr.Textbox(label="Model B Path or URL", placeholder="Enter the path or URL for Model B...")
+    prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
     criteria_dropdown = gr.Dropdown(
         label="Select Evaluation Criteria",
         choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
         value="Clarity",
         type="value"
     )
     evaluate_button = gr.Button("Evaluate Models")
     evaluation_output = gr.Textbox(
+        label="Evaluation Results",
+        placeholder="The evaluation results will appear here...",
+        lines=10,
         interactive=False
     )
+    # Link the evaluation function to the button
     evaluate_button.click(
         fn=evaluate_responses,
         inputs=[prompt_input, model_a_input, model_b_input, criteria_dropdown],
         outputs=[evaluation_output]
     )
+# Launch the Gradio app
 if __name__ == "__main__":
     demo.launch()