LLM-as-a-judge / app.py
Kolumbus Lindh
updates
66cb564
raw
history blame
3.33 kB
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Load the base LoRA evaluation model
def load_lora_model():
repo_id = "KolumbusLindh/LoRA-4100"
model_file = "unsloth.F16.gguf"
local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
print(f"Loading LoRA model from: {local_path}")
return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
lora_model = load_lora_model()
print("LoRA model loaded successfully!")
# Function to load a user-specified model
def load_user_model(model_path):
print(f"Loading user model from: {model_path}")
return Llama(model_path=model_path, n_ctx=2048, n_threads=8)
# Generate a response using the specified model and prompt
def generate_response(model, prompt):
response = model(prompt, max_tokens=256, temperature=0.7)
return response["choices"][0]["text"]
# Evaluate responses generated by two models using the LoRA model
def evaluate_responses(prompt, model_a_path, model_b_path, evaluation_criteria):
# Load user-specified models
model_a = load_user_model(model_a_path)
model_b = load_user_model(model_b_path)
# Generate responses
response_a = generate_response(model_a, prompt)
response_b = generate_response(model_b, prompt)
print(f"Response A: {response_a}")
print(f"Response B: {response_b}")
# Format the evaluation prompt for the LoRA model
evaluation_prompt = f"""
Prompt: {prompt}
Response A: {response_a}
Response B: {response_b}
Evaluation Criteria: {evaluation_criteria}
Please evaluate the responses based on the criteria above. Rate each response on a scale from 1 to 10 for each criterion and provide a detailed explanation. Finally, declare a winner or state 'draw' if they are equal.
"""
# Use the LoRA model to evaluate the responses
evaluation_response = lora_model.create_completion(
prompt=evaluation_prompt,
max_tokens=512,
temperature=0.5
)
return evaluation_response["choices"][0]["text"]
# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
gr.Markdown("## LLM as a Judge 🧐")
# User inputs for models, prompt, and evaluation criteria
model_a_input = gr.Textbox(label="Model A Path or URL", placeholder="Enter the path or URL for Model A...")
model_b_input = gr.Textbox(label="Model B Path or URL", placeholder="Enter the path or URL for Model B...")
prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
criteria_dropdown = gr.Dropdown(
label="Select Evaluation Criteria",
choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
value="Clarity",
type="value"
)
evaluate_button = gr.Button("Evaluate Models")
evaluation_output = gr.Textbox(
label="Evaluation Results",
placeholder="The evaluation results will appear here...",
lines=10,
interactive=False
)
# Link the evaluation function to the button
evaluate_button.click(
fn=evaluate_responses,
inputs=[prompt_input, model_a_input, model_b_input, criteria_dropdown],
outputs=[evaluation_output]
)
# Launch the Gradio app
if __name__ == "__main__":
demo.launch()