LLM-as-a-judge / app.py
Kolumbus Lindh
updates
47aec4f
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Load a user-specified model
def load_user_model(repo_id, model_file):
print(f"Downloading model {model_file} from repository {repo_id}...")
local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
print(f"Model downloaded to: {local_path}")
return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
# Generate a response using the specified model and prompt
def generate_response(model, prompt):
response = model(prompt, max_tokens=512, temperature=0.5, top_p=0.95)
return response["choices"][0]["text"]
# Evaluate responses using the LoRA evaluation model
def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_criteria):
if len(evaluation_criteria) > 3:
return "Error: Please select up to 3 evaluation criteria only.", "", ""
# Load models
model_a_instance = load_user_model(repo_a, model_a)
model_b_instance = load_user_model(repo_b, model_b)
# Generate responses
response_a = generate_response(model_a_instance, prompt)
response_b = generate_response(model_b_instance, prompt)
# Display generated responses
print(f"Response A: {response_a}")
print(f"Response B: {response_b}")
# Format the evaluation prompt
criteria_list = ", ".join(evaluation_criteria)
evaluation_prompt = f"""
Prompt: {prompt}
Response A: {response_a}
Response B: {response_b}
Evaluation Criteria: {criteria_list}
Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
"""
# Use the LoRA model to evaluate the responses
evaluation_response = lora_model.create_completion(
prompt=evaluation_prompt,
max_tokens=512,
temperature=0.5,
top_p=0.95,
)
evaluation_results = evaluation_response["choices"][0]["text"]
return response_a, response_b, evaluation_results
# Load the LoRA evaluation model
def load_lora_model():
repo_id = "KolumbusLindh/LoRA-6150"
model_file = "unsloth.F16.gguf"
print(f"Downloading LoRA evaluation model from repository {repo_id}...")
local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
print(f"LoRA evaluation model downloaded to: {local_path}")
return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
lora_model = load_lora_model()
print("LoRA evaluation model loaded successfully!")
# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
gr.Markdown("## LLM as a Judge 𐄷")
# Model inputs
repo_a_input = gr.Textbox(label="Model A Repository", placeholder="KolumbusLindh/LoRA-6150")
model_a_input = gr.Textbox(label="Model A File Name", placeholder="unsloth.F16.gguf")
repo_b_input = gr.Textbox(label="Model B Repository", placeholder="forestav/LoRA-2000")
model_b_input = gr.Textbox(label="Model B File Name", placeholder="unsloth.F16.gguf")
# Prompt and criteria inputs
prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
criteria_dropdown = gr.CheckboxGroup(
label="Select Evaluation Criteria (Max 3)",
choices=["Clarity", "Completeness", "Accuracy"] # Restricted criteria
)
# Button and outputs
evaluate_button = gr.Button("Evaluate Models")
response_a_output = gr.Textbox(
label="Response A",
placeholder="Response from Model A will appear here...",
lines=10,
interactive=False
)
response_b_output = gr.Textbox(
label="Response B",
placeholder="Response from Model B will appear here...",
lines=10,
interactive=False
)
evaluation_output = gr.Textbox(
label="Evaluation Results",
placeholder="The evaluation analysis will appear here...",
lines=20,
interactive=False
)
# Link evaluation function
evaluate_button.click(
fn=evaluate_responses,
inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input, criteria_dropdown],
outputs=[response_a_output, response_b_output, evaluation_output]
)
# Launch app
if __name__ == "__main__":
demo.launch()