Spaces:

KolumbusLindh
/

LLM-as-a-judge

Sleeping

File size: 3,476 Bytes

04d9cf4
 
 
 
1de90bd
 
04d9cf4
 
 
 
1de90bd
04d9cf4
 
1de90bd
 
04d9cf4
1de90bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec08b2a
04d9cf4
ec08b2a
 
 
 
 
 
 
 
 
 
 
04d9cf4
ec08b2a
 
1de90bd
04d9cf4
ec08b2a
 
04d9cf4
ec08b2a
04d9cf4
 
ec08b2a
04d9cf4
 
 
 
 
1de90bd
 
 
ec08b2a
04d9cf4
1de90bd
ec08b2a
 
 
 
 
04d9cf4
 
1de90bd
 
ec08b2a
1de90bd
04d9cf4
 
 
ec08b2a
04d9cf4
 
 
ec08b2a
 
 
1de90bd
ec08b2a
04d9cf4
 
 
ec08b2a

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Load LoRA-4100 model for evaluation
def load_lora_model():
    repo_id = "KolumbusLindh/LoRA-4100"
    model_file = "unsloth.F16.gguf"
    
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"Loading LoRA model from: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

lora_model = load_lora_model()
print("LoRA model loaded successfully!")

# Load user-specified model
def load_user_model(model_path):
    print(f"Loading user model from: {model_path}")
    return Llama(model_path=model_path, n_ctx=2048, n_threads=8)

# Generate response using a specified model and prompt
def generate_response(model_path, prompt):
    user_model = load_user_model(model_path)
    response = user_model(prompt, max_tokens=256, temperature=0.7)
    return response["choices"][0]["text"]

# Evaluate responses using the LoRA model
def evaluate_responses(prompt, model_a_path, model_b_path, evaluation_criteria):
    # Generate responses
    response_a = generate_response(model_a_path, prompt)
    response_b = generate_response(model_b_path, prompt)
    
    # Format the evaluation prompt
    evaluation_prompt = [
        {"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
        {"role": "user", "content": f"""
Prompt: {prompt}

Response A: {response_a}
Response B: {response_b}

Please evaluate both responses based on the following criteria: {evaluation_criteria}

For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
"""}
    ]
    
    # Generate the evaluation
    evaluation_response = lora_model.create_chat_completion(
        messages=evaluation_prompt,
        max_tokens=512,
        temperature=0.5
    )
    
    evaluation_results = evaluation_response['choices'][0]['message']['content']
    
    return evaluation_results

# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
    gr.Markdown("## LLM as a Judge 🧐")

    # Inputs for model paths, prompt, and evaluation criteria
    model_a_input = gr.Textbox(label="Model A Path or URL", placeholder="Enter the path or URL to Model A...")
    model_b_input = gr.Textbox(label="Model B Path or URL", placeholder="Enter the path or URL to Model B...")
    prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
    
    # Dropdown for evaluation criteria
    criteria_dropdown = gr.Dropdown(
        label="Select Evaluation Criteria",
        choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
        value="Clarity",
        type="value"
    )
    
    # Button to evaluate responses
    evaluate_button = gr.Button("Evaluate Models")

    # Output for evaluation results
    evaluation_output = gr.Textbox(
        label="Evaluation Results", 
        placeholder="The evaluation results will appear here...", 
        lines=10, 
        interactive=False
    )
    
    # Link evaluation function to the button
    evaluate_button.click(
        fn=evaluate_responses,
        inputs=[prompt_input, model_a_input, model_b_input, criteria_dropdown],
        outputs=[evaluation_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()