Spaces:

KolumbusLindh
/

LLM-as-a-judge

Sleeping

File size: 3,327 Bytes

04d9cf4
 
 
 
66cb564
1de90bd
04d9cf4
 
 
1de90bd
04d9cf4
 
1de90bd
 
04d9cf4
66cb564
1de90bd
 
 
 
66cb564
 
 
1de90bd
 
66cb564
1de90bd
66cb564
 
 
 
1de90bd
66cb564
 
 
 
 
1de90bd
66cb564
 
ec08b2a
 
 
 
 
66cb564
ec08b2a
66cb564
 
 
 
 
ec08b2a
 
04d9cf4
66cb564
04d9cf4
 
 
 
 
66cb564
 
 
 
ec08b2a
 
 
 
 
04d9cf4
1de90bd
04d9cf4
66cb564
 
 
04d9cf4
 
66cb564
 
ec08b2a
 
1de90bd
ec08b2a
04d9cf4
 
66cb564
ec08b2a

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Load the base LoRA evaluation model
def load_lora_model():
    repo_id = "KolumbusLindh/LoRA-4100"
    model_file = "unsloth.F16.gguf"
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"Loading LoRA model from: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

lora_model = load_lora_model()
print("LoRA model loaded successfully!")

# Function to load a user-specified model
def load_user_model(model_path):
    print(f"Loading user model from: {model_path}")
    return Llama(model_path=model_path, n_ctx=2048, n_threads=8)

# Generate a response using the specified model and prompt
def generate_response(model, prompt):
    response = model(prompt, max_tokens=256, temperature=0.7)
    return response["choices"][0]["text"]

# Evaluate responses generated by two models using the LoRA model
def evaluate_responses(prompt, model_a_path, model_b_path, evaluation_criteria):
    # Load user-specified models
    model_a = load_user_model(model_a_path)
    model_b = load_user_model(model_b_path)
    
    # Generate responses
    response_a = generate_response(model_a, prompt)
    response_b = generate_response(model_b, prompt)
    
    print(f"Response A: {response_a}")
    print(f"Response B: {response_b}")
    
    # Format the evaluation prompt for the LoRA model
    evaluation_prompt = f"""
Prompt: {prompt}

Response A: {response_a}
Response B: {response_b}

Evaluation Criteria: {evaluation_criteria}

Please evaluate the responses based on the criteria above. Rate each response on a scale from 1 to 10 for each criterion and provide a detailed explanation. Finally, declare a winner or state 'draw' if they are equal.
"""
    # Use the LoRA model to evaluate the responses
    evaluation_response = lora_model.create_completion(
        prompt=evaluation_prompt,
        max_tokens=512,
        temperature=0.5
    )
    return evaluation_response["choices"][0]["text"]

# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
    gr.Markdown("## LLM as a Judge 🧐")

    # User inputs for models, prompt, and evaluation criteria
    model_a_input = gr.Textbox(label="Model A Path or URL", placeholder="Enter the path or URL for Model A...")
    model_b_input = gr.Textbox(label="Model B Path or URL", placeholder="Enter the path or URL for Model B...")
    prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
    criteria_dropdown = gr.Dropdown(
        label="Select Evaluation Criteria",
        choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
        value="Clarity",
        type="value"
    )
    evaluate_button = gr.Button("Evaluate Models")
    evaluation_output = gr.Textbox(
        label="Evaluation Results",
        placeholder="The evaluation results will appear here...",
        lines=10,
        interactive=False
    )

    # Link the evaluation function to the button
    evaluate_button.click(
        fn=evaluate_responses,
        inputs=[prompt_input, model_a_input, model_b_input, criteria_dropdown],
        outputs=[evaluation_output]
    )

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()