Spaces:

KolumbusLindh
/

LLM-as-a-judge

Sleeping

File size: 2,908 Bytes

04d9cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec08b2a
 
 
04d9cf4
ec08b2a
 
 
 
 
 
 
 
 
 
 
04d9cf4
ec08b2a
 
04d9cf4
 
ec08b2a
 
04d9cf4
ec08b2a
04d9cf4
 
ec08b2a
04d9cf4
 
 
 
 
ec08b2a
 
 
 
04d9cf4
ec08b2a
 
 
 
 
 
04d9cf4
 
ec08b2a
 
 
 
04d9cf4
 
 
ec08b2a
04d9cf4
 
 
ec08b2a
 
 
 
 
04d9cf4
 
 
ec08b2a

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Load the model
def load_model():
    repo_id = "KolumbusLindh/LoRA-4100"
    model_file = "unsloth.F16.gguf"
    
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"Loading model from: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

print("Starting model loading...")
model = load_model()
print("Model loaded successfully!")

# Function to evaluate two responses
def evaluate_responses(prompt, response_a, response_b, evaluation_criteria):
    # Format the evaluation prompt
    evaluation_prompt = [
        {"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
        {"role": "user", "content": f"""
Prompt: {prompt}

Response A: {response_a}
Response B: {response_b}

Please evaluate both responses based on the following criteria: {evaluation_criteria}

For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
"""}
    ]
    
    # Generate the evaluation
    evaluation_response = model.create_chat_completion(
        messages=evaluation_prompt,
        max_tokens=512,
        temperature=0.5
    )
    
    evaluation_results = evaluation_response['choices'][0]['message']['content']
    
    return evaluation_results

# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
    gr.Markdown("## LLM as a Judge 🧐")

    # Input fields for the prompt, two responses, and selection of criteria
    prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
    response_a_input = gr.Textbox(label="Response A", placeholder="Enter Response A here...", lines=5)
    response_b_input = gr.Textbox(label="Response B", placeholder="Enter Response B here...", lines=5)
    
    # Dropdown for selecting evaluation criteria
    criteria_dropdown = gr.Dropdown(
        label="Select Evaluation Criteria",
        choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
        value="Clarity",
        type="value"
    )
    
    # Button to start the evaluation
    evaluate_button = gr.Button("Evaluate Responses")

    # Label for displaying the evaluation results
    evaluation_output = gr.Textbox(
        label="Evaluation Results", 
        placeholder="The evaluation results will appear here...", 
        lines=10, 
        interactive=False
    )
    
    # Link evaluation function to the button
    evaluate_button.click(
        fn=evaluate_responses,
        inputs=[prompt_input, response_a_input, response_b_input, criteria_dropdown],
        outputs=[evaluation_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()