File size: 2,908 Bytes
04d9cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec08b2a
 
 
04d9cf4
ec08b2a
 
 
 
 
 
 
 
 
 
 
04d9cf4
ec08b2a
 
04d9cf4
 
ec08b2a
 
04d9cf4
ec08b2a
04d9cf4
 
ec08b2a
04d9cf4
 
 
 
 
ec08b2a
 
 
 
04d9cf4
ec08b2a
 
 
 
 
 
04d9cf4
 
ec08b2a
 
 
 
04d9cf4
 
 
ec08b2a
04d9cf4
 
 
ec08b2a
 
 
 
 
04d9cf4
 
 
ec08b2a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Load the model
def load_model():
    repo_id = "KolumbusLindh/LoRA-4100"
    model_file = "unsloth.F16.gguf"
    
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"Loading model from: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

print("Starting model loading...")
model = load_model()
print("Model loaded successfully!")

# Function to evaluate two responses
def evaluate_responses(prompt, response_a, response_b, evaluation_criteria):
    # Format the evaluation prompt
    evaluation_prompt = [
        {"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
        {"role": "user", "content": f"""
Prompt: {prompt}

Response A: {response_a}
Response B: {response_b}

Please evaluate both responses based on the following criteria: {evaluation_criteria}

For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
"""}
    ]
    
    # Generate the evaluation
    evaluation_response = model.create_chat_completion(
        messages=evaluation_prompt,
        max_tokens=512,
        temperature=0.5
    )
    
    evaluation_results = evaluation_response['choices'][0]['message']['content']
    
    return evaluation_results

# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
    gr.Markdown("## LLM as a Judge 🧐")

    # Input fields for the prompt, two responses, and selection of criteria
    prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
    response_a_input = gr.Textbox(label="Response A", placeholder="Enter Response A here...", lines=5)
    response_b_input = gr.Textbox(label="Response B", placeholder="Enter Response B here...", lines=5)
    
    # Dropdown for selecting evaluation criteria
    criteria_dropdown = gr.Dropdown(
        label="Select Evaluation Criteria",
        choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
        value="Clarity",
        type="value"
    )
    
    # Button to start the evaluation
    evaluate_button = gr.Button("Evaluate Responses")

    # Label for displaying the evaluation results
    evaluation_output = gr.Textbox(
        label="Evaluation Results", 
        placeholder="The evaluation results will appear here...", 
        lines=10, 
        interactive=False
    )
    
    # Link evaluation function to the button
    evaluate_button.click(
        fn=evaluate_responses,
        inputs=[prompt_input, response_a_input, response_b_input, criteria_dropdown],
        outputs=[evaluation_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()