File size: 4,236 Bytes
04d9cf4
 
 
 
8f23865
 
 
04d9cf4
8f23865
04d9cf4
 
66cb564
 
 
1de90bd
 
8f23865
 
66cb564
8f23865
 
66cb564
1de90bd
8f23865
 
66cb564
 
 
1de90bd
66cb564
 
ec08b2a
 
 
 
 
66cb564
ec08b2a
66cb564
 
 
 
 
ec08b2a
 
04d9cf4
66cb564
04d9cf4
8f23865
 
 
 
 
 
 
 
 
 
 
 
04d9cf4
 
 
 
8f23865
 
 
 
 
 
 
 
 
66cb564
ec08b2a
 
 
 
 
04d9cf4
8f23865
 
1de90bd
8f23865
 
04d9cf4
66cb564
 
 
04d9cf4
 
66cb564
 
ec08b2a
 
8f23865
ec08b2a
04d9cf4
 
66cb564
ec08b2a
8f23865
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Function to load a user-specified model from Hugging Face
def load_user_model(repo_id, model_file):
    print(f"Downloading model {model_file} from repository {repo_id}...")
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"Model downloaded to: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

# Generate a response using the specified model and prompt
def generate_response(model, prompt):
    response = model(prompt, max_tokens=256, temperature=0.7)
    return response["choices"][0]["text"]

# Evaluate responses generated by two models using the LoRA evaluation model
def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_criteria):
    # Load user-specified models
    model_a_instance = load_user_model(repo_a, model_a)
    model_b_instance = load_user_model(repo_b, model_b)
    
    # Generate responses
    response_a = generate_response(model_a_instance, prompt)
    response_b = generate_response(model_b_instance, prompt)
    
    print(f"Response A: {response_a}")
    print(f"Response B: {response_b}")
    
    # Format the evaluation prompt for the LoRA model
    evaluation_prompt = f"""
Prompt: {prompt}

Response A: {response_a}
Response B: {response_b}

Evaluation Criteria: {evaluation_criteria}

Please evaluate the responses based on the criteria above. Rate each response on a scale from 1 to 10 for each criterion and provide a detailed explanation. Finally, declare a winner or state 'draw' if they are equal.
"""
    # Use the LoRA model to evaluate the responses
    evaluation_response = lora_model.create_completion(
        prompt=evaluation_prompt,
        max_tokens=512,
        temperature=0.5
    )
    return evaluation_response["choices"][0]["text"]

# Load the base LoRA evaluation model
def load_lora_model():
    repo_id = "KolumbusLindh/LoRA-4100"
    model_file = "unsloth.F16.gguf"
    print(f"Downloading LoRA evaluation model from repository {repo_id}...")
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"LoRA evaluation model downloaded to: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

lora_model = load_lora_model()
print("LoRA evaluation model loaded successfully!")

# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
    gr.Markdown("## LLM as a Judge 🧐")

    # Inputs for Model A repository and file
    repo_a_input = gr.Textbox(label="Model A Repository (e.g., KolumbusLindh/LoRA-4100)", placeholder="Enter the Hugging Face repo name for Model A...")
    model_a_input = gr.Textbox(label="Model A File Name (e.g., unsloth.F16.gguf)", placeholder="Enter the model filename for Model A...")

    # Inputs for Model B repository and file
    repo_b_input = gr.Textbox(label="Model B Repository (e.g., KolumbusLindh/LoRA-4100)", placeholder="Enter the Hugging Face repo name for Model B...")
    model_b_input = gr.Textbox(label="Model B File Name (e.g., unsloth.F16.gguf)", placeholder="Enter the model filename for Model B...")

    # Input for prompt and evaluation criteria
    prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
    criteria_dropdown = gr.Dropdown(
        label="Select Evaluation Criteria",
        choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
        value="Clarity",
        type="value"
    )

    # Button to evaluate responses
    evaluate_button = gr.Button("Evaluate Models")
    
    # Output for evaluation results
    evaluation_output = gr.Textbox(
        label="Evaluation Results",
        placeholder="The evaluation results will appear here...",
        lines=10,
        interactive=False
    )

    # Link the evaluation function to the button
    evaluate_button.click(
        fn=evaluate_responses,
        inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input, criteria_dropdown],
        outputs=[evaluation_output]
    )

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()  # Add share=True to create a public link