File size: 4,890 Bytes
7ab2ec9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61046e0
7ab2ec9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96c0b50
7ab2ec9
 
 
 
 
 
 
 
 
 
 
 
 
96c0b50
7ab2ec9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afc14cf
7ab2ec9
 
 
 
 
 
 
 
 
 
 
 
 
 
96c0b50
7ab2ec9
 
 
 
 
 
 
96c0b50
7ab2ec9
 
 
 
96c0b50
7ab2ec9
 
 
 
 
 
 
 
 
 
96c0b50
7ab2ec9
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Load a user-specified model
def load_user_model(repo_id, model_file):
    print(f"Downloading model {model_file} from repository {repo_id}...")
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"Model downloaded to: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

# Generate a response using the specified model and prompt
def generate_response(model, prompt):
    response = model(prompt, max_tokens=512, temperature=0.5)
    return response["choices"][0]["text"]

# Evaluate responses using the LoRA evaluation model
def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b):

    # Load models
    model_a_instance = load_user_model(repo_a, model_a)
    model_b_instance = load_user_model(repo_b, model_b)
    
    # Generate responses
    response_a = generate_response(model_a_instance, prompt)
    response_b = generate_response(model_b_instance, prompt)
    
    # Display generated responses
    print(f"Response A: {response_a}")
    print(f"Response B: {response_b}")
    
    # Format the evaluation prompt
    evaluation_prompt = f"""
Prompt: {prompt}

Response A: {response_a}
Response B: {response_b}

Evaluation Criteria: Relevance, Coherence and Completeness

Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
"""
    # Use the LoRA model to evaluate the responses
    evaluation_response = lora_model.create_completion(
        prompt=evaluation_prompt,
        max_tokens=512,
        temperature=0.5
    )
    evaluation_results = evaluation_response["choices"][0]["text"]
    
    # Combine results for display
    final_output = f"""
{evaluation_results}
"""
    return final_output, response_a, response_b

# Load the LoRA evaluation model
def load_lora_model():
    repo_id = "KolumbusLindh/LoRA-4100"
    model_file = "unsloth.F16.gguf"
    print(f"Downloading LoRA evaluation model from repository {repo_id}...")
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"LoRA evaluation model downloaded to: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

lora_model = load_lora_model()
print("LoRA evaluation model loaded successfully!")

# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
    gr.Markdown("## LLM as a Judge 🧐")
    gr.Markdown("Welcome to the LLM as a Judge demo! This application uses the LoRA model to evaluate responses generated by two different models based on user-specified criteria. You can select up to 3 evaluation criteria and provide a prompt to generate responses from the models. The LoRA model will then evaluate the responses based on the selected criteria and determine the winner.")

    # Model inputs
    repo_a_input = gr.Textbox(label="Model A Repository", placeholder="Enter the Hugging Face repo name for Model A...", value="forestav/LoRA-2000")
    model_a_input = gr.Textbox(label="Model A File Name", placeholder="Enter the model filename for Model A...", value="unsloth.F16.gguf")
    repo_b_input = gr.Textbox(label="Model B Repository", placeholder="Enter the Hugging Face repo name for Model B...", value="KolumbusLindh/LoRA-4100")
    model_b_input = gr.Textbox(label="Model B File Name", placeholder="Enter the model filename for Model B...", value="unsloth.F16.gguf")

    # Prompt and criteria inputs
    prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)

    # Button and outputs
    evaluate_button = gr.Button("Evaluate Models")

    with gr.Row():
        with gr.Column():
            response_a = gr.Textbox(
                label="Response A",
                placeholder="The response from Model A will appear here...",
                lines=20,
                interactive=False
            )

        with gr.Column():
            response_b = gr.Textbox(
                label="Response B",
                placeholder="The response from Model B will appear here...",
                lines=20,
                interactive=False
            )

    gr.Markdown("### The LLMs are evaluated based on the criterion of Relevance, Coherence and Completeness.")
    evaluation_output = gr.Textbox(
        label="Evaluation Results",
        placeholder="The evaluation results will appear here...",
        lines=20,
        interactive=False
    )

    # Link evaluation function
    evaluate_button.click(
        fn=evaluate_responses,
        inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input],
        outputs=[evaluation_output, response_a, response_b]
    )

# Launch app
if __name__ == "__main__":
    demo.launch()