File size: 5,195 Bytes
7ab2ec9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Load a user-specified model
def load_user_model(repo_id, model_file):
    print(f"Downloading model {model_file} from repository {repo_id}...")
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"Model downloaded to: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

# Generate a response using the specified model and prompt
def generate_response(model, prompt):
    response = model(prompt, max_tokens=512, temperature=0.5)
    return response["choices"][0]["text"]

# Evaluate responses using the LoRA evaluation model
def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_criteria):
    if len(evaluation_criteria) > 3:
        return "Error: Please select up to 3 evaluation criteria only."

    # Load models
    model_a_instance = load_user_model(repo_a, model_a)
    model_b_instance = load_user_model(repo_b, model_b)
    
    # Generate responses
    response_a = generate_response(model_a_instance, prompt)
    response_b = generate_response(model_b_instance, prompt)
    
    # Display generated responses
    print(f"Response A: {response_a}")
    print(f"Response B: {response_b}")
    
    # Format the evaluation prompt
    criteria_list = ", ".join(evaluation_criteria)
    evaluation_prompt = f"""
Prompt: {prompt}

Response A: {response_a}
Response B: {response_b}

Evaluation Criteria: {criteria_list}

Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
"""
    # Use the LoRA model to evaluate the responses
    evaluation_response = lora_model.create_completion(
        prompt=evaluation_prompt,
        max_tokens=512,
        temperature=0.5
    )
    evaluation_results = evaluation_response["choices"][0]["text"]
    
    # Combine results for display
    final_output = f"""
Evaluation Results:\n{evaluation_results}
"""
    return final_output, response_a, response_b

# Load the LoRA evaluation model
def load_lora_model():
    repo_id = "KolumbusLindh/LoRA-4100"
    model_file = "unsloth.F16.gguf"
    print(f"Downloading LoRA evaluation model from repository {repo_id}...")
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"LoRA evaluation model downloaded to: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

lora_model = load_lora_model()
print("LoRA evaluation model loaded successfully!")

# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
    gr.Markdown("## LLM as a Judge 🧐")
    gr.Markdown("Welcome to the LLM as a Judge demo! This application uses the LoRA model to evaluate responses generated by two different models based on user-specified criteria. You can select up to 3 evaluation criteria and provide a prompt to generate responses from the models. The LoRA model will then evaluate the responses based on the selected criteria and determine the winner.")

    # Model inputs
    repo_a_input = gr.Textbox(label="Model A Repository", placeholder="Enter the Hugging Face repo name for Model A...", value="forestav/gguf_lora_model")
    model_a_input = gr.Textbox(label="Model A File Name", placeholder="Enter the model filename for Model A...", value="unsloth.F16.gguf")
    repo_b_input = gr.Textbox(label="Model B Repository", placeholder="Enter the Hugging Face repo name for Model B...", value="KolumbusLindh/LoRA-4100")
    model_b_input = gr.Textbox(label="Model B File Name", placeholder="Enter the model filename for Model B...", value="unsloth.F16.gguf")

    # Prompt and criteria inputs
    prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
    criteria_dropdown = gr.CheckboxGroup(
        label="Select Up to 3 Evaluation Criteria",
        choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"]
    )

    # Button and outputs
    evaluate_button = gr.Button("Evaluate Models")

    with gr.Row():
        with gr.Column():
            response_a = gr.Textbox(
                label="Response A",
                placeholder="The response for Model A will appear here...",
                lines=20,
                interactive=False
            )

        with gr.Column():
            response_b = gr.Textbox(
                label="Response B",
                placeholder="The response for Model B will appear here...",
                lines=20,
                interactive=False
            )

    evaluation_output = gr.Textbox(
        label="Evaluation Results",
        placeholder="The evaluation results will appear here...",
        lines=20,
        interactive=False
    )

    # Link evaluation function
    evaluate_button.click(
        fn=evaluate_responses,
        inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input, criteria_dropdown],
        outputs=[evaluation_output, response_a, response_b]
    )

# Launch app
if __name__ == "__main__":
    demo.launch()