Spaces:

KolumbusLindh
/

LLM-as-a-judge

Sleeping

File size: 4,349 Bytes

04d9cf4
 
 
 
027f91a
8f23865
 
04d9cf4
8f23865
04d9cf4
 
66cb564
 
47aec4f
1de90bd
 
027f91a
 
 
47aec4f
027f91a
 
8f23865
 
66cb564
1de90bd
8f23865
 
66cb564
027f91a
66cb564
 
1de90bd
027f91a
 
66cb564
ec08b2a
 
 
 
 
7841304
ec08b2a
d9faaf5
66cb564
 
 
 
ec08b2a
47aec4f
 
04d9cf4
7841304
027f91a
47aec4f
04d9cf4
027f91a
8f23865
ca0c241
8f23865
 
 
 
 
 
 
 
 
04d9cf4
 
47aec4f
04d9cf4
027f91a
47aec4f
 
 
 
5781d4e
027f91a
66cb564
7841304
47aec4f
 
04d9cf4
8f23865
027f91a
1de90bd
47aec4f
 
 
 
 
 
 
 
 
 
 
 
04d9cf4
66cb564
47aec4f
027f91a
04d9cf4
 
66cb564
027f91a
ec08b2a
 
8f23865
47aec4f
04d9cf4
 
027f91a
ec08b2a
027f91a

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Load a user-specified model
def load_user_model(repo_id, model_file):
    print(f"Downloading model {model_file} from repository {repo_id}...")
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"Model downloaded to: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

# Generate a response using the specified model and prompt
def generate_response(model, prompt):
    response = model(prompt, max_tokens=512, temperature=0.5, top_p=0.95)
    return response["choices"][0]["text"]

# Evaluate responses using the LoRA evaluation model
def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_criteria):
    if len(evaluation_criteria) > 3:
        return "Error: Please select up to 3 evaluation criteria only.", "", ""

    # Load models
    model_a_instance = load_user_model(repo_a, model_a)
    model_b_instance = load_user_model(repo_b, model_b)
    
    # Generate responses
    response_a = generate_response(model_a_instance, prompt)
    response_b = generate_response(model_b_instance, prompt)
    
    # Display generated responses
    print(f"Response A: {response_a}")
    print(f"Response B: {response_b}")
    
    # Format the evaluation prompt
    criteria_list = ", ".join(evaluation_criteria)
    evaluation_prompt = f"""
Prompt: {prompt}

Response A: {response_a}
Response B: {response_b}

Evaluation Criteria: {criteria_list}

Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
"""
    # Use the LoRA model to evaluate the responses
    evaluation_response = lora_model.create_completion(
        prompt=evaluation_prompt,
        max_tokens=512,
        temperature=0.5,
        top_p=0.95,
    )
    evaluation_results = evaluation_response["choices"][0]["text"]
    
    return response_a, response_b, evaluation_results

# Load the LoRA evaluation model
def load_lora_model():
    repo_id = "KolumbusLindh/LoRA-6150"
    model_file = "unsloth.F16.gguf"
    print(f"Downloading LoRA evaluation model from repository {repo_id}...")
    local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
    print(f"LoRA evaluation model downloaded to: {local_path}")
    return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

lora_model = load_lora_model()
print("LoRA evaluation model loaded successfully!")

# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
    gr.Markdown("## LLM as a Judge 𐄷")

    # Model inputs
    repo_a_input = gr.Textbox(label="Model A Repository", placeholder="KolumbusLindh/LoRA-6150")
    model_a_input = gr.Textbox(label="Model A File Name", placeholder="unsloth.F16.gguf")
    repo_b_input = gr.Textbox(label="Model B Repository", placeholder="forestav/LoRA-2000")
    model_b_input = gr.Textbox(label="Model B File Name", placeholder="unsloth.F16.gguf")

    # Prompt and criteria inputs
    prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
    criteria_dropdown = gr.CheckboxGroup(
        label="Select Evaluation Criteria (Max 3)",
        choices=["Clarity", "Completeness", "Accuracy"]  # Restricted criteria
    )

    # Button and outputs
    evaluate_button = gr.Button("Evaluate Models")
    response_a_output = gr.Textbox(
        label="Response A",
        placeholder="Response from Model A will appear here...",
        lines=10,
        interactive=False
    )
    response_b_output = gr.Textbox(
        label="Response B",
        placeholder="Response from Model B will appear here...",
        lines=10,
        interactive=False
    )
    evaluation_output = gr.Textbox(
        label="Evaluation Results",
        placeholder="The evaluation analysis will appear here...",
        lines=20,
        interactive=False
    )

    # Link evaluation function
    evaluate_button.click(
        fn=evaluate_responses,
        inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input, criteria_dropdown],
        outputs=[response_a_output, response_b_output, evaluation_output]
    )

# Launch app
if __name__ == "__main__":
    demo.launch()