Spaces:
Sleeping
Sleeping
File size: 4,349 Bytes
04d9cf4 027f91a 8f23865 04d9cf4 8f23865 04d9cf4 66cb564 47aec4f 1de90bd 027f91a 47aec4f 027f91a 8f23865 66cb564 1de90bd 8f23865 66cb564 027f91a 66cb564 1de90bd 027f91a 66cb564 ec08b2a 7841304 ec08b2a d9faaf5 66cb564 ec08b2a 47aec4f 04d9cf4 7841304 027f91a 47aec4f 04d9cf4 027f91a 8f23865 ca0c241 8f23865 04d9cf4 47aec4f 04d9cf4 027f91a 47aec4f 5781d4e 027f91a 66cb564 7841304 47aec4f 04d9cf4 8f23865 027f91a 1de90bd 47aec4f 04d9cf4 66cb564 47aec4f 027f91a 04d9cf4 66cb564 027f91a ec08b2a 8f23865 47aec4f 04d9cf4 027f91a ec08b2a 027f91a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Load a user-specified model
def load_user_model(repo_id, model_file):
print(f"Downloading model {model_file} from repository {repo_id}...")
local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
print(f"Model downloaded to: {local_path}")
return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
# Generate a response using the specified model and prompt
def generate_response(model, prompt):
response = model(prompt, max_tokens=512, temperature=0.5, top_p=0.95)
return response["choices"][0]["text"]
# Evaluate responses using the LoRA evaluation model
def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_criteria):
if len(evaluation_criteria) > 3:
return "Error: Please select up to 3 evaluation criteria only.", "", ""
# Load models
model_a_instance = load_user_model(repo_a, model_a)
model_b_instance = load_user_model(repo_b, model_b)
# Generate responses
response_a = generate_response(model_a_instance, prompt)
response_b = generate_response(model_b_instance, prompt)
# Display generated responses
print(f"Response A: {response_a}")
print(f"Response B: {response_b}")
# Format the evaluation prompt
criteria_list = ", ".join(evaluation_criteria)
evaluation_prompt = f"""
Prompt: {prompt}
Response A: {response_a}
Response B: {response_b}
Evaluation Criteria: {criteria_list}
Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
"""
# Use the LoRA model to evaluate the responses
evaluation_response = lora_model.create_completion(
prompt=evaluation_prompt,
max_tokens=512,
temperature=0.5,
top_p=0.95,
)
evaluation_results = evaluation_response["choices"][0]["text"]
return response_a, response_b, evaluation_results
# Load the LoRA evaluation model
def load_lora_model():
repo_id = "KolumbusLindh/LoRA-6150"
model_file = "unsloth.F16.gguf"
print(f"Downloading LoRA evaluation model from repository {repo_id}...")
local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
print(f"LoRA evaluation model downloaded to: {local_path}")
return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
lora_model = load_lora_model()
print("LoRA evaluation model loaded successfully!")
# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
gr.Markdown("## LLM as a Judge 𐄷")
# Model inputs
repo_a_input = gr.Textbox(label="Model A Repository", placeholder="KolumbusLindh/LoRA-6150")
model_a_input = gr.Textbox(label="Model A File Name", placeholder="unsloth.F16.gguf")
repo_b_input = gr.Textbox(label="Model B Repository", placeholder="forestav/LoRA-2000")
model_b_input = gr.Textbox(label="Model B File Name", placeholder="unsloth.F16.gguf")
# Prompt and criteria inputs
prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
criteria_dropdown = gr.CheckboxGroup(
label="Select Evaluation Criteria (Max 3)",
choices=["Clarity", "Completeness", "Accuracy"] # Restricted criteria
)
# Button and outputs
evaluate_button = gr.Button("Evaluate Models")
response_a_output = gr.Textbox(
label="Response A",
placeholder="Response from Model A will appear here...",
lines=10,
interactive=False
)
response_b_output = gr.Textbox(
label="Response B",
placeholder="Response from Model B will appear here...",
lines=10,
interactive=False
)
evaluation_output = gr.Textbox(
label="Evaluation Results",
placeholder="The evaluation analysis will appear here...",
lines=20,
interactive=False
)
# Link evaluation function
evaluate_button.click(
fn=evaluate_responses,
inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input, criteria_dropdown],
outputs=[response_a_output, response_b_output, evaluation_output]
)
# Launch app
if __name__ == "__main__":
demo.launch()
|