Spaces:
Sleeping
Sleeping
File size: 3,476 Bytes
04d9cf4 1de90bd 04d9cf4 1de90bd 04d9cf4 1de90bd 04d9cf4 1de90bd ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 1de90bd 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 1de90bd ec08b2a 04d9cf4 1de90bd ec08b2a 04d9cf4 1de90bd ec08b2a 1de90bd 04d9cf4 ec08b2a 04d9cf4 ec08b2a 1de90bd ec08b2a 04d9cf4 ec08b2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Load LoRA-4100 model for evaluation
def load_lora_model():
repo_id = "KolumbusLindh/LoRA-4100"
model_file = "unsloth.F16.gguf"
local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
print(f"Loading LoRA model from: {local_path}")
return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
lora_model = load_lora_model()
print("LoRA model loaded successfully!")
# Load user-specified model
def load_user_model(model_path):
print(f"Loading user model from: {model_path}")
return Llama(model_path=model_path, n_ctx=2048, n_threads=8)
# Generate response using a specified model and prompt
def generate_response(model_path, prompt):
user_model = load_user_model(model_path)
response = user_model(prompt, max_tokens=256, temperature=0.7)
return response["choices"][0]["text"]
# Evaluate responses using the LoRA model
def evaluate_responses(prompt, model_a_path, model_b_path, evaluation_criteria):
# Generate responses
response_a = generate_response(model_a_path, prompt)
response_b = generate_response(model_b_path, prompt)
# Format the evaluation prompt
evaluation_prompt = [
{"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
{"role": "user", "content": f"""
Prompt: {prompt}
Response A: {response_a}
Response B: {response_b}
Please evaluate both responses based on the following criteria: {evaluation_criteria}
For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
"""}
]
# Generate the evaluation
evaluation_response = lora_model.create_chat_completion(
messages=evaluation_prompt,
max_tokens=512,
temperature=0.5
)
evaluation_results = evaluation_response['choices'][0]['message']['content']
return evaluation_results
# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
gr.Markdown("## LLM as a Judge 🧐")
# Inputs for model paths, prompt, and evaluation criteria
model_a_input = gr.Textbox(label="Model A Path or URL", placeholder="Enter the path or URL to Model A...")
model_b_input = gr.Textbox(label="Model B Path or URL", placeholder="Enter the path or URL to Model B...")
prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
# Dropdown for evaluation criteria
criteria_dropdown = gr.Dropdown(
label="Select Evaluation Criteria",
choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
value="Clarity",
type="value"
)
# Button to evaluate responses
evaluate_button = gr.Button("Evaluate Models")
# Output for evaluation results
evaluation_output = gr.Textbox(
label="Evaluation Results",
placeholder="The evaluation results will appear here...",
lines=10,
interactive=False
)
# Link evaluation function to the button
evaluate_button.click(
fn=evaluate_responses,
inputs=[prompt_input, model_a_input, model_b_input, criteria_dropdown],
outputs=[evaluation_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch()
|