Spaces:
Sleeping
Sleeping
File size: 2,908 Bytes
04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a 04d9cf4 ec08b2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Load the model
def load_model():
repo_id = "KolumbusLindh/LoRA-4100"
model_file = "unsloth.F16.gguf"
local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
print(f"Loading model from: {local_path}")
return Llama(model_path=local_path, n_ctx=2048, n_threads=8)
print("Starting model loading...")
model = load_model()
print("Model loaded successfully!")
# Function to evaluate two responses
def evaluate_responses(prompt, response_a, response_b, evaluation_criteria):
# Format the evaluation prompt
evaluation_prompt = [
{"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
{"role": "user", "content": f"""
Prompt: {prompt}
Response A: {response_a}
Response B: {response_b}
Please evaluate both responses based on the following criteria: {evaluation_criteria}
For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
"""}
]
# Generate the evaluation
evaluation_response = model.create_chat_completion(
messages=evaluation_prompt,
max_tokens=512,
temperature=0.5
)
evaluation_results = evaluation_response['choices'][0]['message']['content']
return evaluation_results
# Gradio interface
with gr.Blocks(title="LLM as a Judge") as demo:
gr.Markdown("## LLM as a Judge 🧐")
# Input fields for the prompt, two responses, and selection of criteria
prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
response_a_input = gr.Textbox(label="Response A", placeholder="Enter Response A here...", lines=5)
response_b_input = gr.Textbox(label="Response B", placeholder="Enter Response B here...", lines=5)
# Dropdown for selecting evaluation criteria
criteria_dropdown = gr.Dropdown(
label="Select Evaluation Criteria",
choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
value="Clarity",
type="value"
)
# Button to start the evaluation
evaluate_button = gr.Button("Evaluate Responses")
# Label for displaying the evaluation results
evaluation_output = gr.Textbox(
label="Evaluation Results",
placeholder="The evaluation results will appear here...",
lines=10,
interactive=False
)
# Link evaluation function to the button
evaluate_button.click(
fn=evaluate_responses,
inputs=[prompt_input, response_a_input, response_b_input, criteria_dropdown],
outputs=[evaluation_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch()
|