Spaces:

KolumbusLindh
/

LLM-as-a-judge

Sleeping

LLM-as-a-judge / app.py

Kolumbus Lindh

updsated UI and functionality

ec08b2a 7 months ago

2.91 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# Load the model
	def load_model():
	repo_id = "KolumbusLindh/LoRA-4100"
	model_file = "unsloth.F16.gguf"

	local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
	print(f"Loading model from: {local_path}")
	return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

	print("Starting model loading...")
	model = load_model()
	print("Model loaded successfully!")

	# Function to evaluate two responses
	def evaluate_responses(prompt, response_a, response_b, evaluation_criteria):
	# Format the evaluation prompt
	evaluation_prompt = [
	{"role": "system", "content": "You are an objective and thorough evaluator of instruction-based responses."},
	{"role": "user", "content": f"""
	Prompt: {prompt}

	Response A: {response_a}
	Response B: {response_b}

	Please evaluate both responses based on the following criteria: {evaluation_criteria}

	For each criterion, provide a rating of the responses on a scale from 1 to 10, and explain why each response earned that rating. Then, declare a winner (or 'draw' if both are equal).
	"""}
	]

	# Generate the evaluation
	evaluation_response = model.create_chat_completion(
	messages=evaluation_prompt,
	max_tokens=512,
	temperature=0.5
	)

	evaluation_results = evaluation_response['choices'][0]['message']['content']

	return evaluation_results

	# Gradio interface
	with gr.Blocks(title="LLM as a Judge") as demo:
	gr.Markdown("## LLM as a Judge 🧐")

	# Input fields for the prompt, two responses, and selection of criteria
	prompt_input = gr.Textbox(label="Enter the Prompt", placeholder="Enter the prompt here...", lines=3)
	response_a_input = gr.Textbox(label="Response A", placeholder="Enter Response A here...", lines=5)
	response_b_input = gr.Textbox(label="Response B", placeholder="Enter Response B here...", lines=5)

	# Dropdown for selecting evaluation criteria
	criteria_dropdown = gr.Dropdown(
	label="Select Evaluation Criteria",
	choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"],
	value="Clarity",
	type="value"
	)

	# Button to start the evaluation
	evaluate_button = gr.Button("Evaluate Responses")

	# Label for displaying the evaluation results
	evaluation_output = gr.Textbox(
	label="Evaluation Results",
	placeholder="The evaluation results will appear here...",
	lines=10,
	interactive=False
	)

	# Link evaluation function to the button
	evaluate_button.click(
	fn=evaluate_responses,
	inputs=[prompt_input, response_a_input, response_b_input, criteria_dropdown],
	outputs=[evaluation_output]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()