import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Load the base LoRA evaluation model def load_lora_model(): repo_id = "KolumbusLindh/LoRA-4100" model_file = "unsloth.F16.gguf" local_path = hf_hub_download(repo_id=repo_id, filename=model_file) print(f"Loading LoRA model from: {local_path}") return Llama(model_path=local_path, n_ctx=2048, n_threads=8) lora_model = load_lora_model() print("LoRA model loaded successfully!") # Function to load a user-specified model def load_user_model(model_path): print(f"Loading user model from: {model_path}") return Llama(model_path=model_path, n_ctx=2048, n_threads=8) # Generate a response using the specified model and prompt def generate_response(model, prompt): response = model(prompt, max_tokens=256, temperature=0.7) return response["choices"][0]["text"] # Evaluate responses generated by two models using the LoRA model def evaluate_responses(prompt, model_a_path, model_b_path, evaluation_criteria): # Load user-specified models model_a = load_user_model(model_a_path) model_b = load_user_model(model_b_path) # Generate responses response_a = generate_response(model_a, prompt) response_b = generate_response(model_b, prompt) print(f"Response A: {response_a}") print(f"Response B: {response_b}") # Format the evaluation prompt for the LoRA model evaluation_prompt = f""" Prompt: {prompt} Response A: {response_a} Response B: {response_b} Evaluation Criteria: {evaluation_criteria} Please evaluate the responses based on the criteria above. Rate each response on a scale from 1 to 10 for each criterion and provide a detailed explanation. Finally, declare a winner or state 'draw' if they are equal. """ # Use the LoRA model to evaluate the responses evaluation_response = lora_model.create_completion( prompt=evaluation_prompt, max_tokens=512, temperature=0.5 ) return evaluation_response["choices"][0]["text"] # Gradio interface with gr.Blocks(title="LLM as a Judge") as demo: gr.Markdown("## LLM as a Judge 🧐") # User inputs for models, prompt, and evaluation criteria model_a_input = gr.Textbox(label="Model A Path or URL", placeholder="Enter the path or URL for Model A...") model_b_input = gr.Textbox(label="Model B Path or URL", placeholder="Enter the path or URL for Model B...") prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3) criteria_dropdown = gr.Dropdown( label="Select Evaluation Criteria", choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"], value="Clarity", type="value" ) evaluate_button = gr.Button("Evaluate Models") evaluation_output = gr.Textbox( label="Evaluation Results", placeholder="The evaluation results will appear here...", lines=10, interactive=False ) # Link the evaluation function to the button evaluate_button.click( fn=evaluate_responses, inputs=[prompt_input, model_a_input, model_b_input, criteria_dropdown], outputs=[evaluation_output] ) # Launch the Gradio app if __name__ == "__main__": demo.launch()