File size: 2,799 Bytes
eb2e235 5fd0c28 2936c26 01945bd eb2e235 932195b eb2e235 932195b eb2e235 932195b 20e04d7 d13f282 20e04d7 eb2e235 20e04d7 eb2e235 01945bd def541d eb2e235 01945bd eb2e235 01945bd eb2e235 def541d 01945bd eb2e235 01945bd eb2e235 01945bd b97d649 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import os
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
# Hugging Face repository IDs
base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
adapter_repo = "Mat17892/llama_lora_gguf"
# Download the base model GGUF file
print("Downloading base model...")
base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")
# Download the LoRA adapter GGUF file
print("Downloading LoRA adapter...")
lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")
# Define the llama-cli path explicitly
llama_cli_path = "./llama.cpp/build/bin/llama-cli"
if not os.access(llama_cli_path, os.X_OK): # Check if the file is executable
os.chmod(llama_cli_path, 0o755) # Set executable permissions
# Function to run `llama-cli` with base model and adapter
def run_llama_cli(prompt):
print("Running inference with llama-cli...")
cmd = [
llama_cli_path, # Path to the llama-cli executable
"-c", "2048", # Context length
"-cnv", # Enable conversational mode
"-m", base_model_path,
"--lora", lora_adapter_path,
"--prompt", prompt,
]
try:
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
if process.returncode != 0:
print("Error during inference:")
print(stderr.decode())
return "Error: Could not generate response."
return stdout.decode().strip()
except Exception as e:
print(f"Exception occurred: {e}")
return "Error: Could not generate response."
# Gradio interface
def chatbot_fn(user_input, chat_history):
# Build the full chat history as the prompt
prompt = ""
for user, ai in chat_history:
prompt += f"User: {user}\nAI: {ai}\n"
prompt += f"User: {user_input}\nAI:" # Add latest user input
# Generate response using llama-cli
response = run_llama_cli(prompt)
# Update chat history
chat_history.append((user_input, response))
return chat_history, chat_history
# Build the Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
chatbot = gr.Chatbot(label="Chat with the Model")
with gr.Row():
with gr.Column(scale=4):
user_input = gr.Textbox(label="Your Message", placeholder="Type a message...")
with gr.Column(scale=1):
submit_btn = gr.Button("Send")
chat_history = gr.State([])
# Link components
submit_btn.click(
chatbot_fn,
inputs=[user_input, chat_history],
outputs=[chatbot, chat_history],
show_progress=True,
)
# Launch the Gradio app
demo.launch()
|