Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import torch | |
from peft import AutoPeftModelForCausalLM | |
from transformers import AutoTokenizer | |
import spaces | |
# Check if we're running in a Hugging Face Space with GPU constraints | |
IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1" | |
IS_SPACE = os.environ.get("SPACE_ID", None) is not None | |
# Get Hugging Face token from environment variables | |
HF_TOKEN = os.environ.get('HF_TOKEN') | |
# Determine device (use GPU if available) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" | |
print(f"Using device: {device}") | |
print(f"Low memory mode: {LOW_MEMORY}") | |
# Model configuration | |
load_in_4bit = True # Use 4-bit quantization if memory is constrained | |
# Load model and tokenizer with device mapping | |
# Replace with the name of your trained model | |
model_name = "nafisneehal/chandler_bot" | |
model = AutoPeftModelForCausalLM.from_pretrained( | |
model_name, | |
load_in_4bit=load_in_4bit, | |
device_map="auto" if device == "cuda" else None # Automatic GPU mapping | |
) | |
model.to(device) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Define prompt structure (update if necessary for your model) | |
alpaca_prompt = "{instruction} {input} {output}" | |
instruction = "Chat with me like Chandler" | |
# Use GPU provided by Hugging Face Spaces if available | |
def generate_response(user_input, chat_history): | |
instruction = instruction # Treats user input as the instruction | |
input_text = user_input # Any additional input if needed; leave blank otherwise | |
# Prepare inputs for model inference on the correct device | |
inputs = tokenizer( | |
[alpaca_prompt.format(instruction, input_text, "")], | |
return_tensors="pt" | |
).to(device) # Ensure tensors are on the correct device | |
# Generate response on GPU or CPU as appropriate | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, max_new_tokens=100) | |
# Decode response | |
bot_reply = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Update chat history with user and bot interactions | |
chat_history.append(("User", user_input)) | |
chat_history.append(("Bot", bot_reply)) | |
return chat_history, "" # Returns updated chat history and clears input | |
# Set up Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Llama-Based Chatbot on GPU") | |
chat_history = gr.Chatbot(label="Chat History") | |
user_input = gr.Textbox( | |
placeholder="Type your message here...", label="Your Message") | |
# Connect submit actions to generate response function | |
user_input.submit(generate_response, [user_input, chat_history], [ | |
chat_history, user_input]) | |
submit_btn = gr.Button("Send") | |
submit_btn.click(generate_response, [user_input, chat_history], [ | |
chat_history, user_input]) | |
demo.launch() | |