chanbot / app.py
Last commit not found
raw
history blame
3.09 kB
import gradio as gr
import os
import torch
from unsloth import FastLanguageModel
from huggingface_hub import spaces
# Get Hugging Face token from environment variables
HF_TOKEN = os.environ.get('HF_TOKEN')
# Check if we're running in a Hugging Face Space with GPU constraints
IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
IS_SPACE = os.environ.get("SPACE_ID", None) is not None
# Determine device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
print(f"Using device: {device}")
print(f"Low memory mode: {LOW_MEMORY}")
# Model configuration
max_seq_length = 2048 # Max sequence length for RoPE scaling
dtype = torch.float16 if device == "cuda" else torch.float32
load_in_4bit = True # Enable 4-bit quantization if memory is limited
# Load model and tokenizer with device mapping
model_name = "nafisneehal/chandler_bot"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
device_map="auto" if device == "cuda" else None # Automatic GPU mapping
)
FastLanguageModel.for_inference(model) # Optimize model for faster inference
# Define prompt structure (update if necessary for your model)
alpaca_prompt = "{instruction} {input} {output}"
instruction_text = "Learn how to talk like Chandler - a popular character from FRIENDS TV Show. Input is someone saying something, Output is what Chandler saying in response."
@spaces.GPU # Use GPU provided by Hugging Face Spaces if available
def generate_response(user_input, chat_history):
instruction = user_input # Treats user input as instruction
input_text = "" # Any additional input if needed; empty otherwise
# Prepare inputs for model inference on the correct device
inputs = tokenizer(
[alpaca_prompt.format(instruction, input_text, "")],
return_tensors="pt"
).to(device) # Ensure tensors are on the correct device
# Generate response on GPU or CPU as appropriate
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=100)
# Decode response
bot_reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Update chat history with user and bot interactions
chat_history.append(("User", user_input))
chat_history.append(("Bot", bot_reply))
return chat_history, "" # Returns updated chat history and clears input
# Set up Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Llama-Based Chatbot on GPU")
chat_history = gr.Chatbot(label="Chat History")
user_input = gr.Textbox(
placeholder="Type your message here...", label="Your Message")
# Connect submit actions to generate response function
user_input.submit(generate_response, [user_input, chat_history], [
chat_history, user_input])
submit_btn = gr.Button("Send")
submit_btn.click(generate_response, [user_input, chat_history], [
chat_history, user_input])
demo.launch()