Spaces:
Sleeping
Sleeping
File size: 2,930 Bytes
5651a15 5387ea1 cc5a84d 8ce7b73 5387ea1 cc5a84d 5387ea1 cc5a84d 5387ea1 cc5a84d 4752301 5651a15 9d8332a cc5a84d 5387ea1 4752301 5387ea1 4752301 a6831cd 5387ea1 4752301 5387ea1 4752301 5387ea1 469f469 5387ea1 4752301 5387ea1 469f469 5387ea1 5651a15 5387ea1 5651a15 469f469 002d0ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
import os
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import spaces
# Check if we're running in a Hugging Face Space with GPU constraints
IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
IS_SPACE = os.environ.get("SPACE_ID", None) is not None
# Get Hugging Face token from environment variables
HF_TOKEN = os.environ.get('HF_TOKEN')
# Determine device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
print(f"Using device: {device}")
print(f"Low memory mode: {LOW_MEMORY}")
# Model configuration
load_in_4bit = True # Use 4-bit quantization if memory is constrained
# Load model and tokenizer with device mapping
model_name = "nafisneehal/chandler_bot"
model = AutoPeftModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=load_in_4bit
)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Define prompt structure (update as needed for your model)
alpaca_prompt = "{instruction} {input_text} {output}"
@spaces.GPU # Use GPU provided by Hugging Face Spaces if available
def generate_response(user_input, chat_history):
instruction = "Chat with me like Chandler talks."
input_text = user_input # Treats user input as the input
# Format the input using the prompt template
formatted_input = alpaca_prompt.format(instruction=instruction, input_text=input_text, output="")
# Prepare inputs for model inference on the correct device
inputs = tokenizer([formatted_input], return_tensors="pt").to(device)
# Generate response on GPU or CPU as appropriate
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=100)
# Decode response and remove the instruction part
bot_reply = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(instruction, "").strip()
# Update chat history with user and bot interactions
chat_history.append(("User", user_input))
chat_history.append(("Bot", bot_reply))
return chat_history, "" # Returns updated chat history and clears input
# Set up Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Chandler-Like Chatbot on GPU")
chat_history = gr.Chatbot(label="Chat History", elem_id="chatbox")
user_input = gr.Textbox(
placeholder="Type your message here...", label="Your Message")
# Connect submit actions to generate response function
user_input.submit(generate_response, [user_input, chat_history], [
chat_history, user_input])
submit_btn = gr.Button("Send")
submit_btn.click(generate_response, [user_input, chat_history], [
chat_history, user_input])
# Custom CSS to align chat labels on the left
demo.css = """
#chatbox .bot, #chatbox .user {
text-align: left;
}
"""
demo.launch() # Enables a public link
|