import gradio as gr import os import torch from peft import AutoPeftModelForCausalLM from transformers import AutoTokenizer import spaces # Check if we're running in a Hugging Face Space with GPU constraints IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1" IS_SPACE = os.environ.get("SPACE_ID", None) is not None # Get Hugging Face token from environment variables HF_TOKEN = os.environ.get('HF_TOKEN') # Determine device (use GPU if available) device = "cuda" if torch.cuda.is_available() else "cpu" LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" print(f"Using device: {device}") print(f"Low memory mode: {LOW_MEMORY}") # Model configuration load_in_4bit = True # Use 4-bit quantization if memory is constrained # Load model and tokenizer with device mapping model_name = "nafisneehal/chandler_bot" model = AutoPeftModelForCausalLM.from_pretrained( model_name, load_in_4bit=load_in_4bit ) model.to(device) tokenizer = AutoTokenizer.from_pretrained(model_name) # Define prompt structure (update as needed for your model) alpaca_prompt = "{instruction} {input_text} {output}" @spaces.GPU # Use GPU provided by Hugging Face Spaces if available def generate_response(user_input, chat_history): instruction = "Chat with me like Chandler talks." input_text = user_input # Treats user input as the input # Format the input using the prompt template formatted_input = alpaca_prompt.format(instruction=instruction, input_text=input_text, output="") # Prepare inputs for model inference on the correct device inputs = tokenizer([formatted_input], return_tensors="pt").to(device) # Generate response on GPU or CPU as appropriate with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=100) # Decode response and remove the instruction part bot_reply = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(instruction, "").strip() # Update chat history with user and bot interactions chat_history.append(("User", user_input)) chat_history.append(("Bot", bot_reply)) return chat_history, "" # Returns updated chat history and clears input # Set up Gradio interface with gr.Blocks() as demo: gr.Markdown("# Chandler-Like Chatbot on GPU") chat_history = gr.Chatbot(label="Chat History", elem_id="chatbox") user_input = gr.Textbox( placeholder="Type your message here...", label="Your Message") # Connect submit actions to generate response function user_input.submit(generate_response, [user_input, chat_history], [ chat_history, user_input]) submit_btn = gr.Button("Send") submit_btn.click(generate_response, [user_input, chat_history], [ chat_history, user_input]) # Custom CSS to align chat labels on the left demo.css = """ #chatbox .bot, #chatbox .user { text-align: left; } """ demo.launch() # Enables a public link