File size: 2,930 Bytes
5651a15
5387ea1
 
cc5a84d
 
8ce7b73
5387ea1
 
 
 
 
cc5a84d
 
 
5387ea1
 
 
 
 
 
 
 
cc5a84d
5387ea1
 
 
cc5a84d
 
4752301
5651a15
9d8332a
cc5a84d
5387ea1
4752301
 
5387ea1
 
 
4752301
a6831cd
5387ea1
4752301
 
 
5387ea1
4752301
5387ea1
 
 
 
 
469f469
 
5387ea1
 
 
 
 
 
 
 
 
 
4752301
5387ea1
469f469
5387ea1
 
5651a15
5387ea1
 
 
 
 
 
5651a15
469f469
 
 
 
 
 
 
002d0ae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
import os
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import spaces

# Check if we're running in a Hugging Face Space with GPU constraints
IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
IS_SPACE = os.environ.get("SPACE_ID", None) is not None

# Get Hugging Face token from environment variables
HF_TOKEN = os.environ.get('HF_TOKEN')

# Determine device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"

print(f"Using device: {device}")
print(f"Low memory mode: {LOW_MEMORY}")

# Model configuration
load_in_4bit = True  # Use 4-bit quantization if memory is constrained

# Load model and tokenizer with device mapping
model_name = "nafisneehal/chandler_bot"
model = AutoPeftModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=load_in_4bit
)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define prompt structure (update as needed for your model)
alpaca_prompt = "{instruction} {input_text} {output}"

@spaces.GPU  # Use GPU provided by Hugging Face Spaces if available
def generate_response(user_input, chat_history):
    instruction = "Chat with me like Chandler talks."
    input_text = user_input  # Treats user input as the input

    # Format the input using the prompt template
    formatted_input = alpaca_prompt.format(instruction=instruction, input_text=input_text, output="")

    # Prepare inputs for model inference on the correct device
    inputs = tokenizer([formatted_input], return_tensors="pt").to(device)

    # Generate response on GPU or CPU as appropriate
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100)

    # Decode response and remove the instruction part
    bot_reply = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(instruction, "").strip()

    # Update chat history with user and bot interactions
    chat_history.append(("User", user_input))
    chat_history.append(("Bot", bot_reply))

    return chat_history, ""  # Returns updated chat history and clears input


# Set up Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Chandler-Like Chatbot on GPU")

    chat_history = gr.Chatbot(label="Chat History", elem_id="chatbox")
    user_input = gr.Textbox(
        placeholder="Type your message here...", label="Your Message")

    # Connect submit actions to generate response function
    user_input.submit(generate_response, [user_input, chat_history], [
                      chat_history, user_input])
    submit_btn = gr.Button("Send")
    submit_btn.click(generate_response, [user_input, chat_history], [
                     chat_history, user_input])

# Custom CSS to align chat labels on the left
demo.css = """
#chatbox .bot, #chatbox .user {
    text-align: left;
}
"""

demo.launch()  # Enables a public link