Spaces:

nafisneehal
/

chanbot

Sleeping

App Files Files Community

chanbot / app.py

nafisneehal

Update app.py

9d8332a verified 8 months ago

raw

history blame

2.85 kB

	import gradio as gr
	import os
	import torch
	from peft import AutoPeftModelForCausalLM
	from transformers import AutoTokenizer
	import spaces

	# Check if we're running in a Hugging Face Space with GPU constraints
	IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
	IS_SPACE = os.environ.get("SPACE_ID", None) is not None

	# Get Hugging Face token from environment variables
	HF_TOKEN = os.environ.get('HF_TOKEN')

	# Determine device (use GPU if available)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"

	print(f"Using device: {device}")
	print(f"Low memory mode: {LOW_MEMORY}")

	# Model configuration
	load_in_4bit = True # Use 4-bit quantization if memory is constrained

	# Load model and tokenizer with device mapping
	# Replace with the name of your trained model
	model_name = "nafisneehal/chandler_bot"
	model = AutoPeftModelForCausalLM.from_pretrained(
	model_name,
	load_in_4bit=load_in_4bit,
	device_map="auto" if device == "cuda" else None # Automatic GPU mapping
	)
	model.to(device)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Define prompt structure (update if necessary for your model)
	alpaca_prompt = "{instruction} {input} {output}"

	instruction = "Chat with me like Chandler"


	@spaces.GPU # Use GPU provided by Hugging Face Spaces if available
	def generate_response(user_input, chat_history):
	instruction = instruction # Treats user input as the instruction
	input_text = user_input # Any additional input if needed; leave blank otherwise

	# Prepare inputs for model inference on the correct device
	inputs = tokenizer(
	[alpaca_prompt.format(instruction, input_text, "")],
	return_tensors="pt"
	).to(device) # Ensure tensors are on the correct device

	# Generate response on GPU or CPU as appropriate
	with torch.no_grad():
	outputs = model.generate(**inputs, max_new_tokens=100)

	# Decode response
	bot_reply = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Update chat history with user and bot interactions
	chat_history.append(("User", user_input))
	chat_history.append(("Bot", bot_reply))

	return chat_history, "" # Returns updated chat history and clears input


	# Set up Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Llama-Based Chatbot on GPU")

	chat_history = gr.Chatbot(label="Chat History")
	user_input = gr.Textbox(
	placeholder="Type your message here...", label="Your Message")

	# Connect submit actions to generate response function
	user_input.submit(generate_response, [user_input, chat_history], [
	chat_history, user_input])
	submit_btn = gr.Button("Send")
	submit_btn.click(generate_response, [user_input, chat_history], [
	chat_history, user_input])

	demo.launch()