nanoLLaVA

Runtime error

App Files Files Community

nanoLLaVA / app.py

qnguyen3

Update app.py

50bd3d6 verified about 2 months ago

raw

history blame

8.25 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, StoppingCriteria
	from modeling_llava_qwen2 import LlavaQwen2ForCausalLM
	from threading import Thread
	import re
	import time
	from PIL import Image
	import spaces
	import subprocess
	subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

	# Initialize tokenizer (doesn't require CUDA)
	tokenizer = AutoTokenizer.from_pretrained(
	'qnguyen3/nanoLLaVA-1.5',
	trust_remote_code=True)

	# Don't initialize model here - move it to the GPU-decorated function
	model = None

	class KeywordsStoppingCriteria(StoppingCriteria):
	def __init__(self, keywords, tokenizer, input_ids):
	self.keywords = keywords
	self.keyword_ids = []
	self.max_keyword_len = 0
	for keyword in keywords:
	cur_keyword_ids = tokenizer(keyword).input_ids
	if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
	cur_keyword_ids = cur_keyword_ids[1:]
	if len(cur_keyword_ids) > self.max_keyword_len:
	self.max_keyword_len = len(cur_keyword_ids)
	self.keyword_ids.append(torch.tensor(cur_keyword_ids))
	self.tokenizer = tokenizer
	self.start_len = input_ids.shape[1]

	def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
	self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
	for keyword_id in self.keyword_ids:
	truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
	if torch.equal(truncated_output_ids, keyword_id):
	return True
	outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
	for keyword in self.keywords:
	if keyword in outputs:
	return True
	return False

	def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	outputs = []
	for i in range(output_ids.shape[0]):
	outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
	return all(outputs)


	@spaces.GPU
	def bot_streaming(message, history):
	global model

	# Initialize the model inside the GPU-decorated function
	if model is None:
	model = LlavaQwen2ForCausalLM.from_pretrained(
	'qnguyen3/nanoLLaVA-1.5',
	torch_dtype=torch.float16,
	attn_implementation="flash_attention_2",
	trust_remote_code=True,
	device_map="auto") # Use "auto" instead of 'cpu' then manual to('cuda')

	# Get image path
	image = None
	if "files" in message and message["files"]:
	image = message["files"][-1]["path"]

	# Check if image is available
	if image is None:
	return "Please upload an image for LLaVA to work."

	# Prepare conversation messages
	messages = []
	if len(history) > 0:
	for human, assistant in history:
	# Skip None responses (which can happen during streaming)
	if assistant is not None:
	messages.append({"role": "user", "content": human})
	messages.append({"role": "assistant", "content": assistant})
	# Add the current message
	messages.append({"role": "user", "content": f"<image>\n{message['text']}" if len(messages) == 0 else message['text']})
	else:
	messages.append({"role": "user", "content": f"<image>\n{message['text']}"})

	# Process image
	image = Image.open(image).convert("RGB")

	# Prepare input for generation
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True)

	# Handle image embedding in text
	if '<image>' in text:
	text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
	input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
	else:
	# If no <image> tag was added (possible in some chat templates), add it manually
	input_ids = tokenizer(text).input_ids
	# Find the position to insert the image token
	# For simplicity, insert after the user message start
	user_start_pos = 0
	for i, token in enumerate(input_ids):
	if tokenizer.decode([token]) == '<\|im_start\|>user':
	user_start_pos = i + 2 # +2 to get past the tag
	break
	# Insert image token
	input_ids = input_ids[:user_start_pos] + [-200] + input_ids[user_start_pos:]
	input_ids = torch.tensor([input_ids], dtype=torch.long)

	# Prepare stopping criteria
	stop_str = '<\|im_end\|>'
	keywords = [stop_str]
	stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	# Process image and generate text
	image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
	generation_kwargs = dict(
	input_ids=input_ids,
	images=image_tensor,
	streamer=streamer,
	max_new_tokens=512,
	stopping_criteria=[stopping_criteria],
	temperature=0.01
	)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# Stream response
	buffer = ""
	for new_text in streamer:
	buffer += new_text
	generated_text_without_prompt = buffer[:]
	time.sleep(0.04)
	yield generated_text_without_prompt


	# Create a gradio Blocks interface instead of ChatInterface
	# This avoids the schema validation issues
	with gr.Blocks(title="🚀nanoLLaVA-1.5") as demo:
	gr.Markdown("## 🚀nanoLLaVA-1.5")
	gr.Markdown("Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.")

	chatbot = gr.Chatbot(height=500)
	with gr.Row():
	with gr.Column(scale=0.8):
	msg = gr.Textbox(
	show_label=False,
	placeholder="Enter text and upload an image",
	container=False
	)
	with gr.Column(scale=0.2):
	btn = gr.Button("Submit")
	stop_btn = gr.Button("Stop Generation")

	upload_btn = gr.UploadButton("Upload Image", file_types=["image"])
	current_img = gr.State(None)

	# Example images
	examples = gr.Examples(
	examples=[
	["Who is this guy?", "./demo_1.jpg"],
	["What does the text say?", "./demo_2.jpeg"]
	],
	inputs=[msg, upload_btn]
	)

	def upload_image(image):
	return image

	def add_text(history, text, image):
	if image is None and (not history or type(history[0][0]) != tuple):
	return history + [[text, "Please upload an image first."]]
	return history + [[text, None]]

	def bot_response(history, image):
	message = {"text": history[-1][0], "files": [{"path": image}] if image else []}
	history_format = history[:-1] # All except the last message

	response = ""
	for chunk in bot_streaming(message, history_format):
	response = chunk
	history[-1][1] = response
	yield history

	upload_btn.upload(upload_image, upload_btn, current_img)

	msg.submit(add_text, [chatbot, msg, current_img], chatbot).then(
	bot_response, [chatbot, current_img], chatbot
	)

	btn.click(add_text, [chatbot, msg, current_img], chatbot).then(
	bot_response, [chatbot, current_img], chatbot
	)

	stop_btn.click(None, None, None, cancels=[bot_response])

	# Launch the app with queuing
	demo.queue().launch()