Spaces:
Runtime error
Runtime error
File size: 8,249 Bytes
f04732f a3db70a c36d5bb 58c422e a3db70a f04732f 08bc998 a3db70a b6c6d0c a3db70a 4d18fd2 a3db70a b6c6d0c a3db70a c36d5bb 88b9346 c36d5bb 88b9346 c36d5bb a3db70a b6c6d0c 50bd3d6 b6c6d0c 50bd3d6 a3db70a 50bd3d6 a3db70a b6c6d0c a3db70a b6c6d0c a3db70a 50bd3d6 b6c6d0c c36d5bb d227c5a a3db70a b6c6d0c b699eb0 b6c6d0c a3db70a 4be8019 b6c6d0c a3db70a b6c6d0c a3db70a 50bd3d6 a3db70a 50bd3d6 6e20834 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, StoppingCriteria
from modeling_llava_qwen2 import LlavaQwen2ForCausalLM
from threading import Thread
import re
import time
from PIL import Image
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
# Initialize tokenizer (doesn't require CUDA)
tokenizer = AutoTokenizer.from_pretrained(
'qnguyen3/nanoLLaVA-1.5',
trust_remote_code=True)
# Don't initialize model here - move it to the GPU-decorated function
model = None
class KeywordsStoppingCriteria(StoppingCriteria):
def __init__(self, keywords, tokenizer, input_ids):
self.keywords = keywords
self.keyword_ids = []
self.max_keyword_len = 0
for keyword in keywords:
cur_keyword_ids = tokenizer(keyword).input_ids
if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
cur_keyword_ids = cur_keyword_ids[1:]
if len(cur_keyword_ids) > self.max_keyword_len:
self.max_keyword_len = len(cur_keyword_ids)
self.keyword_ids.append(torch.tensor(cur_keyword_ids))
self.tokenizer = tokenizer
self.start_len = input_ids.shape[1]
def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
for keyword_id in self.keyword_ids:
truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
if torch.equal(truncated_output_ids, keyword_id):
return True
outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
for keyword in self.keywords:
if keyword in outputs:
return True
return False
def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
outputs = []
for i in range(output_ids.shape[0]):
outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
return all(outputs)
@spaces.GPU
def bot_streaming(message, history):
global model
# Initialize the model inside the GPU-decorated function
if model is None:
model = LlavaQwen2ForCausalLM.from_pretrained(
'qnguyen3/nanoLLaVA-1.5',
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
trust_remote_code=True,
device_map="auto") # Use "auto" instead of 'cpu' then manual to('cuda')
# Get image path
image = None
if "files" in message and message["files"]:
image = message["files"][-1]["path"]
# Check if image is available
if image is None:
return "Please upload an image for LLaVA to work."
# Prepare conversation messages
messages = []
if len(history) > 0:
for human, assistant in history:
# Skip None responses (which can happen during streaming)
if assistant is not None:
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": assistant})
# Add the current message
messages.append({"role": "user", "content": f"<image>\n{message['text']}" if len(messages) == 0 else message['text']})
else:
messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
# Process image
image = Image.open(image).convert("RGB")
# Prepare input for generation
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True)
# Handle image embedding in text
if '<image>' in text:
text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
else:
# If no <image> tag was added (possible in some chat templates), add it manually
input_ids = tokenizer(text).input_ids
# Find the position to insert the image token
# For simplicity, insert after the user message start
user_start_pos = 0
for i, token in enumerate(input_ids):
if tokenizer.decode([token]) == '<|im_start|>user':
user_start_pos = i + 2 # +2 to get past the tag
break
# Insert image token
input_ids = input_ids[:user_start_pos] + [-200] + input_ids[user_start_pos:]
input_ids = torch.tensor([input_ids], dtype=torch.long)
# Prepare stopping criteria
stop_str = '<|im_end|>'
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Process image and generate text
image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
generation_kwargs = dict(
input_ids=input_ids,
images=image_tensor,
streamer=streamer,
max_new_tokens=512,
stopping_criteria=[stopping_criteria],
temperature=0.01
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Stream response
buffer = ""
for new_text in streamer:
buffer += new_text
generated_text_without_prompt = buffer[:]
time.sleep(0.04)
yield generated_text_without_prompt
# Create a gradio Blocks interface instead of ChatInterface
# This avoids the schema validation issues
with gr.Blocks(title="🚀nanoLLaVA-1.5") as demo:
gr.Markdown("## 🚀nanoLLaVA-1.5")
gr.Markdown("Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.")
chatbot = gr.Chatbot(height=500)
with gr.Row():
with gr.Column(scale=0.8):
msg = gr.Textbox(
show_label=False,
placeholder="Enter text and upload an image",
container=False
)
with gr.Column(scale=0.2):
btn = gr.Button("Submit")
stop_btn = gr.Button("Stop Generation")
upload_btn = gr.UploadButton("Upload Image", file_types=["image"])
current_img = gr.State(None)
# Example images
examples = gr.Examples(
examples=[
["Who is this guy?", "./demo_1.jpg"],
["What does the text say?", "./demo_2.jpeg"]
],
inputs=[msg, upload_btn]
)
def upload_image(image):
return image
def add_text(history, text, image):
if image is None and (not history or type(history[0][0]) != tuple):
return history + [[text, "Please upload an image first."]]
return history + [[text, None]]
def bot_response(history, image):
message = {"text": history[-1][0], "files": [{"path": image}] if image else []}
history_format = history[:-1] # All except the last message
response = ""
for chunk in bot_streaming(message, history_format):
response = chunk
history[-1][1] = response
yield history
upload_btn.upload(upload_image, upload_btn, current_img)
msg.submit(add_text, [chatbot, msg, current_img], chatbot).then(
bot_response, [chatbot, current_img], chatbot
)
btn.click(add_text, [chatbot, msg, current_img], chatbot).then(
bot_response, [chatbot, current_img], chatbot
)
stop_btn.click(None, None, None, cancels=[bot_response])
# Launch the app with queuing
demo.queue().launch() |