nanoLLaVA

Runtime error

File size: 8,249 Bytes

f04732f
a3db70a
c36d5bb
58c422e
a3db70a
 
 
 
f04732f
08bc998
a3db70a
 
b6c6d0c
a3db70a
4d18fd2
a3db70a
 
b6c6d0c
 
a3db70a
c36d5bb
 
 
 
 
 
 
 
 
 
 
 
 
 
88b9346
c36d5bb
 
 
 
 
 
 
 
 
 
 
 
88b9346
c36d5bb
 
 
 
 
 
a3db70a
 
 
b6c6d0c
 
 
 
 
 
 
 
 
 
 
 
 
50bd3d6
b6c6d0c
 
 
 
 
 
 
 
50bd3d6
a3db70a
50bd3d6
 
 
 
 
 
 
a3db70a
 
b6c6d0c
a3db70a
b6c6d0c
 
a3db70a
 
 
 
50bd3d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6c6d0c
 
c36d5bb
 
 
d227c5a
a3db70a
b6c6d0c
b699eb0
b6c6d0c
 
 
 
 
 
 
 
 
a3db70a
 
4be8019
b6c6d0c
a3db70a
 
b6c6d0c
 
 
 
 
a3db70a
50bd3d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3db70a
50bd3d6
6e20834

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, StoppingCriteria
from modeling_llava_qwen2 import LlavaQwen2ForCausalLM
from threading import Thread
import re
import time 
from PIL import Image
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Initialize tokenizer (doesn't require CUDA)
tokenizer = AutoTokenizer.from_pretrained(
    'qnguyen3/nanoLLaVA-1.5',
    trust_remote_code=True)

# Don't initialize model here - move it to the GPU-decorated function
model = None

class KeywordsStoppingCriteria(StoppingCriteria):
    def __init__(self, keywords, tokenizer, input_ids):
        self.keywords = keywords
        self.keyword_ids = []
        self.max_keyword_len = 0
        for keyword in keywords:
            cur_keyword_ids = tokenizer(keyword).input_ids
            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
                cur_keyword_ids = cur_keyword_ids[1:]
            if len(cur_keyword_ids) > self.max_keyword_len:
                self.max_keyword_len = len(cur_keyword_ids)
            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
        self.tokenizer = tokenizer
        self.start_len = input_ids.shape[1]
        
    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
        for keyword_id in self.keyword_ids:
            truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
            if torch.equal(truncated_output_ids, keyword_id):
                return True
        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
        for keyword in self.keywords:
            if keyword in outputs:
                return True
        return False
        
    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        outputs = []
        for i in range(output_ids.shape[0]):
            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
        return all(outputs)


@spaces.GPU
def bot_streaming(message, history):
    global model
    
    # Initialize the model inside the GPU-decorated function
    if model is None:
        model = LlavaQwen2ForCausalLM.from_pretrained(
            'qnguyen3/nanoLLaVA-1.5',
            torch_dtype=torch.float16,
            attn_implementation="flash_attention_2",
            trust_remote_code=True,
            device_map="auto")  # Use "auto" instead of 'cpu' then manual to('cuda')
    
    # Get image path
    image = None
    if "files" in message and message["files"]:
        image = message["files"][-1]["path"]
    
    # Check if image is available
    if image is None:
        return "Please upload an image for LLaVA to work."

    # Prepare conversation messages
    messages = []
    if len(history) > 0:
        for human, assistant in history:
            # Skip None responses (which can happen during streaming)
            if assistant is not None:
                messages.append({"role": "user", "content": human})
                messages.append({"role": "assistant", "content": assistant})
        # Add the current message
        messages.append({"role": "user", "content": f"<image>\n{message['text']}" if len(messages) == 0 else message['text']})
    else:
        messages.append({"role": "user", "content": f"<image>\n{message['text']}"})

    # Process image
    image = Image.open(image).convert("RGB")
    
    # Prepare input for generation
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True)
    
    # Handle image embedding in text
    if '<image>' in text:
        text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
        input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
    else:
        # If no <image> tag was added (possible in some chat templates), add it manually
        input_ids = tokenizer(text).input_ids
        # Find the position to insert the image token
        # For simplicity, insert after the user message start
        user_start_pos = 0
        for i, token in enumerate(input_ids):
            if tokenizer.decode([token]) == '<|im_start|>user':
                user_start_pos = i + 2  # +2 to get past the tag
                break
        # Insert image token
        input_ids = input_ids[:user_start_pos] + [-200] + input_ids[user_start_pos:]
        input_ids = torch.tensor([input_ids], dtype=torch.long)
    
    # Prepare stopping criteria
    stop_str = '<|im_end|>'
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    # Process image and generate text
    image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
    generation_kwargs = dict(
        input_ids=input_ids, 
        images=image_tensor, 
        streamer=streamer, 
        max_new_tokens=512, 
        stopping_criteria=[stopping_criteria], 
        temperature=0.01
    )
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    # Stream response
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        generated_text_without_prompt = buffer[:]
        time.sleep(0.04)
        yield generated_text_without_prompt


# Create a gradio Blocks interface instead of ChatInterface
# This avoids the schema validation issues
with gr.Blocks(title="🚀nanoLLaVA-1.5") as demo:
    gr.Markdown("## 🚀nanoLLaVA-1.5")
    gr.Markdown("Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.")
    
    chatbot = gr.Chatbot(height=500)
    with gr.Row():
        with gr.Column(scale=0.8):
            msg = gr.Textbox(
                show_label=False,
                placeholder="Enter text and upload an image",
                container=False
            )
        with gr.Column(scale=0.2):
            btn = gr.Button("Submit")
            stop_btn = gr.Button("Stop Generation")
    
    upload_btn = gr.UploadButton("Upload Image", file_types=["image"])
    current_img = gr.State(None)
    
    # Example images
    examples = gr.Examples(
        examples=[
            ["Who is this guy?", "./demo_1.jpg"],
            ["What does the text say?", "./demo_2.jpeg"]
        ],
        inputs=[msg, upload_btn]
    )
    
    def upload_image(image):
        return image
    
    def add_text(history, text, image):
        if image is None and (not history or type(history[0][0]) != tuple):
            return history + [[text, "Please upload an image first."]]
        return history + [[text, None]]
    
    def bot_response(history, image):
        message = {"text": history[-1][0], "files": [{"path": image}] if image else []}
        history_format = history[:-1]  # All except the last message
        
        response = ""
        for chunk in bot_streaming(message, history_format):
            response = chunk
            history[-1][1] = response
            yield history
    
    upload_btn.upload(upload_image, upload_btn, current_img)
    
    msg.submit(add_text, [chatbot, msg, current_img], chatbot).then(
        bot_response, [chatbot, current_img], chatbot
    )
    
    btn.click(add_text, [chatbot, msg, current_img], chatbot).then(
        bot_response, [chatbot, current_img], chatbot
    )
    
    stop_btn.click(None, None, None, cancels=[bot_response])

# Launch the app with queuing
demo.queue().launch()