nanoLLaVA

Runtime error

App Files Files Community

qnguyen3 commited on 21 days ago

Commit

b6c6d0c

verified ·

1 Parent(s): 13b8f1b

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -35

app.py CHANGED Viewed

@@ -6,23 +6,17 @@ from threading import Thread
 import re
 import time
 from PIL import Image
-import torch
 import spaces
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-torch.set_default_device('cuda')
 tokenizer = AutoTokenizer.from_pretrained(
     'qnguyen3/nanoLLaVA-1.5',
     trust_remote_code=True)
-model = LlavaQwen2ForCausalLM.from_pretrained(
-    'qnguyen3/nanoLLaVA-1.5',
-    torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2",
-    trust_remote_code=True,
-    device_map='cpu')
 class KeywordsStoppingCriteria(StoppingCriteria):
     def __init__(self, keywords, tokenizer, input_ids):
@@ -61,15 +55,34 @@ class KeywordsStoppingCriteria(StoppingCriteria):
 @spaces.GPU
 def bot_streaming(message, history):
-    messages = []
     if message["files"]:
-      image = message["files"][-1]["path"]
     else:
-      for i, hist in enumerate(history):
-        if type(hist[0])==tuple:
-          image = hist[0][0]
-          image_turn = i
     if len(history) > 0 and image is not None:
         messages.append({"role": "user", "content": f'<image>\n{history[1][0]}'})
         messages.append({"role": "assistant", "content": history[1][1] })
@@ -86,44 +99,57 @@ def bot_streaming(message, history):
         messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
     elif len(history) == 0 and image is None:
         messages.append({"role": "user", "content": message['text'] })
-    model = model.to('cuda')
-    # if image is None:
-    #     gr.Error("You need to upload an image for LLaVA to work.")
     image = Image.open(image).convert("RGB")
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True)
     text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
     input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
     stop_str = '<|im_end|>'
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
-    generation_kwargs = dict(input_ids=input_ids.to('cuda'),
-                             images=image_tensor.to('cuda'),
-                             streamer=streamer, max_new_tokens=512,
-                             stopping_criteria=[stopping_criteria], temperature=0.01)
-    generated_text = ""
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    text_prompt =f"<|im_start|>user\n{message['text']}<|im_end|>"
     buffer = ""
     for new_text in streamer:
-      buffer += new_text
-      generated_text_without_prompt = buffer[:]
-      time.sleep(0.04)
-      yield generated_text_without_prompt
-demo = gr.ChatInterface(fn=bot_streaming, title="🚀nanoLLaVA-1.5", examples=[{"text": "Who is this guy?", "files":["./demo_1.jpg"]},
-                                                                      {"text": "What does the text say?", "files":["./demo_2.jpeg"]}],
-                        description="Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
-                        stop_btn="Stop Generation", multimodal=True)
 demo.queue().launch()

 import re
 import time
 from PIL import Image
 import spaces
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+# Initialize tokenizer (doesn't require CUDA)
 tokenizer = AutoTokenizer.from_pretrained(
     'qnguyen3/nanoLLaVA-1.5',
     trust_remote_code=True)
+# Don't initialize model here - move it to the GPU-decorated function
+model = None
 class KeywordsStoppingCriteria(StoppingCriteria):
     def __init__(self, keywords, tokenizer, input_ids):
 @spaces.GPU
 def bot_streaming(message, history):
+    global model
+    # Initialize the model inside the GPU-decorated function
+    if model is None:
+        model = LlavaQwen2ForCausalLM.from_pretrained(
+            'qnguyen3/nanoLLaVA-1.5',
+            torch_dtype=torch.float16,
+            attn_implementation="flash_attention_2",
+            trust_remote_code=True,
+            device_map="auto")  # Use "auto" instead of 'cpu' then manual to('cuda')
+    # Get image path
+    image = None
     if message["files"]:
+        image = message["files"][-1]["path"]
     else:
+        for i, hist in enumerate(history):
+            if type(hist[0])==tuple:
+                image = hist[0][0]
+                image_turn = i
+                break
+    # Check if image is available
+    if image is None:
+        return "Please upload an image for LLaVA to work."
+    # Prepare conversation messages
+    messages = []
     if len(history) > 0 and image is not None:
         messages.append({"role": "user", "content": f'<image>\n{history[1][0]}'})
         messages.append({"role": "assistant", "content": history[1][1] })
         messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
     elif len(history) == 0 and image is None:
         messages.append({"role": "user", "content": message['text'] })
+    # Process image
     image = Image.open(image).convert("RGB")
+    # Prepare input for generation
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True)
     text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
     input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
+    # Prepare stopping criteria
     stop_str = '<|im_end|>'
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Process image and generate text
     image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
+    generation_kwargs = dict(
+        input_ids=input_ids,
+        images=image_tensor,
+        streamer=streamer,
+        max_new_tokens=512,
+        stopping_criteria=[stopping_criteria],
+        temperature=0.01
+    )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    # Stream response
     buffer = ""
     for new_text in streamer:
+        buffer += new_text
+        generated_text_without_prompt = buffer[:]
+        time.sleep(0.04)
+        yield generated_text_without_prompt
+demo = gr.ChatInterface(
+    fn=bot_streaming,
+    title="🚀nanoLLaVA-1.5",
+    examples=[
+        {"text": "Who is this guy?", "files":["./demo_1.jpg"]},
+        {"text": "What does the text say?", "files":["./demo_2.jpeg"]}
+    ],
+    description="Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
+    stop_btn="Stop Generation",
+    multimodal=True
+)
 demo.queue().launch()