nanoLLaVA

Runtime error

App Files Files Community

qnguyen3 commited on 21 days ago

Commit

50bd3d6

verified ·

1 Parent(s): b6c6d0c

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -34

app.py CHANGED Viewed

@@ -68,14 +68,8 @@ def bot_streaming(message, history):
     # Get image path
     image = None
-    if message["files"]:
         image = message["files"][-1]["path"]
-    else:
-        for i, hist in enumerate(history):
-            if type(hist[0])==tuple:
-                image = hist[0][0]
-                image_turn = i
-                break
     # Check if image is available
     if image is None:
@@ -83,22 +77,16 @@ def bot_streaming(message, history):
     # Prepare conversation messages
     messages = []
-    if len(history) > 0 and image is not None:
-        messages.append({"role": "user", "content": f'<image>\n{history[1][0]}'})
-        messages.append({"role": "assistant", "content": history[1][1] })
-        for human, assistant in history[2:]:
-            messages.append({"role": "user", "content": human })
-            messages.append({"role": "assistant", "content": assistant })
-        messages.append({"role": "user", "content": message['text']})
-    elif len(history) > 0 and image is None:
         for human, assistant in history:
-            messages.append({"role": "user", "content": human })
-            messages.append({"role": "assistant", "content": assistant })
-        messages.append({"role": "user", "content": message['text']})
-    elif len(history) == 0 and image is not None:
         messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
-    elif len(history) == 0 and image is None:
-        messages.append({"role": "user", "content": message['text'] })
     # Process image
     image = Image.open(image).convert("RGB")
@@ -108,8 +96,24 @@ def bot_streaming(message, history):
         messages,
         tokenize=False,
         add_generation_prompt=True)
-    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
-    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
     # Prepare stopping criteria
     stop_str = '<|im_end|>'
@@ -140,16 +144,65 @@ def bot_streaming(message, history):
         yield generated_text_without_prompt
-demo = gr.ChatInterface(
-    fn=bot_streaming,
-    title="🚀nanoLLaVA-1.5",
-    examples=[
-        {"text": "Who is this guy?", "files":["./demo_1.jpg"]},
-        {"text": "What does the text say?", "files":["./demo_2.jpeg"]}
-    ],
-    description="Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
-    stop_btn="Stop Generation",
-    multimodal=True
-)
 demo.queue().launch()

     # Get image path
     image = None
+    if "files" in message and message["files"]:
         image = message["files"][-1]["path"]
     # Check if image is available
     if image is None:
     # Prepare conversation messages
     messages = []
+    if len(history) > 0:
         for human, assistant in history:
+            # Skip None responses (which can happen during streaming)
+            if assistant is not None:
+                messages.append({"role": "user", "content": human})
+                messages.append({"role": "assistant", "content": assistant})
+        # Add the current message
+        messages.append({"role": "user", "content": f"<image>\n{message['text']}" if len(messages) == 0 else message['text']})
+    else:
         messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
     # Process image
     image = Image.open(image).convert("RGB")
         messages,
         tokenize=False,
         add_generation_prompt=True)
+    # Handle image embedding in text
+    if '<image>' in text:
+        text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
+        input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
+    else:
+        # If no <image> tag was added (possible in some chat templates), add it manually
+        input_ids = tokenizer(text).input_ids
+        # Find the position to insert the image token
+        # For simplicity, insert after the user message start
+        user_start_pos = 0
+        for i, token in enumerate(input_ids):
+            if tokenizer.decode([token]) == '<|im_start|>user':
+                user_start_pos = i + 2  # +2 to get past the tag
+                break
+        # Insert image token
+        input_ids = input_ids[:user_start_pos] + [-200] + input_ids[user_start_pos:]
+        input_ids = torch.tensor([input_ids], dtype=torch.long)
     # Prepare stopping criteria
     stop_str = '<|im_end|>'
         yield generated_text_without_prompt
+# Create a gradio Blocks interface instead of ChatInterface
+# This avoids the schema validation issues
+with gr.Blocks(title="🚀nanoLLaVA-1.5") as demo:
+    gr.Markdown("## 🚀nanoLLaVA-1.5")
+    gr.Markdown("Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.")
+    chatbot = gr.Chatbot(height=500)
+    with gr.Row():
+        with gr.Column(scale=0.8):
+            msg = gr.Textbox(
+                show_label=False,
+                placeholder="Enter text and upload an image",
+                container=False
+            )
+        with gr.Column(scale=0.2):
+            btn = gr.Button("Submit")
+            stop_btn = gr.Button("Stop Generation")
+    upload_btn = gr.UploadButton("Upload Image", file_types=["image"])
+    current_img = gr.State(None)
+    # Example images
+    examples = gr.Examples(
+        examples=[
+            ["Who is this guy?", "./demo_1.jpg"],
+            ["What does the text say?", "./demo_2.jpeg"]
+        ],
+        inputs=[msg, upload_btn]
+    )
+    def upload_image(image):
+        return image
+    def add_text(history, text, image):
+        if image is None and (not history or type(history[0][0]) != tuple):
+            return history + [[text, "Please upload an image first."]]
+        return history + [[text, None]]
+    def bot_response(history, image):
+        message = {"text": history[-1][0], "files": [{"path": image}] if image else []}
+        history_format = history[:-1]  # All except the last message
+        response = ""
+        for chunk in bot_streaming(message, history_format):
+            response = chunk
+            history[-1][1] = response
+            yield history
+    upload_btn.upload(upload_image, upload_btn, current_img)
+    msg.submit(add_text, [chatbot, msg, current_img], chatbot).then(
+        bot_response, [chatbot, current_img], chatbot
+    )
+    btn.click(add_text, [chatbot, msg, current_img], chatbot).then(
+        bot_response, [chatbot, current_img], chatbot
+    )
+    stop_btn.click(None, None, None, cancels=[bot_response])
+# Launch the app with queuing
 demo.queue().launch()