Spaces:

BenkHel
/

CumoThesis

Running on Zero

App Files Files Community

BenkHel commited on 17 days ago

Commit

c2b8ea8

verified ·

1 Parent(s): 2d8021a

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -175

app.py CHANGED Viewed

@@ -1,42 +1,35 @@
 import subprocess
 import sys
 import os
 from transformers import TextIteratorStreamer
 import argparse
 import time
 import subprocess
 import spaces
 import cumo.serve.gradio_web_server as gws
 from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor
 import datetime
 import json
 import gradio as gr
 import requests
 from PIL import Image
 from cumo.conversation import (default_conversation, conv_templates, SeparatorStyle)
 from cumo.constants import LOGDIR
 from cumo.model.language_model.llava_mistral import LlavaMistralForCausalLM
 from cumo.utils import (build_logger, server_error_msg, violates_moderation, moderation_msg)
 import hashlib
 import torch
 import io
 from cumo.constants import WORKER_HEART_BEAT_INTERVAL
-from cumo.utils import (build_logger, server_error_msg,
-    pretty_print_semaphore)
 from cumo.model.builder import load_pretrained_model
 from cumo.mm_utils import process_images, load_image_from_base64, tokenizer_image_token
 from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
-from transformers import TextIteratorStreamer
 from threading import Thread
 headers = {"User-Agent": "CuMo"}
 no_change_btn = gr.Button()
 enable_btn = gr.Button(interactive=True)
 disable_btn = gr.Button(interactive=False)
@@ -54,9 +47,10 @@ tokenizer, model, image_processor, context_len = load_pretrained_model(
 )
 model.config.training = False
-# FIXED PROMPT
 FIXED_PROMPT = "<image>\nWhat material is this item and how to dispose of it?"
 def clear_history():
     state = default_conversation.copy()
     return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
@@ -64,27 +58,12 @@ def clear_history():
 def add_text(state, imagebox, textbox, image_process_mode):
     if state is None:
         state = conv_templates[conv_mode].copy()
     if imagebox is not None:
-        try:
-            image = Image.open(imagebox).convert('RGB')
-        except Exception as e:
-            print(f"Failed to load image: {e}")
-            yield (state, state.to_gradio_chatbot(), "⚠️ Could not load example image.", None) + (enable_btn,) * 5
-            return
-        textbox = DEFAULT_IMAGE_TOKEN + "\nWhat material is this item and how to dispose of it?"
-        textbox = (textbox, image, image_process_mode)
-    else:
-        yield (state, state.to_gradio_chatbot(), "⚠️ Please upload or select an image first.", None) + (enable_btn,) * 5
-        return
-    state.append_message(state.roles[0], textbox)
-    state.append_message(state.roles[1], None)
-    yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
 def delete_text(state, image_process_mode):
     state.messages[-1][-1] = None
@@ -93,63 +72,42 @@ def delete_text(state, image_process_mode):
         prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
     yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
-def regenerate(state, image_process_mode):
-    state.messages[-1][-1] = None
-    prev_human_msg = state.messages[-2]
-    if type(prev_human_msg[1]) in (tuple, list):
-        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
-    state.skip_next = False
-    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
 @spaces.GPU
 def generate(state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens):
-    prompt = FIXED_PROMPT  # <-- Hier fest!
     images = state.get_images(return_pil=True)
     ori_prompt = prompt
     num_image_tokens = 0
-    if images is not None and len(images) > 0:
-        if len(images) > 0:
-            if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
-                raise ValueError("Number of images does not match number of <image> tokens in prompt")
-            image_sizes = [image.size for image in images]
-            images = process_images(images, image_processor, model.config)
-            if type(images) is list:
-                images = [image.to(model.device, dtype=torch.float16) for image in images]
-            else:
-                images = images.to(model.device, dtype=torch.float16)
-            replace_token = DEFAULT_IMAGE_TOKEN
-            if getattr(model.config, 'mm_use_im_start_end', False):
-                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
-            prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
-            num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
         else:
-            images = None
-            image_sizes = None
         image_args = {"images": images, "image_sizes": image_sizes}
     else:
-        images = None
         image_args = {}
     max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
-    max_new_tokens = 512
-    do_sample = True if temperature > 0.001 else False
-    stop_str = state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2
     input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
-    max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
     if max_new_tokens < 1:
-        yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
         return
     thread = Thread(target=model.generate, kwargs=dict(
         inputs=input_ids,
-        do_sample=do_sample,
         temperature=temperature,
         top_p=top_p,
         max_new_tokens=max_new_tokens,
@@ -160,6 +118,8 @@ def generate(state, imagebox, textbox, image_process_mode, temperature, top_p, m
     ))
     thread.start()
     generated_text = ''
     for new_text in streamer:
         generated_text += new_text
         if generated_text.endswith(stop_str):
@@ -169,51 +129,23 @@ def generate(state, imagebox, textbox, image_process_mode, temperature, top_p, m
     yield (state, state.to_gradio_chatbot(), "", None) + (enable_btn,) * 5
     torch.cuda.empty_cache()
-title_markdown = ("""
-# CuMo: Trained for waste management
-""")
-tos_markdown = ("""
-### Please "🗑️  Clear" the output before offering a new picture!
-### Source and Terms of use
-This demo is based on the original CuMo project by SHI-Labs ([GitHub](https://github.com/SHI-Labs/CuMo)).
-If you use this service or build upon this work, please cite the original publication:
-Li, Jiachen and Wang, Xinyao and Zhu, Sijie and Kuo, Chia-wen and Xu, Lu and Chen, Fan and Jain, Jitesh and Shi, Humphrey and Wen, Longyin.
-CuMo: Scaling Multimodal LLM with Co-Upcycled Mixture-of-Experts. arXiv preprint, 2024.
-[[arXiv](https://arxiv.org/abs/2405.05949)]
-By using this service, users are required to agree to the following terms:
-The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes.
-For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
-""")
-learn_more_markdown = ("""
-### License
-The service is a research preview intended for non-commercial use only, subject to the. Please contact us if you find any potential violation.
-""")
-block_css = """
-#buttons button {
-    min-width: min(120px,100%);
-}
-"""
 textbox = gr.Textbox(
     show_label=False,
-    placeholder="Prompt is fixed: What material is this item and how to dispose of it?",
     container=False,
     interactive=False
 )
-with gr.Blocks(title="CuMo", theme=gr.themes.Default(), css=block_css) as demo:
     state = gr.State()
-    gr.Markdown(title_markdown)
     with gr.Row():
         with gr.Column(scale=3):
@@ -223,36 +155,27 @@ with gr.Blocks(title="CuMo", theme=gr.themes.Default(), css=block_css) as demo:
                 value="Default",
                 label="Preprocess for non-square image", visible=False)
-            #cur_dir = os.path.dirname(os.path.abspath(__file__))
             cur_dir = './cumo/serve'
-            default_prompt = "<image>\nWhat material is this item and how to dispose of it?"
             gr.Examples(examples=[
-                [f"{cur_dir}/examples/0165 CB.jpg", default_prompt],
-                [f"{cur_dir}/examples/0225 PA.jpg", default_prompt],
-                [f"{cur_dir}/examples/0787 GM.jpg", default_prompt],
-                [f"{cur_dir}/examples/1396 A.jpg", default_prompt],
-                [f"{cur_dir}/examples/2001 P.jpg", default_prompt],
-                [f"{cur_dir}/examples/2658 PE.jpg", default_prompt],
-                [f"{cur_dir}/examples/3113 R.jpg", default_prompt],
-                [f"{cur_dir}/examples/3750 RPC.jpg", default_prompt],
-                [f"{cur_dir}/examples/5033 CC.jpg", default_prompt],
-                [f"{cur_dir}/examples/5307 B.jpg", default_prompt],
-            ], inputs=[imagebox, textbox], cache_examples=False)
-            with gr.Accordion("Parameters", open=False) as parameter_row:
-                temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
-                top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
-                max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
         with gr.Column(scale=8):
-            chatbot = gr.Chatbot(
-                elem_id="chatbot",
-                label="CuMo Chatbot",
-                height=650,
-                layout="panel",
-            )
             with gr.Row():
                 with gr.Column(scale=8):
                     textbox.render()
@@ -263,50 +186,18 @@ with gr.Blocks(title="CuMo", theme=gr.themes.Default(), css=block_css) as demo:
                 regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
                 clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
     gr.Markdown(tos_markdown)
     gr.Markdown(learn_more_markdown)
     url_params = gr.JSON(visible=False)
-    # Register listeners
     btn_list = [regenerate_btn, clear_btn]
-    clear_btn.click(
-        clear_history,
-        None,
-        [state, chatbot, textbox, imagebox] + btn_list,
-        queue=False
-    )
-    regenerate_btn.click(
-        delete_text,
-        [state, image_process_mode],
-        [state, chatbot, textbox, imagebox] + btn_list,
-    ).then(
-        generate,
-        [state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens],
-        [state, chatbot, textbox, imagebox] + btn_list,
-    )
-    textbox.submit(
-        add_text,
-        [state, imagebox, textbox, image_process_mode],
-        [state, chatbot, textbox, imagebox] + btn_list,
-    ).then(
-        generate,
-        [state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens],
-        [state, chatbot, textbox, imagebox] + btn_list,
-    )
-    submit_btn.click(
-        add_text,
-        [state, imagebox, textbox, image_process_mode],
-        [state, chatbot, textbox, imagebox] + btn_list,
-    ).then(
-        generate,
-        [state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens],
-        [state, chatbot, textbox, imagebox] + btn_list,
-    )
-demo.queue(
-    status_update_rate=10,
-    api_open=False
-).launch()

+# --- Imports bleiben unverändert ---
 import subprocess
 import sys
 import os
 from transformers import TextIteratorStreamer
 import argparse
 import time
 import subprocess
 import spaces
 import cumo.serve.gradio_web_server as gws
 from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor
 import datetime
 import json
 import gradio as gr
 import requests
 from PIL import Image
 from cumo.conversation import (default_conversation, conv_templates, SeparatorStyle)
 from cumo.constants import LOGDIR
 from cumo.model.language_model.llava_mistral import LlavaMistralForCausalLM
 from cumo.utils import (build_logger, server_error_msg, violates_moderation, moderation_msg)
 import hashlib
 import torch
 import io
 from cumo.constants import WORKER_HEART_BEAT_INTERVAL
+from cumo.utils import (build_logger, server_error_msg, pretty_print_semaphore)
 from cumo.model.builder import load_pretrained_model
 from cumo.mm_utils import process_images, load_image_from_base64, tokenizer_image_token
 from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 from threading import Thread
+# --- Model Setup ---
 headers = {"User-Agent": "CuMo"}
 no_change_btn = gr.Button()
 enable_btn = gr.Button(interactive=True)
 disable_btn = gr.Button(interactive=False)
 )
 model.config.training = False
+# --- Prompt ---
 FIXED_PROMPT = "<image>\nWhat material is this item and how to dispose of it?"
+# --- Functions ---
 def clear_history():
     state = default_conversation.copy()
     return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
 def add_text(state, imagebox, textbox, image_process_mode):
     if state is None:
         state = conv_templates[conv_mode].copy()
     if imagebox is not None:
+        image = Image.open(imagebox).convert('RGB')
+        textbox = (FIXED_PROMPT, image, image_process_mode)
+        state.append_message(state.roles[0], textbox)
+        state.append_message(state.roles[1], None)
+        yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
 def delete_text(state, image_process_mode):
     state.messages[-1][-1] = None
         prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
     yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
 @spaces.GPU
 def generate(state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens):
+    prompt = FIXED_PROMPT
     images = state.get_images(return_pil=True)
     ori_prompt = prompt
     num_image_tokens = 0
+    if images and len(images) > 0:
+        if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+            raise ValueError("Number of images does not match number of <image> tokens in prompt")
+        image_sizes = [image.size for image in images]
+        images = process_images(images, image_processor, model.config)
+        if isinstance(images, list):
+            images = [image.to(model.device, dtype=torch.float16) for image in images]
         else:
+            images = images.to(model.device, dtype=torch.float16)
+        replace_token = DEFAULT_IMAGE_TOKEN
+        if getattr(model.config, 'mm_use_im_start_end', False):
+            replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+        num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
         image_args = {"images": images, "image_sizes": image_sizes}
     else:
         image_args = {}
     max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
     input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+    max_new_tokens = min(512, max_context_length - input_ids.shape[-1] - num_image_tokens)
     if max_new_tokens < 1:
+        yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation.", "error_code": 0}).encode() + b"\0"
         return
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
     thread = Thread(target=model.generate, kwargs=dict(
         inputs=input_ids,
+        do_sample=(temperature > 0.001),
         temperature=temperature,
         top_p=top_p,
         max_new_tokens=max_new_tokens,
     ))
     thread.start()
     generated_text = ''
+    stop_str = state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2
     for new_text in streamer:
         generated_text += new_text
         if generated_text.endswith(stop_str):
     yield (state, state.to_gradio_chatbot(), "", None) + (enable_btn,) * 5
     torch.cuda.empty_cache()
+# --- UI Setup ---
 textbox = gr.Textbox(
     show_label=False,
+    placeholder="Prompt is fixed: What material is this item and how to dispose of it.",
     container=False,
     interactive=False
 )
+with gr.Blocks(title="CuMo", theme=gr.themes.Default(), css="""
+#buttons button {
+    min-width: min(120px,100%);
+}
+""") as demo:
     state = gr.State()
+    gr.Markdown("# CuMo: Trained for waste management")
+    gr.Markdown(f"**Prompt:** `{FIXED_PROMPT}`")
     with gr.Row():
         with gr.Column(scale=3):
                 value="Default",
                 label="Preprocess for non-square image", visible=False)
             cur_dir = './cumo/serve'
             gr.Examples(examples=[
+                [f"{cur_dir}/examples/0165 CB.jpg"],
+                [f"{cur_dir}/examples/0225 PA.jpg"],
+                [f"{cur_dir}/examples/0787 GM.jpg"],
+                [f"{cur_dir}/examples/1396 A.jpg"],
+                [f"{cur_dir}/examples/2001 P.jpg"],
+                [f"{cur_dir}/examples/2658 PE.jpg"],
+                [f"{cur_dir}/examples/3113 R.jpg"],
+                [f"{cur_dir}/examples/3750 RPC.jpg"],
+                [f"{cur_dir}/examples/5033 CC.jpg"],
+                [f"{cur_dir}/examples/5307 B.jpg"],
+            ], inputs=[imagebox], cache_examples=False)
+            with gr.Accordion("Parameters", open=False):
+                temperature = gr.Slider(0.0, 1.0, value=0.2, step=0.1, interactive=True, label="Temperature")
+                top_p = gr.Slider(0.0, 1.0, value=0.7, step=0.1, interactive=True, label="Top P")
+                max_output_tokens = gr.Slider(0, 1024, value=512, step=64, interactive=True, label="Max output tokens")
         with gr.Column(scale=8):
+            chatbot = gr.Chatbot(elem_id="chatbot", label="CuMo Chatbot", height=650, layout="panel")
             with gr.Row():
                 with gr.Column(scale=8):
                     textbox.render()
                 regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
                 clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
     gr.Markdown(tos_markdown)
     gr.Markdown(learn_more_markdown)
     url_params = gr.JSON(visible=False)
+    # --- Event Bindings ---
     btn_list = [regenerate_btn, clear_btn]
+    clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list, queue=False)
+    regenerate_btn.click(delete_text, [state, image_process_mode], [state, chatbot, textbox, imagebox] + btn_list
+    ).then(generate, [state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens], [state, chatbot, textbox, imagebox] + btn_list)
+    textbox.submit(add_text, [state, imagebox, textbox, image_process_mode], [state, chatbot, textbox, imagebox] + btn_list
+    ).then(generate, [state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens], [state, chatbot, textbox, imagebox] + btn_list)
+    submit_btn.click(add_text, [state, imagebox, textbox, image_process_mode], [state, chatbot, textbox, imagebox] + btn_list
+    ).then(generate, [state, imagebox, textbox, image_process_mode, temperature, top_p, max_output_tokens], [state, chatbot, textbox, imagebox] + btn_list)
+demo.queue(status_update_rate=10, api_open=False).launch()