Spaces:

AIDC-AI
/

Ovis2.5-2B

Running on Zero

App Files Files Community

玙珲 commited on 2 days ago

Commit

e0ca852

1 Parent(s): 169061d

add thinking budget

Browse files

Files changed (1) hide show

app.py +78 -30

app.py CHANGED Viewed

@@ -27,6 +27,25 @@ streamer = None
 # This should point to the directory containing your SVG file.
 CUR_DIR = os.path.dirname(os.path.abspath(__file__))
 def submit_chat(chatbot, text_input):
     response = ''
     chatbot.append([text_input, response])
@@ -114,6 +133,8 @@ def run_inference(
     do_sample: bool,
     max_new_tokens: int,
     enable_thinking: bool,
 ):
     """
     Runs a single turn of inference and yields the output stream for a gr.Chatbot.
@@ -122,14 +143,11 @@ def run_inference(
     prompt = chatbot[-1][0]
     if (not image_input and not video_input and not prompt) or not prompt:
         gr.Warning("A text prompt is required for generation.")
         # MODIFICATION: Yield the current state and return to avoid errors
         yield chatbot
         return
-    # MODIFICATION: Append the new prompt to the existing history
-    # chatbot.append([prompt, ""])
-    # yield chatbot, "" # Yield the updated chat to show the user's prompt immediately
     content = []
     if image_input:
         content.append({"type": "image", "image": image_input})
@@ -139,7 +157,7 @@ def run_inference(
             content.append({"type": "video", "video": frames})
         else:
             gr.Warning("Failed to process the video file.")
-            chatbot[-1][1] = "Error: Could not process the video file."
             yield chatbot
             return
@@ -154,7 +172,8 @@ def run_inference(
         else:
             input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, enable_thinking=enable_thinking)
     except Exception as e:
-        chatbot[-1][1] = f"Error during input preprocessing: {e}"
         yield chatbot
         return
@@ -170,7 +189,10 @@ def run_inference(
         "eos_token_id": model.text_tokenizer.eos_token_id,
         "pad_token_id": model.text_tokenizer.pad_token_id,
         "streamer": streamer,
-        "use_cache": True
     }
     with torch.inference_mode():
@@ -197,16 +219,11 @@ def run_inference(
         chatbot[-1][1] = formatted_response
         yield chatbot # Yield the final, formatted response
-        logger.info("[OVIS_CONV_START]")
-        [print(f'Q{i}:\n {request}\nA{i}:\n {answer}') for i, (request, answer) in enumerate(chatbot, 1)]
-        # print('New_Q:\n', text_input)
-        # print('New_A:\n', response)
         logger.info("[OVIS_CONV_END]")
-def clear_chat():
-    return [], None, ""
 # --- UI Helper Functions ---
 def toggle_media_input(choice: str) -> Tuple:
     """Switches visibility between Image/Video inputs and their corresponding examples."""
@@ -217,7 +234,6 @@ def toggle_media_input(choice: str) -> Tuple:
 # --- Build Gradio Application ---
-# @spaces.GPU
 def build_demo(model_path: str):
     """Builds the Gradio user interface for the model."""
     global model, streamer
@@ -231,7 +247,7 @@ def build_demo(model_path: str):
     ).to(device).eval()
     text_tokenizer = model.text_tokenizer
-    streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
     print("Model loaded successfully.")
@@ -257,10 +273,22 @@ def build_demo(model_path: str):
     <center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_path}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
     """
     prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your text here and press ENTER", lines=1, container=False)
     with gr.Blocks(theme=gr.themes.Ocean()) as demo:
         gr.HTML(html_header)
-        gr.Markdown("Note: you might have to increase \"Max New Tokens\" and wait longer to obtain answer when Deep Thinking is enabled.")
         with gr.Row():
             with gr.Column(scale=4):
@@ -270,10 +298,10 @@ def build_demo(model_path: str):
                 with gr.Accordion("Generation Settings", open=True):
                     do_sample = gr.Checkbox(label="Enable Sampling (Do Sample)", value=True)
-                    max_new_tokens = gr.Slider(minimum=32, maximum=4096, value=2048, step=32, label="Max New Tokens")
-                    enable_thinking = gr.Checkbox(label="Enable Deep Thinking", value=False)
                 with gr.Column(visible=True) as image_examples_col:
                     gr.Examples(
@@ -297,30 +325,50 @@ def build_demo(model_path: str):
                     generate_btn = gr.Button("Send", variant="primary")
                     clear_btn = gr.Button("Clear", variant="secondary")
         input_type_radio.change(
             fn=toggle_media_input,
             inputs=input_type_radio,
             outputs=[image_input, video_input, image_examples_col, video_examples_col]
         )
-        # MODIFICATION: Update event handlers to use the new function and manage state
-        run_inputs = [chatbot, image_input, video_input, do_sample, max_new_tokens, enable_thinking]
-        # run_outputs = [image_input, prompt_input]
         generat_click_event = generate_btn.click(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
         submit_event = prompt_input.submit(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
         clear_btn.click(
-            fn=lambda: ([], None, None, "", "Image", True, 1024, False),
-            outputs=[chatbot, image_input, video_input, prompt_input, input_type_radio, do_sample, max_new_tokens, enable_thinking]
-        ).then(
-             fn=toggle_media_input,
-             inputs=input_type_radio,
-             outputs=[image_input, video_input, image_examples_col, video_examples_col]
         )
     return demo
 # --- Main Execution Block ---
 # def parse_args():
 #     parser = argparse.ArgumentParser(description="Gradio interface for a single Multimodal Large Language Model.")

 # This should point to the directory containing your SVG file.
 CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+class MyTextIteratorStreamer(TextIteratorStreamer):
+    def manual_end(self):
+        """Flushes any remaining cache and prints a newline to stdout."""
+        # Flush the cache, if it exists
+        if len(self.token_cache) > 0:
+            text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        else:
+            printable_text = ""
+        self.next_tokens_are_prompt = True
+        self.on_finalized_text(printable_text, stream_end=True)
+    def end(self):
+        pass
 def submit_chat(chatbot, text_input):
     response = ''
     chatbot.append([text_input, response])
     do_sample: bool,
     max_new_tokens: int,
     enable_thinking: bool,
+    enable_thinking_budget: bool, # NEWLY ADDED
+    thinking_budget: int,         # NEWLY ADDED
 ):
     """
     Runs a single turn of inference and yields the output stream for a gr.Chatbot.
     prompt = chatbot[-1][0]
     if (not image_input and not video_input and not prompt) or not prompt:
         gr.Warning("A text prompt is required for generation.")
+        chatbot.pop(-1)
         # MODIFICATION: Yield the current state and return to avoid errors
         yield chatbot
         return
     content = []
     if image_input:
         content.append({"type": "image", "image": image_input})
             content.append({"type": "video", "video": frames})
         else:
             gr.Warning("Failed to process the video file.")
+            chatbot.pop(-1)
             yield chatbot
             return
         else:
             input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, enable_thinking=enable_thinking)
     except Exception as e:
+        gr.Warning(f"Error during input preprocessing: {e}")
+        chatbot.pop(-1)
         yield chatbot
         return
         "eos_token_id": model.text_tokenizer.eos_token_id,
         "pad_token_id": model.text_tokenizer.pad_token_id,
         "streamer": streamer,
+        "use_cache": True,
+        "enable_thinking": enable_thinking,
+        "enable_thinking_budget": enable_thinking_budget,
+        "thinking_budget": thinking_budget
     }
     with torch.inference_mode():
         chatbot[-1][1] = formatted_response
         yield chatbot # Yield the final, formatted response
+        logger.info("\n[OVIS_CONV_START]")
+        [print(f'Q{i}:\n {request}\nA{i}:\n {answer}\n') for i, (request, answer) in enumerate(chatbot, 1)]
         logger.info("[OVIS_CONV_END]")
 # --- UI Helper Functions ---
 def toggle_media_input(choice: str) -> Tuple:
     """Switches visibility between Image/Video inputs and their corresponding examples."""
 # --- Build Gradio Application ---
 def build_demo(model_path: str):
     """Builds the Gradio user interface for the model."""
     global model, streamer
     ).to(device).eval()
     text_tokenizer = model.text_tokenizer
+    streamer = MyTextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
     print("Model loaded successfully.")
     <center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_path}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
     """
+    # --- START: Slider synchronization logic functions ---
+    def adjust_max_tokens(thinking_budget_val: int, max_new_tokens_val: int) -> gr.Slider:
+        """Adjusts max_new_tokens to be at least thinking_budget + 128."""
+        new_max_tokens = max(max_new_tokens_val, thinking_budget_val + 128)
+        return gr.update(value=new_max_tokens)
+    def adjust_thinking_budget(max_new_tokens_val: int, thinking_budget_val: int) -> gr.Slider:
+        """Adjusts thinking_budget to be at most max_new_tokens - 128."""
+        new_thinking_budget = min(thinking_budget_val, max_new_tokens_val - 128)
+        return gr.update(value=new_thinking_budget)
+    # --- END: Slider synchronization logic functions ---
     prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your text here and press ENTER", lines=1, container=False)
     with gr.Blocks(theme=gr.themes.Ocean()) as demo:
         gr.HTML(html_header)
+        gr.Markdown("Note: The Thinking Budget mechanism is enabled only when `Deep Thinking` and `Thinking Budget` are both checked. Could tune down `Thinking Budget` for faster generation in `Deep Thinking` mode.")
         with gr.Row():
             with gr.Column(scale=4):
                 with gr.Accordion("Generation Settings", open=True):
                     do_sample = gr.Checkbox(label="Enable Sampling (Do Sample)", value=True)
+                    enable_thinking = gr.Checkbox(label="Enable Deep Thinking", value=True)
+                    enable_thinking_budget = gr.Checkbox(label="Enable Thinking Budget", value=True)
+                    max_new_tokens = gr.Slider(minimum=256, maximum=4096, value=2048, step=32, label="Max New Tokens")
+                    thinking_budget = gr.Slider(minimum=128, maximum=3968, value=1024, step=32, label="Thinking Budget")
                 with gr.Column(visible=True) as image_examples_col:
                     gr.Examples(
                     generate_btn = gr.Button("Send", variant="primary")
                     clear_btn = gr.Button("Clear", variant="secondary")
+        # --- START: Event Handlers for UI Elements ---
         input_type_radio.change(
             fn=toggle_media_input,
             inputs=input_type_radio,
             outputs=[image_input, video_input, image_examples_col, video_examples_col]
         )
+        # Event handlers for coupled sliders
+        thinking_budget.release(
+            fn=adjust_max_tokens,
+            inputs=[thinking_budget, max_new_tokens],
+            outputs=[max_new_tokens]
+        )
+        max_new_tokens.release(
+            fn=adjust_thinking_budget,
+            inputs=[max_new_tokens, thinking_budget],
+            outputs=[thinking_budget]
+        )
+        # MODIFICATION: Update run_inputs to include new controls
+        run_inputs = [chatbot, image_input, video_input, do_sample, max_new_tokens, enable_thinking, enable_thinking_budget, thinking_budget]
         generat_click_event = generate_btn.click(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
         submit_event = prompt_input.submit(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
+        # MODIFICATION: Update clear button to reset new controls
+        # clear_btn.click(
+        #     fn=lambda: ([], None, None, "", "Image", True, 2048, True, True, 1024),
+        #     outputs=[chatbot, image_input, video_input, prompt_input, input_type_radio, do_sample, max_new_tokens, enable_thinking, enable_thinking_budget, thinking_budget]
+        # ).then(
+        #      fn=toggle_media_input,
+        #      inputs=input_type_radio,
+        #      outputs=[image_input, video_input, image_examples_col, video_examples_col]
+        # )
         clear_btn.click(
+            fn=lambda: (list(), None, None, ""),
+            outputs=[chatbot, image_input, video_input, prompt_input]
         )
+        # --- END: Event Handlers for UI Elements ---
     return demo
 # --- Main Execution Block ---
 # def parse_args():
 #     parser = argparse.ArgumentParser(description="Gradio interface for a single Multimodal Large Language Model.")