Spaces:

prithivMLmods
/

Qwen2.5-VL

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 23

Commit

c947ff2

verified ·

1 Parent(s): 5a70700

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -95

app.py CHANGED Viewed

@@ -1,11 +1,5 @@
 import gradio as gr
-from transformers import (
-    AutoProcessor,
-    Qwen2_5_VLForConditionalGeneration,
-    TextIteratorStreamer,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-)
 from transformers.image_utils import load_image
 from threading import Thread
 import time
@@ -15,12 +9,6 @@ import cv2
 import numpy as np
 from PIL import Image
-# A constant for token length limit
-MAX_INPUT_TOKEN_LENGTH = 4096
-# -----------------------
-# Progress Bar Helper
-# -----------------------
 def progress_bar_html(label: str) -> str:
     """
     Returns an HTML snippet for a thin progress bar with a label.
@@ -41,9 +29,6 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
-# -----------------------
-# Video Downsampling Helper
-# -----------------------
 def downsample_video(video_path):
     """
     Downsamples the video to 10 evenly spaced frames.
@@ -69,40 +54,19 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
-# -----------------------
-# Qwen2.5-VL Multimodal Setup
-# -----------------------
-MODEL_ID_QWEN = "Qwen/Qwen2.5-VL-7B-Instruct"  # Alternatively: "Qwen/Qwen2.5-VL-3B-Instruct"
-processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
-qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_QWEN,
     trust_remote_code=True,
-    torch_dtype=torch.float16  # Use float16 for more stability
 ).to("cuda").eval()
-# -----------------------
-# DeepHermes Text Generation Setup
-# -----------------------
-text_model_id = "prithivMLmods/DeepHermes-3-Llama-3-3B-Preview-abliterated"
-text_tokenizer = AutoTokenizer.from_pretrained(text_model_id)
-text_model = AutoModelForCausalLM.from_pretrained(
-    text_model_id,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-text_model.eval()
-# -----------------------
-# Main Inference Function
-# -----------------------
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
-    files = input_dict.get("files", [])
-    # -----------------------
-    # Video Inference Branch
-    # -----------------------
     if text.strip().lower().startswith("@video-infer"):
         # Remove the tag from the query.
         text = text[len("@video-infer"):].strip()
@@ -136,12 +100,10 @@ def model_inference(input_dict, history):
             return_tensors="pt",
             padding=True,
         ).to("cuda")
-        # Clear CUDA cache to reduce potential memory fragmentation.
-        torch.cuda.empty_cache()
         # Set up streaming generation.
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-        thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield progress_bar_html("Processing video with Qwen2.5VL Model")
@@ -151,46 +113,6 @@ def model_inference(input_dict, history):
             yield buffer
         return
-    # -----------------------
-    # Text-Only Inference Branch (using DeepHermes text generation)
-    # -----------------------
-    if not files:
-        # Prepare a simple conversation for text-only input.
-        conversation = [{"role": "user", "content": text}]
-        # Use the text tokenizer’s chat template method.
-        input_ids = text_tokenizer.apply_chat_template(
-            conversation, add_generation_prompt=True, return_tensors="pt"
-        )
-        # Trim if necessary.
-        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-        input_ids = input_ids.to(text_model.device)
-        streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            "input_ids": input_ids,
-            "streamer": streamer,
-            "max_new_tokens": 1024,
-            "do_sample": True,
-            "top_p": 0.9,
-            "top_k": 50,
-            "temperature": 0.6,
-            "num_beams": 1,
-            "repetition_penalty": 1.2,
-        }
-        thread = Thread(target=text_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing with DeepHermes Text Generation Model")
-        for new_text in streamer:
-            buffer += new_text
-            time.sleep(0.01)
-            yield buffer
-        return
-    # -----------------------
-    # Multimodal (Image) Inference Branch with Qwen2.5-VL
-    # -----------------------
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
@@ -198,6 +120,9 @@ def model_inference(input_dict, history):
     else:
         images = []
     if text == "" and images:
         gr.Error("Please input a text query along with the image(s).")
         return
@@ -218,11 +143,9 @@ def model_inference(input_dict, history):
         return_tensors="pt",
         padding=True,
     ).to("cuda")
-    # Clear CUDA cache before generation.
-    torch.cuda.empty_cache()
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-    thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     yield progress_bar_html("Processing with Qwen2.5VL Model")
@@ -231,14 +154,11 @@ def model_inference(input_dict, history):
         time.sleep(0.01)
         yield buffer
-# -----------------------
-# Gradio Chat Interface
-# -----------------------
 examples = [
     [{"text": "Describe the Image?", "files": ["example_images/document.jpg"]}],
-    [{"text": "Tell me a story about a brave knight in a faraway kingdom."}],
     [{"text": "@video-infer Explain the content of the Advertisement", "files": ["example_images/videoplayback.mp4"]}],
     [{"text": "@video-infer Explain the content of the video in detail", "files": ["example_images/breakfast.mp4"]}],
 ]
 demo = gr.ChatInterface(
@@ -252,5 +172,4 @@ demo = gr.ChatInterface(
     cache_examples=False,
 )
-if __name__ == "__main__":
-    demo.launch(share=True, debug=True)

 import gradio as gr
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 from transformers.image_utils import load_image
 from threading import Thread
 import time
 import numpy as np
 from PIL import Image
 def progress_bar_html(label: str) -> str:
     """
     Returns an HTML snippet for a thin progress bar with a label.
 </style>
     '''
 def downsample_video(video_path):
     """
     Downsamples the video to 10 evenly spaced frames.
     vidcap.release()
     return frames
+MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"  # Alternatively: "Qwen/Qwen2.5-VL-3B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
     trust_remote_code=True,
+    torch_dtype=torch.bfloat16
 ).to("cuda").eval()
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
+    files = input_dict["files"]
     if text.strip().lower().startswith("@video-infer"):
         # Remove the tag from the query.
         text = text[len("@video-infer"):].strip()
             return_tensors="pt",
             padding=True,
         ).to("cuda")
         # Set up streaming generation.
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield progress_bar_html("Processing video with Qwen2.5VL Model")
             yield buffer
         return
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
     else:
         images = []
+    if text == "" and not images:
+        gr.Error("Please input a query and optionally image(s).")
+        return
     if text == "" and images:
         gr.Error("Please input a text query along with the image(s).")
         return
         return_tensors="pt",
         padding=True,
     ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     yield progress_bar_html("Processing with Qwen2.5VL Model")
         time.sleep(0.01)
         yield buffer
 examples = [
     [{"text": "Describe the Image?", "files": ["example_images/document.jpg"]}],
     [{"text": "@video-infer Explain the content of the Advertisement", "files": ["example_images/videoplayback.mp4"]}],
     [{"text": "@video-infer Explain the content of the video in detail", "files": ["example_images/breakfast.mp4"]}],
+    [{"text": "@video-infer Explain the content of the video.", "files": ["example_images/sky.mp4"]}],
 ]
 demo = gr.ChatInterface(
     cache_examples=False,
 )
+demo.launch(debug=True)