Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 4

Commit

5633a75

verified ·

1 Parent(s): a4cab0f

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -11

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from threading import Thread
 import time
 import torch
 import spaces
 # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
@@ -22,18 +23,25 @@ def model_inference(input_dict, history):
     # Load images if provided
     if len(files) > 1:
-        images = [load_image(image) for image in files]
     elif len(files) == 1:
-        images = [load_image(files[0])]
     else:
         images = []
     # Validate input
-    if text == "" and not images:
-        gr.Error("Please input a query and optionally image(s).")
         return
-    if text == "" and images:
-        gr.Error("Please input a text query along with the image(s).")
         return
     # Prepare messages for the model
@@ -42,18 +50,24 @@ def model_inference(input_dict, history):
             "role": "user",
             "content": [
                 *[{"type": "image", "image": image} for image in images],
                 {"type": "text", "text": text},
             ],
         }
     ]
     # Apply chat template and process inputs
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
-        images=images if images else None,
-        return_tensors="pt",
         padding=True,
     ).to("cuda")
     # Set up streamer for real-time output
@@ -76,7 +90,6 @@ def model_inference(input_dict, history):
 # Example inputs
 examples = [
     [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
     [{"text": "summarize the letter", "files": ["examples/1.png"]}],
     [{"text": "Describe the photo", "files": ["examples/3.png"]}],
@@ -87,14 +100,14 @@ examples = [
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
     description="# **Multimodal OCR**",
     examples=examples,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
     stop_btn="Stop Generation",
     multimodal=True,
     cache_examples=False,

 import time
 import torch
 import spaces
+from qwen_vl_utils import process_vision_info
 # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
     # Load images if provided
     if len(files) > 1:
+        images = [load_image(image) for image in files if image.endswith(('png', 'jpg', 'jpeg'))]
+        videos = [video for video in files if video.endswith(('mp4', 'avi', 'mov'))]
     elif len(files) == 1:
+        if files[0].endswith(('png', 'jpg', 'jpeg')):
+            images = [load_image(files[0])]
+            videos = []
+        else:
+            images = []
+            videos = [files[0]]
     else:
         images = []
+        videos = []
     # Validate input
+    if text == "" and not images and not videos:
+        gr.Error("Please input a query and optionally image(s) or video(s).")
         return
+    if text == "" and (images or videos):
+        gr.Error("Please input a text query along with the image(s) or video(s).")
         return
     # Prepare messages for the model
             "role": "user",
             "content": [
                 *[{"type": "image", "image": image} for image in images],
+                *[{"type": "video", "video": video} for video in videos],
                 {"type": "text", "text": text},
             ],
         }
     ]
+    # Process vision info (images and videos)
+    image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
     # Apply chat template and process inputs
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
+        images=image_inputs,
+        videos=video_inputs,
         padding=True,
+        return_tensors="pt",
+        **video_kwargs,
     ).to("cuda")
     # Set up streamer for real-time output
 # Example inputs
 examples = [
     [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
     [{"text": "summarize the letter", "files": ["examples/1.png"]}],
     [{"text": "Describe the photo", "files": ["examples/3.png"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
+    [{"text": "Describe the video.", "files": ["example_videos/sample.mp4"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
     description="# **Multimodal OCR**",
     examples=examples,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
     stop_btn="Stop Generation",
     multimodal=True,
     cache_examples=False,