Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 12

Commit

a520e3c

verified ·

1 Parent(s): 36ebfe1

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -47

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from transformers import (
     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForImageTextToText,
 )
 from transformers.image_utils import load_image
 from threading import Thread
@@ -31,7 +32,10 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
-QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct" # or use  #prithivMLmods/Qwen2-VL-OCR2-2B-Instruct
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
@@ -39,62 +43,105 @@ qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
 aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
 aya_model = AutoModelForImageTextToText.from_pretrained(
     AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
 )
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
     if text.lower().startswith("@aya-vision"):
-        # Remove the command prefix and trim the prompt.
         text_prompt = text[len("@aya-vision"):].strip()
         if not files:
             yield "Error: Please provide an image for the @aya-vision feature."
             return
-        else:
-            # For simplicity, use the first provided image.
-            image = load_image(files[0])
-            yield progress_bar_html("Processing with Aya-Vision-8b")
-            messages = [{
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
-                    {"type": "text", "text": text_prompt},
-                ],
-            }]
-            inputs = aya_processor.apply_chat_template(
-                messages,
-                padding=True,
-                add_generation_prompt=True,
-                tokenize=True,
-                return_dict=True,
-                return_tensors="pt"
-            ).to(aya_model.device)
-            # Set up a streamer for Aya-Vision output
-            streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
-            generation_kwargs = dict(
-                inputs,
-                streamer=streamer,
-                max_new_tokens=1024,
-                do_sample=True,
-                temperature=0.3
-            )
-            thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
-            thread.start()
-            buffer = ""
-            for new_text in streamer:
-                buffer += new_text
-                buffer = buffer.replace("<|im_end|>", "")
-                time.sleep(0.01)
-                yield buffer
-            return
-    # Load images if provided.
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
@@ -102,7 +149,6 @@ def model_inference(input_dict, history):
     else:
         images = []
-    # Validate input: require both text and (optionally) image(s).
     if text == "" and not images:
         yield "Error: Please input a query and optionally image(s)."
         return
@@ -110,7 +156,6 @@ def model_inference(input_dict, history):
         yield "Error: Please input a text query along with the image(s)."
         return
-    # Prepare messages for the Qwen2-VL model.
     messages = [{
         "role": "user",
         "content": [
@@ -129,11 +174,9 @@ def model_inference(input_dict, history):
         padding=True,
     ).to("cuda")
-    # Set up a streamer for real-time output.
     streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-    # Start generation in a separate thread.
     thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
     thread.start()
@@ -145,28 +188,36 @@ def model_inference(input_dict, history):
         time.sleep(0.01)
         yield buffer
 examples = [
-    [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
-    [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
-    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
-    [{"text": "@aya-vision Describe the photo", "files": ["examples/3.png"]}],
     [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
     [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
     [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "@aya-vision Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
-    description="# **Multimodal OCR `@aya-vision 'prompt..'`**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
         file_types=["image"],
         file_count="multiple",
-        placeholder="By default, it runs Qwen2VL OCR, Tag @aya-vision for Aya Vision 8B"
     ),
     stop_btn="Stop Generation",
     multimodal=True,

     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForImageTextToText,
+    Gemma3ForConditionalGeneration  # new Gemma3 model import
 )
 from transformers.image_utils import load_image
 from threading import Thread
 </style>
     '''
+### Load Models & Processors ###
+# Qwen2VL OCR model (default)
+QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # or alternate version
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# Aya-Vision model (trigger with @aya-vision)
 AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
 aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
 aya_model = AutoModelForImageTextToText.from_pretrained(
     AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
 )
+# Gemma3-4b model (trigger with @gemma3-4b)
+GEMMA3_MODEL_ID = "google/gemma-3-4b-it"
+gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
+    GEMMA3_MODEL_ID, device_map="auto"
+).eval()
+gemma3_processor = AutoProcessor.from_pretrained(GEMMA3_MODEL_ID)
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
+    # Branch: Aya-Vision (trigger with @aya-vision)
     if text.lower().startswith("@aya-vision"):
         text_prompt = text[len("@aya-vision"):].strip()
         if not files:
             yield "Error: Please provide an image for the @aya-vision feature."
             return
+        image = load_image(files[0])
+        yield progress_bar_html("Processing with Aya-Vision-8b")
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": text_prompt},
+            ],
+        }]
+        inputs = aya_processor.apply_chat_template(
+            messages,
+            padding=True,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(aya_model.device)
+        streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(
+            inputs,
+            streamer=streamer,
+            max_new_tokens=1024,
+            do_sample=True,
+            temperature=0.3
+        )
+        thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+        return
+    # Branch: Gemma3-4b (trigger with @gemma3-4b)
+    if text.lower().startswith("@gemma3-4b"):
+        text_prompt = text[len("@gemma3-4b"):].strip()
+        if not files:
+            yield "Error: Please provide an image for the @gemma3-4b feature."
+            return
+        image = load_image(files[0])
+        yield progress_bar_html("Processing with Gemma3-4b")
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}]
+            },
+            {
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
+                    {"type": "text", "text": text_prompt}
+                ]
+            }
+        ]
+        inputs = gemma3_processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True,
+            return_dict=True, return_tensors="pt"
+        ).to(gemma3_model.device, dtype=torch.bfloat16)
+        input_len = inputs["input_ids"].shape[-1]
+        streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512, do_sample=False)
+        thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+        return
+    # Default Branch: Qwen2-VL OCR (for text query with optional images)
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
     else:
         images = []
     if text == "" and not images:
         yield "Error: Please input a query and optionally image(s)."
         return
         yield "Error: Please input a text query along with the image(s)."
         return
     messages = [{
         "role": "user",
         "content": [
         padding=True,
     ).to("cuda")
     streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
     thread.start()
         time.sleep(0.01)
         yield buffer
+# Examples for quick testing.
 examples = [
+    [{"text": "@gemma3-4b Summarize the letter", "files": ["examples/1.png"]}],
+    [{"text": "@gemma3-4b Extract JSON from the image", "files": ["example_images/document.jpg"]}],
+    [{"text": "@gemma3-4b Describe the photo", "files": ["examples/3.png"]}],
     [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
     [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
     [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
+    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "@aya-vision Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
+# Gradio ChatInterface with a multimodal textbox.
 demo = gr.ChatInterface(
     fn=model_inference,
+    description=(
+        "# **Multimodal OCR & Vision Features**\n\n"
+        "Use the following commands to select a model:\n"
+        "- `@aya-vision` for Aya-Vision-8b\n"
+        "- `@gemma3-4b` for Gemma3-4b\n\n"
+        "Default processing is done with Qwen2VL OCR."
+    ),
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
         file_types=["image"],
         file_count="multiple",
+        placeholder="Enter your text query and attach images if needed. Use @aya-vision or @gemma3-4b to choose a feature."
     ),
     stop_btn="Stop Generation",
     multimodal=True,