Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on May 5

Commit

8c1f8ea

verified ·

1 Parent(s): a8067dc

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -14

app.py CHANGED Viewed

@@ -15,11 +15,10 @@ from transformers import (
 from transformers import Qwen2_5_VLForConditionalGeneration
 # Helper Functions
 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
-    Colors can be customized; default colors are used for Qwen2VL/Aya‑Vision.
     """
     return f'''
 <div style="display: flex; align-items: center;">
@@ -36,7 +35,6 @@ def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_colo
 </style>
     '''
 def downsample_video(video_path):
     """
     Downsamples a video file by extracting 25 evenly spaced frames.
@@ -81,7 +79,7 @@ rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Main Inference Function
 @spaces.GPU
 def model_inference(input_dict, history, use_rolmocr=False):
-    text = input_dict.get("text", "").strip()
     files = input_dict.get("files", [])
     if not text and not files:
@@ -121,6 +119,7 @@ def model_inference(input_dict, history, use_rolmocr=False):
     model = rolmocr_model if use_rolmocr else qwen_model
     model_name = "RolmOCR" if use_rolmocr else "Qwen2VL OCR"
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     all_images = [item["image"] for item in content if item["type"] == "image"]
     inputs = processor(
@@ -130,31 +129,33 @@ def model_inference(input_dict, history, use_rolmocr=False):
         padding=True,
     ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
-    # Send initial progress bar
     yield progress_bar_html(f"Processing with {model_name}")
-    # Stream generation
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
         yield buffer
-    # Ensure generation is complete
-    thread.join()
-    # Save the full response to response.txt
     try:
         with open("response.txt", "w", encoding="utf-8") as f:
-            f.write(buffer)
     except Exception as e:
-        yield f"Error saving response: {e}"
 # Gradio Interface
 examples = [
@@ -163,7 +164,6 @@ examples = [
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
     description="# **Multimodal OCR `@RolmOCR and Default Qwen2VL OCR`**",
@@ -180,4 +180,4 @@ demo = gr.ChatInterface(
     additional_inputs=[gr.Checkbox(label="Use RolmOCR", value=False, info="Check to use RolmOCR, uncheck to use Qwen2VL OCR")],
 )
-demo.launch(debug=True)

 from transformers import Qwen2_5_VLForConditionalGeneration
 # Helper Functions
 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
+    Colors can be customized; default colors are used for Qwen2VL/Aya-Vision.
     """
     return f'''
 <div style="display: flex; align-items: center;">
 </style>
     '''
 def downsample_video(video_path):
     """
     Downsamples a video file by extracting 25 evenly spaced frames.
 # Main Inference Function
 @spaces.GPU
 def model_inference(input_dict, history, use_rolmocr=False):
+    text = input_dict["text"].strip()
     files = input_dict.get("files", [])
     if not text and not files:
     model = rolmocr_model if use_rolmocr else qwen_model
     model_name = "RolmOCR" if use_rolmocr else "Qwen2VL OCR"
+    # Prepare prompt and inputs
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     all_images = [item["image"] for item in content if item["type"] == "image"]
     inputs = processor(
         padding=True,
     ).to("cuda")
+    # Set up streaming
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     yield progress_bar_html(f"Processing with {model_name}")
+    # Stream tokens
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
         yield buffer
+    # Once streaming is done, save to response.txt and yield final result
+    results = buffer.strip()
     try:
         with open("response.txt", "w", encoding="utf-8") as f:
+            f.write(results)
     except Exception as e:
+        yield f"Error writing to response.txt: {e}"
+        return
+    yield results
+    return
 # Gradio Interface
 examples = [
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
     description="# **Multimodal OCR `@RolmOCR and Default Qwen2VL OCR`**",
     additional_inputs=[gr.Checkbox(label="Use RolmOCR", value=False, info="Check to use RolmOCR, uncheck to use Qwen2VL OCR")],
 )
+demo.launch(debug=True)