Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on May 5

Commit

f22b5b6

verified ·

1 Parent(s): 564e537

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -4

app.py CHANGED Viewed

@@ -18,7 +18,6 @@ from transformers import Qwen2_5_VLForConditionalGeneration
 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
-    Colors can be customized; default colors are used for Qwen2VL/Aya‑Vision.
     """
     return f'''
 <div style="display: flex; align-items: center;">
@@ -37,7 +36,7 @@ def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_colo
 def downsample_video(video_path):
     """
-    Downsamples a video file by extracting 10 evenly spaced frames.
     Returns a list of tuples (PIL.Image, timestamp).
     """
     vidcap = cv2.VideoCapture(video_path)
@@ -132,14 +131,28 @@ def model_inference(input_dict, history, use_rolmocr=False):
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     yield progress_bar_html(f"Processing with {model_name}")
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
         yield buffer
 # Gradio Interface
 examples = [
     [{"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
@@ -149,7 +162,7 @@ examples = [
 demo = gr.ChatInterface(
     fn=model_inference,
-    description="# **Multimodal OCR `@RolmOCR and Default Qwen2VL OCR`**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
@@ -163,4 +176,5 @@ demo = gr.ChatInterface(
     additional_inputs=[gr.Checkbox(label="Use RolmOCR", value=False, info="Check to use RolmOCR, uncheck to use Qwen2VL OCR")],
 )
-demo.launch(debug=True)

 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
     """
     return f'''
 <div style="display: flex; align-items: center;">
 def downsample_video(video_path):
     """
+    Downsamples a video file by extracting 25 evenly spaced frames.
     Returns a list of tuples (PIL.Image, timestamp).
     """
     vidcap = cv2.VideoCapture(video_path)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     yield progress_bar_html(f"Processing with {model_name}")
+    # Stream tokens
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
         yield buffer
+    # Ensure generation finished
+    thread.join()
+    # Write final response to file
+    try:
+        with open("response.txt", "w", encoding="utf-8") as f:
+            f.write(buffer.strip())
+    except Exception as e:
+        # If writing fails, you can log or yield an error message
+        yield f"Warning: could not write response to file: {e}"
 # Gradio Interface
 examples = [
     [{"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
 demo = gr.ChatInterface(
     fn=model_inference,
+    description="# **Multimodal OCR `RolmOCR and Default Qwen2VL OCR`**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
     additional_inputs=[gr.Checkbox(label="Use RolmOCR", value=False, info="Check to use RolmOCR, uncheck to use Qwen2VL OCR")],
 )
+if __name__ == "__main__":
+    demo.launch(debug=True)