Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 25

Commit

66f98ad

verified ·

1 Parent(s): a71e844

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -6

app.py CHANGED Viewed

@@ -38,6 +38,7 @@ for name, model_id in MODEL_OPTIONS.items():
     ).to("cuda").eval()
     processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 image_extensions = Image.registered_extensions()
 def identify_and_save_blob(blob_path):
@@ -69,15 +70,27 @@ def qwen_inference(model_name, media_input, text_input=None):
     model = models[model_name]
     processor = processors[model_name]
     if isinstance(media_input, str):
         media_path = media_input
-        if media_path.endswith(tuple([i for i in image_extensions.keys()])):
             media_type = "image"
         else:
             try:
                 media_path, media_type = identify_and_save_blob(media_input)
             except Exception as e:
                 raise ValueError("Unsupported media type. Please upload a valid image.")
     messages = [
         {
@@ -154,7 +167,7 @@ def generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, a
         "Justified": 4
     }[alignment]
-    # Register font
     font_path = f"font/{font_choice}"
     pdfmetrics.registerFont(TTFont(font_choice, font_path))
@@ -171,8 +184,8 @@ def generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, a
     story.append(Spacer(1, 12))
     # Add plain text output
-    text = Paragraph(plain_text, styles["Normal"])
-    story.append(text)
     doc.build(story)
     return filename
@@ -243,8 +256,9 @@ with gr.Blocks(css=css) as demo:
                     choices=list(MODEL_OPTIONS.keys()),
                     value="Latex OCR"
                 )
-                input_media = gr.File(
-                    label="Upload Image", type="filepath"
                 )
                 text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image...")
                 submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")

     ).to("cuda").eval()
     processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+# Get valid image extensions from PIL
 image_extensions = Image.registered_extensions()
 def identify_and_save_blob(blob_path):
     model = models[model_name]
     processor = processors[model_name]
+    # Determine media type and obtain a file path if needed
     if isinstance(media_input, str):
+        # If the input is a file path, check extension
         media_path = media_input
+        if media_path.endswith(tuple(image_extensions.keys())):
             media_type = "image"
         else:
             try:
                 media_path, media_type = identify_and_save_blob(media_input)
             except Exception as e:
                 raise ValueError("Unsupported media type. Please upload a valid image.")
+    else:
+        # media_input is a PIL image (or numpy array) coming from gr.Image
+        if not isinstance(media_input, Image.Image):
+            # In case gr.Image returns a numpy array, convert it.
+            media_input = Image.fromarray(media_input)
+        # Save the image temporarily to disk
+        temp_filename = f"temp_{uuid.uuid4()}.png"
+        media_input.save(temp_filename)
+        media_path = temp_filename
+        media_type = "image"
     messages = [
         {
         "Justified": 4
     }[alignment]
+    # Register font (assumes font files are available in a folder named "font")
     font_path = f"font/{font_choice}"
     pdfmetrics.registerFont(TTFont(font_choice, font_path))
     story.append(Spacer(1, 12))
     # Add plain text output
+    text_para = Paragraph(plain_text, styles["Normal"])
+    story.append(text_para)
     doc.build(story)
     return filename
                     choices=list(MODEL_OPTIONS.keys()),
                     value="Latex OCR"
                 )
+                # Using gr.Image instead of gr.File for image upload
+                input_media = gr.Image(
+                    label="Upload Image", type="pil"
                 )
                 text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image...")
                 submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")