Spaces:

sooooner
/

DonutMathHWP

Runtime error

App Files Files Community

sooooner commited on Oct 5, 2024

Commit

68c034b

1 Parent(s): dcbff67

.

Browse files

Files changed (2) hide show

app.py +4 -9
utils.py +18 -24

app.py CHANGED Viewed

@@ -6,14 +6,11 @@ import spaces
 from utils import Image2Text
-@spaces.GPU(duration=15)
 def greet(input_img):
     global image_to_text
-    print('-----------')
-    print(input_img[0])
-    print('-----------')
-    contents = image_to_text.get_text(input_img[0], num_beams=4)
-    return '\n'.join(contents)
 examples_path = os.path.dirname(__file__)
@@ -26,15 +23,13 @@ if __name__ == "__main__":
     demo = gr.Interface(
         fn=greet,
-        # inputs="image",
         inputs=gr.File(
             label="Drag (Select) 1 or more photos of your face",
             file_types=["image"],
             file_count="multiple"
         ),
-        outputs="text",
         title=f"🍩 for Hwp math problems",
-        # examples=[os.path.join(examples_path, "samples", img_name) for img_name in sorted(os.listdir("samples"))],
         cache_examples=True
     )

 from utils import Image2Text
+@spaces.GPU(duration=30)
 def greet(input_img):
     global image_to_text
+    contents = image_to_text.get_text(input_img, num_beams=4)
+    return contents
 examples_path = os.path.dirname(__file__)
     demo = gr.Interface(
         fn=greet,
         inputs=gr.File(
             label="Drag (Select) 1 or more photos of your face",
             file_types=["image"],
             file_count="multiple"
         ),
+        outputs=gr.JSON(label="Extracted Texts"),
         title=f"🍩 for Hwp math problems",
         cache_examples=True
     )

utils.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import os
-from typing import Union
 import PIL.Image
 import PIL.ImageOps
-import requests
 def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
     """
@@ -36,15 +39,6 @@ def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
     image = image.convert("RGB")
     return image
-import re
-import torch
-import numpy as np
-from PIL import Image
-from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig
 def aspect_ratio_preserving_resize_and_crop(image, target_width, target_height):
     width, height = image.size
     width_ratio = width / target_width
@@ -93,17 +87,17 @@ class Image2Text:
         return model, processor
     def load_img(self, inputs, width=480, height=480):
-        # image = Image.fromarray(inputs)
-        image = load_image(inputs)
-        image = aspect_ratio_preserving_resize_and_crop(image, target_width=width, target_height=height).convert("RGB")
         img = self.processor(image , return_tensors="pt", size=(width, height)).pixel_values
-        pixel_values = img.to(self.device)
         return pixel_values
     def generate(self, pixel_values, num_beams):
         outputs = self.model.generate(
                 pixel_values,
-                decoder_input_ids=self.decoder_input_ids,
                 max_length=2048,
                 early_stopping=True,
                 pad_token_id=self.processor.tokenizer.pad_token_id,
@@ -116,12 +110,12 @@ class Image2Text:
         return outputs
     def postprocessing(self, outputs):
-        seq = self.processor.batch_decode(outputs.sequences)[0]
-        seq = seq.replace(self.processor.tokenizer.eos_token, "").replace(self.processor.tokenizer.pad_token, "")
-        seq = re.sub(r"<.*?>", "", seq, count=1).strip()
-        seq = self.processor.token2json(seq)
-        contents = seq['content'].split('[newline]')
-        return contents
     def get_text(self, img_path, num_beams=4):
         pixel_values = self.load_img(img_path)

 import os
+import re
+import torch
+import requests
+import numpy as np
 import PIL.Image
 import PIL.ImageOps
+from PIL import Image
+from typing import Union
+from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig
 def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
     """
     image = image.convert("RGB")
     return image
 def aspect_ratio_preserving_resize_and_crop(image, target_width, target_height):
     width, height = image.size
     width_ratio = width / target_width
         return model, processor
     def load_img(self, inputs, width=480, height=480):
+        images = [load_image(input_) for input_ in inputs]
+        images = [aspect_ratio_preserving_resize_and_crop(image, target_width=width, target_height=height) for image in images]
         img = self.processor(image , return_tensors="pt", size=(width, height)).pixel_values
+        imgs = self.processor([image.convert("RGB") for image in images], return_tensors="pt", size=(width, height)).pixel_values
+        pixel_values = imgs.to(self.device)
         return pixel_values
     def generate(self, pixel_values, num_beams):
         outputs = self.model.generate(
                 pixel_values,
+                decoder_input_ids=self.decoder_input_ids.repeat(pixel_values.shape[0], 1),
                 max_length=2048,
                 early_stopping=True,
                 pad_token_id=self.processor.tokenizer.pad_token_id,
         return outputs
     def postprocessing(self, outputs):
+        seqs = self.processor.batch_decode(outputs.sequences)
+        seqs = [seq.replace(self.processor.tokenizer.eos_token, "").replace(self.processor.tokenizer.pad_token, "") for seq in seqs]
+        seqs = [re.sub(r"<.*?>", "", seq, count=1).strip() for seq in seqs]
+        seqs = [self.processor.token2json(seq) for seq in seqs]
+        contents = [seq['content'].split('[newline]') for seq in seqs]
+        return ['\n'.join(content) for content in contents]
     def get_text(self, img_path, num_beams=4):
         pixel_values = self.load_img(img_path)