MinerU

Paused

App Files Files Community

Arsenii11 commited on Feb 24

Commit

aa23348

1 Parent(s): 5783dae

Changed prompt

Browse files

Files changed (1) hide show

mineru_single.py +96 -9

mineru_single.py CHANGED Viewed

@@ -52,6 +52,7 @@ class Processor:
         except Exception as e:
             logger.error("Failed to initialize Processor: %s", str(e))
             raise
     def cleanup_gpu(self):
         """
         Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
@@ -63,6 +64,7 @@ class Processor:
             logger.info("GPU memory cleaned up.")
         except Exception as e:
             logger.error("Error during GPU cleanup: %s", e)
     def process(self, file_url: str, key: str) -> str:
         """
         Process a single PDF, returning final Markdown with irrelevant images removed.
@@ -211,9 +213,9 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
                                     Additionally, if the image contains a truncated part, you must describe it and mark as a
                                     part of some another image that goes before or after current image.
-                                    If an image is a multiple-choice question's options, make sure to modify your answer to add
-                                    "MCQ: A option B option C option D option", where MCQ is a descriptor and "option" would be
-                                    replaced with actual option from image.
                         """},
                         {
                             "inline_data": {
@@ -235,15 +237,100 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
         return ("error", "Error describing image", None)
-if __name__ == "__main__":
-    processor = Processor()
-    single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
-    markdown_result = processor.process(single_url, key="1234323")
-    print("Single file Markdown:\n", markdown_result)
 # if __name__ == "__main__":
 #     with open("./test_image.jpg", "rb") as file:
 #         test_image = file.read()
 #     print(call_gemini_for_image_description(test_image))

         except Exception as e:
             logger.error("Failed to initialize Processor: %s", str(e))
             raise
     def cleanup_gpu(self):
         """
         Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
             logger.info("GPU memory cleaned up.")
         except Exception as e:
             logger.error("Error during GPU cleanup: %s", e)
     def process(self, file_url: str, key: str) -> str:
         """
         Process a single PDF, returning final Markdown with irrelevant images removed.
                                     Additionally, if the image contains a truncated part, you must describe it and mark as a
                                     part of some another image that goes before or after current image.
+                                    If the image is of a multiple-choice question’s options, then modify your answer by appending
+                                    'MCQ: A [option] B [option] C [option] D [option]' (replacing [option] with the actual options).
+                                    Otherwise, follow the above instructions strictly.
                         """},
                         {
                             "inline_data": {
         return ("error", "Error describing image", None)
+# if __name__ == "__main__":
+#     processor = Processor()
+#     single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
+#     markdown_result = processor.process(single_url, key="1234323")
+#     print("Single file Markdown:\n", markdown_result)
 # if __name__ == "__main__":
 #     with open("./test_image.jpg", "rb") as file:
 #         test_image = file.read()
 #     print(call_gemini_for_image_description(test_image))
+if __name__ == "__main__":
+    class Processor:
+        def __init__(self):
+            try:
+                self.s3_writer = s3Writer(
+                    ak=os.getenv("S3_ACCESS_KEY"),
+                    sk=os.getenv("S3_SECRET_KEY"),
+                    bucket=os.getenv("S3_BUCKET_NAME"),
+                    endpoint_url=os.getenv("S3_ENDPOINT"),
+                )
+                self.svm_model = SVMModel()
+                logger.info("Classification model initialized successfully")
+                with open("/home/user/magic-pdf.json", "r") as f:
+                    config = json.load(f)
+                self.layout_mode = config["layout-config"]["model"]
+                self.formula_enable = config["formula-config"]["enable"]
+                self.table_enable = False
+                self.language = "en"
+                self.prefix = "document-extracts/"
+                logger.info("Processor initialized successfully")
+            except Exception as e:
+                logger.error("Failed to initialize Processor: %s", str(e))
+                raise
+        def cleanup_gpu(self):
+            """
+            Releases GPU memory, uses garbage collection to clear PyTorch's CUDA cache.
+            This helps prevent VRAM accumulation.
+            """
+            try:
+                gc.collect()               # Garbage collection
+                torch.cuda.empty_cache()   # Clear memory cache on GPU
+                logger.info("GPU memory cleaned up.")
+            except Exception as e:
+                logger.error("Error during GPU cleanup: %s", e)
+        def process(self, file_path: str, key: str) -> str:
+            """
+            Process a single PDF file from a local path, returning final Markdown with irrelevant images removed.
+            """
+            logger.info("Processing file: %s", file_path)
+            try:
+                # Read PDF file from the given file path
+                with open(file_path, "rb") as f:
+                    pdf_bytes = f.read()
+                logger.info("Loaded %d bytes from file_path='%s'", len(pdf_bytes), file_path)
+                # Analyze PDF with OCR
+                dataset = PymuDocDataset(pdf_bytes)
+                inference = doc_analyze(
+                    dataset,
+                    ocr=True,
+                    lang=self.language,
+                    layout_model=self.layout_mode,
+                    formula_enable=self.formula_enable,
+                    table_enable=self.table_enable
+                )
+                logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
+                # Classify images and remove irrelevant ones
+                image_writer = ImageWriter(self.s3_writer, f"{self.prefix}{key}/", self.svm_model)  # Pass base path to ImageWriter
+                pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
+                logger.info("OCR pipeline completed for key='%s'.", key)
+                md_content = pipe_result.get_markdown(f"{self.prefix}{key}/")
+                final_markdown = image_writer.post_process(f"{self.prefix}{key}/", md_content)
+                logger.info("Completed PDF process for key='%s'. Final MD length=%d", key, len(final_markdown))
+                return final_markdown
+            finally:
+                # GPU memory is cleaned up after each processing.
+                self.cleanup_gpu()
+    processor = Processor()
+    file_path = "./engineering.PDF"
+    markdown_result = processor.process(file_path, key="1234323")
+    print("Single file Markdown:\n", markdown_result)