Arsenii11 commited on
Commit
aa23348
·
1 Parent(s): 5783dae

Changed prompt

Browse files
Files changed (1) hide show
  1. mineru_single.py +96 -9
mineru_single.py CHANGED
@@ -52,6 +52,7 @@ class Processor:
52
  except Exception as e:
53
  logger.error("Failed to initialize Processor: %s", str(e))
54
  raise
 
55
  def cleanup_gpu(self):
56
  """
57
  Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
@@ -63,6 +64,7 @@ class Processor:
63
  logger.info("GPU memory cleaned up.")
64
  except Exception as e:
65
  logger.error("Error during GPU cleanup: %s", e)
 
66
  def process(self, file_url: str, key: str) -> str:
67
  """
68
  Process a single PDF, returning final Markdown with irrelevant images removed.
@@ -211,9 +213,9 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
211
  Additionally, if the image contains a truncated part, you must describe it and mark as a
212
  part of some another image that goes before or after current image.
213
 
214
- If an image is a multiple-choice question's options, make sure to modify your answer to add
215
- "MCQ: A option B option C option D option", where MCQ is a descriptor and "option" would be
216
- replaced with actual option from image.
217
  """},
218
  {
219
  "inline_data": {
@@ -235,15 +237,100 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
235
  return ("error", "Error describing image", None)
236
 
237
 
238
- if __name__ == "__main__":
239
- processor = Processor()
240
- single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
241
- markdown_result = processor.process(single_url, key="1234323")
242
- print("Single file Markdown:\n", markdown_result)
243
 
244
  # if __name__ == "__main__":
245
  # with open("./test_image.jpg", "rb") as file:
246
  # test_image = file.read()
247
 
248
  # print(call_gemini_for_image_description(test_image))
249
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  except Exception as e:
53
  logger.error("Failed to initialize Processor: %s", str(e))
54
  raise
55
+
56
  def cleanup_gpu(self):
57
  """
58
  Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
 
64
  logger.info("GPU memory cleaned up.")
65
  except Exception as e:
66
  logger.error("Error during GPU cleanup: %s", e)
67
+
68
  def process(self, file_url: str, key: str) -> str:
69
  """
70
  Process a single PDF, returning final Markdown with irrelevant images removed.
 
213
  Additionally, if the image contains a truncated part, you must describe it and mark as a
214
  part of some another image that goes before or after current image.
215
 
216
+ If the image is of a multiple-choice questions options, then modify your answer by appending
217
+ 'MCQ: A [option] B [option] C [option] D [option]' (replacing [option] with the actual options).
218
+ Otherwise, follow the above instructions strictly.
219
  """},
220
  {
221
  "inline_data": {
 
237
  return ("error", "Error describing image", None)
238
 
239
 
240
+ # if __name__ == "__main__":
241
+ # processor = Processor()
242
+ # single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
243
+ # markdown_result = processor.process(single_url, key="1234323")
244
+ # print("Single file Markdown:\n", markdown_result)
245
 
246
  # if __name__ == "__main__":
247
  # with open("./test_image.jpg", "rb") as file:
248
  # test_image = file.read()
249
 
250
  # print(call_gemini_for_image_description(test_image))
251
+
252
+
253
+ if __name__ == "__main__":
254
+ class Processor:
255
+ def __init__(self):
256
+ try:
257
+ self.s3_writer = s3Writer(
258
+ ak=os.getenv("S3_ACCESS_KEY"),
259
+ sk=os.getenv("S3_SECRET_KEY"),
260
+ bucket=os.getenv("S3_BUCKET_NAME"),
261
+ endpoint_url=os.getenv("S3_ENDPOINT"),
262
+ )
263
+ self.svm_model = SVMModel()
264
+ logger.info("Classification model initialized successfully")
265
+
266
+ with open("/home/user/magic-pdf.json", "r") as f:
267
+ config = json.load(f)
268
+
269
+ self.layout_mode = config["layout-config"]["model"]
270
+ self.formula_enable = config["formula-config"]["enable"]
271
+ self.table_enable = False
272
+ self.language = "en"
273
+
274
+ self.prefix = "document-extracts/"
275
+ logger.info("Processor initialized successfully")
276
+ except Exception as e:
277
+ logger.error("Failed to initialize Processor: %s", str(e))
278
+ raise
279
+
280
+ def cleanup_gpu(self):
281
+ """
282
+ Releases GPU memory, uses garbage collection to clear PyTorch's CUDA cache.
283
+ This helps prevent VRAM accumulation.
284
+ """
285
+ try:
286
+ gc.collect() # Garbage collection
287
+ torch.cuda.empty_cache() # Clear memory cache on GPU
288
+ logger.info("GPU memory cleaned up.")
289
+ except Exception as e:
290
+ logger.error("Error during GPU cleanup: %s", e)
291
+
292
+ def process(self, file_path: str, key: str) -> str:
293
+ """
294
+ Process a single PDF file from a local path, returning final Markdown with irrelevant images removed.
295
+ """
296
+ logger.info("Processing file: %s", file_path)
297
+ try:
298
+ # Read PDF file from the given file path
299
+ with open(file_path, "rb") as f:
300
+ pdf_bytes = f.read()
301
+
302
+ logger.info("Loaded %d bytes from file_path='%s'", len(pdf_bytes), file_path)
303
+
304
+ # Analyze PDF with OCR
305
+ dataset = PymuDocDataset(pdf_bytes)
306
+ inference = doc_analyze(
307
+ dataset,
308
+ ocr=True,
309
+ lang=self.language,
310
+ layout_model=self.layout_mode,
311
+ formula_enable=self.formula_enable,
312
+ table_enable=self.table_enable
313
+ )
314
+
315
+ logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
316
+
317
+ # Classify images and remove irrelevant ones
318
+ image_writer = ImageWriter(self.s3_writer, f"{self.prefix}{key}/", self.svm_model) # Pass base path to ImageWriter
319
+ pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
320
+
321
+ logger.info("OCR pipeline completed for key='%s'.", key)
322
+
323
+ md_content = pipe_result.get_markdown(f"{self.prefix}{key}/")
324
+ final_markdown = image_writer.post_process(f"{self.prefix}{key}/", md_content)
325
+
326
+ logger.info("Completed PDF process for key='%s'. Final MD length=%d", key, len(final_markdown))
327
+ return final_markdown
328
+ finally:
329
+ # GPU memory is cleaned up after each processing.
330
+ self.cleanup_gpu()
331
+
332
+
333
+ processor = Processor()
334
+ file_path = "./engineering.PDF"
335
+ markdown_result = processor.process(file_path, key="1234323")
336
+ print("Single file Markdown:\n", markdown_result)