Changed prompt
Browse files- mineru_single.py +96 -9
mineru_single.py
CHANGED
@@ -52,6 +52,7 @@ class Processor:
|
|
52 |
except Exception as e:
|
53 |
logger.error("Failed to initialize Processor: %s", str(e))
|
54 |
raise
|
|
|
55 |
def cleanup_gpu(self):
|
56 |
"""
|
57 |
Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
|
@@ -63,6 +64,7 @@ class Processor:
|
|
63 |
logger.info("GPU memory cleaned up.")
|
64 |
except Exception as e:
|
65 |
logger.error("Error during GPU cleanup: %s", e)
|
|
|
66 |
def process(self, file_url: str, key: str) -> str:
|
67 |
"""
|
68 |
Process a single PDF, returning final Markdown with irrelevant images removed.
|
@@ -211,9 +213,9 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
|
|
211 |
Additionally, if the image contains a truncated part, you must describe it and mark as a
|
212 |
part of some another image that goes before or after current image.
|
213 |
|
214 |
-
If
|
215 |
-
|
216 |
-
|
217 |
"""},
|
218 |
{
|
219 |
"inline_data": {
|
@@ -235,15 +237,100 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
|
|
235 |
return ("error", "Error describing image", None)
|
236 |
|
237 |
|
238 |
-
if __name__ == "__main__":
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
|
244 |
# if __name__ == "__main__":
|
245 |
# with open("./test_image.jpg", "rb") as file:
|
246 |
# test_image = file.read()
|
247 |
|
248 |
# print(call_gemini_for_image_description(test_image))
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
except Exception as e:
|
53 |
logger.error("Failed to initialize Processor: %s", str(e))
|
54 |
raise
|
55 |
+
|
56 |
def cleanup_gpu(self):
|
57 |
"""
|
58 |
Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
|
|
|
64 |
logger.info("GPU memory cleaned up.")
|
65 |
except Exception as e:
|
66 |
logger.error("Error during GPU cleanup: %s", e)
|
67 |
+
|
68 |
def process(self, file_url: str, key: str) -> str:
|
69 |
"""
|
70 |
Process a single PDF, returning final Markdown with irrelevant images removed.
|
|
|
213 |
Additionally, if the image contains a truncated part, you must describe it and mark as a
|
214 |
part of some another image that goes before or after current image.
|
215 |
|
216 |
+
If the image is of a multiple-choice question’s options, then modify your answer by appending
|
217 |
+
'MCQ: A [option] B [option] C [option] D [option]' (replacing [option] with the actual options).
|
218 |
+
Otherwise, follow the above instructions strictly.
|
219 |
"""},
|
220 |
{
|
221 |
"inline_data": {
|
|
|
237 |
return ("error", "Error describing image", None)
|
238 |
|
239 |
|
240 |
+
# if __name__ == "__main__":
|
241 |
+
# processor = Processor()
|
242 |
+
# single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
|
243 |
+
# markdown_result = processor.process(single_url, key="1234323")
|
244 |
+
# print("Single file Markdown:\n", markdown_result)
|
245 |
|
246 |
# if __name__ == "__main__":
|
247 |
# with open("./test_image.jpg", "rb") as file:
|
248 |
# test_image = file.read()
|
249 |
|
250 |
# print(call_gemini_for_image_description(test_image))
|
251 |
+
|
252 |
+
|
253 |
+
if __name__ == "__main__":
|
254 |
+
class Processor:
|
255 |
+
def __init__(self):
|
256 |
+
try:
|
257 |
+
self.s3_writer = s3Writer(
|
258 |
+
ak=os.getenv("S3_ACCESS_KEY"),
|
259 |
+
sk=os.getenv("S3_SECRET_KEY"),
|
260 |
+
bucket=os.getenv("S3_BUCKET_NAME"),
|
261 |
+
endpoint_url=os.getenv("S3_ENDPOINT"),
|
262 |
+
)
|
263 |
+
self.svm_model = SVMModel()
|
264 |
+
logger.info("Classification model initialized successfully")
|
265 |
+
|
266 |
+
with open("/home/user/magic-pdf.json", "r") as f:
|
267 |
+
config = json.load(f)
|
268 |
+
|
269 |
+
self.layout_mode = config["layout-config"]["model"]
|
270 |
+
self.formula_enable = config["formula-config"]["enable"]
|
271 |
+
self.table_enable = False
|
272 |
+
self.language = "en"
|
273 |
+
|
274 |
+
self.prefix = "document-extracts/"
|
275 |
+
logger.info("Processor initialized successfully")
|
276 |
+
except Exception as e:
|
277 |
+
logger.error("Failed to initialize Processor: %s", str(e))
|
278 |
+
raise
|
279 |
+
|
280 |
+
def cleanup_gpu(self):
|
281 |
+
"""
|
282 |
+
Releases GPU memory, uses garbage collection to clear PyTorch's CUDA cache.
|
283 |
+
This helps prevent VRAM accumulation.
|
284 |
+
"""
|
285 |
+
try:
|
286 |
+
gc.collect() # Garbage collection
|
287 |
+
torch.cuda.empty_cache() # Clear memory cache on GPU
|
288 |
+
logger.info("GPU memory cleaned up.")
|
289 |
+
except Exception as e:
|
290 |
+
logger.error("Error during GPU cleanup: %s", e)
|
291 |
+
|
292 |
+
def process(self, file_path: str, key: str) -> str:
|
293 |
+
"""
|
294 |
+
Process a single PDF file from a local path, returning final Markdown with irrelevant images removed.
|
295 |
+
"""
|
296 |
+
logger.info("Processing file: %s", file_path)
|
297 |
+
try:
|
298 |
+
# Read PDF file from the given file path
|
299 |
+
with open(file_path, "rb") as f:
|
300 |
+
pdf_bytes = f.read()
|
301 |
+
|
302 |
+
logger.info("Loaded %d bytes from file_path='%s'", len(pdf_bytes), file_path)
|
303 |
+
|
304 |
+
# Analyze PDF with OCR
|
305 |
+
dataset = PymuDocDataset(pdf_bytes)
|
306 |
+
inference = doc_analyze(
|
307 |
+
dataset,
|
308 |
+
ocr=True,
|
309 |
+
lang=self.language,
|
310 |
+
layout_model=self.layout_mode,
|
311 |
+
formula_enable=self.formula_enable,
|
312 |
+
table_enable=self.table_enable
|
313 |
+
)
|
314 |
+
|
315 |
+
logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
|
316 |
+
|
317 |
+
# Classify images and remove irrelevant ones
|
318 |
+
image_writer = ImageWriter(self.s3_writer, f"{self.prefix}{key}/", self.svm_model) # Pass base path to ImageWriter
|
319 |
+
pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
|
320 |
+
|
321 |
+
logger.info("OCR pipeline completed for key='%s'.", key)
|
322 |
+
|
323 |
+
md_content = pipe_result.get_markdown(f"{self.prefix}{key}/")
|
324 |
+
final_markdown = image_writer.post_process(f"{self.prefix}{key}/", md_content)
|
325 |
+
|
326 |
+
logger.info("Completed PDF process for key='%s'. Final MD length=%d", key, len(final_markdown))
|
327 |
+
return final_markdown
|
328 |
+
finally:
|
329 |
+
# GPU memory is cleaned up after each processing.
|
330 |
+
self.cleanup_gpu()
|
331 |
+
|
332 |
+
|
333 |
+
processor = Processor()
|
334 |
+
file_path = "./engineering.PDF"
|
335 |
+
markdown_result = processor.process(file_path, key="1234323")
|
336 |
+
print("Single file Markdown:\n", markdown_result)
|