MinerU

Paused

App Files Files Community

SkyNait commited on Mar 7

Commit

059f61a

1 Parent(s): da94345

fix pattern

Browse files

Files changed (6) hide show

__pycache__/inference_svm_model.cpython-310.pyc +0 -0
__pycache__/mineru_single.cpython-310.pyc +0 -0
__pycache__/topic_extr.cpython-310.pyc +0 -0
__pycache__/worker.cpython-310.pyc +0 -0
topic_extr.py +1 -0
worker.py +11 -6

__pycache__/inference_svm_model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ

__pycache__/mineru_single.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ

__pycache__/topic_extr.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/topic_extr.cpython-310.pyc and b/__pycache__/topic_extr.cpython-310.pyc differ

__pycache__/worker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ

topic_extr.py CHANGED Viewed

@@ -129,6 +129,7 @@ class TopicExtractionProcessor:
             logger.info("Processing %s with pages %s", key, pages)
             subset_pdf = self.create_subset_pdf(pdf_bytes, pages)
             dataset = PymuDocDataset(subset_pdf)

             logger.info("Processing %s with pages %s", key, pages)
             subset_pdf = self.create_subset_pdf(pdf_bytes, pages)
+            logger.info(f"Created subset PDF with size: {len(subset_pdf)} bytes")
             dataset = PymuDocDataset(subset_pdf)

worker.py CHANGED Viewed

@@ -7,9 +7,7 @@ import multiprocessing
 from concurrent.futures import ThreadPoolExecutor
 import pika
 from typing import Tuple, Dict, Any
 from mineru_single import Processor
 from topic_extr import TopicExtractionProcessor
 import logging
@@ -129,14 +127,18 @@ class RabbitMQWorker:
             elif pattern == "topic_extraction":
                 data = body_dict.get("data")
                 input_files = data.get("input_files")
-                # contexts = []
                 logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
                 for file in input_files:
                     try:
                         context = {
-                            "key": file["key"],
-                            "body": self.topic_processor.process(file)
                         }
                         contexts.append(context)
                     except Exception as e:
@@ -144,10 +146,13 @@ class RabbitMQWorker:
                         logger.error(err_str)
                         contexts.append({"key": file.get("key", ""), "body": err_str})
                 data["md_context"] = contexts
                 body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
                 body_dict["data"] = data
                 if self.publish_message(body_dict, headers):
                     logger.info("[Worker %s] Published topic extraction results to ml_server.", thread_id)
                     ch.basic_ack(delivery_tag=method.delivery_tag)

 from concurrent.futures import ThreadPoolExecutor
 import pika
 from typing import Tuple, Dict, Any
 from mineru_single import Processor
 from topic_extr import TopicExtractionProcessor
 import logging
             elif pattern == "topic_extraction":
                 data = body_dict.get("data")
                 input_files = data.get("input_files")
                 logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
                 for file in input_files:
                     try:
+                        # Process the file and get markdown content
+                        markdown_content = self.topic_processor.process(file)
+                        # Create context with the markdown content
                         context = {
+                            "key": file["key"] + ".md",
+                            # "body": self.topic_processor.process(file)
+                            "body": markdown_content
                         }
                         contexts.append(context)
                     except Exception as e:
                         logger.error(err_str)
                         contexts.append({"key": file.get("key", ""), "body": err_str})
+                # Add the markdown contexts to the data
                 data["md_context"] = contexts
                 body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
                 body_dict["data"] = data
+                # Publish the results back to the ML server
                 if self.publish_message(body_dict, headers):
                     logger.info("[Worker %s] Published topic extraction results to ml_server.", thread_id)
                     ch.basic_ack(delivery_tag=method.delivery_tag)