fix pattern
Browse files
__pycache__/inference_svm_model.cpython-310.pyc
CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
|
|
__pycache__/mineru_single.cpython-310.pyc
CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
|
|
__pycache__/topic_extr.cpython-310.pyc
CHANGED
Binary files a/__pycache__/topic_extr.cpython-310.pyc and b/__pycache__/topic_extr.cpython-310.pyc differ
|
|
__pycache__/worker.cpython-310.pyc
CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
|
|
topic_extr.py
CHANGED
@@ -129,6 +129,7 @@ class TopicExtractionProcessor:
|
|
129 |
logger.info("Processing %s with pages %s", key, pages)
|
130 |
|
131 |
subset_pdf = self.create_subset_pdf(pdf_bytes, pages)
|
|
|
132 |
|
133 |
|
134 |
dataset = PymuDocDataset(subset_pdf)
|
|
|
129 |
logger.info("Processing %s with pages %s", key, pages)
|
130 |
|
131 |
subset_pdf = self.create_subset_pdf(pdf_bytes, pages)
|
132 |
+
logger.info(f"Created subset PDF with size: {len(subset_pdf)} bytes")
|
133 |
|
134 |
|
135 |
dataset = PymuDocDataset(subset_pdf)
|
worker.py
CHANGED
@@ -7,9 +7,7 @@ import multiprocessing
|
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
import pika
|
9 |
from typing import Tuple, Dict, Any
|
10 |
-
|
11 |
from mineru_single import Processor
|
12 |
-
|
13 |
from topic_extr import TopicExtractionProcessor
|
14 |
|
15 |
import logging
|
@@ -129,14 +127,18 @@ class RabbitMQWorker:
|
|
129 |
elif pattern == "topic_extraction":
|
130 |
data = body_dict.get("data")
|
131 |
input_files = data.get("input_files")
|
132 |
-
# contexts = []
|
133 |
logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
|
134 |
|
135 |
for file in input_files:
|
136 |
try:
|
|
|
|
|
|
|
|
|
137 |
context = {
|
138 |
-
"key": file["key"],
|
139 |
-
"body": self.topic_processor.process(file)
|
|
|
140 |
}
|
141 |
contexts.append(context)
|
142 |
except Exception as e:
|
@@ -144,10 +146,13 @@ class RabbitMQWorker:
|
|
144 |
logger.error(err_str)
|
145 |
contexts.append({"key": file.get("key", ""), "body": err_str})
|
146 |
|
|
|
147 |
data["md_context"] = contexts
|
|
|
148 |
body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
|
149 |
body_dict["data"] = data
|
150 |
-
|
|
|
151 |
if self.publish_message(body_dict, headers):
|
152 |
logger.info("[Worker %s] Published topic extraction results to ml_server.", thread_id)
|
153 |
ch.basic_ack(delivery_tag=method.delivery_tag)
|
|
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
import pika
|
9 |
from typing import Tuple, Dict, Any
|
|
|
10 |
from mineru_single import Processor
|
|
|
11 |
from topic_extr import TopicExtractionProcessor
|
12 |
|
13 |
import logging
|
|
|
127 |
elif pattern == "topic_extraction":
|
128 |
data = body_dict.get("data")
|
129 |
input_files = data.get("input_files")
|
|
|
130 |
logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
|
131 |
|
132 |
for file in input_files:
|
133 |
try:
|
134 |
+
# Process the file and get markdown content
|
135 |
+
markdown_content = self.topic_processor.process(file)
|
136 |
+
|
137 |
+
# Create context with the markdown content
|
138 |
context = {
|
139 |
+
"key": file["key"] + ".md",
|
140 |
+
# "body": self.topic_processor.process(file)
|
141 |
+
"body": markdown_content
|
142 |
}
|
143 |
contexts.append(context)
|
144 |
except Exception as e:
|
|
|
146 |
logger.error(err_str)
|
147 |
contexts.append({"key": file.get("key", ""), "body": err_str})
|
148 |
|
149 |
+
# Add the markdown contexts to the data
|
150 |
data["md_context"] = contexts
|
151 |
+
|
152 |
body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
|
153 |
body_dict["data"] = data
|
154 |
+
|
155 |
+
# Publish the results back to the ML server
|
156 |
if self.publish_message(body_dict, headers):
|
157 |
logger.info("[Worker %s] Published topic extraction results to ml_server.", thread_id)
|
158 |
ch.basic_ack(delivery_tag=method.delivery_tag)
|