SkyNait commited on
Commit
059f61a
·
1 Parent(s): da94345

fix pattern

Browse files
__pycache__/inference_svm_model.cpython-310.pyc CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
 
__pycache__/mineru_single.cpython-310.pyc CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
 
__pycache__/topic_extr.cpython-310.pyc CHANGED
Binary files a/__pycache__/topic_extr.cpython-310.pyc and b/__pycache__/topic_extr.cpython-310.pyc differ
 
__pycache__/worker.cpython-310.pyc CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
 
topic_extr.py CHANGED
@@ -129,6 +129,7 @@ class TopicExtractionProcessor:
129
  logger.info("Processing %s with pages %s", key, pages)
130
 
131
  subset_pdf = self.create_subset_pdf(pdf_bytes, pages)
 
132
 
133
 
134
  dataset = PymuDocDataset(subset_pdf)
 
129
  logger.info("Processing %s with pages %s", key, pages)
130
 
131
  subset_pdf = self.create_subset_pdf(pdf_bytes, pages)
132
+ logger.info(f"Created subset PDF with size: {len(subset_pdf)} bytes")
133
 
134
 
135
  dataset = PymuDocDataset(subset_pdf)
worker.py CHANGED
@@ -7,9 +7,7 @@ import multiprocessing
7
  from concurrent.futures import ThreadPoolExecutor
8
  import pika
9
  from typing import Tuple, Dict, Any
10
-
11
  from mineru_single import Processor
12
-
13
  from topic_extr import TopicExtractionProcessor
14
 
15
  import logging
@@ -129,14 +127,18 @@ class RabbitMQWorker:
129
  elif pattern == "topic_extraction":
130
  data = body_dict.get("data")
131
  input_files = data.get("input_files")
132
- # contexts = []
133
  logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
134
 
135
  for file in input_files:
136
  try:
 
 
 
 
137
  context = {
138
- "key": file["key"],
139
- "body": self.topic_processor.process(file)
 
140
  }
141
  contexts.append(context)
142
  except Exception as e:
@@ -144,10 +146,13 @@ class RabbitMQWorker:
144
  logger.error(err_str)
145
  contexts.append({"key": file.get("key", ""), "body": err_str})
146
 
 
147
  data["md_context"] = contexts
 
148
  body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
149
  body_dict["data"] = data
150
-
 
151
  if self.publish_message(body_dict, headers):
152
  logger.info("[Worker %s] Published topic extraction results to ml_server.", thread_id)
153
  ch.basic_ack(delivery_tag=method.delivery_tag)
 
7
  from concurrent.futures import ThreadPoolExecutor
8
  import pika
9
  from typing import Tuple, Dict, Any
 
10
  from mineru_single import Processor
 
11
  from topic_extr import TopicExtractionProcessor
12
 
13
  import logging
 
127
  elif pattern == "topic_extraction":
128
  data = body_dict.get("data")
129
  input_files = data.get("input_files")
 
130
  logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
131
 
132
  for file in input_files:
133
  try:
134
+ # Process the file and get markdown content
135
+ markdown_content = self.topic_processor.process(file)
136
+
137
+ # Create context with the markdown content
138
  context = {
139
+ "key": file["key"] + ".md",
140
+ # "body": self.topic_processor.process(file)
141
+ "body": markdown_content
142
  }
143
  contexts.append(context)
144
  except Exception as e:
 
146
  logger.error(err_str)
147
  contexts.append({"key": file.get("key", ""), "body": err_str})
148
 
149
+ # Add the markdown contexts to the data
150
  data["md_context"] = contexts
151
+
152
  body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
153
  body_dict["data"] = data
154
+
155
+ # Publish the results back to the ML server
156
  if self.publish_message(body_dict, headers):
157
  logger.info("[Worker %s] Published topic extraction results to ml_server.", thread_id)
158
  ch.basic_ack(delivery_tag=method.delivery_tag)