princhman commited on
Commit
9d379e2
·
1 Parent(s): 73d131e

worker update

Browse files
__pycache__/inference_svm_model.cpython-310.pyc ADDED
Binary file (960 Bytes). View file
 
__pycache__/mineru_single.cpython-310.pyc ADDED
Binary file (4.24 kB). View file
 
__pycache__/worker.cpython-310.pyc ADDED
Binary file (4.78 kB). View file
 
worker.py CHANGED
@@ -10,71 +10,12 @@ from typing import Tuple, Dict, Any
10
 
11
  from mineru_single import Processor
12
 
13
- class MessageProcessor:
14
- def __init__(self):
15
- self.processor = Processor()
16
-
17
- def process_message(self, body_bytes: bytes) -> Tuple[str, Dict[str, Any]]:
18
- """Process incoming message and return processed results"""
19
- body_str = body_bytes.decode("utf-8")
20
- data = json.loads(body_str)
21
-
22
- headers = data.get("headers", {})
23
- request_type = headers.get("request_type", "")
24
- request_id = headers.get("request_id", "")
25
- body = data.get("body", {})
26
-
27
- if request_type != "process_files":
28
- return "No processing done", data
29
-
30
- input_files = body.get("input_files", [])
31
- topics = body.get("topics", [])
32
-
33
- urls, file_key_map = self._extract_urls_and_keys(input_files)
34
- batch_results = self.processor.process_batch(urls)
35
- md_context = self._create_markdown_context(batch_results, file_key_map)
36
-
37
- final_json = self._create_response_json(request_id, input_files, topics, md_context)
38
- return json.dumps(final_json, ensure_ascii=False), final_json
39
-
40
- def _extract_urls_and_keys(self, input_files: list) -> Tuple[list, dict]:
41
- """Extract URLs and create file key mapping"""
42
- urls = []
43
- file_key_map = {}
44
- for f in input_files:
45
- key = f.get("key", "")
46
- url = f.get("url", "")
47
- urls.append(url)
48
- file_key_map[url] = key
49
- return urls, file_key_map
50
-
51
- def _create_markdown_context(self, batch_results: dict, file_key_map: dict) -> list:
52
- """Create markdown context from batch results"""
53
- md_context = []
54
- for url, md_content in batch_results.items():
55
- key = file_key_map.get(url, "")
56
- md_context.append({"key": key, "body": md_content})
57
- return md_context
58
-
59
- def _create_response_json(self, request_id: str, input_files: list,
60
- topics: list, md_context: list) -> dict:
61
- """Create the final response JSON"""
62
- return {
63
- "headers": {
64
- "request_type": "question_extraction_update_from_gpu_server",
65
- "request_id": request_id
66
- },
67
- "body": {
68
- "input_files": input_files,
69
- "topics": topics,
70
- "md_context": md_context
71
- }
72
- }
73
 
74
  class RabbitMQWorker:
75
  def __init__(self, num_workers: int = 1):
76
  self.num_workers = num_workers
77
- self.message_processor = MessageProcessor()
 
78
 
79
  def callback(self, ch, method, properties, body):
80
  """Handle incoming RabbitMQ messages"""
@@ -83,13 +24,25 @@ class RabbitMQWorker:
83
 
84
  print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
85
 
86
- if headers.get("process") == "topic_extraction":
87
- raw_text_outputs, parsed_json_outputs = self.message_processor.process_message(body)
88
- print(f"[Worker {thread_id}] Pipeline result:\n{raw_text_outputs}")
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  else:
90
- print(f"[Worker {thread_id}] Unknown process, sleeping 10s.")
91
- time.sleep(10)
92
- print("[Worker] Done")
93
 
94
  def worker(self, channel):
95
  """Worker process to consume messages"""
 
10
 
11
  from mineru_single import Processor
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  class RabbitMQWorker:
15
  def __init__(self, num_workers: int = 1):
16
  self.num_workers = num_workers
17
+ self.public_connection = pika.BlockingConnection(pika.URLParameters(os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")))
18
+ self.public_channel = self.public_connection.channel()
19
 
20
  def callback(self, ch, method, properties, body):
21
  """Handle incoming RabbitMQ messages"""
 
24
 
25
  print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
26
 
27
+ if headers.get("request_type") == "process_files":
28
+ contexts = []
29
+ body_dict = json.loads(body)
30
+ for file in body_dict.get("input_files", []):
31
+ contexts.append({"key": file["key"], "body": self.message_processor.process_message(file["url"])})
32
+ body_dict["md_context"] = contexts
33
+ json_body = json.dumps(body_dict)
34
+ self.public_channel.queue_declare(queue="ml_server", durable=True)
35
+ self.public_channel.basic_publish(
36
+ exchange="",
37
+ routing_key="ml_server",
38
+ body=json_body,
39
+ properties=pika.BasicProperties(headers=headers)
40
+ )
41
+ print(f"[Worker {thread_id}] Contexts: {contexts}")
42
+
43
  else:
44
+ print(f"[Worker {thread_id}] Unknown process")
45
+ return
 
46
 
47
  def worker(self, channel):
48
  """Worker process to consume messages"""