MinerU

Paused

App Files Files Community

SkyNait commited on Feb 28

Commit

aa071f3

1 Parent(s): 04fd3ea

fix RabbitMQ

Browse files

Files changed (9) hide show

.env +6 -0
__pycache__/inference_svm_model.cpython-310.pyc +0 -0
__pycache__/mineru_single.cpython-310.pyc +0 -0
__pycache__/table_row_extraction.cpython-310.pyc +0 -0
__pycache__/topic_extraction.cpython-310.pyc +0 -0
__pycache__/worker.cpython-310.pyc +0 -0
test_listener.py +33 -0
topic_extraction.log +0 -0
worker.py +17 -4

.env ADDED Viewed

	@@ -0,0 +1,6 @@

+GEMINI_API_KEY=AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU
+RABBITMQ_URL=amqp://pP4gN4GdD3PiUkQQ:[email protected]:57635
+AWS_REGION=eu-west-2
+AWS_RESOURCES_NAME=quextro-resources
+AWS_ACCESS_KEY=AKIAXNGUVKHXIIUQZ3OE
+AWS_SECRET_KEY=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

__pycache__/inference_svm_model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ

__pycache__/mineru_single.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ

__pycache__/table_row_extraction.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ

__pycache__/topic_extraction.cpython-310.pyc ADDED Viewed

Binary file (23.3 kB). View file

__pycache__/worker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ

test_listener.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import pika
+import os
+import dotenv
+import time
+dotenv.load_dotenv()
+params = pika.URLParameters(os.getenv("RABBITMQ_URL"))
+params.heartbeat = 5
+params.blocked_connection_timeout = 2
+params.connection_attempts = 3
+params.retry_delay = 5
+connection = pika.BlockingConnection(params)
+channel = connection.channel()
+channel.queue_declare(queue="web_server", durable=True)
+def callback(ch, method, properties, body):
+    try:
+        print(f"Received message: {body}")
+        print(f"Properties: {properties}")
+        print(f"Method: {method}")
+        print(f"Channel: {ch}")
+        time.sleep(10)
+    except Exception as e:
+        print(f"Error: {e}")
+channel.basic_consume(queue="web_server", on_message_callback=callback, auto_ack=True)
+print("Waiting for messages...")
+channel.start_consuming()

topic_extraction.log ADDED Viewed

File without changes

worker.py CHANGED Viewed

@@ -10,6 +10,8 @@ from typing import Tuple, Dict, Any
 from mineru_single import Processor
 import logging
 logging.basicConfig(
@@ -25,6 +27,11 @@ class RabbitMQWorker:
         logger.info("Initializing RabbitMQWorker")
         self.processor = Processor()
         self.publisher_connection = None
         self.publisher_channel = None
@@ -124,10 +131,16 @@ class RabbitMQWorker:
             elif pattern == "topic_extraction":
                 data = body_dict.get("data")
-                pdf_path = data.get("pdf_path") #url
-                topic_processor = MineruNoTextProcessor(gemini_api_key=os.getenv("GEMINI_API_KEY"))
                 try:
-                    topics_markdown = topic_processor.process(pdf_path)
                     data["topics_markdown"] = topics_markdown
                     body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
                     body_dict["data"] = data
@@ -136,7 +149,7 @@ class RabbitMQWorker:
                     else:
                         ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
                 except Exception as e:
-                    logger.error(f"Error processing topic extraction: {e}")
                     ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
             else:

 from mineru_single import Processor
+from topic_extraction import MineruNoTextProcessor
 import logging
 logging.basicConfig(
         logger.info("Initializing RabbitMQWorker")
         self.processor = Processor()
+        self.topic_processor = MineruNoTextProcessor(
+            output_folder="/tmp/topic_extraction_outputs",
+            gemini_api_key=os.getenv("GEMINI_API_KEY")
+        )
         self.publisher_connection = None
         self.publisher_channel = None
             elif pattern == "topic_extraction":
                 data = body_dict.get("data")
+                input_files = data.get("input_files")
+                if not input_files or not isinstance(input_files, list):
+                    logger.error("[Worker %s] No input files provided for topic extraction.", thread_id)
+                    ch.basic_ack(delivery_tag=method.delivery_tag)
+                    return
+                # Use the first file's URL for topic extraction
+                pdf_url = input_files[0].get("url")
+                logger.info("[Worker %s] Processing topic extraction for URL: %s", thread_id, pdf_url)
                 try:
+                    topics_markdown = self.topic_processor.process(pdf_url)
                     data["topics_markdown"] = topics_markdown
                     body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
                     body_dict["data"] = data
                     else:
                         ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
                 except Exception as e:
+                    logger.error("Error processing topic extraction: %s", e)
                     ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
             else: