MinerU

Paused

App Files Files Community

princhman commited on Feb 17

Commit

b273357

1 Parent(s): 20dcfd9

RabbitMQ and SVM model

Browse files

Files changed (13) hide show

.gitattributes +71 -71
Dockerfile +25 -25
README.md +17 -17
app.py +29 -25
download_models_hf.py +66 -66
header.html +131 -131
inference_svm_model.py +29 -0
mineru_single.py +80 -58
model_classification/svm_model.joblib +3 -0
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams +2 -2
requirements.txt +32 -29
svm_model.joblib +3 -0
worker.py +179 -0

.gitattributes CHANGED Viewed

@@ -1,71 +1,71 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.8.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.8.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.8.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.9.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.9.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.9.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.9.0a4-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.9.0a5-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.9.0a6-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.9.0a7-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.9.0a8-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-magic_pdf-0.9.0a9-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/arabic/arabic_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/devanagari/devanagari_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/japan/japan_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/ka/ka_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/ta/ta_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-paddleocr/whl/rec/te/te_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
-*.pdf filter=lfs diff=lfs merge=lfs -text
-*.jpg filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.8.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.8.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.8.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.9.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.9.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.9.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.9.0a4-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.9.0a5-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.9.0a6-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.9.0a7-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.9.0a8-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+magic_pdf-0.9.0a9-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/arabic/arabic_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/devanagari/devanagari_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/japan/japan_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/ka/ka_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/ta/ta_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+paddleocr/whl/rec/te/te_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,26 +1,26 @@
-FROM python:3.9
-WORKDIR /code
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    software-properties-common \
-    && rm -rf /var/lib/apt/lists/*
-# Copy requirements first to leverage Docker cache
-COPY ./requirements.txt /code/requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of the application
-COPY . /code/
-# Make sure the inbox and output directories exist
-RUN mkdir -p /code/inbox /code/output
-# Expose the port
-EXPOSE 7860
-# Command to run the application
 CMD ["python", "app.py"]

+FROM python:3.9
+WORKDIR /code
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . /code/
+# Make sure the inbox and output directories exist
+RUN mkdir -p /code/inbox /code/output
+# Expose the port
+EXPOSE 7860
+# Command to run the application
 CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,18 +1,18 @@
----
-title: MinerU
-emoji: 📚
-colorFrom: purple
-colorTo: blue
-sdk: gradio
-sdk_version: 5.8.0
-app_file: app.py
-pinned: false
-license: agpl-3.0
-models:
-- opendatalab/PDF-Extract-Kit-1.0
-- hantian/layoutreader
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 Paper: https://huggingface.co/papers/2409.18839

+---
+title: MinerU
+emoji: 📚
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+sdk_version: 5.8.0
+app_file: app.py
+pinned: false
+license: agpl-3.0
+models:
+- opendatalab/PDF-Extract-Kit-1.0
+- hantian/layoutreader
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 Paper: https://huggingface.co/papers/2409.18839

app.py CHANGED Viewed

@@ -1,54 +1,58 @@
 #!/usr/bin/env python3
 import os
-import shutil
-import logging
 import uvicorn
-from fastapi import FastAPI, File, UploadFile, Header, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from mineru_single import Processor
-processor = Processor()
 app = FastAPI()
-logging.basicConfig(level=logging.INFO)
-# Add CORS middleware to allow requests from any origin
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Allows all origins
     allow_credentials=True,
-    allow_methods=["*"],  # Allows all methods
-    allow_headers=["*"],  # Allows all headers
 )
 @app.get("/")
 async def root():
-    """Health check endpoint"""
     return {"status": "ok", "message": "API is running"}
 @app.post("/process")
 async def process_pdf(
-    file_url: str,
     x_api_key: str = Header(None, alias="X-API-Key")
 ):
-    # Get the secret key from environment variable
-    api_key = os.getenv("SECRET_KEY")
     if not x_api_key:
         raise HTTPException(status_code=401, detail="API key is missing")
-    if x_api_key != api_key:
         raise HTTPException(status_code=401, detail="Invalid API key")
-    # Process the file and wait for completion
-    markdown_text = processor.process(file_url)
     return {
-        "message": "Processing completed",
-        "code": 200,
-        "content": markdown_text
     }
-# If you want to run locally or for debug:
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 #!/usr/bin/env python3
 import os
+import json
+import uuid
 import uvicorn
+import pika
+from fastapi import FastAPI, Body, Header, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 app = FastAPI()
+API_KEY = os.getenv("SECRET_KEY")
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
 @app.get("/")
 async def root():
     return {"status": "ok", "message": "API is running"}
 @app.post("/process")
 async def process_pdf(
+    input_json: dict = Body(...),
     x_api_key: str = Header(None, alias="X-API-Key")
 ):
     if not x_api_key:
         raise HTTPException(status_code=401, detail="API key is missing")
+    if x_api_key != API_KEY:
         raise HTTPException(status_code=401, detail="Invalid API key")
+    # Connect to RabbitMQ
+    rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
+    connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
+    channel = connection.channel()
+    channel.queue_declare(queue="ml_server", durable=True)
+    channel.basic_publish(
+        exchange="",
+        routing_key="ml_server",
+        body=json.dumps(input_json),
+        properties=pika.BasicProperties(
+            headers={"process": "topic_extraction"}
+        )
+    )
+    connection.close()
     return {
+        "message": "Job queued",
+        "request_id": input_json.get("headers", {}).get("request_id", str(uuid.uuid4()))
     }
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

download_models_hf.py CHANGED Viewed

@@ -1,66 +1,66 @@
-import json
-import os
-import requests
-from huggingface_hub import snapshot_download
-def download_json(url):
-    # 下载JSON文件
-    response = requests.get(url)
-    response.raise_for_status()  # 检查请求是否成功
-    return response.json()
-def download_and_modify_json(url, local_filename, modifications):
-    if os.path.exists(local_filename):
-        data = json.load(open(local_filename))
-        config_version = data.get('config_version', '0.0.0')
-        if config_version < '1.1.1':
-            data = download_json(url)
-    else:
-        data = download_json(url)
-    # 修改内容
-    for key, value in modifications.items():
-        data[key] = value
-    # 保存修改后的内容
-    with open(local_filename, 'w', encoding='utf-8') as f:
-        json.dump(data, f, ensure_ascii=False, indent=4)
-if __name__ == '__main__':
-    mineru_patterns = [
-        "models/Layout/LayoutLMv3/*",
-        "models/Layout/YOLO/*",
-        "models/MFD/YOLO/*",
-        "models/MFR/unimernet_small_2501/*",
-        "models/TabRec/TableMaster/*",
-        "models/TabRec/StructEqTable/*",
-    ]
-    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
-    layoutreader_pattern = [
-        "*.json",
-        "*.safetensors",
-    ]
-    layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
-    model_dir = model_dir + '/models'
-    print(f'model_dir is: {model_dir}')
-    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
-    json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
-    config_file_name = 'magic-pdf.json'
-    home_dir = os.path.expanduser('~')
-    config_file = os.path.join(home_dir, config_file_name)
-    json_mods = {
-        'models-dir': model_dir,
-        'layoutreader-model-dir': layoutreader_model_dir,
-    }
-    download_and_modify_json(json_url, config_file, json_mods)
-    print(f'The configuration file has been configured successfully, the path is: {config_file}')

+import json
+import os
+import requests
+from huggingface_hub import snapshot_download
+def download_json(url):
+    # 下载JSON文件
+    response = requests.get(url)
+    response.raise_for_status()  # 检查请求是否成功
+    return response.json()
+def download_and_modify_json(url, local_filename, modifications):
+    if os.path.exists(local_filename):
+        data = json.load(open(local_filename))
+        config_version = data.get('config_version', '0.0.0')
+        if config_version < '1.1.1':
+            data = download_json(url)
+    else:
+        data = download_json(url)
+    # 修改内容
+    for key, value in modifications.items():
+        data[key] = value
+    # 保存修改后的内容
+    with open(local_filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+if __name__ == '__main__':
+    mineru_patterns = [
+        "models/Layout/LayoutLMv3/*",
+        "models/Layout/YOLO/*",
+        "models/MFD/YOLO/*",
+        "models/MFR/unimernet_small_2501/*",
+        "models/TabRec/TableMaster/*",
+        "models/TabRec/StructEqTable/*",
+    ]
+    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
+    layoutreader_pattern = [
+        "*.json",
+        "*.safetensors",
+    ]
+    layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
+    model_dir = model_dir + '/models'
+    print(f'model_dir is: {model_dir}')
+    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
+    json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
+    config_file_name = 'magic-pdf.json'
+    home_dir = os.path.expanduser('~')
+    config_file = os.path.join(home_dir, config_file_name)
+    json_mods = {
+        'models-dir': model_dir,
+        'layoutreader-model-dir': layoutreader_model_dir,
+    }
+    download_and_modify_json(json_url, config_file, json_mods)
+    print(f'The configuration file has been configured successfully, the path is: {config_file}')

header.html CHANGED Viewed

@@ -1,132 +1,132 @@
-<html>
-  <head>
-  <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
-  <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
-<style>
-  .link-block {
-    border: 1px solid transparent;
-    border-radius: 24px;
-    background-color: rgba(54, 54, 54, 1);
-    cursor: pointer !important;
-  }
-  .link-block:hover {
-    background-color: rgba(54, 54, 54, 0.75) !important;
-    cursor: pointer !important;
-  }
-  .external-link {
-    display: inline-flex;
-    align-items: center;
-    height: 36px;
-    line-height: 36px;
-    padding: 0 16px;
-    cursor: pointer !important;
-  }
-  .external-link,
-  .external-link:hover {
-    cursor: pointer !important;
-  }
-  a {
-    text-decoration: none;
-  }
-</style>
-  </head>
-<body>
-  <div style="
-      display: flex;
-      flex-direction: column;
-      justify-content: center;
-      align-items: center;
-      text-align: center;
-      background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
-      padding: 24px;
-      gap: 24px;
-      border-radius: 8px;
-    ">
-    <div style="
-        display: flex;
-        flex-direction: column;
-        align-items: center;
-        gap: 16px;
-      ">
-      <div style="display: flex; flex-direction: column; gap: 8px">
-        <h1 style="
-            font-size: 48px;
-            color: #fafafa;
-            margin: 0;
-            font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
-              'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
-          ">
-          MinerU: PDF Extraction Demo
-        </h1>
-      </div>
-    </div>
-    <p style="
-        margin: 0;
-        line-height: 1.6rem;
-        font-size: 16px;
-        color: #fafafa;
-        opacity: 0.8;
-      ">
-      A one-stop, open-source, high-quality data extraction tool, supports
-      PDF/webpage/e-book extraction.<br>
-    </p>
-    <style>
-      .link-block {
-        display: inline-block;
-      }
-      .link-block + .link-block {
-        margin-left: 20px;
-      }
-    </style>
-    <div class="column has-text-centered">
-      <div class="publication-links">
-        <!-- Code Link. -->
-        <span class="link-block">
-          <a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
-            <span class="icon" style="margin-right: 4px">
-              <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
-            </span>
-            <span style="color: white">Code</span>
-          </a>
-        </span>
-        <!-- arXiv Link. -->
-        <span class="link-block">
-          <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
-            <span class="icon" style="margin-right: 8px">
-              <i class="fas fa-file" style="color: white"></i>
-            </span>
-            <span style="color: white">Paper</span>
-          </a>
-        </span>
-        <!-- Homepage Link. -->
-        <span class="link-block">
-          <a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
-            <span class="icon" style="margin-right: 8px">
-              <i class="fas fa-home" style="color: white"></i>
-            </span>
-            <span style="color: white">Homepage</span>
-          </a>
-        </span>
-        <!-- Client Link. -->
-        <span class="link-block">
-          <a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
-            <span class="icon" style="margin-right: 8px">
-              <i class="fas fa-download" style="color: white"></i>
-            </span>
-            <span style="color: white">Download</span>
-          </a>
-        </span>
-      </div>
-    </div>
-    <!-- New Demo Links -->
-  </div>
 </body></html>

+<html>
+  <head>
+  <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
+  <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
+<style>
+  .link-block {
+    border: 1px solid transparent;
+    border-radius: 24px;
+    background-color: rgba(54, 54, 54, 1);
+    cursor: pointer !important;
+  }
+  .link-block:hover {
+    background-color: rgba(54, 54, 54, 0.75) !important;
+    cursor: pointer !important;
+  }
+  .external-link {
+    display: inline-flex;
+    align-items: center;
+    height: 36px;
+    line-height: 36px;
+    padding: 0 16px;
+    cursor: pointer !important;
+  }
+  .external-link,
+  .external-link:hover {
+    cursor: pointer !important;
+  }
+  a {
+    text-decoration: none;
+  }
+</style>
+  </head>
+<body>
+  <div style="
+      display: flex;
+      flex-direction: column;
+      justify-content: center;
+      align-items: center;
+      text-align: center;
+      background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
+      padding: 24px;
+      gap: 24px;
+      border-radius: 8px;
+    ">
+    <div style="
+        display: flex;
+        flex-direction: column;
+        align-items: center;
+        gap: 16px;
+      ">
+      <div style="display: flex; flex-direction: column; gap: 8px">
+        <h1 style="
+            font-size: 48px;
+            color: #fafafa;
+            margin: 0;
+            font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
+              'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
+          ">
+          MinerU: PDF Extraction Demo
+        </h1>
+      </div>
+    </div>
+    <p style="
+        margin: 0;
+        line-height: 1.6rem;
+        font-size: 16px;
+        color: #fafafa;
+        opacity: 0.8;
+      ">
+      A one-stop, open-source, high-quality data extraction tool, supports
+      PDF/webpage/e-book extraction.<br>
+    </p>
+    <style>
+      .link-block {
+        display: inline-block;
+      }
+      .link-block + .link-block {
+        margin-left: 20px;
+      }
+    </style>
+    <div class="column has-text-centered">
+      <div class="publication-links">
+        <!-- Code Link. -->
+        <span class="link-block">
+          <a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+            <span class="icon" style="margin-right: 4px">
+              <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
+            </span>
+            <span style="color: white">Code</span>
+          </a>
+        </span>
+        <!-- arXiv Link. -->
+        <span class="link-block">
+          <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+            <span class="icon" style="margin-right: 8px">
+              <i class="fas fa-file" style="color: white"></i>
+            </span>
+            <span style="color: white">Paper</span>
+          </a>
+        </span>
+        <!-- Homepage Link. -->
+        <span class="link-block">
+          <a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+            <span class="icon" style="margin-right: 8px">
+              <i class="fas fa-home" style="color: white"></i>
+            </span>
+            <span style="color: white">Homepage</span>
+          </a>
+        </span>
+        <!-- Client Link. -->
+        <span class="link-block">
+          <a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+            <span class="icon" style="margin-right: 8px">
+              <i class="fas fa-download" style="color: white"></i>
+            </span>
+            <span style="color: white">Download</span>
+          </a>
+        </span>
+      </div>
+    </div>
+    <!-- New Demo Links -->
+  </div>
 </body></html>

inference_svm_model.py ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env python3
+import cv2
+import numpy as np
+from joblib import load
+def load_svm_model(model_path: str):
+    return load(model_path)
+def classify_image(
+    image_path: str,
+    loaded_model,
+    label_map: dict,
+    image_size=(128, 128)
+) -> str:
+    img = cv2.imread(image_path)
+    if img is None:
+        # If image fails to load, default to "irrelevant" or handle differently
+        return label_map[0]
+    img = cv2.resize(img, image_size)
+    x = img.flatten().reshape(1, -1)
+    pred = loaded_model.predict(x)[0]
+    return label_map[pred]
+if __name__ == "__main__":
+    model = load_svm_model("./model_classification/svm_model.joblib")
+    label_map = {0: "irrelevant", 1: "relevant"}
+    result = classify_image("test.jpg", model, label_map)
+    print("Classification result:", result)

mineru_single.py CHANGED Viewed

@@ -1,37 +1,16 @@
 #!/usr/bin/env python3
 import os
-import time
-import base64
-import json
-import re
 import uuid
-from pathlib import Path
-from loguru import logger
 import requests
-from magic_pdf.data.data_reader_writer import FileBasedDataReader
-from magic_pdf.tools.common import do_parse, prepare_env
-import pymupdf
-from magic_pdf.data.data_reader_writer.base import DataWriter
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.data.io.s3 import S3Writer
-# def to_pdf(file_path):
-#     """
-#     If input is not PDF, convert it to PDF using PyMuPDF
-#     """
-#     with pymupdf.open(file_path) as doc:
-#         if doc.is_pdf:
-#             return file_path
-#         else:
-#             pdf_bytes = doc.convert_to_pdf()
-#             unique_filename = f"{uuid.uuid4()}.pdf"
-#             tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
-#             with open(tmp_file_path, "wb") as tmp_pdf_file:
-#                 tmp_pdf_file.write(pdf_bytes)
-#             return tmp_file_path
 class Processor:
     def __init__(self):
@@ -41,61 +20,104 @@ class Processor:
             bucket=os.getenv("S3_BUCKET_NAME"),
             endpoint_url=os.getenv("S3_ENDPOINT"),
         )
-        self.image_writer = ImageWriter(self.s3_writer)
         with open("/home/user/magic-pdf.json", "r") as f:
             config = json.load(f)
         self.layout_mode = config["layout-config"]["model"]
         self.formula_enable = config["formula-config"]["enable"]
         self.table_enable = config["table-config"]["enable"]
         self.language = "en"
-        self.prefix = os.getenv("S3_ENDPOINT") + os.getenv("S3_BUCKET_NAME") + "/" + "document-extracts/"
-        self._init_model()
-    def _init_model(self):
-        os.system('pip uninstall -y magic-pdf')
-        os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
-        # os.system('pip install git+https://github.com/myhloli/Magic-PDF.git@dev')
-        os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
-        os.system('python download_models_hf.py')
-    def process(self, file_link: str, file_name: str = str(uuid.uuid4())):
-        print("Processing file")
-        response = requests.get(file_link)
         if response.status_code != 200:
-            raise Exception(f"Failed to download file from {file_link}")
         pdf_bytes = response.content
         dataset = PymuDocDataset(pdf_bytes)
-        inference = doc_analyze(dataset, ocr=True, lang=self.language, layout_model=self.layout_mode, formula_enable=self.formula_enable, table_enable=self.table_enable)
-        pipe_result = inference.pipe_ocr_mode(self.image_writer, lang=self.language)
-        md_content = pipe_result.get_markdown(self.prefix + file_name + "/")
-        return self.image_writer.remove_redundant_images(md_content)
 class ImageWriter(DataWriter):
-    def __init__(self, s3_client: S3Writer):
-        self.s3_client = s3_client
         self._redundant_images_paths = []
-    def _process_image(self, data: bytes) -> str:
-        # TODO: actually process image
-        return True
     def write(self, path: str, data: bytes) -> None:
-        # process image, if it is a vialbe image, upload it to s3, otherwise save the path to that image as redundant
-        if self._process_image(data):
-            self.s3_client.write(path, data)
         else:
             self._redundant_images_paths.append(path)
-    def remove_redundant_images(self, md_content: str):
         for path in self._redundant_images_paths:
             md_content = md_content.replace(f"![]({path})", "")
         return md_content
 if __name__ == "__main__":
     processor = Processor()
-    URL = ""
-    print(processor.process(URL))

 #!/usr/bin/env python3
 import os
 import uuid
+import json
 import requests
+from loguru import logger
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.data.io.s3 import S3Writer
+from magic_pdf.data.data_reader_writer.base import DataWriter
+from inference_svm_model import load_svm_model, classify_image
 class Processor:
     def __init__(self):
             bucket=os.getenv("S3_BUCKET_NAME"),
             endpoint_url=os.getenv("S3_ENDPOINT"),
         )
+        model_path = os.getenv("SVM_MODEL_PATH", "./svm_model/svm_model.joblib")
+        self.svm_model = load_svm_model(model_path)
+        self.label_map = {0: "irrelevant", 1: "relevant"}
         with open("/home/user/magic-pdf.json", "r") as f:
             config = json.load(f)
         self.layout_mode = config["layout-config"]["model"]
         self.formula_enable = config["formula-config"]["enable"]
         self.table_enable = config["table-config"]["enable"]
         self.language = "en"
+        endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
+        bucket = os.getenv("S3_BUCKET_NAME", "")
+        self.prefix = f"{endpoint}/{bucket}/document-extracts/"
+    def process(self, file_url: str) -> str:
+        logger.info("Processing file: {}", file_url)
+        response = requests.get(file_url)
         if response.status_code != 200:
+            raise Exception(f"Failed to download PDF: {file_url}")
         pdf_bytes = response.content
         dataset = PymuDocDataset(pdf_bytes)
+        inference = doc_analyze(
+            dataset,
+            ocr=True,
+            lang=self.language,
+            layout_model=self.layout_mode,
+            formula_enable=self.formula_enable,
+            table_enable=self.table_enable
+        )
+        image_writer = ImageWriter(self.s3_writer, self.svm_model, self.label_map)
+        pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
+        folder_name = str(uuid.uuid4())
+        md_content = pipe_result.get_markdown(self.prefix + folder_name + "/")
+        # Remove references to images classified as "irrelevant"
+        final_markdown = image_writer.remove_redundant_images(md_content)
+        return final_markdown
+    def process_batch(self, file_urls: list[str]) -> dict:
+        results = {}
+        for url in file_urls:
+            try:
+                md = self.process(url)
+                results[url] = md
+            except Exception as e:
+                results[url] = f"Error: {str(e)}"
+        return results
 class ImageWriter(DataWriter):
+    """
+    Receives each extracted image. Classifies it, uploads if relevant, or flags
+    it for removal if irrelevant.
+    """
+    def __init__(self, s3_writer: S3Writer, svm_model, label_map):
+        self.s3_writer = s3_writer
+        self.svm_model = svm_model
+        self.label_map = label_map
         self._redundant_images_paths = []
     def write(self, path: str, data: bytes) -> None:
+        import tempfile
+        import os
+        import uuid
+        tmp_name = f"{uuid.uuid4()}.jpg"
+        tmp_path = os.path.join(tempfile.gettempdir(), tmp_name)
+        with open(tmp_path, "wb") as f:
+            f.write(data)
+        label_str = classify_image(tmp_path, self.svm_model, self.label_map)
+        os.remove(tmp_path)
+        if label_str == "relevant":
+            # Upload to S3
+            self.s3_writer.write(path, data)
         else:
             self._redundant_images_paths.append(path)
+    def remove_redundant_images(self, md_content: str) -> str:
         for path in self._redundant_images_paths:
             md_content = md_content.replace(f"![]({path})", "")
         return md_content
 if __name__ == "__main__":
     processor = Processor()
+    single_url = "https://example.com/somefile.pdf"
+    markdown_result = processor.process(single_url)
+    print("Single file Markdown:\n", markdown_result)
+    multiple_urls = ["https://example.com/file1.pdf", "https://example.com/file2.pdf"]
+    batch_results = processor.process_batch(multiple_urls)
+    print("Batch results:", batch_results)

model_classification/svm_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfd07af67fb52073a477bcded6ed710f402f492d4fbc11945fbab1a68f7ceb62
+size 219034075

paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ef815afbb8970610618561946ce86faf60745ada64cd316ed34bfe34bdbf46f
-size 8934498

 version https://git-lfs.github.com/spec/v1
+oid sha256:d1f185677978a6e3e908a7123d2f37ff64dd5ae87e594dd1281331aedb26ad27
+size 6946816

requirements.txt CHANGED Viewed

@@ -1,29 +1,32 @@
-boto3>=1.28.43
-Brotli>=1.1.0
-click>=8.1.7
-PyMuPDF>=1.24.9,<1.24.14
-loguru>=0.6.0
-numpy>=1.21.6,<2.0.0
-fast-langdetect>=0.2.3
-scikit-learn>=1.0.2
-transformers
-pdfminer.six==20231228
-unimernet==0.2.3
-doclayout_yolo==0.0.2b1
-matplotlib
-ultralytics>=8.3.48
-paddleocr==2.7.3
-paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
-struct-eqtable==0.3.2
-detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
-magic-pdf>=1.0.1
-torch>=2.2.2,<=2.3.1
-torchvision>=0.17.2,<=0.18.1
-rapid-table>=1.0.3,<2.0.0
-rapidocr-paddle
-rapidocr-onnxruntime
-gradio-pdf>=0.0.21
-openai
-fastapi
-uvicorn
-python-multipart

+boto3>=1.28.43
+Brotli>=1.1.0
+click>=8.1.7
+PyMuPDF>=1.24.9,<1.24.14
+loguru>=0.6.0
+numpy>=1.21.6,<2.0.0
+fast-langdetect>=0.2.3
+scikit-learn>=1.0.2
+transformers
+pdfminer.six==20231228
+unimernet==0.2.3
+doclayout_yolo==0.0.2b1
+matplotlib
+ultralytics>=8.3.48
+paddleocr==2.7.3
+paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
+struct-eqtable==0.3.2
+detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
+magic-pdf>=1.0.1
+torch>=2.2.2,<=2.3.1
+torchvision>=0.17.2,<=0.18.1
+rapid-table>=1.0.3,<2.0.0
+rapidocr-paddle
+rapidocr-onnxruntime
+gradio-pdf>=0.0.21
+openai
+fastapi
+uvicorn
+python-multipart
+pika==1.3.2
+joblib==1.4.2
+opencv-python-headless==4.11.0.86

svm_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfd07af67fb52073a477bcded6ed710f402f492d4fbc11945fbab1a68f7ceb62
+size 219034075

worker.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/env python3
+import os
+import json
+import time
+import threading
+import multiprocessing
+from concurrent.futures import ThreadPoolExecutor
+import pika
+from mineru_single import Processor
+processor = Processor()
+def run_pipeline(body_bytes: bytes):
+    """
+    1) Decode the body bytes to a string.
+    2) Parse the JSON. We expect something like:
+       {
+         "headers": {"request_type": "process_files", "request_id": "..."},
+         "body": {
+             "input_files": [...],
+             "topics": [...]
+         }
+       }
+    3) If request_type == "process_files", call processor.process_batch(...) on the URLs.
+    4) Return raw_text_outputs (str) and parsed_json_outputs (dict).
+    """
+    body_str = body_bytes.decode("utf-8")
+    data = json.loads(body_str)
+    headers = data.get("headers", {})
+    request_type = headers.get("request_type", "")
+    request_id = headers.get("request_id", "")
+    body = data.get("body", {})
+    # If it's not "process_files", we do nothing special
+    if request_type != "process_files":
+        return "No processing done", data
+    # Gather file URLs
+    input_files = body.get("input_files", [])
+    topics = body.get("topics", [])
+    urls = []
+    file_key_map = {}
+    for f in input_files:
+        key = f.get("key", "")
+        url = f.get("url", "")
+        urls.append(url)
+        file_key_map[url] = key
+    batch_results = processor.process_batch(urls)  # {url: markdown_string}
+    md_context = []
+    for url, md_content in batch_results.items():
+        key = file_key_map.get(url, "")
+        md_context.append({"key": key, "body": md_content})
+    out_headers = {
+        "request_type": "question_extraction_update_from_gpu_server",
+        "request_id": request_id
+    }
+    out_body = {
+        "input_files": input_files,
+        "topics": topics,
+        "md_context": md_context
+    }
+    final_json = {
+        "headers": out_headers,
+        "body": out_body
+    }
+    return json.dumps(final_json, ensure_ascii=False), final_json
+def callback(ch, method, properties, body):
+    """
+    This function is invoked for each incoming RabbitMQ message.
+    """
+    thread_id = threading.current_thread().name
+    headers = properties.headers or {}
+    print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
+    # If the header "process" is "topic_extraction", we run our pipeline
+    if headers.get("process") == "topic_extraction":
+        raw_text_outputs, parsed_json_outputs = run_pipeline(body)
+        # Do something with the result, e.g. print or store in DB
+        print(f"[Worker {thread_id}] Pipeline result:\n{raw_text_outputs}")
+    else:
+        # Fallback if "process" is something else
+        print(f"[Worker {thread_id}] Unknown process, sleeping 10s.")
+        time.sleep(10)
+        print("[Worker] Done")
+def worker(channel):
+    try:
+        channel.start_consuming()
+    except Exception as e:
+        print(f"[Worker] Error: {e}")
+def connect_to_rabbitmq():
+    rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
+    connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
+    channel = connection.channel()
+    # Declare the queue
+    channel.queue_declare(queue="ml_server", durable=True)
+    # Limit messages per worker
+    channel.basic_qos(prefetch_count=1)
+    # auto_ack=True for simplicity, else you must ack manually
+    channel.basic_consume(
+        queue="ml_server",
+        on_message_callback=callback,
+        auto_ack=True
+    )
+    return connection, channel
+def main():
+    """
+    Main entry: starts multiple worker threads to consume from the queue.
+    """
+    num_workers = multiprocessing.cpu_count()
+    print(f"Starting {num_workers} workers")
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        for _ in range(num_workers):
+            connection, channel = connect_to_rabbitmq()
+            executor.submit(worker, channel)
+if __name__ == "__main__":
+    """
+    If run directly, we also publish a test message, then start the workers.
+    """
+    rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
+    connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
+    channel = connection.channel()
+    channel.queue_declare(queue="ml_server", durable=True)
+    sample_message = {
+        "headers": {
+            "request_type": "process_files",
+            "request_id": "abc123"
+        },
+        "body": {
+            "input_files": [
+                {
+                    "key": "file1",
+                    "url": "https://example.com/file1.pdf",
+                    "type": "mark_scheme"
+                },
+                {
+                    "key": "file2",
+                    "url": "https://example.com/file2.pdf",
+                    "type": "question"
+                }
+            ],
+            "topics": [
+                {
+                    "title": "Algebra",
+                    "id": 123
+                }
+            ]
+        }
+    }
+    channel.basic_publish(
+        exchange="",
+        routing_key="ml_server",
+        body=json.dumps(sample_message),
+        properties=pika.BasicProperties(
+            headers={"process": "topic_extraction"}
+        )
+    )
+    connection.close()
+    main()