princhman commited on
Commit
b273357
·
1 Parent(s): 20dcfd9

RabbitMQ and SVM model

Browse files
.gitattributes CHANGED
@@ -1,71 +1,71 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- magic_pdf-0.8.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
37
- magic_pdf-0.8.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
38
- magic_pdf-0.8.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
39
- paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
40
- paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
41
- paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
42
- magic_pdf-0.9.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
43
- magic_pdf-0.9.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
44
- magic_pdf-0.9.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
45
- magic_pdf-0.9.0a4-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
46
- magic_pdf-0.9.0a5-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
47
- magic_pdf-0.9.0a6-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
48
- magic_pdf-0.9.0a7-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
49
- magic_pdf-0.9.0a8-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
50
- magic_pdf-0.9.0a9-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
51
- paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
52
- paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
53
- paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
54
- paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
55
- paddleocr/whl/rec/arabic/arabic_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
56
- paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
57
- paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
58
- paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
59
- paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
60
- paddleocr/whl/rec/devanagari/devanagari_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
61
- paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
62
- paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
63
- paddleocr/whl/rec/japan/japan_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
64
- paddleocr/whl/rec/ka/ka_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
65
- paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
66
- paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
67
- paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
68
- paddleocr/whl/rec/ta/ta_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
69
- paddleocr/whl/rec/te/te_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
70
- *.pdf filter=lfs diff=lfs merge=lfs -text
71
- *.jpg filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ magic_pdf-0.8.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
37
+ magic_pdf-0.8.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
38
+ magic_pdf-0.8.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
39
+ paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
40
+ paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
41
+ paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
42
+ magic_pdf-0.9.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
43
+ magic_pdf-0.9.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
44
+ magic_pdf-0.9.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
45
+ magic_pdf-0.9.0a4-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
46
+ magic_pdf-0.9.0a5-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
47
+ magic_pdf-0.9.0a6-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
48
+ magic_pdf-0.9.0a7-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
49
+ magic_pdf-0.9.0a8-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
50
+ magic_pdf-0.9.0a9-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
51
+ paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
52
+ paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
53
+ paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
54
+ paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
55
+ paddleocr/whl/rec/arabic/arabic_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
56
+ paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
57
+ paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
58
+ paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
59
+ paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
60
+ paddleocr/whl/rec/devanagari/devanagari_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
61
+ paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
62
+ paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
63
+ paddleocr/whl/rec/japan/japan_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
64
+ paddleocr/whl/rec/ka/ka_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
65
+ paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
66
+ paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
67
+ paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
68
+ paddleocr/whl/rec/ta/ta_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
69
+ paddleocr/whl/rec/te/te_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
70
+ *.pdf filter=lfs diff=lfs merge=lfs -text
71
+ *.jpg filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,26 +1,26 @@
1
- FROM python:3.9
2
-
3
- WORKDIR /code
4
-
5
- # Install system dependencies
6
- RUN apt-get update && apt-get install -y \
7
- build-essential \
8
- curl \
9
- software-properties-common \
10
- && rm -rf /var/lib/apt/lists/*
11
-
12
- # Copy requirements first to leverage Docker cache
13
- COPY ./requirements.txt /code/requirements.txt
14
- RUN pip install --no-cache-dir -r requirements.txt
15
-
16
- # Copy the rest of the application
17
- COPY . /code/
18
-
19
- # Make sure the inbox and output directories exist
20
- RUN mkdir -p /code/inbox /code/output
21
-
22
- # Expose the port
23
- EXPOSE 7860
24
-
25
- # Command to run the application
26
  CMD ["python", "app.py"]
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ curl \
9
+ software-properties-common \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy requirements first to leverage Docker cache
13
+ COPY ./requirements.txt /code/requirements.txt
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy the rest of the application
17
+ COPY . /code/
18
+
19
+ # Make sure the inbox and output directories exist
20
+ RUN mkdir -p /code/inbox /code/output
21
+
22
+ # Expose the port
23
+ EXPOSE 7860
24
+
25
+ # Command to run the application
26
  CMD ["python", "app.py"]
README.md CHANGED
@@ -1,18 +1,18 @@
1
- ---
2
- title: MinerU
3
- emoji: 📚
4
- colorFrom: purple
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.8.0
8
- app_file: app.py
9
- pinned: false
10
- license: agpl-3.0
11
- models:
12
- - opendatalab/PDF-Extract-Kit-1.0
13
- - hantian/layoutreader
14
- ---
15
-
16
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
17
-
18
  Paper: https://huggingface.co/papers/2409.18839
 
1
+ ---
2
+ title: MinerU
3
+ emoji: 📚
4
+ colorFrom: purple
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.8.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: agpl-3.0
11
+ models:
12
+ - opendatalab/PDF-Extract-Kit-1.0
13
+ - hantian/layoutreader
14
+ ---
15
+
16
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
17
+
18
  Paper: https://huggingface.co/papers/2409.18839
app.py CHANGED
@@ -1,54 +1,58 @@
1
  #!/usr/bin/env python3
2
  import os
3
- import shutil
4
- import logging
5
  import uvicorn
6
- from fastapi import FastAPI, File, UploadFile, Header, HTTPException
 
7
  from fastapi.middleware.cors import CORSMiddleware
8
- from mineru_single import Processor
9
 
10
- processor = Processor()
11
  app = FastAPI()
12
- logging.basicConfig(level=logging.INFO)
13
 
14
- # Add CORS middleware to allow requests from any origin
 
15
  app.add_middleware(
16
  CORSMiddleware,
17
- allow_origins=["*"], # Allows all origins
18
  allow_credentials=True,
19
- allow_methods=["*"], # Allows all methods
20
- allow_headers=["*"], # Allows all headers
21
  )
22
 
23
-
24
  @app.get("/")
25
  async def root():
26
- """Health check endpoint"""
27
  return {"status": "ok", "message": "API is running"}
28
 
29
  @app.post("/process")
30
  async def process_pdf(
31
- file_url: str,
32
  x_api_key: str = Header(None, alias="X-API-Key")
33
  ):
34
- # Get the secret key from environment variable
35
- api_key = os.getenv("SECRET_KEY")
36
-
37
  if not x_api_key:
38
  raise HTTPException(status_code=401, detail="API key is missing")
39
-
40
- if x_api_key != api_key:
41
  raise HTTPException(status_code=401, detail="Invalid API key")
42
 
43
- # Process the file and wait for completion
44
- markdown_text = processor.process(file_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  return {
47
- "message": "Processing completed",
48
- "code": 200,
49
- "content": markdown_text
50
  }
51
 
52
- # If you want to run locally or for debug:
53
  if __name__ == "__main__":
54
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  #!/usr/bin/env python3
2
  import os
3
+ import json
4
+ import uuid
5
  import uvicorn
6
+ import pika
7
+ from fastapi import FastAPI, Body, Header, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
 
9
 
 
10
  app = FastAPI()
 
11
 
12
+ API_KEY = os.getenv("SECRET_KEY")
13
+
14
  app.add_middleware(
15
  CORSMiddleware,
16
+ allow_origins=["*"],
17
  allow_credentials=True,
18
+ allow_methods=["*"],
19
+ allow_headers=["*"],
20
  )
21
 
 
22
  @app.get("/")
23
  async def root():
 
24
  return {"status": "ok", "message": "API is running"}
25
 
26
  @app.post("/process")
27
  async def process_pdf(
28
+ input_json: dict = Body(...),
29
  x_api_key: str = Header(None, alias="X-API-Key")
30
  ):
 
 
 
31
  if not x_api_key:
32
  raise HTTPException(status_code=401, detail="API key is missing")
33
+ if x_api_key != API_KEY:
 
34
  raise HTTPException(status_code=401, detail="Invalid API key")
35
 
36
+ # Connect to RabbitMQ
37
+ rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
38
+ connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
39
+ channel = connection.channel()
40
+ channel.queue_declare(queue="ml_server", durable=True)
41
+
42
+ channel.basic_publish(
43
+ exchange="",
44
+ routing_key="ml_server",
45
+ body=json.dumps(input_json),
46
+ properties=pika.BasicProperties(
47
+ headers={"process": "topic_extraction"}
48
+ )
49
+ )
50
+ connection.close()
51
 
52
  return {
53
+ "message": "Job queued",
54
+ "request_id": input_json.get("headers", {}).get("request_id", str(uuid.uuid4()))
 
55
  }
56
 
 
57
  if __name__ == "__main__":
58
+ uvicorn.run(app, host="0.0.0.0", port=8000)
download_models_hf.py CHANGED
@@ -1,66 +1,66 @@
1
- import json
2
- import os
3
-
4
- import requests
5
- from huggingface_hub import snapshot_download
6
-
7
-
8
- def download_json(url):
9
- # 下载JSON文件
10
- response = requests.get(url)
11
- response.raise_for_status() # 检查请求是否成功
12
- return response.json()
13
-
14
-
15
- def download_and_modify_json(url, local_filename, modifications):
16
- if os.path.exists(local_filename):
17
- data = json.load(open(local_filename))
18
- config_version = data.get('config_version', '0.0.0')
19
- if config_version < '1.1.1':
20
- data = download_json(url)
21
- else:
22
- data = download_json(url)
23
-
24
- # 修改内容
25
- for key, value in modifications.items():
26
- data[key] = value
27
-
28
- # 保存修改后的内容
29
- with open(local_filename, 'w', encoding='utf-8') as f:
30
- json.dump(data, f, ensure_ascii=False, indent=4)
31
-
32
-
33
- if __name__ == '__main__':
34
-
35
- mineru_patterns = [
36
- "models/Layout/LayoutLMv3/*",
37
- "models/Layout/YOLO/*",
38
- "models/MFD/YOLO/*",
39
- "models/MFR/unimernet_small_2501/*",
40
- "models/TabRec/TableMaster/*",
41
- "models/TabRec/StructEqTable/*",
42
- ]
43
- model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
44
-
45
- layoutreader_pattern = [
46
- "*.json",
47
- "*.safetensors",
48
- ]
49
- layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
50
-
51
- model_dir = model_dir + '/models'
52
- print(f'model_dir is: {model_dir}')
53
- print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
54
-
55
- json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
56
- config_file_name = 'magic-pdf.json'
57
- home_dir = os.path.expanduser('~')
58
- config_file = os.path.join(home_dir, config_file_name)
59
-
60
- json_mods = {
61
- 'models-dir': model_dir,
62
- 'layoutreader-model-dir': layoutreader_model_dir,
63
- }
64
-
65
- download_and_modify_json(json_url, config_file, json_mods)
66
- print(f'The configuration file has been configured successfully, the path is: {config_file}')
 
1
+ import json
2
+ import os
3
+
4
+ import requests
5
+ from huggingface_hub import snapshot_download
6
+
7
+
8
+ def download_json(url):
9
+ # 下载JSON文件
10
+ response = requests.get(url)
11
+ response.raise_for_status() # 检查请求是否成功
12
+ return response.json()
13
+
14
+
15
+ def download_and_modify_json(url, local_filename, modifications):
16
+ if os.path.exists(local_filename):
17
+ data = json.load(open(local_filename))
18
+ config_version = data.get('config_version', '0.0.0')
19
+ if config_version < '1.1.1':
20
+ data = download_json(url)
21
+ else:
22
+ data = download_json(url)
23
+
24
+ # 修改内容
25
+ for key, value in modifications.items():
26
+ data[key] = value
27
+
28
+ # 保存修改后的内容
29
+ with open(local_filename, 'w', encoding='utf-8') as f:
30
+ json.dump(data, f, ensure_ascii=False, indent=4)
31
+
32
+
33
+ if __name__ == '__main__':
34
+
35
+ mineru_patterns = [
36
+ "models/Layout/LayoutLMv3/*",
37
+ "models/Layout/YOLO/*",
38
+ "models/MFD/YOLO/*",
39
+ "models/MFR/unimernet_small_2501/*",
40
+ "models/TabRec/TableMaster/*",
41
+ "models/TabRec/StructEqTable/*",
42
+ ]
43
+ model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
44
+
45
+ layoutreader_pattern = [
46
+ "*.json",
47
+ "*.safetensors",
48
+ ]
49
+ layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
50
+
51
+ model_dir = model_dir + '/models'
52
+ print(f'model_dir is: {model_dir}')
53
+ print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
54
+
55
+ json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
56
+ config_file_name = 'magic-pdf.json'
57
+ home_dir = os.path.expanduser('~')
58
+ config_file = os.path.join(home_dir, config_file_name)
59
+
60
+ json_mods = {
61
+ 'models-dir': model_dir,
62
+ 'layoutreader-model-dir': layoutreader_model_dir,
63
+ }
64
+
65
+ download_and_modify_json(json_url, config_file, json_mods)
66
+ print(f'The configuration file has been configured successfully, the path is: {config_file}')
header.html CHANGED
@@ -1,132 +1,132 @@
1
- <html>
2
- <head>
3
- <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
4
- <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
5
- <style>
6
- .link-block {
7
- border: 1px solid transparent;
8
- border-radius: 24px;
9
- background-color: rgba(54, 54, 54, 1);
10
- cursor: pointer !important;
11
- }
12
- .link-block:hover {
13
- background-color: rgba(54, 54, 54, 0.75) !important;
14
- cursor: pointer !important;
15
- }
16
- .external-link {
17
- display: inline-flex;
18
- align-items: center;
19
- height: 36px;
20
- line-height: 36px;
21
- padding: 0 16px;
22
- cursor: pointer !important;
23
- }
24
- .external-link,
25
- .external-link:hover {
26
- cursor: pointer !important;
27
- }
28
- a {
29
- text-decoration: none;
30
- }
31
- </style>
32
- </head>
33
-
34
- <body>
35
- <div style="
36
- display: flex;
37
- flex-direction: column;
38
- justify-content: center;
39
- align-items: center;
40
- text-align: center;
41
- background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
42
- padding: 24px;
43
- gap: 24px;
44
- border-radius: 8px;
45
- ">
46
- <div style="
47
- display: flex;
48
- flex-direction: column;
49
- align-items: center;
50
- gap: 16px;
51
- ">
52
- <div style="display: flex; flex-direction: column; gap: 8px">
53
- <h1 style="
54
- font-size: 48px;
55
- color: #fafafa;
56
- margin: 0;
57
- font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
58
- 'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
59
- ">
60
- MinerU: PDF Extraction Demo
61
- </h1>
62
- </div>
63
- </div>
64
-
65
- <p style="
66
- margin: 0;
67
- line-height: 1.6rem;
68
- font-size: 16px;
69
- color: #fafafa;
70
- opacity: 0.8;
71
- ">
72
- A one-stop, open-source, high-quality data extraction tool, supports
73
- PDF/webpage/e-book extraction.<br>
74
- </p>
75
- <style>
76
- .link-block {
77
- display: inline-block;
78
- }
79
- .link-block + .link-block {
80
- margin-left: 20px;
81
- }
82
- </style>
83
-
84
- <div class="column has-text-centered">
85
- <div class="publication-links">
86
- <!-- Code Link. -->
87
- <span class="link-block">
88
- <a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
89
- <span class="icon" style="margin-right: 4px">
90
- <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
91
- </span>
92
- <span style="color: white">Code</span>
93
- </a>
94
- </span>
95
-
96
- <!-- arXiv Link. -->
97
- <span class="link-block">
98
- <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
99
- <span class="icon" style="margin-right: 8px">
100
- <i class="fas fa-file" style="color: white"></i>
101
- </span>
102
- <span style="color: white">Paper</span>
103
- </a>
104
- </span>
105
-
106
- <!-- Homepage Link. -->
107
- <span class="link-block">
108
- <a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
109
- <span class="icon" style="margin-right: 8px">
110
- <i class="fas fa-home" style="color: white"></i>
111
- </span>
112
- <span style="color: white">Homepage</span>
113
- </a>
114
- </span>
115
-
116
- <!-- Client Link. -->
117
- <span class="link-block">
118
- <a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
119
- <span class="icon" style="margin-right: 8px">
120
- <i class="fas fa-download" style="color: white"></i>
121
- </span>
122
- <span style="color: white">Download</span>
123
- </a>
124
- </span>
125
- </div>
126
- </div>
127
-
128
- <!-- New Demo Links -->
129
- </div>
130
-
131
-
132
  </body></html>
 
1
+ <html>
2
+ <head>
3
+ <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
4
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
5
+ <style>
6
+ .link-block {
7
+ border: 1px solid transparent;
8
+ border-radius: 24px;
9
+ background-color: rgba(54, 54, 54, 1);
10
+ cursor: pointer !important;
11
+ }
12
+ .link-block:hover {
13
+ background-color: rgba(54, 54, 54, 0.75) !important;
14
+ cursor: pointer !important;
15
+ }
16
+ .external-link {
17
+ display: inline-flex;
18
+ align-items: center;
19
+ height: 36px;
20
+ line-height: 36px;
21
+ padding: 0 16px;
22
+ cursor: pointer !important;
23
+ }
24
+ .external-link,
25
+ .external-link:hover {
26
+ cursor: pointer !important;
27
+ }
28
+ a {
29
+ text-decoration: none;
30
+ }
31
+ </style>
32
+ </head>
33
+
34
+ <body>
35
+ <div style="
36
+ display: flex;
37
+ flex-direction: column;
38
+ justify-content: center;
39
+ align-items: center;
40
+ text-align: center;
41
+ background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
42
+ padding: 24px;
43
+ gap: 24px;
44
+ border-radius: 8px;
45
+ ">
46
+ <div style="
47
+ display: flex;
48
+ flex-direction: column;
49
+ align-items: center;
50
+ gap: 16px;
51
+ ">
52
+ <div style="display: flex; flex-direction: column; gap: 8px">
53
+ <h1 style="
54
+ font-size: 48px;
55
+ color: #fafafa;
56
+ margin: 0;
57
+ font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
58
+ 'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
59
+ ">
60
+ MinerU: PDF Extraction Demo
61
+ </h1>
62
+ </div>
63
+ </div>
64
+
65
+ <p style="
66
+ margin: 0;
67
+ line-height: 1.6rem;
68
+ font-size: 16px;
69
+ color: #fafafa;
70
+ opacity: 0.8;
71
+ ">
72
+ A one-stop, open-source, high-quality data extraction tool, supports
73
+ PDF/webpage/e-book extraction.<br>
74
+ </p>
75
+ <style>
76
+ .link-block {
77
+ display: inline-block;
78
+ }
79
+ .link-block + .link-block {
80
+ margin-left: 20px;
81
+ }
82
+ </style>
83
+
84
+ <div class="column has-text-centered">
85
+ <div class="publication-links">
86
+ <!-- Code Link. -->
87
+ <span class="link-block">
88
+ <a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
89
+ <span class="icon" style="margin-right: 4px">
90
+ <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
91
+ </span>
92
+ <span style="color: white">Code</span>
93
+ </a>
94
+ </span>
95
+
96
+ <!-- arXiv Link. -->
97
+ <span class="link-block">
98
+ <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
99
+ <span class="icon" style="margin-right: 8px">
100
+ <i class="fas fa-file" style="color: white"></i>
101
+ </span>
102
+ <span style="color: white">Paper</span>
103
+ </a>
104
+ </span>
105
+
106
+ <!-- Homepage Link. -->
107
+ <span class="link-block">
108
+ <a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
109
+ <span class="icon" style="margin-right: 8px">
110
+ <i class="fas fa-home" style="color: white"></i>
111
+ </span>
112
+ <span style="color: white">Homepage</span>
113
+ </a>
114
+ </span>
115
+
116
+ <!-- Client Link. -->
117
+ <span class="link-block">
118
+ <a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
119
+ <span class="icon" style="margin-right: 8px">
120
+ <i class="fas fa-download" style="color: white"></i>
121
+ </span>
122
+ <span style="color: white">Download</span>
123
+ </a>
124
+ </span>
125
+ </div>
126
+ </div>
127
+
128
+ <!-- New Demo Links -->
129
+ </div>
130
+
131
+
132
  </body></html>
inference_svm_model.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import cv2
3
+ import numpy as np
4
+ from joblib import load
5
+
6
+ def load_svm_model(model_path: str):
7
+ return load(model_path)
8
+
9
+ def classify_image(
10
+ image_path: str,
11
+ loaded_model,
12
+ label_map: dict,
13
+ image_size=(128, 128)
14
+ ) -> str:
15
+ img = cv2.imread(image_path)
16
+ if img is None:
17
+ # If image fails to load, default to "irrelevant" or handle differently
18
+ return label_map[0]
19
+
20
+ img = cv2.resize(img, image_size)
21
+ x = img.flatten().reshape(1, -1)
22
+ pred = loaded_model.predict(x)[0]
23
+ return label_map[pred]
24
+
25
+ if __name__ == "__main__":
26
+ model = load_svm_model("./model_classification/svm_model.joblib")
27
+ label_map = {0: "irrelevant", 1: "relevant"}
28
+ result = classify_image("test.jpg", model, label_map)
29
+ print("Classification result:", result)
mineru_single.py CHANGED
@@ -1,37 +1,16 @@
1
  #!/usr/bin/env python3
2
  import os
3
- import time
4
- import base64
5
- import json
6
- import re
7
  import uuid
8
- from pathlib import Path
9
- from loguru import logger
10
  import requests
11
- from magic_pdf.data.data_reader_writer import FileBasedDataReader
12
- from magic_pdf.tools.common import do_parse, prepare_env
13
- import pymupdf
14
- from magic_pdf.data.data_reader_writer.base import DataWriter
15
  from magic_pdf.data.dataset import PymuDocDataset
16
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
17
  from magic_pdf.data.io.s3 import S3Writer
 
18
 
19
-
20
- # def to_pdf(file_path):
21
- # """
22
- # If input is not PDF, convert it to PDF using PyMuPDF
23
- # """
24
- # with pymupdf.open(file_path) as doc:
25
- # if doc.is_pdf:
26
- # return file_path
27
- # else:
28
- # pdf_bytes = doc.convert_to_pdf()
29
- # unique_filename = f"{uuid.uuid4()}.pdf"
30
- # tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
31
- # with open(tmp_file_path, "wb") as tmp_pdf_file:
32
- # tmp_pdf_file.write(pdf_bytes)
33
- # return tmp_file_path
34
-
35
 
36
  class Processor:
37
  def __init__(self):
@@ -41,61 +20,104 @@ class Processor:
41
  bucket=os.getenv("S3_BUCKET_NAME"),
42
  endpoint_url=os.getenv("S3_ENDPOINT"),
43
  )
44
- self.image_writer = ImageWriter(self.s3_writer)
 
 
 
45
 
46
  with open("/home/user/magic-pdf.json", "r") as f:
47
  config = json.load(f)
 
48
  self.layout_mode = config["layout-config"]["model"]
49
  self.formula_enable = config["formula-config"]["enable"]
50
  self.table_enable = config["table-config"]["enable"]
51
  self.language = "en"
52
- self.prefix = os.getenv("S3_ENDPOINT") + os.getenv("S3_BUCKET_NAME") + "/" + "document-extracts/"
53
- self._init_model()
54
-
55
- def _init_model(self):
56
- os.system('pip uninstall -y magic-pdf')
57
- os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
58
- # os.system('pip install git+https://github.com/myhloli/Magic-PDF.git@dev')
59
-
60
- os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
61
- os.system('python download_models_hf.py')
62
-
63
- def process(self, file_link: str, file_name: str = str(uuid.uuid4())):
64
- print("Processing file")
65
- response = requests.get(file_link)
66
  if response.status_code != 200:
67
- raise Exception(f"Failed to download file from {file_link}")
68
  pdf_bytes = response.content
69
 
70
  dataset = PymuDocDataset(pdf_bytes)
71
- inference = doc_analyze(dataset, ocr=True, lang=self.language, layout_model=self.layout_mode, formula_enable=self.formula_enable, table_enable=self.table_enable)
72
- pipe_result = inference.pipe_ocr_mode(self.image_writer, lang=self.language)
73
- md_content = pipe_result.get_markdown(self.prefix + file_name + "/")
74
- return self.image_writer.remove_redundant_images(md_content)
 
 
 
 
 
 
 
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  class ImageWriter(DataWriter):
78
- def __init__(self, s3_client: S3Writer):
79
- self.s3_client = s3_client
 
 
 
 
 
 
80
  self._redundant_images_paths = []
81
 
82
- def _process_image(self, data: bytes) -> str:
83
- # TODO: actually process image
84
- return True
85
-
86
  def write(self, path: str, data: bytes) -> None:
87
- # process image, if it is a vialbe image, upload it to s3, otherwise save the path to that image as redundant
88
- if self._process_image(data):
89
- self.s3_client.write(path, data)
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  else:
91
  self._redundant_images_paths.append(path)
92
 
93
- def remove_redundant_images(self, md_content: str):
94
  for path in self._redundant_images_paths:
95
  md_content = md_content.replace(f"![]({path})", "")
96
  return md_content
97
 
98
  if __name__ == "__main__":
99
  processor = Processor()
100
- URL = ""
101
- print(processor.process(URL))
 
 
 
 
 
 
 
1
  #!/usr/bin/env python3
2
  import os
 
 
 
 
3
  import uuid
4
+ import json
 
5
  import requests
6
+ from loguru import logger
7
+
 
 
8
  from magic_pdf.data.dataset import PymuDocDataset
9
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
10
  from magic_pdf.data.io.s3 import S3Writer
11
+ from magic_pdf.data.data_reader_writer.base import DataWriter
12
 
13
+ from inference_svm_model import load_svm_model, classify_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  class Processor:
16
  def __init__(self):
 
20
  bucket=os.getenv("S3_BUCKET_NAME"),
21
  endpoint_url=os.getenv("S3_ENDPOINT"),
22
  )
23
+
24
+ model_path = os.getenv("SVM_MODEL_PATH", "./svm_model/svm_model.joblib")
25
+ self.svm_model = load_svm_model(model_path)
26
+ self.label_map = {0: "irrelevant", 1: "relevant"}
27
 
28
  with open("/home/user/magic-pdf.json", "r") as f:
29
  config = json.load(f)
30
+
31
  self.layout_mode = config["layout-config"]["model"]
32
  self.formula_enable = config["formula-config"]["enable"]
33
  self.table_enable = config["table-config"]["enable"]
34
  self.language = "en"
35
+
36
+ endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
37
+ bucket = os.getenv("S3_BUCKET_NAME", "")
38
+ self.prefix = f"{endpoint}/{bucket}/document-extracts/"
39
+
40
+ def process(self, file_url: str) -> str:
41
+ logger.info("Processing file: {}", file_url)
42
+ response = requests.get(file_url)
 
 
 
 
 
 
43
  if response.status_code != 200:
44
+ raise Exception(f"Failed to download PDF: {file_url}")
45
  pdf_bytes = response.content
46
 
47
  dataset = PymuDocDataset(pdf_bytes)
48
+ inference = doc_analyze(
49
+ dataset,
50
+ ocr=True,
51
+ lang=self.language,
52
+ layout_model=self.layout_mode,
53
+ formula_enable=self.formula_enable,
54
+ table_enable=self.table_enable
55
+ )
56
+
57
+ image_writer = ImageWriter(self.s3_writer, self.svm_model, self.label_map)
58
+
59
+ pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
60
 
61
+ folder_name = str(uuid.uuid4())
62
+ md_content = pipe_result.get_markdown(self.prefix + folder_name + "/")
63
+
64
+ # Remove references to images classified as "irrelevant"
65
+ final_markdown = image_writer.remove_redundant_images(md_content)
66
+ return final_markdown
67
+
68
+ def process_batch(self, file_urls: list[str]) -> dict:
69
+ results = {}
70
+ for url in file_urls:
71
+ try:
72
+ md = self.process(url)
73
+ results[url] = md
74
+ except Exception as e:
75
+ results[url] = f"Error: {str(e)}"
76
+ return results
77
 
78
  class ImageWriter(DataWriter):
79
+ """
80
+ Receives each extracted image. Classifies it, uploads if relevant, or flags
81
+ it for removal if irrelevant.
82
+ """
83
+ def __init__(self, s3_writer: S3Writer, svm_model, label_map):
84
+ self.s3_writer = s3_writer
85
+ self.svm_model = svm_model
86
+ self.label_map = label_map
87
  self._redundant_images_paths = []
88
 
 
 
 
 
89
  def write(self, path: str, data: bytes) -> None:
90
+ import tempfile
91
+ import os
92
+ import uuid
93
+
94
+ tmp_name = f"{uuid.uuid4()}.jpg"
95
+ tmp_path = os.path.join(tempfile.gettempdir(), tmp_name)
96
+ with open(tmp_path, "wb") as f:
97
+ f.write(data)
98
+
99
+ label_str = classify_image(tmp_path, self.svm_model, self.label_map)
100
+
101
+ os.remove(tmp_path)
102
+
103
+ if label_str == "relevant":
104
+ # Upload to S3
105
+ self.s3_writer.write(path, data)
106
  else:
107
  self._redundant_images_paths.append(path)
108
 
109
+ def remove_redundant_images(self, md_content: str) -> str:
110
  for path in self._redundant_images_paths:
111
  md_content = md_content.replace(f"![]({path})", "")
112
  return md_content
113
 
114
  if __name__ == "__main__":
115
  processor = Processor()
116
+
117
+ single_url = "https://example.com/somefile.pdf"
118
+ markdown_result = processor.process(single_url)
119
+ print("Single file Markdown:\n", markdown_result)
120
+
121
+ multiple_urls = ["https://example.com/file1.pdf", "https://example.com/file2.pdf"]
122
+ batch_results = processor.process_batch(multiple_urls)
123
+ print("Batch results:", batch_results)
model_classification/svm_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfd07af67fb52073a477bcded6ed710f402f492d4fbc11945fbab1a68f7ceb62
3
+ size 219034075
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ef815afbb8970610618561946ce86faf60745ada64cd316ed34bfe34bdbf46f
3
- size 8934498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1f185677978a6e3e908a7123d2f37ff64dd5ae87e594dd1281331aedb26ad27
3
+ size 6946816
requirements.txt CHANGED
@@ -1,29 +1,32 @@
1
- boto3>=1.28.43
2
- Brotli>=1.1.0
3
- click>=8.1.7
4
- PyMuPDF>=1.24.9,<1.24.14
5
- loguru>=0.6.0
6
- numpy>=1.21.6,<2.0.0
7
- fast-langdetect>=0.2.3
8
- scikit-learn>=1.0.2
9
- transformers
10
- pdfminer.six==20231228
11
- unimernet==0.2.3
12
- doclayout_yolo==0.0.2b1
13
- matplotlib
14
- ultralytics>=8.3.48
15
- paddleocr==2.7.3
16
- paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
17
- struct-eqtable==0.3.2
18
- detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
19
- magic-pdf>=1.0.1
20
- torch>=2.2.2,<=2.3.1
21
- torchvision>=0.17.2,<=0.18.1
22
- rapid-table>=1.0.3,<2.0.0
23
- rapidocr-paddle
24
- rapidocr-onnxruntime
25
- gradio-pdf>=0.0.21
26
- openai
27
- fastapi
28
- uvicorn
29
- python-multipart
 
 
 
 
1
+ boto3>=1.28.43
2
+ Brotli>=1.1.0
3
+ click>=8.1.7
4
+ PyMuPDF>=1.24.9,<1.24.14
5
+ loguru>=0.6.0
6
+ numpy>=1.21.6,<2.0.0
7
+ fast-langdetect>=0.2.3
8
+ scikit-learn>=1.0.2
9
+ transformers
10
+ pdfminer.six==20231228
11
+ unimernet==0.2.3
12
+ doclayout_yolo==0.0.2b1
13
+ matplotlib
14
+ ultralytics>=8.3.48
15
+ paddleocr==2.7.3
16
+ paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
17
+ struct-eqtable==0.3.2
18
+ detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
19
+ magic-pdf>=1.0.1
20
+ torch>=2.2.2,<=2.3.1
21
+ torchvision>=0.17.2,<=0.18.1
22
+ rapid-table>=1.0.3,<2.0.0
23
+ rapidocr-paddle
24
+ rapidocr-onnxruntime
25
+ gradio-pdf>=0.0.21
26
+ openai
27
+ fastapi
28
+ uvicorn
29
+ python-multipart
30
+ pika==1.3.2
31
+ joblib==1.4.2
32
+ opencv-python-headless==4.11.0.86
svm_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfd07af67fb52073a477bcded6ed710f402f492d4fbc11945fbab1a68f7ceb62
3
+ size 219034075
worker.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import json
4
+ import time
5
+ import threading
6
+ import multiprocessing
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ import pika
9
+
10
+ from mineru_single import Processor
11
+
12
+ processor = Processor()
13
+
14
+ def run_pipeline(body_bytes: bytes):
15
+ """
16
+ 1) Decode the body bytes to a string.
17
+ 2) Parse the JSON. We expect something like:
18
+ {
19
+ "headers": {"request_type": "process_files", "request_id": "..."},
20
+ "body": {
21
+ "input_files": [...],
22
+ "topics": [...]
23
+ }
24
+ }
25
+ 3) If request_type == "process_files", call processor.process_batch(...) on the URLs.
26
+ 4) Return raw_text_outputs (str) and parsed_json_outputs (dict).
27
+ """
28
+
29
+ body_str = body_bytes.decode("utf-8")
30
+ data = json.loads(body_str)
31
+
32
+ headers = data.get("headers", {})
33
+ request_type = headers.get("request_type", "")
34
+ request_id = headers.get("request_id", "")
35
+ body = data.get("body", {})
36
+
37
+ # If it's not "process_files", we do nothing special
38
+ if request_type != "process_files":
39
+ return "No processing done", data
40
+
41
+ # Gather file URLs
42
+ input_files = body.get("input_files", [])
43
+ topics = body.get("topics", [])
44
+
45
+ urls = []
46
+ file_key_map = {}
47
+ for f in input_files:
48
+ key = f.get("key", "")
49
+ url = f.get("url", "")
50
+ urls.append(url)
51
+ file_key_map[url] = key
52
+
53
+ batch_results = processor.process_batch(urls) # {url: markdown_string}
54
+
55
+ md_context = []
56
+ for url, md_content in batch_results.items():
57
+ key = file_key_map.get(url, "")
58
+ md_context.append({"key": key, "body": md_content})
59
+
60
+ out_headers = {
61
+ "request_type": "question_extraction_update_from_gpu_server",
62
+ "request_id": request_id
63
+ }
64
+ out_body = {
65
+ "input_files": input_files,
66
+ "topics": topics,
67
+ "md_context": md_context
68
+ }
69
+ final_json = {
70
+ "headers": out_headers,
71
+ "body": out_body
72
+ }
73
+
74
+ return json.dumps(final_json, ensure_ascii=False), final_json
75
+
76
+ def callback(ch, method, properties, body):
77
+ """
78
+ This function is invoked for each incoming RabbitMQ message.
79
+ """
80
+ thread_id = threading.current_thread().name
81
+ headers = properties.headers or {}
82
+
83
+ print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
84
+
85
+ # If the header "process" is "topic_extraction", we run our pipeline
86
+ if headers.get("process") == "topic_extraction":
87
+ raw_text_outputs, parsed_json_outputs = run_pipeline(body)
88
+ # Do something with the result, e.g. print or store in DB
89
+ print(f"[Worker {thread_id}] Pipeline result:\n{raw_text_outputs}")
90
+ else:
91
+ # Fallback if "process" is something else
92
+ print(f"[Worker {thread_id}] Unknown process, sleeping 10s.")
93
+ time.sleep(10)
94
+ print("[Worker] Done")
95
+
96
+ def worker(channel):
97
+ try:
98
+ channel.start_consuming()
99
+ except Exception as e:
100
+ print(f"[Worker] Error: {e}")
101
+
102
+ def connect_to_rabbitmq():
103
+ rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
104
+ connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
105
+ channel = connection.channel()
106
+
107
+ # Declare the queue
108
+ channel.queue_declare(queue="ml_server", durable=True)
109
+
110
+ # Limit messages per worker
111
+ channel.basic_qos(prefetch_count=1)
112
+
113
+ # auto_ack=True for simplicity, else you must ack manually
114
+ channel.basic_consume(
115
+ queue="ml_server",
116
+ on_message_callback=callback,
117
+ auto_ack=True
118
+ )
119
+ return connection, channel
120
+
121
+ def main():
122
+ """
123
+ Main entry: starts multiple worker threads to consume from the queue.
124
+ """
125
+ num_workers = multiprocessing.cpu_count()
126
+ print(f"Starting {num_workers} workers")
127
+
128
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
129
+ for _ in range(num_workers):
130
+ connection, channel = connect_to_rabbitmq()
131
+ executor.submit(worker, channel)
132
+
133
+ if __name__ == "__main__":
134
+ """
135
+ If run directly, we also publish a test message, then start the workers.
136
+ """
137
+ rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
138
+ connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
139
+ channel = connection.channel()
140
+ channel.queue_declare(queue="ml_server", durable=True)
141
+
142
+ sample_message = {
143
+ "headers": {
144
+ "request_type": "process_files",
145
+ "request_id": "abc123"
146
+ },
147
+ "body": {
148
+ "input_files": [
149
+ {
150
+ "key": "file1",
151
+ "url": "https://example.com/file1.pdf",
152
+ "type": "mark_scheme"
153
+ },
154
+ {
155
+ "key": "file2",
156
+ "url": "https://example.com/file2.pdf",
157
+ "type": "question"
158
+ }
159
+ ],
160
+ "topics": [
161
+ {
162
+ "title": "Algebra",
163
+ "id": 123
164
+ }
165
+ ]
166
+ }
167
+ }
168
+
169
+ channel.basic_publish(
170
+ exchange="",
171
+ routing_key="ml_server",
172
+ body=json.dumps(sample_message),
173
+ properties=pika.BasicProperties(
174
+ headers={"process": "topic_extraction"}
175
+ )
176
+ )
177
+ connection.close()
178
+
179
+ main()