RabbitMQ and SVM model
Browse files- .gitattributes +71 -71
- Dockerfile +25 -25
- README.md +17 -17
- app.py +29 -25
- download_models_hf.py +66 -66
- header.html +131 -131
- inference_svm_model.py +29 -0
- mineru_single.py +80 -58
- model_classification/svm_model.joblib +3 -0
- paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams +2 -2
- requirements.txt +32 -29
- svm_model.joblib +3 -0
- worker.py +179 -0
.gitattributes
CHANGED
@@ -1,71 +1,71 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
magic_pdf-0.8.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
37 |
-
magic_pdf-0.8.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
38 |
-
magic_pdf-0.8.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
39 |
-
paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
40 |
-
paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
41 |
-
paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
42 |
-
magic_pdf-0.9.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
43 |
-
magic_pdf-0.9.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
44 |
-
magic_pdf-0.9.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
45 |
-
magic_pdf-0.9.0a4-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
46 |
-
magic_pdf-0.9.0a5-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
47 |
-
magic_pdf-0.9.0a6-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
48 |
-
magic_pdf-0.9.0a7-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
49 |
-
magic_pdf-0.9.0a8-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
50 |
-
magic_pdf-0.9.0a9-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
51 |
-
paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
52 |
-
paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
53 |
-
paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
54 |
-
paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
55 |
-
paddleocr/whl/rec/arabic/arabic_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
56 |
-
paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
57 |
-
paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
58 |
-
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
59 |
-
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
60 |
-
paddleocr/whl/rec/devanagari/devanagari_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
61 |
-
paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
62 |
-
paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
63 |
-
paddleocr/whl/rec/japan/japan_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
64 |
-
paddleocr/whl/rec/ka/ka_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
65 |
-
paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
66 |
-
paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
67 |
-
paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
68 |
-
paddleocr/whl/rec/ta/ta_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
69 |
-
paddleocr/whl/rec/te/te_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
70 |
-
*.pdf filter=lfs diff=lfs merge=lfs -text
|
71 |
-
*.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
magic_pdf-0.8.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
37 |
+
magic_pdf-0.8.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
38 |
+
magic_pdf-0.8.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
39 |
+
paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
40 |
+
paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
41 |
+
paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
42 |
+
magic_pdf-0.9.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
43 |
+
magic_pdf-0.9.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
44 |
+
magic_pdf-0.9.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
45 |
+
magic_pdf-0.9.0a4-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
46 |
+
magic_pdf-0.9.0a5-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
47 |
+
magic_pdf-0.9.0a6-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
48 |
+
magic_pdf-0.9.0a7-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
49 |
+
magic_pdf-0.9.0a8-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
50 |
+
magic_pdf-0.9.0a9-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
51 |
+
paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
52 |
+
paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
53 |
+
paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
54 |
+
paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
55 |
+
paddleocr/whl/rec/arabic/arabic_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
56 |
+
paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
57 |
+
paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
58 |
+
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
59 |
+
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
60 |
+
paddleocr/whl/rec/devanagari/devanagari_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
61 |
+
paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
62 |
+
paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
63 |
+
paddleocr/whl/rec/japan/japan_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
64 |
+
paddleocr/whl/rec/ka/ka_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
65 |
+
paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
66 |
+
paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
67 |
+
paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
68 |
+
paddleocr/whl/rec/ta/ta_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
69 |
+
paddleocr/whl/rec/te/te_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
70 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
71 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
CHANGED
@@ -1,26 +1,26 @@
|
|
1 |
-
FROM python:3.9
|
2 |
-
|
3 |
-
WORKDIR /code
|
4 |
-
|
5 |
-
# Install system dependencies
|
6 |
-
RUN apt-get update && apt-get install -y \
|
7 |
-
build-essential \
|
8 |
-
curl \
|
9 |
-
software-properties-common \
|
10 |
-
&& rm -rf /var/lib/apt/lists/*
|
11 |
-
|
12 |
-
# Copy requirements first to leverage Docker cache
|
13 |
-
COPY ./requirements.txt /code/requirements.txt
|
14 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
-
|
16 |
-
# Copy the rest of the application
|
17 |
-
COPY . /code/
|
18 |
-
|
19 |
-
# Make sure the inbox and output directories exist
|
20 |
-
RUN mkdir -p /code/inbox /code/output
|
21 |
-
|
22 |
-
# Expose the port
|
23 |
-
EXPOSE 7860
|
24 |
-
|
25 |
-
# Command to run the application
|
26 |
CMD ["python", "app.py"]
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
# Install system dependencies
|
6 |
+
RUN apt-get update && apt-get install -y \
|
7 |
+
build-essential \
|
8 |
+
curl \
|
9 |
+
software-properties-common \
|
10 |
+
&& rm -rf /var/lib/apt/lists/*
|
11 |
+
|
12 |
+
# Copy requirements first to leverage Docker cache
|
13 |
+
COPY ./requirements.txt /code/requirements.txt
|
14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
+
|
16 |
+
# Copy the rest of the application
|
17 |
+
COPY . /code/
|
18 |
+
|
19 |
+
# Make sure the inbox and output directories exist
|
20 |
+
RUN mkdir -p /code/inbox /code/output
|
21 |
+
|
22 |
+
# Expose the port
|
23 |
+
EXPOSE 7860
|
24 |
+
|
25 |
+
# Command to run the application
|
26 |
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
-
---
|
2 |
-
title: MinerU
|
3 |
-
emoji: 📚
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: blue
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.8.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: agpl-3.0
|
11 |
-
models:
|
12 |
-
- opendatalab/PDF-Extract-Kit-1.0
|
13 |
-
- hantian/layoutreader
|
14 |
-
---
|
15 |
-
|
16 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
17 |
-
|
18 |
Paper: https://huggingface.co/papers/2409.18839
|
|
|
1 |
+
---
|
2 |
+
title: MinerU
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.8.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: agpl-3.0
|
11 |
+
models:
|
12 |
+
- opendatalab/PDF-Extract-Kit-1.0
|
13 |
+
- hantian/layoutreader
|
14 |
+
---
|
15 |
+
|
16 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
17 |
+
|
18 |
Paper: https://huggingface.co/papers/2409.18839
|
app.py
CHANGED
@@ -1,54 +1,58 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
import os
|
3 |
-
import
|
4 |
-
import
|
5 |
import uvicorn
|
6 |
-
|
|
|
7 |
from fastapi.middleware.cors import CORSMiddleware
|
8 |
-
from mineru_single import Processor
|
9 |
|
10 |
-
processor = Processor()
|
11 |
app = FastAPI()
|
12 |
-
logging.basicConfig(level=logging.INFO)
|
13 |
|
14 |
-
|
|
|
15 |
app.add_middleware(
|
16 |
CORSMiddleware,
|
17 |
-
allow_origins=["*"],
|
18 |
allow_credentials=True,
|
19 |
-
allow_methods=["*"],
|
20 |
-
allow_headers=["*"],
|
21 |
)
|
22 |
|
23 |
-
|
24 |
@app.get("/")
|
25 |
async def root():
|
26 |
-
"""Health check endpoint"""
|
27 |
return {"status": "ok", "message": "API is running"}
|
28 |
|
29 |
@app.post("/process")
|
30 |
async def process_pdf(
|
31 |
-
|
32 |
x_api_key: str = Header(None, alias="X-API-Key")
|
33 |
):
|
34 |
-
# Get the secret key from environment variable
|
35 |
-
api_key = os.getenv("SECRET_KEY")
|
36 |
-
|
37 |
if not x_api_key:
|
38 |
raise HTTPException(status_code=401, detail="API key is missing")
|
39 |
-
|
40 |
-
if x_api_key != api_key:
|
41 |
raise HTTPException(status_code=401, detail="Invalid API key")
|
42 |
|
43 |
-
#
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
return {
|
47 |
-
"message": "
|
48 |
-
"
|
49 |
-
"content": markdown_text
|
50 |
}
|
51 |
|
52 |
-
# If you want to run locally or for debug:
|
53 |
if __name__ == "__main__":
|
54 |
-
uvicorn.run(app, host="0.0.0.0", port=
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
import os
|
3 |
+
import json
|
4 |
+
import uuid
|
5 |
import uvicorn
|
6 |
+
import pika
|
7 |
+
from fastapi import FastAPI, Body, Header, HTTPException
|
8 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
9 |
|
|
|
10 |
app = FastAPI()
|
|
|
11 |
|
12 |
+
API_KEY = os.getenv("SECRET_KEY")
|
13 |
+
|
14 |
app.add_middleware(
|
15 |
CORSMiddleware,
|
16 |
+
allow_origins=["*"],
|
17 |
allow_credentials=True,
|
18 |
+
allow_methods=["*"],
|
19 |
+
allow_headers=["*"],
|
20 |
)
|
21 |
|
|
|
22 |
@app.get("/")
|
23 |
async def root():
|
|
|
24 |
return {"status": "ok", "message": "API is running"}
|
25 |
|
26 |
@app.post("/process")
|
27 |
async def process_pdf(
|
28 |
+
input_json: dict = Body(...),
|
29 |
x_api_key: str = Header(None, alias="X-API-Key")
|
30 |
):
|
|
|
|
|
|
|
31 |
if not x_api_key:
|
32 |
raise HTTPException(status_code=401, detail="API key is missing")
|
33 |
+
if x_api_key != API_KEY:
|
|
|
34 |
raise HTTPException(status_code=401, detail="Invalid API key")
|
35 |
|
36 |
+
# Connect to RabbitMQ
|
37 |
+
rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
|
38 |
+
connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
|
39 |
+
channel = connection.channel()
|
40 |
+
channel.queue_declare(queue="ml_server", durable=True)
|
41 |
+
|
42 |
+
channel.basic_publish(
|
43 |
+
exchange="",
|
44 |
+
routing_key="ml_server",
|
45 |
+
body=json.dumps(input_json),
|
46 |
+
properties=pika.BasicProperties(
|
47 |
+
headers={"process": "topic_extraction"}
|
48 |
+
)
|
49 |
+
)
|
50 |
+
connection.close()
|
51 |
|
52 |
return {
|
53 |
+
"message": "Job queued",
|
54 |
+
"request_id": input_json.get("headers", {}).get("request_id", str(uuid.uuid4()))
|
|
|
55 |
}
|
56 |
|
|
|
57 |
if __name__ == "__main__":
|
58 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
download_models_hf.py
CHANGED
@@ -1,66 +1,66 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
|
4 |
-
import requests
|
5 |
-
from huggingface_hub import snapshot_download
|
6 |
-
|
7 |
-
|
8 |
-
def download_json(url):
|
9 |
-
# 下载JSON文件
|
10 |
-
response = requests.get(url)
|
11 |
-
response.raise_for_status() # 检查请求是否成功
|
12 |
-
return response.json()
|
13 |
-
|
14 |
-
|
15 |
-
def download_and_modify_json(url, local_filename, modifications):
|
16 |
-
if os.path.exists(local_filename):
|
17 |
-
data = json.load(open(local_filename))
|
18 |
-
config_version = data.get('config_version', '0.0.0')
|
19 |
-
if config_version < '1.1.1':
|
20 |
-
data = download_json(url)
|
21 |
-
else:
|
22 |
-
data = download_json(url)
|
23 |
-
|
24 |
-
# 修改内容
|
25 |
-
for key, value in modifications.items():
|
26 |
-
data[key] = value
|
27 |
-
|
28 |
-
# 保存修改后的内容
|
29 |
-
with open(local_filename, 'w', encoding='utf-8') as f:
|
30 |
-
json.dump(data, f, ensure_ascii=False, indent=4)
|
31 |
-
|
32 |
-
|
33 |
-
if __name__ == '__main__':
|
34 |
-
|
35 |
-
mineru_patterns = [
|
36 |
-
"models/Layout/LayoutLMv3/*",
|
37 |
-
"models/Layout/YOLO/*",
|
38 |
-
"models/MFD/YOLO/*",
|
39 |
-
"models/MFR/unimernet_small_2501/*",
|
40 |
-
"models/TabRec/TableMaster/*",
|
41 |
-
"models/TabRec/StructEqTable/*",
|
42 |
-
]
|
43 |
-
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
|
44 |
-
|
45 |
-
layoutreader_pattern = [
|
46 |
-
"*.json",
|
47 |
-
"*.safetensors",
|
48 |
-
]
|
49 |
-
layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
|
50 |
-
|
51 |
-
model_dir = model_dir + '/models'
|
52 |
-
print(f'model_dir is: {model_dir}')
|
53 |
-
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
|
54 |
-
|
55 |
-
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
|
56 |
-
config_file_name = 'magic-pdf.json'
|
57 |
-
home_dir = os.path.expanduser('~')
|
58 |
-
config_file = os.path.join(home_dir, config_file_name)
|
59 |
-
|
60 |
-
json_mods = {
|
61 |
-
'models-dir': model_dir,
|
62 |
-
'layoutreader-model-dir': layoutreader_model_dir,
|
63 |
-
}
|
64 |
-
|
65 |
-
download_and_modify_json(json_url, config_file, json_mods)
|
66 |
-
print(f'The configuration file has been configured successfully, the path is: {config_file}')
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import requests
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
|
7 |
+
|
8 |
+
def download_json(url):
|
9 |
+
# 下载JSON文件
|
10 |
+
response = requests.get(url)
|
11 |
+
response.raise_for_status() # 检查请求是否成功
|
12 |
+
return response.json()
|
13 |
+
|
14 |
+
|
15 |
+
def download_and_modify_json(url, local_filename, modifications):
|
16 |
+
if os.path.exists(local_filename):
|
17 |
+
data = json.load(open(local_filename))
|
18 |
+
config_version = data.get('config_version', '0.0.0')
|
19 |
+
if config_version < '1.1.1':
|
20 |
+
data = download_json(url)
|
21 |
+
else:
|
22 |
+
data = download_json(url)
|
23 |
+
|
24 |
+
# 修改内容
|
25 |
+
for key, value in modifications.items():
|
26 |
+
data[key] = value
|
27 |
+
|
28 |
+
# 保存修改后的内容
|
29 |
+
with open(local_filename, 'w', encoding='utf-8') as f:
|
30 |
+
json.dump(data, f, ensure_ascii=False, indent=4)
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == '__main__':
|
34 |
+
|
35 |
+
mineru_patterns = [
|
36 |
+
"models/Layout/LayoutLMv3/*",
|
37 |
+
"models/Layout/YOLO/*",
|
38 |
+
"models/MFD/YOLO/*",
|
39 |
+
"models/MFR/unimernet_small_2501/*",
|
40 |
+
"models/TabRec/TableMaster/*",
|
41 |
+
"models/TabRec/StructEqTable/*",
|
42 |
+
]
|
43 |
+
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
|
44 |
+
|
45 |
+
layoutreader_pattern = [
|
46 |
+
"*.json",
|
47 |
+
"*.safetensors",
|
48 |
+
]
|
49 |
+
layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
|
50 |
+
|
51 |
+
model_dir = model_dir + '/models'
|
52 |
+
print(f'model_dir is: {model_dir}')
|
53 |
+
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
|
54 |
+
|
55 |
+
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
|
56 |
+
config_file_name = 'magic-pdf.json'
|
57 |
+
home_dir = os.path.expanduser('~')
|
58 |
+
config_file = os.path.join(home_dir, config_file_name)
|
59 |
+
|
60 |
+
json_mods = {
|
61 |
+
'models-dir': model_dir,
|
62 |
+
'layoutreader-model-dir': layoutreader_model_dir,
|
63 |
+
}
|
64 |
+
|
65 |
+
download_and_modify_json(json_url, config_file, json_mods)
|
66 |
+
print(f'The configuration file has been configured successfully, the path is: {config_file}')
|
header.html
CHANGED
@@ -1,132 +1,132 @@
|
|
1 |
-
<html>
|
2 |
-
<head>
|
3 |
-
<!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
|
4 |
-
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
|
5 |
-
<style>
|
6 |
-
.link-block {
|
7 |
-
border: 1px solid transparent;
|
8 |
-
border-radius: 24px;
|
9 |
-
background-color: rgba(54, 54, 54, 1);
|
10 |
-
cursor: pointer !important;
|
11 |
-
}
|
12 |
-
.link-block:hover {
|
13 |
-
background-color: rgba(54, 54, 54, 0.75) !important;
|
14 |
-
cursor: pointer !important;
|
15 |
-
}
|
16 |
-
.external-link {
|
17 |
-
display: inline-flex;
|
18 |
-
align-items: center;
|
19 |
-
height: 36px;
|
20 |
-
line-height: 36px;
|
21 |
-
padding: 0 16px;
|
22 |
-
cursor: pointer !important;
|
23 |
-
}
|
24 |
-
.external-link,
|
25 |
-
.external-link:hover {
|
26 |
-
cursor: pointer !important;
|
27 |
-
}
|
28 |
-
a {
|
29 |
-
text-decoration: none;
|
30 |
-
}
|
31 |
-
</style>
|
32 |
-
</head>
|
33 |
-
|
34 |
-
<body>
|
35 |
-
<div style="
|
36 |
-
display: flex;
|
37 |
-
flex-direction: column;
|
38 |
-
justify-content: center;
|
39 |
-
align-items: center;
|
40 |
-
text-align: center;
|
41 |
-
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
42 |
-
padding: 24px;
|
43 |
-
gap: 24px;
|
44 |
-
border-radius: 8px;
|
45 |
-
">
|
46 |
-
<div style="
|
47 |
-
display: flex;
|
48 |
-
flex-direction: column;
|
49 |
-
align-items: center;
|
50 |
-
gap: 16px;
|
51 |
-
">
|
52 |
-
<div style="display: flex; flex-direction: column; gap: 8px">
|
53 |
-
<h1 style="
|
54 |
-
font-size: 48px;
|
55 |
-
color: #fafafa;
|
56 |
-
margin: 0;
|
57 |
-
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
58 |
-
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
59 |
-
">
|
60 |
-
MinerU: PDF Extraction Demo
|
61 |
-
</h1>
|
62 |
-
</div>
|
63 |
-
</div>
|
64 |
-
|
65 |
-
<p style="
|
66 |
-
margin: 0;
|
67 |
-
line-height: 1.6rem;
|
68 |
-
font-size: 16px;
|
69 |
-
color: #fafafa;
|
70 |
-
opacity: 0.8;
|
71 |
-
">
|
72 |
-
A one-stop, open-source, high-quality data extraction tool, supports
|
73 |
-
PDF/webpage/e-book extraction.<br>
|
74 |
-
</p>
|
75 |
-
<style>
|
76 |
-
.link-block {
|
77 |
-
display: inline-block;
|
78 |
-
}
|
79 |
-
.link-block + .link-block {
|
80 |
-
margin-left: 20px;
|
81 |
-
}
|
82 |
-
</style>
|
83 |
-
|
84 |
-
<div class="column has-text-centered">
|
85 |
-
<div class="publication-links">
|
86 |
-
<!-- Code Link. -->
|
87 |
-
<span class="link-block">
|
88 |
-
<a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
89 |
-
<span class="icon" style="margin-right: 4px">
|
90 |
-
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
|
91 |
-
</span>
|
92 |
-
<span style="color: white">Code</span>
|
93 |
-
</a>
|
94 |
-
</span>
|
95 |
-
|
96 |
-
<!-- arXiv Link. -->
|
97 |
-
<span class="link-block">
|
98 |
-
<a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
99 |
-
<span class="icon" style="margin-right: 8px">
|
100 |
-
<i class="fas fa-file" style="color: white"></i>
|
101 |
-
</span>
|
102 |
-
<span style="color: white">Paper</span>
|
103 |
-
</a>
|
104 |
-
</span>
|
105 |
-
|
106 |
-
<!-- Homepage Link. -->
|
107 |
-
<span class="link-block">
|
108 |
-
<a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
109 |
-
<span class="icon" style="margin-right: 8px">
|
110 |
-
<i class="fas fa-home" style="color: white"></i>
|
111 |
-
</span>
|
112 |
-
<span style="color: white">Homepage</span>
|
113 |
-
</a>
|
114 |
-
</span>
|
115 |
-
|
116 |
-
<!-- Client Link. -->
|
117 |
-
<span class="link-block">
|
118 |
-
<a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
119 |
-
<span class="icon" style="margin-right: 8px">
|
120 |
-
<i class="fas fa-download" style="color: white"></i>
|
121 |
-
</span>
|
122 |
-
<span style="color: white">Download</span>
|
123 |
-
</a>
|
124 |
-
</span>
|
125 |
-
</div>
|
126 |
-
</div>
|
127 |
-
|
128 |
-
<!-- New Demo Links -->
|
129 |
-
</div>
|
130 |
-
|
131 |
-
|
132 |
</body></html>
|
|
|
1 |
+
<html>
|
2 |
+
<head>
|
3 |
+
<!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
|
4 |
+
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
|
5 |
+
<style>
|
6 |
+
.link-block {
|
7 |
+
border: 1px solid transparent;
|
8 |
+
border-radius: 24px;
|
9 |
+
background-color: rgba(54, 54, 54, 1);
|
10 |
+
cursor: pointer !important;
|
11 |
+
}
|
12 |
+
.link-block:hover {
|
13 |
+
background-color: rgba(54, 54, 54, 0.75) !important;
|
14 |
+
cursor: pointer !important;
|
15 |
+
}
|
16 |
+
.external-link {
|
17 |
+
display: inline-flex;
|
18 |
+
align-items: center;
|
19 |
+
height: 36px;
|
20 |
+
line-height: 36px;
|
21 |
+
padding: 0 16px;
|
22 |
+
cursor: pointer !important;
|
23 |
+
}
|
24 |
+
.external-link,
|
25 |
+
.external-link:hover {
|
26 |
+
cursor: pointer !important;
|
27 |
+
}
|
28 |
+
a {
|
29 |
+
text-decoration: none;
|
30 |
+
}
|
31 |
+
</style>
|
32 |
+
</head>
|
33 |
+
|
34 |
+
<body>
|
35 |
+
<div style="
|
36 |
+
display: flex;
|
37 |
+
flex-direction: column;
|
38 |
+
justify-content: center;
|
39 |
+
align-items: center;
|
40 |
+
text-align: center;
|
41 |
+
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
42 |
+
padding: 24px;
|
43 |
+
gap: 24px;
|
44 |
+
border-radius: 8px;
|
45 |
+
">
|
46 |
+
<div style="
|
47 |
+
display: flex;
|
48 |
+
flex-direction: column;
|
49 |
+
align-items: center;
|
50 |
+
gap: 16px;
|
51 |
+
">
|
52 |
+
<div style="display: flex; flex-direction: column; gap: 8px">
|
53 |
+
<h1 style="
|
54 |
+
font-size: 48px;
|
55 |
+
color: #fafafa;
|
56 |
+
margin: 0;
|
57 |
+
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
58 |
+
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
59 |
+
">
|
60 |
+
MinerU: PDF Extraction Demo
|
61 |
+
</h1>
|
62 |
+
</div>
|
63 |
+
</div>
|
64 |
+
|
65 |
+
<p style="
|
66 |
+
margin: 0;
|
67 |
+
line-height: 1.6rem;
|
68 |
+
font-size: 16px;
|
69 |
+
color: #fafafa;
|
70 |
+
opacity: 0.8;
|
71 |
+
">
|
72 |
+
A one-stop, open-source, high-quality data extraction tool, supports
|
73 |
+
PDF/webpage/e-book extraction.<br>
|
74 |
+
</p>
|
75 |
+
<style>
|
76 |
+
.link-block {
|
77 |
+
display: inline-block;
|
78 |
+
}
|
79 |
+
.link-block + .link-block {
|
80 |
+
margin-left: 20px;
|
81 |
+
}
|
82 |
+
</style>
|
83 |
+
|
84 |
+
<div class="column has-text-centered">
|
85 |
+
<div class="publication-links">
|
86 |
+
<!-- Code Link. -->
|
87 |
+
<span class="link-block">
|
88 |
+
<a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
89 |
+
<span class="icon" style="margin-right: 4px">
|
90 |
+
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
|
91 |
+
</span>
|
92 |
+
<span style="color: white">Code</span>
|
93 |
+
</a>
|
94 |
+
</span>
|
95 |
+
|
96 |
+
<!-- arXiv Link. -->
|
97 |
+
<span class="link-block">
|
98 |
+
<a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
99 |
+
<span class="icon" style="margin-right: 8px">
|
100 |
+
<i class="fas fa-file" style="color: white"></i>
|
101 |
+
</span>
|
102 |
+
<span style="color: white">Paper</span>
|
103 |
+
</a>
|
104 |
+
</span>
|
105 |
+
|
106 |
+
<!-- Homepage Link. -->
|
107 |
+
<span class="link-block">
|
108 |
+
<a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
109 |
+
<span class="icon" style="margin-right: 8px">
|
110 |
+
<i class="fas fa-home" style="color: white"></i>
|
111 |
+
</span>
|
112 |
+
<span style="color: white">Homepage</span>
|
113 |
+
</a>
|
114 |
+
</span>
|
115 |
+
|
116 |
+
<!-- Client Link. -->
|
117 |
+
<span class="link-block">
|
118 |
+
<a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
119 |
+
<span class="icon" style="margin-right: 8px">
|
120 |
+
<i class="fas fa-download" style="color: white"></i>
|
121 |
+
</span>
|
122 |
+
<span style="color: white">Download</span>
|
123 |
+
</a>
|
124 |
+
</span>
|
125 |
+
</div>
|
126 |
+
</div>
|
127 |
+
|
128 |
+
<!-- New Demo Links -->
|
129 |
+
</div>
|
130 |
+
|
131 |
+
|
132 |
</body></html>
|
inference_svm_model.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from joblib import load
|
5 |
+
|
6 |
+
def load_svm_model(model_path: str):
|
7 |
+
return load(model_path)
|
8 |
+
|
9 |
+
def classify_image(
|
10 |
+
image_path: str,
|
11 |
+
loaded_model,
|
12 |
+
label_map: dict,
|
13 |
+
image_size=(128, 128)
|
14 |
+
) -> str:
|
15 |
+
img = cv2.imread(image_path)
|
16 |
+
if img is None:
|
17 |
+
# If image fails to load, default to "irrelevant" or handle differently
|
18 |
+
return label_map[0]
|
19 |
+
|
20 |
+
img = cv2.resize(img, image_size)
|
21 |
+
x = img.flatten().reshape(1, -1)
|
22 |
+
pred = loaded_model.predict(x)[0]
|
23 |
+
return label_map[pred]
|
24 |
+
|
25 |
+
if __name__ == "__main__":
|
26 |
+
model = load_svm_model("./model_classification/svm_model.joblib")
|
27 |
+
label_map = {0: "irrelevant", 1: "relevant"}
|
28 |
+
result = classify_image("test.jpg", model, label_map)
|
29 |
+
print("Classification result:", result)
|
mineru_single.py
CHANGED
@@ -1,37 +1,16 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
import os
|
3 |
-
import time
|
4 |
-
import base64
|
5 |
-
import json
|
6 |
-
import re
|
7 |
import uuid
|
8 |
-
|
9 |
-
from loguru import logger
|
10 |
import requests
|
11 |
-
from
|
12 |
-
|
13 |
-
import pymupdf
|
14 |
-
from magic_pdf.data.data_reader_writer.base import DataWriter
|
15 |
from magic_pdf.data.dataset import PymuDocDataset
|
16 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
17 |
from magic_pdf.data.io.s3 import S3Writer
|
|
|
18 |
|
19 |
-
|
20 |
-
# def to_pdf(file_path):
|
21 |
-
# """
|
22 |
-
# If input is not PDF, convert it to PDF using PyMuPDF
|
23 |
-
# """
|
24 |
-
# with pymupdf.open(file_path) as doc:
|
25 |
-
# if doc.is_pdf:
|
26 |
-
# return file_path
|
27 |
-
# else:
|
28 |
-
# pdf_bytes = doc.convert_to_pdf()
|
29 |
-
# unique_filename = f"{uuid.uuid4()}.pdf"
|
30 |
-
# tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
31 |
-
# with open(tmp_file_path, "wb") as tmp_pdf_file:
|
32 |
-
# tmp_pdf_file.write(pdf_bytes)
|
33 |
-
# return tmp_file_path
|
34 |
-
|
35 |
|
36 |
class Processor:
|
37 |
def __init__(self):
|
@@ -41,61 +20,104 @@ class Processor:
|
|
41 |
bucket=os.getenv("S3_BUCKET_NAME"),
|
42 |
endpoint_url=os.getenv("S3_ENDPOINT"),
|
43 |
)
|
44 |
-
|
|
|
|
|
|
|
45 |
|
46 |
with open("/home/user/magic-pdf.json", "r") as f:
|
47 |
config = json.load(f)
|
|
|
48 |
self.layout_mode = config["layout-config"]["model"]
|
49 |
self.formula_enable = config["formula-config"]["enable"]
|
50 |
self.table_enable = config["table-config"]["enable"]
|
51 |
self.language = "en"
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
|
61 |
-
os.system('python download_models_hf.py')
|
62 |
-
|
63 |
-
def process(self, file_link: str, file_name: str = str(uuid.uuid4())):
|
64 |
-
print("Processing file")
|
65 |
-
response = requests.get(file_link)
|
66 |
if response.status_code != 200:
|
67 |
-
raise Exception(f"Failed to download
|
68 |
pdf_bytes = response.content
|
69 |
|
70 |
dataset = PymuDocDataset(pdf_bytes)
|
71 |
-
inference = doc_analyze(
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
class ImageWriter(DataWriter):
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
self._redundant_images_paths = []
|
81 |
|
82 |
-
def _process_image(self, data: bytes) -> str:
|
83 |
-
# TODO: actually process image
|
84 |
-
return True
|
85 |
-
|
86 |
def write(self, path: str, data: bytes) -> None:
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
else:
|
91 |
self._redundant_images_paths.append(path)
|
92 |
|
93 |
-
def remove_redundant_images(self, md_content: str):
|
94 |
for path in self._redundant_images_paths:
|
95 |
md_content = md_content.replace(f"", "")
|
96 |
return md_content
|
97 |
|
98 |
if __name__ == "__main__":
|
99 |
processor = Processor()
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
import os
|
|
|
|
|
|
|
|
|
3 |
import uuid
|
4 |
+
import json
|
|
|
5 |
import requests
|
6 |
+
from loguru import logger
|
7 |
+
|
|
|
|
|
8 |
from magic_pdf.data.dataset import PymuDocDataset
|
9 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
10 |
from magic_pdf.data.io.s3 import S3Writer
|
11 |
+
from magic_pdf.data.data_reader_writer.base import DataWriter
|
12 |
|
13 |
+
from inference_svm_model import load_svm_model, classify_image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
class Processor:
|
16 |
def __init__(self):
|
|
|
20 |
bucket=os.getenv("S3_BUCKET_NAME"),
|
21 |
endpoint_url=os.getenv("S3_ENDPOINT"),
|
22 |
)
|
23 |
+
|
24 |
+
model_path = os.getenv("SVM_MODEL_PATH", "./svm_model/svm_model.joblib")
|
25 |
+
self.svm_model = load_svm_model(model_path)
|
26 |
+
self.label_map = {0: "irrelevant", 1: "relevant"}
|
27 |
|
28 |
with open("/home/user/magic-pdf.json", "r") as f:
|
29 |
config = json.load(f)
|
30 |
+
|
31 |
self.layout_mode = config["layout-config"]["model"]
|
32 |
self.formula_enable = config["formula-config"]["enable"]
|
33 |
self.table_enable = config["table-config"]["enable"]
|
34 |
self.language = "en"
|
35 |
+
|
36 |
+
endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
|
37 |
+
bucket = os.getenv("S3_BUCKET_NAME", "")
|
38 |
+
self.prefix = f"{endpoint}/{bucket}/document-extracts/"
|
39 |
+
|
40 |
+
def process(self, file_url: str) -> str:
|
41 |
+
logger.info("Processing file: {}", file_url)
|
42 |
+
response = requests.get(file_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
if response.status_code != 200:
|
44 |
+
raise Exception(f"Failed to download PDF: {file_url}")
|
45 |
pdf_bytes = response.content
|
46 |
|
47 |
dataset = PymuDocDataset(pdf_bytes)
|
48 |
+
inference = doc_analyze(
|
49 |
+
dataset,
|
50 |
+
ocr=True,
|
51 |
+
lang=self.language,
|
52 |
+
layout_model=self.layout_mode,
|
53 |
+
formula_enable=self.formula_enable,
|
54 |
+
table_enable=self.table_enable
|
55 |
+
)
|
56 |
+
|
57 |
+
image_writer = ImageWriter(self.s3_writer, self.svm_model, self.label_map)
|
58 |
+
|
59 |
+
pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
|
60 |
|
61 |
+
folder_name = str(uuid.uuid4())
|
62 |
+
md_content = pipe_result.get_markdown(self.prefix + folder_name + "/")
|
63 |
+
|
64 |
+
# Remove references to images classified as "irrelevant"
|
65 |
+
final_markdown = image_writer.remove_redundant_images(md_content)
|
66 |
+
return final_markdown
|
67 |
+
|
68 |
+
def process_batch(self, file_urls: list[str]) -> dict:
|
69 |
+
results = {}
|
70 |
+
for url in file_urls:
|
71 |
+
try:
|
72 |
+
md = self.process(url)
|
73 |
+
results[url] = md
|
74 |
+
except Exception as e:
|
75 |
+
results[url] = f"Error: {str(e)}"
|
76 |
+
return results
|
77 |
|
78 |
class ImageWriter(DataWriter):
|
79 |
+
"""
|
80 |
+
Receives each extracted image. Classifies it, uploads if relevant, or flags
|
81 |
+
it for removal if irrelevant.
|
82 |
+
"""
|
83 |
+
def __init__(self, s3_writer: S3Writer, svm_model, label_map):
|
84 |
+
self.s3_writer = s3_writer
|
85 |
+
self.svm_model = svm_model
|
86 |
+
self.label_map = label_map
|
87 |
self._redundant_images_paths = []
|
88 |
|
|
|
|
|
|
|
|
|
89 |
def write(self, path: str, data: bytes) -> None:
|
90 |
+
import tempfile
|
91 |
+
import os
|
92 |
+
import uuid
|
93 |
+
|
94 |
+
tmp_name = f"{uuid.uuid4()}.jpg"
|
95 |
+
tmp_path = os.path.join(tempfile.gettempdir(), tmp_name)
|
96 |
+
with open(tmp_path, "wb") as f:
|
97 |
+
f.write(data)
|
98 |
+
|
99 |
+
label_str = classify_image(tmp_path, self.svm_model, self.label_map)
|
100 |
+
|
101 |
+
os.remove(tmp_path)
|
102 |
+
|
103 |
+
if label_str == "relevant":
|
104 |
+
# Upload to S3
|
105 |
+
self.s3_writer.write(path, data)
|
106 |
else:
|
107 |
self._redundant_images_paths.append(path)
|
108 |
|
109 |
+
def remove_redundant_images(self, md_content: str) -> str:
|
110 |
for path in self._redundant_images_paths:
|
111 |
md_content = md_content.replace(f"", "")
|
112 |
return md_content
|
113 |
|
114 |
if __name__ == "__main__":
|
115 |
processor = Processor()
|
116 |
+
|
117 |
+
single_url = "https://example.com/somefile.pdf"
|
118 |
+
markdown_result = processor.process(single_url)
|
119 |
+
print("Single file Markdown:\n", markdown_result)
|
120 |
+
|
121 |
+
multiple_urls = ["https://example.com/file1.pdf", "https://example.com/file2.pdf"]
|
122 |
+
batch_results = processor.process_batch(multiple_urls)
|
123 |
+
print("Batch results:", batch_results)
|
model_classification/svm_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfd07af67fb52073a477bcded6ed710f402f492d4fbc11945fbab1a68f7ceb62
|
3 |
+
size 219034075
|
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1f185677978a6e3e908a7123d2f37ff64dd5ae87e594dd1281331aedb26ad27
|
3 |
+
size 6946816
|
requirements.txt
CHANGED
@@ -1,29 +1,32 @@
|
|
1 |
-
boto3>=1.28.43
|
2 |
-
Brotli>=1.1.0
|
3 |
-
click>=8.1.7
|
4 |
-
PyMuPDF>=1.24.9,<1.24.14
|
5 |
-
loguru>=0.6.0
|
6 |
-
numpy>=1.21.6,<2.0.0
|
7 |
-
fast-langdetect>=0.2.3
|
8 |
-
scikit-learn>=1.0.2
|
9 |
-
transformers
|
10 |
-
pdfminer.six==20231228
|
11 |
-
unimernet==0.2.3
|
12 |
-
doclayout_yolo==0.0.2b1
|
13 |
-
matplotlib
|
14 |
-
ultralytics>=8.3.48
|
15 |
-
paddleocr==2.7.3
|
16 |
-
paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
|
17 |
-
struct-eqtable==0.3.2
|
18 |
-
detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
|
19 |
-
magic-pdf>=1.0.1
|
20 |
-
torch>=2.2.2,<=2.3.1
|
21 |
-
torchvision>=0.17.2,<=0.18.1
|
22 |
-
rapid-table>=1.0.3,<2.0.0
|
23 |
-
rapidocr-paddle
|
24 |
-
rapidocr-onnxruntime
|
25 |
-
gradio-pdf>=0.0.21
|
26 |
-
openai
|
27 |
-
fastapi
|
28 |
-
uvicorn
|
29 |
-
python-multipart
|
|
|
|
|
|
|
|
1 |
+
boto3>=1.28.43
|
2 |
+
Brotli>=1.1.0
|
3 |
+
click>=8.1.7
|
4 |
+
PyMuPDF>=1.24.9,<1.24.14
|
5 |
+
loguru>=0.6.0
|
6 |
+
numpy>=1.21.6,<2.0.0
|
7 |
+
fast-langdetect>=0.2.3
|
8 |
+
scikit-learn>=1.0.2
|
9 |
+
transformers
|
10 |
+
pdfminer.six==20231228
|
11 |
+
unimernet==0.2.3
|
12 |
+
doclayout_yolo==0.0.2b1
|
13 |
+
matplotlib
|
14 |
+
ultralytics>=8.3.48
|
15 |
+
paddleocr==2.7.3
|
16 |
+
paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
|
17 |
+
struct-eqtable==0.3.2
|
18 |
+
detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
|
19 |
+
magic-pdf>=1.0.1
|
20 |
+
torch>=2.2.2,<=2.3.1
|
21 |
+
torchvision>=0.17.2,<=0.18.1
|
22 |
+
rapid-table>=1.0.3,<2.0.0
|
23 |
+
rapidocr-paddle
|
24 |
+
rapidocr-onnxruntime
|
25 |
+
gradio-pdf>=0.0.21
|
26 |
+
openai
|
27 |
+
fastapi
|
28 |
+
uvicorn
|
29 |
+
python-multipart
|
30 |
+
pika==1.3.2
|
31 |
+
joblib==1.4.2
|
32 |
+
opencv-python-headless==4.11.0.86
|
svm_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfd07af67fb52073a477bcded6ed710f402f492d4fbc11945fbab1a68f7ceb62
|
3 |
+
size 219034075
|
worker.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import time
|
5 |
+
import threading
|
6 |
+
import multiprocessing
|
7 |
+
from concurrent.futures import ThreadPoolExecutor
|
8 |
+
import pika
|
9 |
+
|
10 |
+
from mineru_single import Processor
|
11 |
+
|
12 |
+
processor = Processor()
|
13 |
+
|
14 |
+
def run_pipeline(body_bytes: bytes):
|
15 |
+
"""
|
16 |
+
1) Decode the body bytes to a string.
|
17 |
+
2) Parse the JSON. We expect something like:
|
18 |
+
{
|
19 |
+
"headers": {"request_type": "process_files", "request_id": "..."},
|
20 |
+
"body": {
|
21 |
+
"input_files": [...],
|
22 |
+
"topics": [...]
|
23 |
+
}
|
24 |
+
}
|
25 |
+
3) If request_type == "process_files", call processor.process_batch(...) on the URLs.
|
26 |
+
4) Return raw_text_outputs (str) and parsed_json_outputs (dict).
|
27 |
+
"""
|
28 |
+
|
29 |
+
body_str = body_bytes.decode("utf-8")
|
30 |
+
data = json.loads(body_str)
|
31 |
+
|
32 |
+
headers = data.get("headers", {})
|
33 |
+
request_type = headers.get("request_type", "")
|
34 |
+
request_id = headers.get("request_id", "")
|
35 |
+
body = data.get("body", {})
|
36 |
+
|
37 |
+
# If it's not "process_files", we do nothing special
|
38 |
+
if request_type != "process_files":
|
39 |
+
return "No processing done", data
|
40 |
+
|
41 |
+
# Gather file URLs
|
42 |
+
input_files = body.get("input_files", [])
|
43 |
+
topics = body.get("topics", [])
|
44 |
+
|
45 |
+
urls = []
|
46 |
+
file_key_map = {}
|
47 |
+
for f in input_files:
|
48 |
+
key = f.get("key", "")
|
49 |
+
url = f.get("url", "")
|
50 |
+
urls.append(url)
|
51 |
+
file_key_map[url] = key
|
52 |
+
|
53 |
+
batch_results = processor.process_batch(urls) # {url: markdown_string}
|
54 |
+
|
55 |
+
md_context = []
|
56 |
+
for url, md_content in batch_results.items():
|
57 |
+
key = file_key_map.get(url, "")
|
58 |
+
md_context.append({"key": key, "body": md_content})
|
59 |
+
|
60 |
+
out_headers = {
|
61 |
+
"request_type": "question_extraction_update_from_gpu_server",
|
62 |
+
"request_id": request_id
|
63 |
+
}
|
64 |
+
out_body = {
|
65 |
+
"input_files": input_files,
|
66 |
+
"topics": topics,
|
67 |
+
"md_context": md_context
|
68 |
+
}
|
69 |
+
final_json = {
|
70 |
+
"headers": out_headers,
|
71 |
+
"body": out_body
|
72 |
+
}
|
73 |
+
|
74 |
+
return json.dumps(final_json, ensure_ascii=False), final_json
|
75 |
+
|
76 |
+
def callback(ch, method, properties, body):
|
77 |
+
"""
|
78 |
+
This function is invoked for each incoming RabbitMQ message.
|
79 |
+
"""
|
80 |
+
thread_id = threading.current_thread().name
|
81 |
+
headers = properties.headers or {}
|
82 |
+
|
83 |
+
print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
|
84 |
+
|
85 |
+
# If the header "process" is "topic_extraction", we run our pipeline
|
86 |
+
if headers.get("process") == "topic_extraction":
|
87 |
+
raw_text_outputs, parsed_json_outputs = run_pipeline(body)
|
88 |
+
# Do something with the result, e.g. print or store in DB
|
89 |
+
print(f"[Worker {thread_id}] Pipeline result:\n{raw_text_outputs}")
|
90 |
+
else:
|
91 |
+
# Fallback if "process" is something else
|
92 |
+
print(f"[Worker {thread_id}] Unknown process, sleeping 10s.")
|
93 |
+
time.sleep(10)
|
94 |
+
print("[Worker] Done")
|
95 |
+
|
96 |
+
def worker(channel):
|
97 |
+
try:
|
98 |
+
channel.start_consuming()
|
99 |
+
except Exception as e:
|
100 |
+
print(f"[Worker] Error: {e}")
|
101 |
+
|
102 |
+
def connect_to_rabbitmq():
|
103 |
+
rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
|
104 |
+
connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
|
105 |
+
channel = connection.channel()
|
106 |
+
|
107 |
+
# Declare the queue
|
108 |
+
channel.queue_declare(queue="ml_server", durable=True)
|
109 |
+
|
110 |
+
# Limit messages per worker
|
111 |
+
channel.basic_qos(prefetch_count=1)
|
112 |
+
|
113 |
+
# auto_ack=True for simplicity, else you must ack manually
|
114 |
+
channel.basic_consume(
|
115 |
+
queue="ml_server",
|
116 |
+
on_message_callback=callback,
|
117 |
+
auto_ack=True
|
118 |
+
)
|
119 |
+
return connection, channel
|
120 |
+
|
121 |
+
def main():
|
122 |
+
"""
|
123 |
+
Main entry: starts multiple worker threads to consume from the queue.
|
124 |
+
"""
|
125 |
+
num_workers = multiprocessing.cpu_count()
|
126 |
+
print(f"Starting {num_workers} workers")
|
127 |
+
|
128 |
+
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
129 |
+
for _ in range(num_workers):
|
130 |
+
connection, channel = connect_to_rabbitmq()
|
131 |
+
executor.submit(worker, channel)
|
132 |
+
|
133 |
+
if __name__ == "__main__":
|
134 |
+
"""
|
135 |
+
If run directly, we also publish a test message, then start the workers.
|
136 |
+
"""
|
137 |
+
rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
|
138 |
+
connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
|
139 |
+
channel = connection.channel()
|
140 |
+
channel.queue_declare(queue="ml_server", durable=True)
|
141 |
+
|
142 |
+
sample_message = {
|
143 |
+
"headers": {
|
144 |
+
"request_type": "process_files",
|
145 |
+
"request_id": "abc123"
|
146 |
+
},
|
147 |
+
"body": {
|
148 |
+
"input_files": [
|
149 |
+
{
|
150 |
+
"key": "file1",
|
151 |
+
"url": "https://example.com/file1.pdf",
|
152 |
+
"type": "mark_scheme"
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"key": "file2",
|
156 |
+
"url": "https://example.com/file2.pdf",
|
157 |
+
"type": "question"
|
158 |
+
}
|
159 |
+
],
|
160 |
+
"topics": [
|
161 |
+
{
|
162 |
+
"title": "Algebra",
|
163 |
+
"id": 123
|
164 |
+
}
|
165 |
+
]
|
166 |
+
}
|
167 |
+
}
|
168 |
+
|
169 |
+
channel.basic_publish(
|
170 |
+
exchange="",
|
171 |
+
routing_key="ml_server",
|
172 |
+
body=json.dumps(sample_message),
|
173 |
+
properties=pika.BasicProperties(
|
174 |
+
headers={"process": "topic_extraction"}
|
175 |
+
)
|
176 |
+
)
|
177 |
+
connection.close()
|
178 |
+
|
179 |
+
main()
|