File size: 6,441 Bytes
f30c298
44df236
f30c298
78bc6bc
 
 
f30c298
 
 
44df236
53a34c2
 
 
 
 
 
f30c298
 
 
44df236
f30c298
53a34c2
f30c298
 
 
53a34c2
 
f30c298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53a34c2
 
 
 
 
f30c298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53a34c2
f30c298
 
 
 
 
53a34c2
f30c298
 
 
 
44df236
f30c298
 
44df236
78bc6bc
 
 
 
53a34c2
 
 
 
 
 
 
 
 
 
 
 
 
 
78bc6bc
53a34c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78bc6bc
53a34c2
 
 
 
 
 
 
 
f30c298
53a34c2
 
 
 
78bc6bc
f30c298
44df236
f30c298
 
 
53a34c2
f30c298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53a34c2
 
f30c298
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import tempfile
import os
import json
import traceback
from datetime import datetime
from typing import Dict, List, Any, Optional

# Import necessary components from magic_pdf based on convert_pdf.py
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

# Application metadata
app_description = """
# MinerU PDF Processor API

This API provides PDF processing capabilities using MinerU's magic-pdf library.
It extracts text content and generates markdown from PDF documents.

## Features:
- PDF text extraction
- Markdown conversion
- Layout analysis (via output files)
"""

app = FastAPI(
    title="MinerU PDF API",
    description=app_description,
    version="1.0.0",
    contact={
        "name": "PDF Converter Service",
    },
)

# Add CORS middleware to allow cross-origin requests
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allow all methods
    allow_headers=["*"],  # Allow all headers
)

# Define output directories (relative to the app's working directory in the container)
local_image_dir, local_md_dir = "output/images", "output"
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)

# Health check endpoint
@app.get("/health", tags=["Health"])
async def health_check() -> Dict[str, Any]:
    """
    Health check endpoint to verify the service is running.
    Returns the service status and current time.
    """
    return {
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "service": "mineru-pdf-processor"
    }

@app.post("/extract", tags=["PDF Processing"])
async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
    """
    Process a PDF file using PymuDocDataset and return the extracted markdown content.
    
    Parameters:
        file: The PDF file to process
        
    Returns:
        A JSON object containing the extracted markdown and status.
    """
    if not file.filename or not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
    
    content = await file.read()
    temp_pdf_path = None
    
    try:
        # Save the uploaded PDF to a temporary file
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
            temp_pdf.write(content)
            temp_pdf_path = temp_pdf.name
            
        # Clear previous output files (optional, depending on desired behavior)
        # You might want to handle output naming differently in a multi-user API context
        # For simplicity, we'll clear the output dir here like in convert_pdf.py
        for item in os.listdir(local_image_dir):
            os.remove(os.path.join(local_image_dir, item))
        for item in os.listdir(local_md_dir):
             if os.path.isfile(os.path.join(local_md_dir, item)):
                 os.remove(os.path.join(local_md_dir, item))

        # Get filename and prepare output paths for magic-pdf
        pdf_file_name = os.path.basename(temp_pdf_path)
        name_without_suff = os.path.splitext(pdf_file_name)[0]
        image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links
        
        # Setup writers
        image_writer = FileBasedDataWriter(local_image_dir)
        md_writer = FileBasedDataWriter(local_md_dir)

        # Use PymuDocDataset for processing
        ds = PymuDocDataset(content) # Pass pdf bytes directly

        # Inference and pipeline based on PDF type
        if ds.classify() == SupportedPdfParseMethod.OCR:
            infer_result = ds.apply(doc_analyze, ocr=True)
            pipe_result = infer_result.pipe_ocr_mode(image_writer)
        else:
            infer_result = ds.apply(doc_analyze, ocr=False)
            pipe_result = infer_result.pipe_txt_mode(image_writer)

        # Optional: Generate intermediate output files (comment out if not needed for API)
        infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
        pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
        pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
        pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path)
        pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')

        # Get markdown content
        md_content = pipe_result.get_markdown(image_dir_rel_path)
        
        # Dump markdown to file (optional for API, but useful for debugging/access)
        md_file_path = f"{name_without_suff}.md"
        pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path)
        print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}")


        # Return the markdown content in the response
        return {
            "filename": file.filename,
            "status": "success",
            "markdown_content": md_content
            # You could potentially add links to the generated files here if needed
            # "output_files": { ... } 
        }
    
    except Exception as e:
        error_detail = str(e)
        error_trace = traceback.format_exc()
        
        # Log the error
        print(f"Error processing PDF: {error_detail}")
        print(error_trace)
        
        return JSONResponse(
            status_code=500, 
            content={
                "error": "Error processing PDF",
                "detail": error_detail,
                "filename": file.filename if file and hasattr(file, 'filename') else None
            }
        )
    
    finally:
        # Clean up the temporary file
        if temp_pdf_path and os.path.exists(temp_pdf_path):
            try:
                os.unlink(temp_pdf_path)
            except Exception:
                pass

if __name__ == "__main__":
    # Keep uvicorn import here for local running
    import uvicorn 
    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)