File size: 3,816 Bytes
f30c298
44df236
f30c298
78bc6bc
 
 
 
f30c298
 
 
 
44df236
f30c298
 
 
44df236
f30c298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44df236
f30c298
 
44df236
78bc6bc
 
 
 
 
 
 
 
 
 
f30c298
78bc6bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f30c298
44df236
f30c298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import magic_pdf
import tempfile
import os
import json
import traceback
import uvicorn
from datetime import datetime
from typing import Dict, List, Any, Optional

# Application metadata
app_description = """
# MinerU PDF Processor API

This API provides PDF processing capabilities using MinerU's magic-pdf library.
It extracts text content and tables from PDF documents.

## Features:
- PDF text extraction
- Table detection and extraction
- JSON response for easy integration
"""

app = FastAPI(
    title="MinerU PDF API",
    description=app_description,
    version="1.0.0",
    contact={
        "name": "PDF Converter Service",
    },
)

# Add CORS middleware to allow cross-origin requests
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allow all methods
    allow_headers=["*"],  # Allow all headers
)

# Health check endpoint
@app.get("/health", tags=["Health"])
async def health_check() -> Dict[str, Any]:
    """
    Health check endpoint to verify the service is running.
    Returns the service status and current time.
    """
    return {
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "service": "mineru-pdf-processor"
    }

@app.post("/extract", tags=["PDF Processing"])
async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
    """
    Extract text and tables from a PDF file.
    
    Parameters:
        file: The PDF file to process
        
    Returns:
        A JSON object containing the extracted content with pages, text blocks, and tables
    """
    if not file.filename or not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
    
    content = await file.read()
    temp_pdf_path = None
    
    try:
        # Save the uploaded PDF to a temporary file
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
            temp_pdf.write(content)
            temp_pdf_path = temp_pdf.name
        
        # Process the PDF using magic_pdf.PDF class
        result = magic_pdf.PDF(temp_pdf_path).parse()
        
        # Convert result to dictionary
        output = {
            "filename": file.filename,
            "pages": []
        }
        
        for page in result.pages:
            page_data = {
                "page_num": page.page_num,
                "text": "\n".join([block.text for block in page.text_blocks]),
                "tables": []
            }
            
            for table in page.tables:
                page_data["tables"].append(table.to_markdown())
            
            output["pages"].append(page_data)
        
        return {"result": output}
    
    except Exception as e:
        error_detail = str(e)
        error_trace = traceback.format_exc()
        
        # Log the error (would be better with a proper logger)
        print(f"Error processing PDF: {error_detail}")
        print(error_trace)
        
        return JSONResponse(
            status_code=500, 
            content={
                "error": "Error processing PDF",
                "detail": error_detail,
                "filename": file.filename if file and hasattr(file, 'filename') else None
            }
        )
    
    finally:
        # Clean up the temporary file
        if temp_pdf_path and os.path.exists(temp_pdf_path):
            try:
                os.unlink(temp_pdf_path)
            except Exception:
                pass

if __name__ == "__main__":
    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)