docker_mineru / app.py
marcosremar2's picture
Enhance FastAPI implementation with better documentation, error handling and examples
f30c298
raw
history blame
3.82 kB
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import magic_pdf
import tempfile
import os
import json
import traceback
import uvicorn
from datetime import datetime
from typing import Dict, List, Any, Optional
# Application metadata
app_description = """
# MinerU PDF Processor API
This API provides PDF processing capabilities using MinerU's magic-pdf library.
It extracts text content and tables from PDF documents.
## Features:
- PDF text extraction
- Table detection and extraction
- JSON response for easy integration
"""
app = FastAPI(
title="MinerU PDF API",
description=app_description,
version="1.0.0",
contact={
"name": "PDF Converter Service",
},
)
# Add CORS middleware to allow cross-origin requests
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins
allow_credentials=True,
allow_methods=["*"], # Allow all methods
allow_headers=["*"], # Allow all headers
)
# Health check endpoint
@app.get("/health", tags=["Health"])
async def health_check() -> Dict[str, Any]:
"""
Health check endpoint to verify the service is running.
Returns the service status and current time.
"""
return {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"service": "mineru-pdf-processor"
}
@app.post("/extract", tags=["PDF Processing"])
async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
"""
Extract text and tables from a PDF file.
Parameters:
file: The PDF file to process
Returns:
A JSON object containing the extracted content with pages, text blocks, and tables
"""
if not file.filename or not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
content = await file.read()
temp_pdf_path = None
try:
# Save the uploaded PDF to a temporary file
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
temp_pdf.write(content)
temp_pdf_path = temp_pdf.name
# Process the PDF using magic_pdf.PDF class
result = magic_pdf.PDF(temp_pdf_path).parse()
# Convert result to dictionary
output = {
"filename": file.filename,
"pages": []
}
for page in result.pages:
page_data = {
"page_num": page.page_num,
"text": "\n".join([block.text for block in page.text_blocks]),
"tables": []
}
for table in page.tables:
page_data["tables"].append(table.to_markdown())
output["pages"].append(page_data)
return {"result": output}
except Exception as e:
error_detail = str(e)
error_trace = traceback.format_exc()
# Log the error (would be better with a proper logger)
print(f"Error processing PDF: {error_detail}")
print(error_trace)
return JSONResponse(
status_code=500,
content={
"error": "Error processing PDF",
"detail": error_detail,
"filename": file.filename if file and hasattr(file, 'filename') else None
}
)
finally:
# Clean up the temporary file
if temp_pdf_path and os.path.exists(temp_pdf_path):
try:
os.unlink(temp_pdf_path)
except Exception:
pass
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)