docker_mineru / app.py
marcosremar2's picture
Fix: Use PymuDocDataset in API endpoint
53a34c2
raw
history blame
6.44 kB
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import tempfile
import os
import json
import traceback
from datetime import datetime
from typing import Dict, List, Any, Optional
# Import necessary components from magic_pdf based on convert_pdf.py
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# Application metadata
app_description = """
# MinerU PDF Processor API
This API provides PDF processing capabilities using MinerU's magic-pdf library.
It extracts text content and generates markdown from PDF documents.
## Features:
- PDF text extraction
- Markdown conversion
- Layout analysis (via output files)
"""
app = FastAPI(
title="MinerU PDF API",
description=app_description,
version="1.0.0",
contact={
"name": "PDF Converter Service",
},
)
# Add CORS middleware to allow cross-origin requests
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins
allow_credentials=True,
allow_methods=["*"], # Allow all methods
allow_headers=["*"], # Allow all headers
)
# Define output directories (relative to the app's working directory in the container)
local_image_dir, local_md_dir = "output/images", "output"
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
# Health check endpoint
@app.get("/health", tags=["Health"])
async def health_check() -> Dict[str, Any]:
"""
Health check endpoint to verify the service is running.
Returns the service status and current time.
"""
return {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"service": "mineru-pdf-processor"
}
@app.post("/extract", tags=["PDF Processing"])
async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
"""
Process a PDF file using PymuDocDataset and return the extracted markdown content.
Parameters:
file: The PDF file to process
Returns:
A JSON object containing the extracted markdown and status.
"""
if not file.filename or not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
content = await file.read()
temp_pdf_path = None
try:
# Save the uploaded PDF to a temporary file
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
temp_pdf.write(content)
temp_pdf_path = temp_pdf.name
# Clear previous output files (optional, depending on desired behavior)
# You might want to handle output naming differently in a multi-user API context
# For simplicity, we'll clear the output dir here like in convert_pdf.py
for item in os.listdir(local_image_dir):
os.remove(os.path.join(local_image_dir, item))
for item in os.listdir(local_md_dir):
if os.path.isfile(os.path.join(local_md_dir, item)):
os.remove(os.path.join(local_md_dir, item))
# Get filename and prepare output paths for magic-pdf
pdf_file_name = os.path.basename(temp_pdf_path)
name_without_suff = os.path.splitext(pdf_file_name)[0]
image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links
# Setup writers
image_writer = FileBasedDataWriter(local_image_dir)
md_writer = FileBasedDataWriter(local_md_dir)
# Use PymuDocDataset for processing
ds = PymuDocDataset(content) # Pass pdf bytes directly
# Inference and pipeline based on PDF type
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)
# Optional: Generate intermediate output files (comment out if not needed for API)
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path)
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
# Get markdown content
md_content = pipe_result.get_markdown(image_dir_rel_path)
# Dump markdown to file (optional for API, but useful for debugging/access)
md_file_path = f"{name_without_suff}.md"
pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path)
print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}")
# Return the markdown content in the response
return {
"filename": file.filename,
"status": "success",
"markdown_content": md_content
# You could potentially add links to the generated files here if needed
# "output_files": { ... }
}
except Exception as e:
error_detail = str(e)
error_trace = traceback.format_exc()
# Log the error
print(f"Error processing PDF: {error_detail}")
print(error_trace)
return JSONResponse(
status_code=500,
content={
"error": "Error processing PDF",
"detail": error_detail,
"filename": file.filename if file and hasattr(file, 'filename') else None
}
)
finally:
# Clean up the temporary file
if temp_pdf_path and os.path.exists(temp_pdf_path):
try:
os.unlink(temp_pdf_path)
except Exception:
pass
if __name__ == "__main__":
# Keep uvicorn import here for local running
import uvicorn
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)