from fastapi import FastAPI, UploadFile, File, HTTPException from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware import tempfile import os import json import traceback from datetime import datetime from typing import Dict, List, Any, Optional # Import necessary components from magic_pdf based on convert_pdf.py from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod # Application metadata app_description = """ # MinerU PDF Processor API This API provides PDF processing capabilities using MinerU's magic-pdf library. It extracts text content and generates markdown from PDF documents. ## Features: - PDF text extraction - Markdown conversion - Layout analysis (via output files) """ app = FastAPI( title="MinerU PDF API", description=app_description, version="1.0.0", contact={ "name": "PDF Converter Service", }, ) # Add CORS middleware to allow cross-origin requests app.add_middleware( CORSMiddleware, allow_origins=["*"], # Allow all origins allow_credentials=True, allow_methods=["*"], # Allow all methods allow_headers=["*"], # Allow all headers ) # Define output directories (relative to the app's working directory in the container) local_image_dir, local_md_dir = "output/images", "output" os.makedirs(local_image_dir, exist_ok=True) os.makedirs(local_md_dir, exist_ok=True) # Health check endpoint @app.get("/health", tags=["Health"]) async def health_check() -> Dict[str, Any]: """ Health check endpoint to verify the service is running. Returns the service status and current time. """ return { "status": "healthy", "timestamp": datetime.now().isoformat(), "service": "mineru-pdf-processor" } @app.post("/extract", tags=["PDF Processing"]) async def extract(file: UploadFile = File(...)) -> Dict[str, Any]: """ Process a PDF file using PymuDocDataset and return the extracted markdown content. Parameters: file: The PDF file to process Returns: A JSON object containing the extracted markdown and status. """ if not file.filename or not file.filename.lower().endswith('.pdf'): raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.") content = await file.read() temp_pdf_path = None try: # Save the uploaded PDF to a temporary file with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf: temp_pdf.write(content) temp_pdf_path = temp_pdf.name # Clear previous output files (optional, depending on desired behavior) # You might want to handle output naming differently in a multi-user API context # For simplicity, we'll clear the output dir here like in convert_pdf.py for item in os.listdir(local_image_dir): os.remove(os.path.join(local_image_dir, item)) for item in os.listdir(local_md_dir): if os.path.isfile(os.path.join(local_md_dir, item)): os.remove(os.path.join(local_md_dir, item)) # Get filename and prepare output paths for magic-pdf pdf_file_name = os.path.basename(temp_pdf_path) name_without_suff = os.path.splitext(pdf_file_name)[0] image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links # Setup writers image_writer = FileBasedDataWriter(local_image_dir) md_writer = FileBasedDataWriter(local_md_dir) # Use PymuDocDataset for processing ds = PymuDocDataset(content) # Pass pdf bytes directly # Inference and pipeline based on PDF type if ds.classify() == SupportedPdfParseMethod.OCR: infer_result = ds.apply(doc_analyze, ocr=True) pipe_result = infer_result.pipe_ocr_mode(image_writer) else: infer_result = ds.apply(doc_analyze, ocr=False) pipe_result = infer_result.pipe_txt_mode(image_writer) # Optional: Generate intermediate output files (comment out if not needed for API) infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")) pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path) pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json') # Get markdown content md_content = pipe_result.get_markdown(image_dir_rel_path) # Dump markdown to file (optional for API, but useful for debugging/access) md_file_path = f"{name_without_suff}.md" pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path) print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}") # Return the markdown content in the response return { "filename": file.filename, "status": "success", "markdown_content": md_content # You could potentially add links to the generated files here if needed # "output_files": { ... } } except Exception as e: error_detail = str(e) error_trace = traceback.format_exc() # Log the error print(f"Error processing PDF: {error_detail}") print(error_trace) return JSONResponse( status_code=500, content={ "error": "Error processing PDF", "detail": error_detail, "filename": file.filename if file and hasattr(file, 'filename') else None } ) finally: # Clean up the temporary file if temp_pdf_path and os.path.exists(temp_pdf_path): try: os.unlink(temp_pdf_path) except Exception: pass if __name__ == "__main__": # Keep uvicorn import here for local running import uvicorn uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)