Spaces:

marcosremar2
/

docker_mineru

Sleeping

File size: 6,441 Bytes

from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import tempfile
import os
import json
import traceback
from datetime import datetime
from typing import Dict, List, Any, Optional

# Import necessary components from magic_pdf based on convert_pdf.py
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

# Application metadata
app_description = """
# MinerU PDF Processor API

This API provides PDF processing capabilities using MinerU's magic-pdf library.
It extracts text content and generates markdown from PDF documents.

## Features:
- PDF text extraction
- Markdown conversion
- Layout analysis (via output files)
"""

app = FastAPI(
    title="MinerU PDF API",
    description=app_description,
    version="1.0.0",
    contact={
        "name": "PDF Converter Service",
    },
)

# Add CORS middleware to allow cross-origin requests
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allow all methods
    allow_headers=["*"],  # Allow all headers
)

# Define output directories (relative to the app's working directory in the container)
local_image_dir, local_md_dir = "output/images", "output"
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)

# Health check endpoint
@app.get("/health", tags=["Health"])
async def health_check() -> Dict[str, Any]:
    """
    Health check endpoint to verify the service is running.
    Returns the service status and current time.
    """
    return {
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "service": "mineru-pdf-processor"
    }

@app.post("/extract", tags=["PDF Processing"])
async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
    """
    Process a PDF file using PymuDocDataset and return the extracted markdown content.
    
    Parameters:
        file: The PDF file to process
        
    Returns:
        A JSON object containing the extracted markdown and status.
    """
    if not file.filename or not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
    
    content = await file.read()
    temp_pdf_path = None
    
    try:
        # Save the uploaded PDF to a temporary file
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
            temp_pdf.write(content)
            temp_pdf_path = temp_pdf.name
            
        # Clear previous output files (optional, depending on desired behavior)
        # You might want to handle output naming differently in a multi-user API context
        # For simplicity, we'll clear the output dir here like in convert_pdf.py
        for item in os.listdir(local_image_dir):
            os.remove(os.path.join(local_image_dir, item))
        for item in os.listdir(local_md_dir):
             if os.path.isfile(os.path.join(local_md_dir, item)):
                 os.remove(os.path.join(local_md_dir, item))

        # Get filename and prepare output paths for magic-pdf
        pdf_file_name = os.path.basename(temp_pdf_path)
        name_without_suff = os.path.splitext(pdf_file_name)[0]
        image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links
        
        # Setup writers
        image_writer = FileBasedDataWriter(local_image_dir)
        md_writer = FileBasedDataWriter(local_md_dir)

        # Use PymuDocDataset for processing
        ds = PymuDocDataset(content) # Pass pdf bytes directly

        # Inference and pipeline based on PDF type
        if ds.classify() == SupportedPdfParseMethod.OCR:
            infer_result = ds.apply(doc_analyze, ocr=True)
            pipe_result = infer_result.pipe_ocr_mode(image_writer)
        else:
            infer_result = ds.apply(doc_analyze, ocr=False)
            pipe_result = infer_result.pipe_txt_mode(image_writer)

        # Optional: Generate intermediate output files (comment out if not needed for API)
        infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
        pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
        pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
        pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path)
        pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')

        # Get markdown content
        md_content = pipe_result.get_markdown(image_dir_rel_path)
        
        # Dump markdown to file (optional for API, but useful for debugging/access)
        md_file_path = f"{name_without_suff}.md"
        pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path)
        print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}")


        # Return the markdown content in the response
        return {
            "filename": file.filename,
            "status": "success",
            "markdown_content": md_content
            # You could potentially add links to the generated files here if needed
            # "output_files": { ... } 
        }
    
    except Exception as e:
        error_detail = str(e)
        error_trace = traceback.format_exc()
        
        # Log the error
        print(f"Error processing PDF: {error_detail}")
        print(error_trace)
        
        return JSONResponse(
            status_code=500, 
            content={
                "error": "Error processing PDF",
                "detail": error_detail,
                "filename": file.filename if file and hasattr(file, 'filename') else None
            }
        )
    
    finally:
        # Clean up the temporary file
        if temp_pdf_path and os.path.exists(temp_pdf_path):
            try:
                os.unlink(temp_pdf_path)
            except Exception:
                pass

if __name__ == "__main__":
    # Keep uvicorn import here for local running
    import uvicorn 
    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)