Spaces:

marcosremar2
/

docker_mineru

Sleeping

File size: 3,816 Bytes

from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import magic_pdf
import tempfile
import os
import json
import traceback
import uvicorn
from datetime import datetime
from typing import Dict, List, Any, Optional

# Application metadata
app_description = """
# MinerU PDF Processor API

This API provides PDF processing capabilities using MinerU's magic-pdf library.
It extracts text content and tables from PDF documents.

## Features:
- PDF text extraction
- Table detection and extraction
- JSON response for easy integration
"""

app = FastAPI(
    title="MinerU PDF API",
    description=app_description,
    version="1.0.0",
    contact={
        "name": "PDF Converter Service",
    },
)

# Add CORS middleware to allow cross-origin requests
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allow all methods
    allow_headers=["*"],  # Allow all headers
)

# Health check endpoint
@app.get("/health", tags=["Health"])
async def health_check() -> Dict[str, Any]:
    """
    Health check endpoint to verify the service is running.
    Returns the service status and current time.
    """
    return {
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "service": "mineru-pdf-processor"
    }

@app.post("/extract", tags=["PDF Processing"])
async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
    """
    Extract text and tables from a PDF file.
    
    Parameters:
        file: The PDF file to process
        
    Returns:
        A JSON object containing the extracted content with pages, text blocks, and tables
    """
    if not file.filename or not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
    
    content = await file.read()
    temp_pdf_path = None
    
    try:
        # Save the uploaded PDF to a temporary file
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
            temp_pdf.write(content)
            temp_pdf_path = temp_pdf.name
        
        # Process the PDF using magic_pdf.PDF class
        result = magic_pdf.PDF(temp_pdf_path).parse()
        
        # Convert result to dictionary
        output = {
            "filename": file.filename,
            "pages": []
        }
        
        for page in result.pages:
            page_data = {
                "page_num": page.page_num,
                "text": "\n".join([block.text for block in page.text_blocks]),
                "tables": []
            }
            
            for table in page.tables:
                page_data["tables"].append(table.to_markdown())
            
            output["pages"].append(page_data)
        
        return {"result": output}
    
    except Exception as e:
        error_detail = str(e)
        error_trace = traceback.format_exc()
        
        # Log the error (would be better with a proper logger)
        print(f"Error processing PDF: {error_detail}")
        print(error_trace)
        
        return JSONResponse(
            status_code=500, 
            content={
                "error": "Error processing PDF",
                "detail": error_detail,
                "filename": file.filename if file and hasattr(file, 'filename') else None
            }
        )
    
    finally:
        # Clean up the temporary file
        if temp_pdf_path and os.path.exists(temp_pdf_path):
            try:
                os.unlink(temp_pdf_path)
            except Exception:
                pass

if __name__ == "__main__":
    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)