Spaces:
Sleeping
Sleeping
File size: 3,816 Bytes
f30c298 44df236 f30c298 78bc6bc f30c298 44df236 f30c298 44df236 f30c298 44df236 f30c298 44df236 78bc6bc f30c298 78bc6bc f30c298 44df236 f30c298 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import magic_pdf
import tempfile
import os
import json
import traceback
import uvicorn
from datetime import datetime
from typing import Dict, List, Any, Optional
# Application metadata
app_description = """
# MinerU PDF Processor API
This API provides PDF processing capabilities using MinerU's magic-pdf library.
It extracts text content and tables from PDF documents.
## Features:
- PDF text extraction
- Table detection and extraction
- JSON response for easy integration
"""
app = FastAPI(
title="MinerU PDF API",
description=app_description,
version="1.0.0",
contact={
"name": "PDF Converter Service",
},
)
# Add CORS middleware to allow cross-origin requests
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins
allow_credentials=True,
allow_methods=["*"], # Allow all methods
allow_headers=["*"], # Allow all headers
)
# Health check endpoint
@app.get("/health", tags=["Health"])
async def health_check() -> Dict[str, Any]:
"""
Health check endpoint to verify the service is running.
Returns the service status and current time.
"""
return {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"service": "mineru-pdf-processor"
}
@app.post("/extract", tags=["PDF Processing"])
async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
"""
Extract text and tables from a PDF file.
Parameters:
file: The PDF file to process
Returns:
A JSON object containing the extracted content with pages, text blocks, and tables
"""
if not file.filename or not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
content = await file.read()
temp_pdf_path = None
try:
# Save the uploaded PDF to a temporary file
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
temp_pdf.write(content)
temp_pdf_path = temp_pdf.name
# Process the PDF using magic_pdf.PDF class
result = magic_pdf.PDF(temp_pdf_path).parse()
# Convert result to dictionary
output = {
"filename": file.filename,
"pages": []
}
for page in result.pages:
page_data = {
"page_num": page.page_num,
"text": "\n".join([block.text for block in page.text_blocks]),
"tables": []
}
for table in page.tables:
page_data["tables"].append(table.to_markdown())
output["pages"].append(page_data)
return {"result": output}
except Exception as e:
error_detail = str(e)
error_trace = traceback.format_exc()
# Log the error (would be better with a proper logger)
print(f"Error processing PDF: {error_detail}")
print(error_trace)
return JSONResponse(
status_code=500,
content={
"error": "Error processing PDF",
"detail": error_detail,
"filename": file.filename if file and hasattr(file, 'filename') else None
}
)
finally:
# Clean up the temporary file
if temp_pdf_path and os.path.exists(temp_pdf_path):
try:
os.unlink(temp_pdf_path)
except Exception:
pass
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False) |