Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

docker_mineru / app.py

marcosremar2

Enhance FastAPI implementation with better documentation, error handling and examples

f30c298 about 2 months ago

raw

history blame

3.82 kB

	from fastapi import FastAPI, UploadFile, File, HTTPException
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	import magic_pdf
	import tempfile
	import os
	import json
	import traceback
	import uvicorn
	from datetime import datetime
	from typing import Dict, List, Any, Optional

	# Application metadata
	app_description = """
	# MinerU PDF Processor API

	This API provides PDF processing capabilities using MinerU's magic-pdf library.
	It extracts text content and tables from PDF documents.

	## Features:
	- PDF text extraction
	- Table detection and extraction
	- JSON response for easy integration
	"""

	app = FastAPI(
	title="MinerU PDF API",
	description=app_description,
	version="1.0.0",
	contact={
	"name": "PDF Converter Service",
	},
	)

	# Add CORS middleware to allow cross-origin requests
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # Allow all origins
	allow_credentials=True,
	allow_methods=["*"], # Allow all methods
	allow_headers=["*"], # Allow all headers
	)

	# Health check endpoint
	@app.get("/health", tags=["Health"])
	async def health_check() -> Dict[str, Any]:
	"""
	Health check endpoint to verify the service is running.
	Returns the service status and current time.
	"""
	return {
	"status": "healthy",
	"timestamp": datetime.now().isoformat(),
	"service": "mineru-pdf-processor"
	}

	@app.post("/extract", tags=["PDF Processing"])
	async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
	"""
	Extract text and tables from a PDF file.

	Parameters:
	file: The PDF file to process

	Returns:
	A JSON object containing the extracted content with pages, text blocks, and tables
	"""
	if not file.filename or not file.filename.lower().endswith('.pdf'):
	raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")

	content = await file.read()
	temp_pdf_path = None

	try:
	# Save the uploaded PDF to a temporary file
	with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
	temp_pdf.write(content)
	temp_pdf_path = temp_pdf.name

	# Process the PDF using magic_pdf.PDF class
	result = magic_pdf.PDF(temp_pdf_path).parse()

	# Convert result to dictionary
	output = {
	"filename": file.filename,
	"pages": []
	}

	for page in result.pages:
	page_data = {
	"page_num": page.page_num,
	"text": "\n".join([block.text for block in page.text_blocks]),
	"tables": []
	}

	for table in page.tables:
	page_data["tables"].append(table.to_markdown())

	output["pages"].append(page_data)

	return {"result": output}

	except Exception as e:
	error_detail = str(e)
	error_trace = traceback.format_exc()

	# Log the error (would be better with a proper logger)
	print(f"Error processing PDF: {error_detail}")
	print(error_trace)

	return JSONResponse(
	status_code=500,
	content={
	"error": "Error processing PDF",
	"detail": error_detail,
	"filename": file.filename if file and hasattr(file, 'filename') else None
	}
	)

	finally:
	# Clean up the temporary file
	if temp_pdf_path and os.path.exists(temp_pdf_path):
	try:
	os.unlink(temp_pdf_path)
	except Exception:
	pass

	if __name__ == "__main__":
	uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)