Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, UploadFile, File, HTTPException | |
from fastapi.responses import JSONResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
import tempfile | |
import os | |
import json | |
import traceback | |
from datetime import datetime | |
from typing import Dict, List, Any, Optional | |
# Import necessary components from magic_pdf based on convert_pdf.py | |
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader | |
from magic_pdf.data.dataset import PymuDocDataset | |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze | |
from magic_pdf.config.enums import SupportedPdfParseMethod | |
# Application metadata | |
app_description = """ | |
# MinerU PDF Processor API | |
This API provides PDF processing capabilities using MinerU's magic-pdf library. | |
It extracts text content and generates markdown from PDF documents. | |
## Features: | |
- PDF text extraction | |
- Markdown conversion | |
- Layout analysis (via output files) | |
""" | |
app = FastAPI( | |
title="MinerU PDF API", | |
description=app_description, | |
version="1.0.0", | |
contact={ | |
"name": "PDF Converter Service", | |
}, | |
) | |
# Add CORS middleware to allow cross-origin requests | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], # Allow all origins | |
allow_credentials=True, | |
allow_methods=["*"], # Allow all methods | |
allow_headers=["*"], # Allow all headers | |
) | |
# Define output directories (relative to the app's working directory in the container) | |
local_image_dir, local_md_dir = "output/images", "output" | |
os.makedirs(local_image_dir, exist_ok=True) | |
os.makedirs(local_md_dir, exist_ok=True) | |
# Health check endpoint | |
async def health_check() -> Dict[str, Any]: | |
""" | |
Health check endpoint to verify the service is running. | |
Returns the service status and current time. | |
""" | |
return { | |
"status": "healthy", | |
"timestamp": datetime.now().isoformat(), | |
"service": "mineru-pdf-processor" | |
} | |
async def extract(file: UploadFile = File(...)) -> Dict[str, Any]: | |
""" | |
Process a PDF file using PymuDocDataset and return the extracted markdown content. | |
Parameters: | |
file: The PDF file to process | |
Returns: | |
A JSON object containing the extracted markdown and status. | |
""" | |
if not file.filename or not file.filename.lower().endswith('.pdf'): | |
raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.") | |
content = await file.read() | |
temp_pdf_path = None | |
try: | |
# Save the uploaded PDF to a temporary file | |
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf: | |
temp_pdf.write(content) | |
temp_pdf_path = temp_pdf.name | |
# Clear previous output files (optional, depending on desired behavior) | |
# You might want to handle output naming differently in a multi-user API context | |
# For simplicity, we'll clear the output dir here like in convert_pdf.py | |
for item in os.listdir(local_image_dir): | |
os.remove(os.path.join(local_image_dir, item)) | |
for item in os.listdir(local_md_dir): | |
if os.path.isfile(os.path.join(local_md_dir, item)): | |
os.remove(os.path.join(local_md_dir, item)) | |
# Get filename and prepare output paths for magic-pdf | |
pdf_file_name = os.path.basename(temp_pdf_path) | |
name_without_suff = os.path.splitext(pdf_file_name)[0] | |
image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links | |
# Setup writers | |
image_writer = FileBasedDataWriter(local_image_dir) | |
md_writer = FileBasedDataWriter(local_md_dir) | |
# Use PymuDocDataset for processing | |
ds = PymuDocDataset(content) # Pass pdf bytes directly | |
# Inference and pipeline based on PDF type | |
if ds.classify() == SupportedPdfParseMethod.OCR: | |
infer_result = ds.apply(doc_analyze, ocr=True) | |
pipe_result = infer_result.pipe_ocr_mode(image_writer) | |
else: | |
infer_result = ds.apply(doc_analyze, ocr=False) | |
pipe_result = infer_result.pipe_txt_mode(image_writer) | |
# Optional: Generate intermediate output files (comment out if not needed for API) | |
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) | |
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) | |
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")) | |
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path) | |
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json') | |
# Get markdown content | |
md_content = pipe_result.get_markdown(image_dir_rel_path) | |
# Dump markdown to file (optional for API, but useful for debugging/access) | |
md_file_path = f"{name_without_suff}.md" | |
pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path) | |
print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}") | |
# Return the markdown content in the response | |
return { | |
"filename": file.filename, | |
"status": "success", | |
"markdown_content": md_content | |
# You could potentially add links to the generated files here if needed | |
# "output_files": { ... } | |
} | |
except Exception as e: | |
error_detail = str(e) | |
error_trace = traceback.format_exc() | |
# Log the error | |
print(f"Error processing PDF: {error_detail}") | |
print(error_trace) | |
return JSONResponse( | |
status_code=500, | |
content={ | |
"error": "Error processing PDF", | |
"detail": error_detail, | |
"filename": file.filename if file and hasattr(file, 'filename') else None | |
} | |
) | |
finally: | |
# Clean up the temporary file | |
if temp_pdf_path and os.path.exists(temp_pdf_path): | |
try: | |
os.unlink(temp_pdf_path) | |
except Exception: | |
pass | |
if __name__ == "__main__": | |
# Keep uvicorn import here for local running | |
import uvicorn | |
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False) |