Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, UploadFile, File, HTTPException | |
from fastapi.responses import JSONResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
from fastapi.staticfiles import StaticFiles | |
import tempfile | |
import os | |
import sys | |
import traceback | |
from datetime import datetime | |
from typing import Dict, Any | |
import shutil | |
import torch | |
# Add the parent directory to sys.path to import convert_pdf_to_md | |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
from pdf_converter import convert_pdf_to_md | |
# Create output directory if it doesn't exist | |
output_dir = "/app/output" | |
images_dir = "/app/output/images" | |
os.makedirs(output_dir, exist_ok=True) | |
os.makedirs(images_dir, exist_ok=True) | |
# Application metadata | |
app_description = """ | |
# PDF to Markdown Converter API | |
This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker. | |
## Features: | |
- PDF to Markdown conversion using marker | |
- Simple API interface | |
""" | |
app = FastAPI( | |
title="PDF to Markdown API", | |
description=app_description, | |
version="1.0.0", | |
) | |
# Add CORS middleware to allow cross-origin requests | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], # Allow all origins | |
allow_credentials=True, | |
allow_methods=["*"], # Allow all methods | |
allow_headers=["*"], # Allow all headers | |
) | |
# Mount the output directory as static files | |
app.mount("/output", StaticFiles(directory="/app/output"), name="output") | |
# Health check endpoint | |
async def health_check() -> Dict[str, Any]: | |
""" | |
Health check endpoint to verify the service is running. | |
Returns the service status and current time. | |
""" | |
return { | |
"status": "healthy", | |
"timestamp": datetime.now().isoformat(), | |
"service": "pdf-to-markdown-converter", | |
"gpu": "CUDA enabled" if torch.cuda.is_available() else "CPU only" | |
} | |
async def convert(file: UploadFile = File(...)) -> Dict[str, Any]: | |
""" | |
Convert a PDF file to markdown using marker. | |
Parameters: | |
file: The PDF file to process | |
Returns: | |
A JSON object containing the conversion result and markdown content | |
""" | |
if not file.filename or not file.filename.lower().endswith('.pdf'): | |
raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.") | |
content = await file.read() | |
temp_pdf_path = None | |
try: | |
# Save the uploaded PDF to a temporary file | |
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf: | |
temp_pdf.write(content) | |
temp_pdf_path = temp_pdf.name | |
# Get the base name of the file | |
filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0] | |
output_md_file = f"/app/output/{filename_without_ext}.md" | |
# Process the PDF using marker | |
md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file) | |
return { | |
"filename": file.filename, | |
"status": "success", | |
"markdown_content": md_content, | |
"output_file": f"/output/{filename_without_ext}.md" | |
} | |
except Exception as e: | |
error_detail = str(e) | |
error_trace = traceback.format_exc() | |
# Log the error | |
print(f"Error processing PDF: {error_detail}") | |
print(error_trace) | |
return JSONResponse( | |
status_code=500, | |
content={ | |
"error": "Error processing PDF", | |
"detail": error_detail, | |
"filename": file.filename if file and hasattr(file, 'filename') else None | |
} | |
) | |
finally: | |
# Clean up the temporary file | |
if temp_pdf_path and os.path.exists(temp_pdf_path): | |
try: | |
os.unlink(temp_pdf_path) | |
except Exception: | |
pass | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False) |