File size: 4,112 Bytes
3d9ca9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
import tempfile
import os
import sys
import traceback
from datetime import datetime
from typing import Dict, Any
import shutil
import torch

# Add the parent directory to sys.path to import convert_pdf_to_md
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from pdf_converter import convert_pdf_to_md

# Create output directory if it doesn't exist
output_dir = "/app/output"
images_dir = "/app/output/images"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)

# Application metadata
app_description = """
# PDF to Markdown Converter API

This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker.

## Features:
- PDF to Markdown conversion using marker
- Simple API interface
"""

app = FastAPI(
    title="PDF to Markdown API",
    description=app_description,
    version="1.0.0",
)

# Add CORS middleware to allow cross-origin requests
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allow all methods
    allow_headers=["*"],  # Allow all headers
)

# Mount the output directory as static files
app.mount("/output", StaticFiles(directory="/app/output"), name="output")

# Health check endpoint
@app.get("/health", tags=["Health"])
async def health_check() -> Dict[str, Any]:
    """
    Health check endpoint to verify the service is running.
    Returns the service status and current time.
    """
    return {
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "service": "pdf-to-markdown-converter",
        "gpu": "CUDA enabled" if torch.cuda.is_available() else "CPU only"
    }

@app.post("/convert", tags=["PDF Processing"])
async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
    """
    Convert a PDF file to markdown using marker.
    
    Parameters:
        file: The PDF file to process
        
    Returns:
        A JSON object containing the conversion result and markdown content
    """
    if not file.filename or not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
    
    content = await file.read()
    temp_pdf_path = None
    
    try:
        # Save the uploaded PDF to a temporary file
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
            temp_pdf.write(content)
            temp_pdf_path = temp_pdf.name
        
        # Get the base name of the file
        filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
        output_md_file = f"/app/output/{filename_without_ext}.md"
        
        # Process the PDF using marker
        md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file)
        
        return {
            "filename": file.filename,
            "status": "success",
            "markdown_content": md_content,
            "output_file": f"/output/{filename_without_ext}.md"
        }
    
    except Exception as e:
        error_detail = str(e)
        error_trace = traceback.format_exc()
        
        # Log the error
        print(f"Error processing PDF: {error_detail}")
        print(error_trace)
        
        return JSONResponse(
            status_code=500, 
            content={
                "error": "Error processing PDF",
                "detail": error_detail,
                "filename": file.filename if file and hasattr(file, 'filename') else None
            }
        )
    
    finally:
        # Clean up the temporary file
        if temp_pdf_path and os.path.exists(temp_pdf_path):
            try:
                os.unlink(temp_pdf_path)
            except Exception:
                pass

if __name__ == "__main__":
    import uvicorn
    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)