File size: 3,183 Bytes
a467a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from fastapi import FastAPI, HTTPException
from dotenv import load_dotenv
import boto3
import os
import uvicorn
import logging
from uuid import uuid4
from pydantic import BaseModel
from helper import PdfToSectionConverter

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Fetch AWS credentials from environment
s3_access_key_id = os.getenv("S3_ACCESS_KEY_ID")
s3_secret_key = os.getenv("S3_SECRET_KEY")
aws_region = os.getenv("AWS_REGION")

# Validate environment variables
if not all([s3_access_key_id, s3_secret_key, aws_region]):
    logger.error("Missing AWS S3 credentials in environment variables.")
    raise ValueError("AWS credentials not set properly.")

# Initialize FastAPI app
app = FastAPI()

# Configure S3 client
s3_client = boto3.client(
    "s3",
    aws_access_key_id=s3_access_key_id,
    aws_secret_access_key=s3_secret_key,
    region_name=aws_region,
)

class PdfRequest(BaseModel):
    s3_file_path: str
    file_title: str
    doc_id : str
    start_page: int = 0
    end_page: int = 0

@app.get("/")
async def start():
    return {"message": "Parser API is Ready"}

@app.post("/convert_pdf")
async def convert_pdf(request: PdfRequest):
    try:
        output_dir = "/tmp"
        output_path = os.path.join(output_dir, "temp_file.pdf")
        doc_id = request.doc_id

        # Ensure the directory exists
        if not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
        
        # Validate S3 file path
        if not request.s3_file_path.startswith("s3://"):
            raise HTTPException(status_code=400, detail="Invalid S3 file path. Must start with 's3://'")

        try:
            bucket_name, object_key = request.s3_file_path.replace("s3://", "").split("/", 1)
        except ValueError:
            raise HTTPException(status_code=400, detail="Invalid S3 file path format.")

        logger.info(f"Downloading {request.s3_file_path} from S3 bucket {bucket_name}...")

        # Download PDF from S3
        try:
            s3_client.download_file(bucket_name, object_key, output_path)
        except Exception as e:
            logger.error(f"Failed to download file from S3: {str(e)}")
            raise HTTPException(status_code=500, detail="Error downloading file from S3.")

        # Initialize and run the converter
        converter = PdfToSectionConverter()
        output = converter.convert(
            downloaded_pdf_path=output_path,
            file_title=request.file_title,
            doc_id=doc_id,
            start_page_no=request.start_page,
            end_page_no=request.end_page
        )

        # Cleanup the temporary file
        os.remove(output_path)

        return {"status": "success", "data": output}

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Unexpected error: {str(e)}")
        raise HTTPException(status_code=500, detail="Internal Server Error.")

def start_server():
    logger.info("Starting Server...")
    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)

if __name__ == "__main__":
    start_server()