Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, HTTPException | |
from dotenv import load_dotenv | |
import boto3 | |
import os | |
import uvicorn | |
import logging | |
from uuid import uuid4 | |
from pydantic import BaseModel | |
from helper import PdfToSectionConverter | |
# Load environment variables | |
load_dotenv() | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Fetch AWS credentials from environment | |
s3_access_key_id = os.getenv("S3_ACCESS_KEY_ID") | |
s3_secret_key = os.getenv("S3_SECRET_KEY") | |
aws_region = os.getenv("AWS_REGION") | |
# Validate environment variables | |
if not all([s3_access_key_id, s3_secret_key, aws_region]): | |
logger.error("Missing AWS S3 credentials in environment variables.") | |
raise ValueError("AWS credentials not set properly.") | |
# Initialize FastAPI app | |
app = FastAPI() | |
# Configure S3 client | |
s3_client = boto3.client( | |
"s3", | |
aws_access_key_id=s3_access_key_id, | |
aws_secret_access_key=s3_secret_key, | |
region_name=aws_region, | |
) | |
class PdfRequest(BaseModel): | |
s3_file_path: str | |
file_title: str | |
doc_id : str | |
start_page: int = 0 | |
end_page: int = 0 | |
async def start(): | |
return {"message": "Parser API is Ready"} | |
async def convert_pdf(request: PdfRequest): | |
try: | |
output_dir = "/tmp" | |
output_path = os.path.join(output_dir, "temp_file.pdf") | |
doc_id = request.doc_id | |
# Ensure the directory exists | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir, exist_ok=True) | |
# Validate S3 file path | |
if not request.s3_file_path.startswith("s3://"): | |
raise HTTPException(status_code=400, detail="Invalid S3 file path. Must start with 's3://'") | |
try: | |
bucket_name, object_key = request.s3_file_path.replace("s3://", "").split("/", 1) | |
except ValueError: | |
raise HTTPException(status_code=400, detail="Invalid S3 file path format.") | |
logger.info(f"Downloading {request.s3_file_path} from S3 bucket {bucket_name}...") | |
# Download PDF from S3 | |
try: | |
s3_client.download_file(bucket_name, object_key, output_path) | |
except Exception as e: | |
logger.error(f"Failed to download file from S3: {str(e)}") | |
raise HTTPException(status_code=500, detail="Error downloading file from S3.") | |
# Initialize and run the converter | |
converter = PdfToSectionConverter() | |
output = converter.convert( | |
downloaded_pdf_path=output_path, | |
file_title=request.file_title, | |
doc_id=doc_id, | |
start_page_no=request.start_page, | |
end_page_no=request.end_page | |
) | |
# Cleanup the temporary file | |
os.remove(output_path) | |
return {"status": "success", "data": output} | |
except HTTPException: | |
raise | |
except Exception as e: | |
logger.error(f"Unexpected error: {str(e)}") | |
raise HTTPException(status_code=500, detail="Internal Server Error.") | |
def start_server(): | |
logger.info("Starting Server...") | |
uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True) | |
if __name__ == "__main__": | |
start_server() | |