Bot_Development / service /aws_loader.py
dsmultimedika's picture
Update Repository
0743bb0
raw
history blame
4.16 kB
import os
import boto3
import tempfile
import fitz
from io import BytesIO
from fastapi import HTTPException
class Loader:
def __init__(self):
# Create S3 and Transcribe clients with credentials
self.bucket_name = "multimedika"
self.s3_client = boto3.client(
"s3",
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
region_name="us-west-2",
)
# def upload_to_s3(self, file, object_name, folder_name="summarizer"):
# try:
# # If folder_name is provided, prepend it to the object_name
# if folder_name:
# object_name = f"{folder_name}/{object_name}"
# # Create an in-memory file-like object
# with BytesIO() as file_stream:
# # Write the contents of the uploaded file to the stream
# file_stream.write(file.file.read())
# file_stream.seek(0) # Move to the beginning of the stream
# # Upload file to S3
# self.s3_client.upload_fileobj(file_stream, self.bucket_name, object_name)
# print(f"File '{object_name}' successfully uploaded to bucket '{self.bucket_name}'.")
# except Exception as e:
# raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}")
def upload_to_s3(self, file, object_name, folder_name="summarizer"):
try:
# If folder_name is provided, prepend it to the object_name
if folder_name:
object_name = f"{folder_name}/{object_name}"
# Open the PDF with PyMuPDF (fitz)
pdf_document = fitz.open(stream=file.file.read(), filetype="pdf")
# Loop through each page of the PDF
for page_num in range(pdf_document.page_count):
# Convert the page to bytes (as a separate PDF)
page_stream = BytesIO()
single_page_pdf = fitz.open() # Create a new PDF
single_page_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
single_page_pdf.save(page_stream)
single_page_pdf.close()
# Reset the stream position to the start
page_stream.seek(0)
# Define the object name for each page (e.g., 'summarizer/object_name/page_1.pdf')
page_object_name = f"{object_name}/{page_num + 1}.pdf"
# Upload each page to S3
self.s3_client.upload_fileobj(page_stream, self.bucket_name, page_object_name)
print(f"Page {page_num + 1} of '{object_name}' successfully uploaded as '{page_object_name}' to bucket '{self.bucket_name}'.")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}")
def get_file_aws(self, object_name, local_file_name=None):
"""Downloads a PDF file from S3 and reads it using PyMuPDF."""
if local_file_name is None:
local_file_name = "downloaded_pdf_file.pdf" # Default file name
try:
# Create a temporary directory to store the file
temp_dir = tempfile.mkdtemp()
file_path = os.path.join(temp_dir, local_file_name)
# Download the file from S3
with open(file_path, "wb") as temp_file:
self.s3_client.download_fileobj(
self.bucket_name, object_name, temp_file
)
# Open and read the PDF using PyMuPDF
doc = fitz.open(file_path)
# Example: Print the number of pages
print(f"Number of pages: {doc.page_count}")
# Do something with the PDF, like read text
for page in doc:
print(page.get_text())
# Close the document
doc.close()
# Clean up the downloaded file if needed
os.remove(file_path)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Error get file file in aws: {e}")