Spaces:

Multimedika
/

Bot_Development

Sleeping

App Files Files Community

Bot_Development / service /aws_loader.py

dsmultimedika

Update Repository

0743bb0 6 months ago

raw

history blame

4.16 kB

	import os
	import boto3
	import tempfile
	import fitz
	from io import BytesIO

	from fastapi import HTTPException


	class Loader:
	def __init__(self):
	# Create S3 and Transcribe clients with credentials
	self.bucket_name = "multimedika"
	self.s3_client = boto3.client(
	"s3",
	aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
	aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
	region_name="us-west-2",
	)

	# def upload_to_s3(self, file, object_name, folder_name="summarizer"):
	# try:
	# # If folder_name is provided, prepend it to the object_name
	# if folder_name:
	# object_name = f"{folder_name}/{object_name}"

	# # Create an in-memory file-like object
	# with BytesIO() as file_stream:
	# # Write the contents of the uploaded file to the stream
	# file_stream.write(file.file.read())
	# file_stream.seek(0) # Move to the beginning of the stream

	# # Upload file to S3
	# self.s3_client.upload_fileobj(file_stream, self.bucket_name, object_name)

	# print(f"File '{object_name}' successfully uploaded to bucket '{self.bucket_name}'.")
	# except Exception as e:
	# raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}")

	def upload_to_s3(self, file, object_name, folder_name="summarizer"):
	try:
	# If folder_name is provided, prepend it to the object_name
	if folder_name:
	object_name = f"{folder_name}/{object_name}"

	# Open the PDF with PyMuPDF (fitz)
	pdf_document = fitz.open(stream=file.file.read(), filetype="pdf")

	# Loop through each page of the PDF
	for page_num in range(pdf_document.page_count):

	# Convert the page to bytes (as a separate PDF)
	page_stream = BytesIO()
	single_page_pdf = fitz.open() # Create a new PDF
	single_page_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
	single_page_pdf.save(page_stream)
	single_page_pdf.close()

	# Reset the stream position to the start
	page_stream.seek(0)

	# Define the object name for each page (e.g., 'summarizer/object_name/page_1.pdf')
	page_object_name = f"{object_name}/{page_num + 1}.pdf"

	# Upload each page to S3
	self.s3_client.upload_fileobj(page_stream, self.bucket_name, page_object_name)

	print(f"Page {page_num + 1} of '{object_name}' successfully uploaded as '{page_object_name}' to bucket '{self.bucket_name}'.")

	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}")

	def get_file_aws(self, object_name, local_file_name=None):
	"""Downloads a PDF file from S3 and reads it using PyMuPDF."""
	if local_file_name is None:
	local_file_name = "downloaded_pdf_file.pdf" # Default file name

	try:
	# Create a temporary directory to store the file
	temp_dir = tempfile.mkdtemp()
	file_path = os.path.join(temp_dir, local_file_name)
	# Download the file from S3
	with open(file_path, "wb") as temp_file:
	self.s3_client.download_fileobj(
	self.bucket_name, object_name, temp_file
	)
	# Open and read the PDF using PyMuPDF
	doc = fitz.open(file_path)
	# Example: Print the number of pages
	print(f"Number of pages: {doc.page_count}")
	# Do something with the PDF, like read text
	for page in doc:
	print(page.get_text())
	# Close the document
	doc.close()
	# Clean up the downloaded file if needed
	os.remove(file_path)

	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Error get file file in aws: {e}")