File size: 4,158 Bytes
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0743bb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9002555
 
 
 
 
 
0743bb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9002555
0743bb0
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import boto3
import tempfile
import fitz
from io import BytesIO

from fastapi import HTTPException


class Loader:
    def __init__(self):
        # Create S3 and Transcribe clients with credentials
        self.bucket_name = "multimedika"
        self.s3_client = boto3.client(
            "s3",
            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
            region_name="us-west-2",
        )

    # def upload_to_s3(self, file, object_name, folder_name="summarizer"):
    #     try:
    #         # If folder_name is provided, prepend it to the object_name
    #         if folder_name:
    #             object_name = f"{folder_name}/{object_name}"

    #         # Create an in-memory file-like object
    #         with BytesIO() as file_stream:
    #             # Write the contents of the uploaded file to the stream
    #             file_stream.write(file.file.read())
    #             file_stream.seek(0)  # Move to the beginning of the stream

    #             # Upload file to S3
    #             self.s3_client.upload_fileobj(file_stream, self.bucket_name, object_name)

    #         print(f"File '{object_name}' successfully uploaded to bucket '{self.bucket_name}'.")
    #     except Exception as e:
    #         raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}")
    
    def upload_to_s3(self, file, object_name, folder_name="summarizer"):
        try:
            # If folder_name is provided, prepend it to the object_name
            if folder_name:
                object_name = f"{folder_name}/{object_name}"

            # Open the PDF with PyMuPDF (fitz)
            pdf_document = fitz.open(stream=file.file.read(), filetype="pdf")

            # Loop through each page of the PDF
            for page_num in range(pdf_document.page_count):

                # Convert the page to bytes (as a separate PDF)
                page_stream = BytesIO()
                single_page_pdf = fitz.open()  # Create a new PDF
                single_page_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
                single_page_pdf.save(page_stream)
                single_page_pdf.close()

                # Reset the stream position to the start
                page_stream.seek(0)

                # Define the object name for each page (e.g., 'summarizer/object_name/page_1.pdf')
                page_object_name = f"{object_name}/{page_num + 1}.pdf"

                # Upload each page to S3
                self.s3_client.upload_fileobj(page_stream, self.bucket_name, page_object_name)

                print(f"Page {page_num + 1} of '{object_name}' successfully uploaded as '{page_object_name}' to bucket '{self.bucket_name}'.")

        except Exception as e:
            raise HTTPException(status_code=400, detail=f"Error uploading to AWS: {e}")

    def get_file_aws(self, object_name, local_file_name=None):
        """Downloads a PDF file from S3 and reads it using PyMuPDF."""
        if local_file_name is None:
            local_file_name = "downloaded_pdf_file.pdf"  # Default file name

        try:
            # Create a temporary directory to store the file
            temp_dir = tempfile.mkdtemp()
            file_path = os.path.join(temp_dir, local_file_name)
            # Download the file from S3
            with open(file_path, "wb") as temp_file:
                self.s3_client.download_fileobj(
                    self.bucket_name, object_name, temp_file
                )
            # Open and read the PDF using PyMuPDF
            doc = fitz.open(file_path)
            # Example: Print the number of pages
            print(f"Number of pages: {doc.page_count}")
            # Do something with the PDF, like read text
            for page in doc:
                print(page.get_text())
            # Close the document
            doc.close()
            # Clean up the downloaded file if needed
            os.remove(file_path)
            
        except Exception as e:
            raise HTTPException(status_code=400, detail=f"Error get file file in aws: {e}")