File size: 1,597 Bytes
105b369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from typing import List

from phi.document.base import Document
from phi.document.reader.base import Reader
from phi.aws.resource.s3.object import S3Object
from phi.utils.log import logger


class S3PDFReader(Reader):
    """Reader for PDF files on S3"""

    def read(self, s3_object: S3Object) -> List[Document]:
        from io import BytesIO

        if not s3_object:
            raise ValueError("No s3_object provided")

        try:
            from pypdf import PdfReader as DocumentReader  # noqa: F401
        except ImportError:
            raise ImportError("`pypdf` not installed")

        try:
            logger.info(f"Reading: {s3_object.uri}")

            object_resource = s3_object.get_resource()
            object_body = object_resource.get()["Body"]
            doc_name = s3_object.name.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
            doc_reader = DocumentReader(BytesIO(object_body.read()))
            documents = [
                Document(
                    name=doc_name,
                    id=f"{doc_name}_{page_number}",
                    meta_data={"page": page_number},
                    content=page.extract_text(),
                )
                for page_number, page in enumerate(doc_reader.pages, start=1)
            ]
            if self.chunk:
                chunked_documents = []
                for document in documents:
                    chunked_documents.extend(self.chunk_document(document))
                return chunked_documents
            return documents
        except Exception:
            raise