AmmarFahmy
adding all files
105b369
from typing import List
from phi.document.base import Document
from phi.document.reader.base import Reader
from phi.aws.resource.s3.object import S3Object
from phi.utils.log import logger
class S3PDFReader(Reader):
"""Reader for PDF files on S3"""
def read(self, s3_object: S3Object) -> List[Document]:
from io import BytesIO
if not s3_object:
raise ValueError("No s3_object provided")
try:
from pypdf import PdfReader as DocumentReader # noqa: F401
except ImportError:
raise ImportError("`pypdf` not installed")
try:
logger.info(f"Reading: {s3_object.uri}")
object_resource = s3_object.get_resource()
object_body = object_resource.get()["Body"]
doc_name = s3_object.name.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
doc_reader = DocumentReader(BytesIO(object_body.read()))
documents = [
Document(
name=doc_name,
id=f"{doc_name}_{page_number}",
meta_data={"page": page_number},
content=page.extract_text(),
)
for page_number, page in enumerate(doc_reader.pages, start=1)
]
if self.chunk:
chunked_documents = []
for document in documents:
chunked_documents.extend(self.chunk_document(document))
return chunked_documents
return documents
except Exception:
raise