Spaces:
Runtime error
Runtime error
from typing import List | |
from phi.document.base import Document | |
from phi.document.reader.base import Reader | |
from phi.aws.resource.s3.object import S3Object | |
from phi.utils.log import logger | |
class S3PDFReader(Reader): | |
"""Reader for PDF files on S3""" | |
def read(self, s3_object: S3Object) -> List[Document]: | |
from io import BytesIO | |
if not s3_object: | |
raise ValueError("No s3_object provided") | |
try: | |
from pypdf import PdfReader as DocumentReader # noqa: F401 | |
except ImportError: | |
raise ImportError("`pypdf` not installed") | |
try: | |
logger.info(f"Reading: {s3_object.uri}") | |
object_resource = s3_object.get_resource() | |
object_body = object_resource.get()["Body"] | |
doc_name = s3_object.name.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_") | |
doc_reader = DocumentReader(BytesIO(object_body.read())) | |
documents = [ | |
Document( | |
name=doc_name, | |
id=f"{doc_name}_{page_number}", | |
meta_data={"page": page_number}, | |
content=page.extract_text(), | |
) | |
for page_number, page in enumerate(doc_reader.pages, start=1) | |
] | |
if self.chunk: | |
chunked_documents = [] | |
for document in documents: | |
chunked_documents.extend(self.chunk_document(document)) | |
return chunked_documents | |
return documents | |
except Exception: | |
raise | |