AmmarFahmy
adding all files
105b369
from pathlib import Path
from typing import List
from phi.document.base import Document
from phi.document.reader.base import Reader
from phi.aws.resource.s3.object import S3Object
from phi.utils.log import logger
class S3TextReader(Reader):
"""Reader for text files on S3"""
def read(self, s3_object: S3Object) -> List[Document]:
if not s3_object:
raise ValueError("No s3_object provided")
try:
import textract # noqa: F401
except ImportError:
raise ImportError("`textract` not installed")
try:
logger.info(f"Reading: {s3_object.uri}")
obj_name = s3_object.name.split("/")[-1]
temporary_file = Path("storage").joinpath(obj_name)
s3_object.download(temporary_file)
logger.info(f"Parsing: {temporary_file}")
doc_name = s3_object.name.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
doc_content = textract.process(temporary_file)
documents = [
Document(
name=doc_name,
id=doc_name,
content=doc_content.decode("utf-8"),
)
]
if self.chunk:
chunked_documents = []
for document in documents:
chunked_documents.extend(self.chunk_document(document))
return chunked_documents
logger.debug(f"Deleting: {temporary_file}")
temporary_file.unlink()
return documents
except Exception as e:
logger.error(f"Error reading: {s3_object.uri}: {e}")
return []