Spaces:
Runtime error
Runtime error
from pathlib import Path | |
from typing import List | |
from phi.document.base import Document | |
from phi.document.reader.base import Reader | |
from phi.aws.resource.s3.object import S3Object | |
from phi.utils.log import logger | |
class S3TextReader(Reader): | |
"""Reader for text files on S3""" | |
def read(self, s3_object: S3Object) -> List[Document]: | |
if not s3_object: | |
raise ValueError("No s3_object provided") | |
try: | |
import textract # noqa: F401 | |
except ImportError: | |
raise ImportError("`textract` not installed") | |
try: | |
logger.info(f"Reading: {s3_object.uri}") | |
obj_name = s3_object.name.split("/")[-1] | |
temporary_file = Path("storage").joinpath(obj_name) | |
s3_object.download(temporary_file) | |
logger.info(f"Parsing: {temporary_file}") | |
doc_name = s3_object.name.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_") | |
doc_content = textract.process(temporary_file) | |
documents = [ | |
Document( | |
name=doc_name, | |
id=doc_name, | |
content=doc_content.decode("utf-8"), | |
) | |
] | |
if self.chunk: | |
chunked_documents = [] | |
for document in documents: | |
chunked_documents.extend(self.chunk_document(document)) | |
return chunked_documents | |
logger.debug(f"Deleting: {temporary_file}") | |
temporary_file.unlink() | |
return documents | |
except Exception as e: | |
logger.error(f"Error reading: {s3_object.uri}: {e}") | |
return [] | |