Spaces:
Runtime error
Runtime error
from pathlib import Path | |
from typing import List, Union, IO, Any | |
from phi.document.base import Document | |
from phi.document.reader.base import Reader | |
from phi.utils.log import logger | |
class PDFReader(Reader): | |
"""Reader for PDF files""" | |
def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]: | |
if not pdf: | |
raise ValueError("No pdf provided") | |
try: | |
from pypdf import PdfReader as DocumentReader # noqa: F401 | |
except ImportError: | |
raise ImportError("`pypdf` not installed") | |
doc_name = "" | |
try: | |
if isinstance(pdf, str): | |
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_") | |
else: | |
doc_name = pdf.name.split(".")[0] | |
except Exception: | |
doc_name = "pdf" | |
logger.info(f"Reading: {doc_name}") | |
doc_reader = DocumentReader(pdf) | |
documents = [ | |
Document( | |
name=doc_name, | |
id=f"{doc_name}_{page_number}", | |
meta_data={"page": page_number}, | |
content=page.extract_text(), | |
) | |
for page_number, page in enumerate(doc_reader.pages, start=1) | |
] | |
if self.chunk: | |
chunked_documents = [] | |
for document in documents: | |
chunked_documents.extend(self.chunk_document(document)) | |
return chunked_documents | |
return documents | |
class PDFUrlReader(Reader): | |
"""Reader for PDF files from URL""" | |
def read(self, url: str) -> List[Document]: | |
if not url: | |
raise ValueError("No url provided") | |
from io import BytesIO | |
try: | |
import httpx | |
except ImportError: | |
raise ImportError("`httpx` not installed") | |
try: | |
from pypdf import PdfReader as DocumentReader # noqa: F401 | |
except ImportError: | |
raise ImportError("`pypdf` not installed") | |
logger.info(f"Reading: {url}") | |
response = httpx.get(url) | |
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_") | |
doc_reader = DocumentReader(BytesIO(response.content)) | |
documents = [ | |
Document( | |
name=doc_name, | |
id=f"{doc_name}_{page_number}", | |
meta_data={"page": page_number}, | |
content=page.extract_text(), | |
) | |
for page_number, page in enumerate(doc_reader.pages, start=1) | |
] | |
if self.chunk: | |
chunked_documents = [] | |
for document in documents: | |
chunked_documents.extend(self.chunk_document(document)) | |
return chunked_documents | |
return documents | |