from pathlib import Path from typing import List, Union, IO, Any from phi.document.base import Document from phi.document.reader.base import Reader from phi.utils.log import logger class PDFReader(Reader): """Reader for PDF files""" def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]: if not pdf: raise ValueError("No pdf provided") try: from pypdf import PdfReader as DocumentReader # noqa: F401 except ImportError: raise ImportError("`pypdf` not installed") doc_name = "" try: if isinstance(pdf, str): doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_") else: doc_name = pdf.name.split(".")[0] except Exception: doc_name = "pdf" logger.info(f"Reading: {doc_name}") doc_reader = DocumentReader(pdf) documents = [ Document( name=doc_name, id=f"{doc_name}_{page_number}", meta_data={"page": page_number}, content=page.extract_text(), ) for page_number, page in enumerate(doc_reader.pages, start=1) ] if self.chunk: chunked_documents = [] for document in documents: chunked_documents.extend(self.chunk_document(document)) return chunked_documents return documents class PDFUrlReader(Reader): """Reader for PDF files from URL""" def read(self, url: str) -> List[Document]: if not url: raise ValueError("No url provided") from io import BytesIO try: import httpx except ImportError: raise ImportError("`httpx` not installed") try: from pypdf import PdfReader as DocumentReader # noqa: F401 except ImportError: raise ImportError("`pypdf` not installed") logger.info(f"Reading: {url}") response = httpx.get(url) doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_") doc_reader = DocumentReader(BytesIO(response.content)) documents = [ Document( name=doc_name, id=f"{doc_name}_{page_number}", meta_data={"page": page_number}, content=page.extract_text(), ) for page_number, page in enumerate(doc_reader.pages, start=1) ] if self.chunk: chunked_documents = [] for document in documents: chunked_documents.extend(self.chunk_document(document)) return chunked_documents return documents