Spaces:
Runtime error
Runtime error
File size: 2,738 Bytes
105b369 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
from pathlib import Path
from typing import List, Union, IO, Any
from phi.document.base import Document
from phi.document.reader.base import Reader
from phi.utils.log import logger
class PDFReader(Reader):
"""Reader for PDF files"""
def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
if not pdf:
raise ValueError("No pdf provided")
try:
from pypdf import PdfReader as DocumentReader # noqa: F401
except ImportError:
raise ImportError("`pypdf` not installed")
doc_name = ""
try:
if isinstance(pdf, str):
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
else:
doc_name = pdf.name.split(".")[0]
except Exception:
doc_name = "pdf"
logger.info(f"Reading: {doc_name}")
doc_reader = DocumentReader(pdf)
documents = [
Document(
name=doc_name,
id=f"{doc_name}_{page_number}",
meta_data={"page": page_number},
content=page.extract_text(),
)
for page_number, page in enumerate(doc_reader.pages, start=1)
]
if self.chunk:
chunked_documents = []
for document in documents:
chunked_documents.extend(self.chunk_document(document))
return chunked_documents
return documents
class PDFUrlReader(Reader):
"""Reader for PDF files from URL"""
def read(self, url: str) -> List[Document]:
if not url:
raise ValueError("No url provided")
from io import BytesIO
try:
import httpx
except ImportError:
raise ImportError("`httpx` not installed")
try:
from pypdf import PdfReader as DocumentReader # noqa: F401
except ImportError:
raise ImportError("`pypdf` not installed")
logger.info(f"Reading: {url}")
response = httpx.get(url)
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
doc_reader = DocumentReader(BytesIO(response.content))
documents = [
Document(
name=doc_name,
id=f"{doc_name}_{page_number}",
meta_data={"page": page_number},
content=page.extract_text(),
)
for page_number, page in enumerate(doc_reader.pages, start=1)
]
if self.chunk:
chunked_documents = []
for document in documents:
chunked_documents.extend(self.chunk_document(document))
return chunked_documents
return documents
|