dsmultimedika's picture
Build Application
9002555
raw
history blame
1.35 kB
from typing import Optional
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from fastapi import UploadFile
from typing import List
from PyPDF2 import PdfReader
from llama_parse import LlamaParse
class Reader(BaseReader):
async def read_from_uploadfile(self, file: UploadFile) -> List[Document]:
try:
file_content = await file.read()
# Initialize PdfReader with file-like object
reader = PdfReader(file.file)
# Extract text from each page and store in a list
pages = []
for page_num, page in enumerate(reader.pages):
text = page.extract_text() or "" # Extract text or use empty if none
if text.strip(): # Only add non-empty pages
pages.append((page_num + 1, text.strip()))
# Create Document objects with page number in metadata
documents = [
Document(text=page_text, metadata={"page": page_num})
for page_num, page_text in pages
]
return documents
except Exception as e:
# Handle specific exceptions or fallback to generic one
print(f"Error reading PDF file: {e}")
raise RuntimeError(f"Failed to process the uploaded file: {e}")