from typing import Optional, List from fastapi import UploadFile from fastapi.responses import JSONResponse from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document from PyPDF2 import PdfReader class Reader(BaseReader): async def read_from_uploadfile(self, file: UploadFile) -> List[Document]: try: file_content = await file.read() # Initialize PdfReader with file-like object reader = PdfReader(file.file) total_pages = reader.pages print("Total pages: ", len(total_pages)) # Extract text from each page and store it in a list documents = [] for page_num, page in enumerate(total_pages, start=1): text = page.extract_text() or "" # Extract text or use empty if none if text.strip(): # Only add non-empty text as a document documents.append( Document(text=text.strip(), metadata={"page": page_num}) ) else: # Handle the case where a page is empty but should still be accounted for documents.append(Document(text="", metadata={"page": page_num})) print("Number of documents: ", len(documents)) return documents except Exception as e: return JSONResponse( status_code=500, content=f"Failed to process the uploaded file: {e}" )