dsmultimedika's picture
Improve the code bot development
d57efd6
raw
history blame
1.51 kB
from typing import Optional, List
from fastapi import UploadFile
from fastapi.responses import JSONResponse
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from PyPDF2 import PdfReader
class Reader(BaseReader):
async def read_from_uploadfile(self, file: UploadFile) -> List[Document]:
try:
file_content = await file.read()
# Initialize PdfReader with file-like object
reader = PdfReader(file.file)
total_pages = reader.pages
print("Total pages: ", len(total_pages))
# Extract text from each page and store it in a list
documents = []
for page_num, page in enumerate(total_pages, start=1):
text = page.extract_text() or "" # Extract text or use empty if none
if text.strip(): # Only add non-empty text as a document
documents.append(
Document(text=text.strip(), metadata={"page": page_num})
)
else:
# Handle the case where a page is empty but should still be accounted for
documents.append(Document(text="", metadata={"page": page_num}))
print("Number of documents: ", len(documents))
return documents
except Exception as e:
return JSONResponse(
status_code=500, content=f"Failed to process the uploaded file: {e}"
)