Spaces:
Sleeping
Sleeping
from typing import Optional, List | |
from fastapi import UploadFile | |
from fastapi.responses import JSONResponse | |
from llama_index.core.readers.base import BaseReader | |
from llama_index.core.schema import Document | |
from PyPDF2 import PdfReader | |
class Reader(BaseReader): | |
async def read_from_uploadfile(self, file: UploadFile) -> List[Document]: | |
try: | |
file_content = await file.read() | |
# Initialize PdfReader with file-like object | |
reader = PdfReader(file.file) | |
total_pages = reader.pages | |
print("Total pages: ", len(total_pages)) | |
# Extract text from each page and store it in a list | |
documents = [] | |
for page_num, page in enumerate(total_pages, start=1): | |
text = page.extract_text() or "" # Extract text or use empty if none | |
if text.strip(): # Only add non-empty text as a document | |
documents.append( | |
Document(text=text.strip(), metadata={"page": page_num}) | |
) | |
else: | |
# Handle the case where a page is empty but should still be accounted for | |
documents.append(Document(text="", metadata={"page": page_num})) | |
print("Number of documents: ", len(documents)) | |
return documents | |
except Exception as e: | |
return JSONResponse( | |
status_code=500, content=f"Failed to process the uploaded file: {e}" | |
) | |