Spaces:
Sleeping
Sleeping
from typing import Optional | |
from llama_index.core.readers.base import BaseReader | |
from llama_index.core.schema import Document | |
from fastapi import UploadFile | |
from typing import List | |
from PyPDF2 import PdfReader | |
from llama_parse import LlamaParse | |
class Reader(BaseReader): | |
async def read_from_uploadfile(self, file: UploadFile) -> List[Document]: | |
try: | |
file_content = await file.read() | |
# Initialize PdfReader with file-like object | |
reader = PdfReader(file.file) | |
# Extract text from each page and store in a list | |
pages = [] | |
for page_num, page in enumerate(reader.pages): | |
text = page.extract_text() or "" # Extract text or use empty if none | |
if text.strip(): # Only add non-empty pages | |
pages.append((page_num + 1, text.strip())) | |
# Create Document objects with page number in metadata | |
documents = [ | |
Document(text=page_text, metadata={"page": page_num}) | |
for page_num, page_text in pages | |
] | |
return documents | |
except Exception as e: | |
# Handle specific exceptions or fallback to generic one | |
print(f"Error reading PDF file: {e}") | |
raise RuntimeError(f"Failed to process the uploaded file: {e}") |