Spaces:
Sleeping
Sleeping
File size: 1,346 Bytes
9002555 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from typing import Optional
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from fastapi import UploadFile
from typing import List
from PyPDF2 import PdfReader
from llama_parse import LlamaParse
class Reader(BaseReader):
async def read_from_uploadfile(self, file: UploadFile) -> List[Document]:
try:
file_content = await file.read()
# Initialize PdfReader with file-like object
reader = PdfReader(file.file)
# Extract text from each page and store in a list
pages = []
for page_num, page in enumerate(reader.pages):
text = page.extract_text() or "" # Extract text or use empty if none
if text.strip(): # Only add non-empty pages
pages.append((page_num + 1, text.strip()))
# Create Document objects with page number in metadata
documents = [
Document(text=page_text, metadata={"page": page_num})
for page_num, page_text in pages
]
return documents
except Exception as e:
# Handle specific exceptions or fallback to generic one
print(f"Error reading PDF file: {e}")
raise RuntimeError(f"Failed to process the uploaded file: {e}") |