File size: 1,346 Bytes
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from typing import Optional
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from fastapi import UploadFile
from typing import List
from PyPDF2 import PdfReader

from llama_parse import LlamaParse

class Reader(BaseReader):
    async def read_from_uploadfile(self, file: UploadFile) -> List[Document]:
        try:
            file_content = await file.read()
            # Initialize PdfReader with file-like object
            reader = PdfReader(file.file)

            # Extract text from each page and store in a list
            pages = []
            for page_num, page in enumerate(reader.pages):
                text = page.extract_text() or ""  # Extract text or use empty if none
                if text.strip():  # Only add non-empty pages
                    pages.append((page_num + 1, text.strip()))


            # Create Document objects with page number in metadata
            documents = [
                Document(text=page_text, metadata={"page": page_num})
                for page_num, page_text in pages
            ]

            return documents

        except Exception as e:
            # Handle specific exceptions or fallback to generic one
            print(f"Error reading PDF file: {e}")
            raise RuntimeError(f"Failed to process the uploaded file: {e}")