File size: 1,507 Bytes
d57efd6
 
 
9002555
 
 
 
 
 
 
 
 
d57efd6
9002555
 
d57efd6
 
9002555
d57efd6
 
 
9002555
d57efd6
 
 
 
 
 
 
9002555
d57efd6
9002555
 
 
 
d57efd6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from typing import Optional, List
from fastapi import UploadFile
from fastapi.responses import JSONResponse
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from PyPDF2 import PdfReader


class Reader(BaseReader):
    async def read_from_uploadfile(self, file: UploadFile) -> List[Document]:
        try:
            file_content = await file.read()

            # Initialize PdfReader with file-like object
            reader = PdfReader(file.file)
            total_pages = reader.pages
            print("Total pages: ", len(total_pages))

            # Extract text from each page and store it in a list
            documents = []
            for page_num, page in enumerate(total_pages, start=1):
                text = page.extract_text() or ""  # Extract text or use empty if none
                if text.strip():  # Only add non-empty text as a document
                    documents.append(
                        Document(text=text.strip(), metadata={"page": page_num})
                    )
                else:
                    # Handle the case where a page is empty but should still be accounted for
                    documents.append(Document(text="", metadata={"page": page_num}))

            print("Number of documents: ", len(documents))

            return documents

        except Exception as e:
            return JSONResponse(
                status_code=500, content=f"Failed to process the uploaded file: {e}"
            )