File size: 928 Bytes
3daab2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import PyPDF2
import tiktoken
# Ensure NLTK resources are available


def extract_text_from_pdf(file):
    """

    Extracts text from a PDF file and tracks text by page.



    :param file: Uploaded PDF file object.

    :return: Tuple (text, page_texts), where:

             - text is the combined text of the entire PDF.

             - page_texts is a list of tuples [(page_number, page_text), ...].

    """
    pdf_reader = PyPDF2.PdfReader(file)
    text = ""
    page_texts = []
    for i, page in enumerate(pdf_reader.pages):
        page_content = page.extract_text()
        text += page_content
        page_texts.append((i + 1, page_content))  # Track page numbers (1-indexed)
    return text, page_texts

def count_tokens(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("o200k_base")
    return len(encoding.encode(string))