import PyPDF2 import tiktoken # Ensure NLTK resources are available def extract_text_from_pdf(file): """ Extracts text from a PDF file and tracks text by page. :param file: Uploaded PDF file object. :return: Tuple (text, page_texts), where: - text is the combined text of the entire PDF. - page_texts is a list of tuples [(page_number, page_text), ...]. """ pdf_reader = PyPDF2.PdfReader(file) text = "" page_texts = [] for i, page in enumerate(pdf_reader.pages): page_content = page.extract_text() text += page_content page_texts.append((i + 1, page_content)) # Track page numbers (1-indexed) return text, page_texts def count_tokens(string: str) -> int: """Returns the number of tokens in a text string.""" encoding = tiktoken.get_encoding("o200k_base") return len(encoding.encode(string))