Spaces:
Sleeping
Sleeping
import PyPDF2 | |
import tiktoken | |
# Ensure NLTK resources are available | |
def extract_text_from_pdf(file): | |
""" | |
Extracts text from a PDF file and tracks text by page. | |
:param file: Uploaded PDF file object. | |
:return: Tuple (text, page_texts), where: | |
- text is the combined text of the entire PDF. | |
- page_texts is a list of tuples [(page_number, page_text), ...]. | |
""" | |
pdf_reader = PyPDF2.PdfReader(file) | |
text = "" | |
page_texts = [] | |
for i, page in enumerate(pdf_reader.pages): | |
page_content = page.extract_text() | |
text += page_content | |
page_texts.append((i + 1, page_content)) # Track page numbers (1-indexed) | |
return text, page_texts | |
def count_tokens(string: str) -> int: | |
"""Returns the number of tokens in a text string.""" | |
encoding = tiktoken.get_encoding("o200k_base") | |
return len(encoding.encode(string)) | |