IntelAnalyser / helper_functions.py
ashischakraborty's picture
UC#3 first upload
3daab2e verified
raw
history blame contribute delete
928 Bytes
import PyPDF2
import tiktoken
# Ensure NLTK resources are available
def extract_text_from_pdf(file):
"""
Extracts text from a PDF file and tracks text by page.
:param file: Uploaded PDF file object.
:return: Tuple (text, page_texts), where:
- text is the combined text of the entire PDF.
- page_texts is a list of tuples [(page_number, page_text), ...].
"""
pdf_reader = PyPDF2.PdfReader(file)
text = ""
page_texts = []
for i, page in enumerate(pdf_reader.pages):
page_content = page.extract_text()
text += page_content
page_texts.append((i + 1, page_content)) # Track page numbers (1-indexed)
return text, page_texts
def count_tokens(string: str) -> int:
"""Returns the number of tokens in a text string."""
encoding = tiktoken.get_encoding("o200k_base")
return len(encoding.encode(string))