import pdfplumber import re from transformers import AutoTokenizer from typing import List, Dict import pandas as pd tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # We try to extract the section and subsection data along with the text to be appended to the chunk def extract_text_with_hierarchy(pdf_path: str) -> List[Dict]: """Extract text with section/subsection hierarchy""" content = [] current_section = "" current_subsection = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text() lines = text.split('\n') for line in lines: line = line.strip() if not line: continue # Detect section headers section_match = re.match(r'\\section\*{(.+?)}', line) subsection_match = re.match(r'\\subsection\*{(.+?)}', line) if section_match: current_section = section_match.group(1) current_subsection = "" content.append({ 'type': 'section', 'title': current_section, 'text': "" }) elif subsection_match: current_subsection = subsection_match.group(1) content.append({ 'type': 'subsection', 'title': current_subsection, 'text': "" }) else: if content: content[-1]['text'] += line + " " else: content.append({ 'type': 'text', 'title': "", 'text': line }) return content def create_bert_chunks(file_name:str,content: List[Dict], max_tokens=450, overlap=50) -> List[Dict]: """Create chunks optimized for DistilBERT with hierarchy context""" chunks = [] current_chunk = [] current_tokens = 0 current_section = "" current_subsection = "" for item in content: # Build context header header = "" if item['type'] == 'section': current_section = item['title'] current_subsection = "" header = f"[SECTION] {current_section}\n" elif item['type'] == 'subsection': current_subsection = item['title'] header = f"[SUBSECTION] {current_subsection}\n" # Split text into sentences sentences = re.split(r'(?<=[.!?])\s+', item['text']) for sentence in sentences: full_text = header + sentence if header else sentence tokens = tokenizer.encode(full_text) if current_tokens + len(tokens) > max_tokens: if current_chunk: chunk_text = "\n".join(current_chunk) chunks.append({ 'text': chunk_text, 'section': current_section, 'subsection': current_subsection, 'tokens': current_tokens, 'file_name':file_name }) # Carry over overlap overlap_tokens = tokenizer.encode(chunk_text)[-overlap:] current_chunk = [tokenizer.decode(overlap_tokens)] current_tokens = len(overlap_tokens) header = "" # Reset header after overlap current_chunk.append(full_text) current_tokens += len(tokens) header = "" # Clear header after first use # Add remaining content if current_chunk: chunk_text = "\n".join(current_chunk) chunks.append({ 'text': chunk_text, 'section': current_section, 'subsection': current_subsection, 'tokens': current_tokens, 'file_name':file_name }) return chunks def process_pdf(pdf_path: str) -> List[Dict]: """Process PDF into BERT-optimized chunks""" structured_content = extract_text_with_hierarchy(pdf_path) return create_bert_chunks(pdf_path,structured_content)