from collections import deque from pathlib import Path import logging from typing import Iterator, Self import re from llama_index.core.schema import BaseNode logger = logging.getLogger(__name__) DOCUMENT_NODE_NUMBER: int = -1 class TreeNode: def __init__(self, name: str, number: int | None = None): self.name = name self.number = number self.children: list[Self] = [] self.parent: Self | None = None def add_child(self, child: Self) -> None: self.children.append(child) def set_parent(self, parent: Self) -> None: if self.parent is not None: raise ValueError("parent has already been set") else: self.parent = parent def remove_parent(self) -> None: self.parent = None def __str__(self, level: int = 0) -> str: ret = " " * level + self.name if self.number is not None: ret += f" [{self.number}]" ret += "\n" for child in self.children: ret += child.__str__(level + 1) return ret def bfs(self) -> Iterator[Self]: """Perform Breadth-First traversal of the tree.""" queue = deque([self]) while queue: node = queue.popleft() yield node queue.extend(node.children) def remove_child(self, child: Self) -> bool: if child in self.children: child.remove_parent() self.children.remove(child) return True return False def __iter__(self): return self.bfs() def parse_landscape_structure(document: BaseNode) -> TreeNode: page_pattern = re.compile(r"^-\s*Page\s+(\d+)\s*:\s*(.+)$") header_pattern = re.compile(r"^(#+)\s+(.+)$") format = document.metadata.get("format", "") if format != "landscape": raise ValueError(f"Unsupported format {format}") number_pages = document.metadata.get("nb_pages", None) structure = document.metadata.get("structure", "") filename = document.metadata.get("filename", "") assert number_pages and structure and filename lines = structure.splitlines() filestem = Path(filename).stem root = TreeNode(name=filestem, number=DOCUMENT_NODE_NUMBER) stack = [(root, 0)] # (node, level) pairs abstract_node_number = DOCUMENT_NODE_NUMBER - 1 processed_page_numbers = set() for line in lines: line = line.strip() if not line: continue # Check if it's a header header_match = header_pattern.match(line) if header_match: level = len(header_match.group(1)) title = header_match.group(2).strip() new_node = TreeNode(name=title, number=abstract_node_number) abstract_node_number -= 1 # Adjust stack for header level while stack and stack[-1][1] >= level: stack.pop() if stack: stack[-1][0].add_child(new_node) new_node.set_parent(stack[-1][0]) stack.append((new_node, level)) continue # Check if it's a page entry page_match = page_pattern.match(line) if page_match: page_num = int(page_match.group(1)) title = page_match.group(2).strip() if page_num in processed_page_numbers: logger.warning(f"Filename {filename} Page {page_num} already processed. Skipping {title}.") elif page_num > number_pages: logger.warning( f"Filename {filename} Page number {page_num} is greater than the number of pages in the document. Skipping {title}." ) else: processed_page_numbers.add(page_num) new_node = TreeNode(name=title, number=page_num) # Add to last header in stack if stack: stack[-1][0].add_child(new_node) new_node.set_parent(stack[-1][0]) leftout_page_numbers = set(range(1, number_pages + 1)) - processed_page_numbers if leftout_page_numbers: logger.warning(f"Filename {filename} Page numbers {leftout_page_numbers} are not processed.") uncategorized_node = TreeNode(name="Uncategorized", number=abstract_node_number) abstract_node_number -= 1 root.add_child(uncategorized_node) uncategorized_node.set_parent(root) for page_num in leftout_page_numbers: new_node = TreeNode(name=f"Page number {page_num}", number=page_num) uncategorized_node.add_child(new_node) new_node.set_parent(uncategorized_node) return root def parse_portrait_structure(document: BaseNode) -> TreeNode: header_pattern = re.compile(r"(#+)\s+(.*?)\s+\[line\s+(\d+)\]") format = document.metadata.get("format", "") if format != "portrait": raise ValueError(f"Unsupported format {format}") structure = document.metadata.get("structure", "") filename = document.metadata.get("filename", "") created_toc = document.metadata.get("created_toc", "") assert structure and filename and created_toc lines = structure.splitlines() filestem = Path(filename).stem root = TreeNode(name=filestem, number=DOCUMENT_NODE_NUMBER) stack = [(root, 0)] # (node, level) pairs processed_line_numbers = list() for line in lines: line = line.strip() if not line: continue # Check if it's a header header_match = header_pattern.match(line) if header_match: level = len(header_match.group(1)) title = header_match.group(2).strip() line_number = int(header_match.group(3)) processed_line_numbers.append(line_number) new_node = TreeNode(name=title, number=line_number) # Adjust stack for header level while stack and stack[-1][1] >= level: stack.pop() if stack: stack[-1][0].add_child(new_node) new_node.set_parent(stack[-1][0]) stack.append((new_node, level)) continue assert processed_line_numbers[0] == 0 and all( processed_line_numbers[i] <= processed_line_numbers[i + 1] for i in range(len(processed_line_numbers) - 1) ) return root def parse_structure(document: BaseNode) -> TreeNode: format = document.metadata.get("format", "") match format: case "landscape": return parse_landscape_structure(document) case "portrait": return parse_portrait_structure(document) case _: raise ValueError(f"Unsupported format {format}")