# Copyright (C) 2021-2024, Mindee. # This program is licensed under the Apache License 2.0. # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details. from typing import Any, Dict, List, Optional, Tuple import numpy as np from scipy.cluster.hierarchy import fclusterdata from doctr.io.elements import Block, Document, KIEDocument, KIEPage, Line, Page, Prediction, Word from doctr.utils.geometry import estimate_page_angle, resolve_enclosing_bbox, resolve_enclosing_rbbox, rotate_boxes from doctr.utils.repr import NestedObject __all__ = ["DocumentBuilder"] class DocumentBuilder(NestedObject): """Implements a document builder Args: ---- resolve_lines: whether words should be automatically grouped into lines resolve_blocks: whether lines should be automatically grouped into blocks paragraph_break: relative length of the minimum space separating paragraphs export_as_straight_boxes: if True, force straight boxes in the export (fit a rectangle box to all rotated boxes). Else, keep the boxes format unchanged, no matter what it is. """ def __init__( self, resolve_lines: bool = True, resolve_blocks: bool = True, paragraph_break: float = 0.035, export_as_straight_boxes: bool = False, ) -> None: self.resolve_lines = resolve_lines self.resolve_blocks = resolve_blocks self.paragraph_break = paragraph_break self.export_as_straight_boxes = export_as_straight_boxes @staticmethod def _sort_boxes(boxes: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Sort bounding boxes from top to bottom, left to right Args: ---- boxes: bounding boxes of shape (N, 4) or (N, 4, 2) (in case of rotated bbox) Returns: ------- tuple: indices of ordered boxes of shape (N,), boxes If straight boxes are passed tpo the function, boxes are unchanged else: boxes returned are straight boxes fitted to the straightened rotated boxes so that we fit the lines afterwards to the straigthened page """ if boxes.ndim == 3: boxes = rotate_boxes( loc_preds=boxes, angle=-estimate_page_angle(boxes), orig_shape=(1024, 1024), min_angle=5.0, ) boxes = np.concatenate((boxes.min(1), boxes.max(1)), -1) return (boxes[:, 0] + 2 * boxes[:, 3] / np.median(boxes[:, 3] - boxes[:, 1])).argsort(), boxes def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: List[int]) -> List[List[int]]: """Split a line in sub_lines Args: ---- boxes: bounding boxes of shape (N, 4) word_idcs: list of indexes for the words of the line Returns: ------- A list of (sub-)lines computed from the original line (words) """ lines = [] # Sort words horizontally word_idcs = [word_idcs[idx] for idx in boxes[word_idcs, 0].argsort().tolist()] # Eventually split line horizontally if len(word_idcs) < 2: lines.append(word_idcs) else: sub_line = [word_idcs[0]] for i in word_idcs[1:]: horiz_break = True prev_box = boxes[sub_line[-1]] # Compute distance between boxes dist = boxes[i, 0] - prev_box[2] # If distance between boxes is lower than paragraph break, same sub-line if dist < self.paragraph_break: horiz_break = False if horiz_break: lines.append(sub_line) sub_line = [] sub_line.append(i) lines.append(sub_line) return lines def _resolve_lines(self, boxes: np.ndarray) -> List[List[int]]: """Order boxes to group them in lines Args: ---- boxes: bounding boxes of shape (N, 4) or (N, 4, 2) in case of rotated bbox Returns: ------- nested list of box indices """ # Sort boxes, and straighten the boxes if they are rotated idxs, boxes = self._sort_boxes(boxes) # Compute median for boxes heights y_med = np.median(boxes[:, 3] - boxes[:, 1]) lines = [] words = [idxs[0]] # Assign the top-left word to the first line # Define a mean y-center for the line y_center_sum = boxes[idxs[0]][[1, 3]].mean() for idx in idxs[1:]: vert_break = True # Compute y_dist y_dist = abs(boxes[idx][[1, 3]].mean() - y_center_sum / len(words)) # If y-center of the box is close enough to mean y-center of the line, same line if y_dist < y_med / 2: vert_break = False if vert_break: # Compute sub-lines (horizontal split) lines.extend(self._resolve_sub_lines(boxes, words)) words = [] y_center_sum = 0 words.append(idx) y_center_sum += boxes[idx][[1, 3]].mean() # Use the remaining words to form the last(s) line(s) if len(words) > 0: # Compute sub-lines (horizontal split) lines.extend(self._resolve_sub_lines(boxes, words)) return lines @staticmethod def _resolve_blocks(boxes: np.ndarray, lines: List[List[int]]) -> List[List[List[int]]]: """Order lines to group them in blocks Args: ---- boxes: bounding boxes of shape (N, 4) or (N, 4, 2) lines: list of lines, each line is a list of idx Returns: ------- nested list of box indices """ # Resolve enclosing boxes of lines if boxes.ndim == 3: box_lines: np.ndarray = np.asarray([ resolve_enclosing_rbbox([tuple(boxes[idx, :, :]) for idx in line]) # type: ignore[misc] for line in lines ]) else: _box_lines = [ resolve_enclosing_bbox([(tuple(boxes[idx, :2]), tuple(boxes[idx, 2:])) for idx in line]) for line in lines ] box_lines = np.asarray([(x1, y1, x2, y2) for ((x1, y1), (x2, y2)) in _box_lines]) # Compute geometrical features of lines to clusterize # Clusterizing only with box centers yield to poor results for complex documents if boxes.ndim == 3: box_features: np.ndarray = np.stack( ( (box_lines[:, 0, 0] + box_lines[:, 0, 1]) / 2, (box_lines[:, 0, 0] + box_lines[:, 2, 0]) / 2, (box_lines[:, 0, 0] + box_lines[:, 2, 1]) / 2, (box_lines[:, 0, 1] + box_lines[:, 2, 1]) / 2, (box_lines[:, 0, 1] + box_lines[:, 2, 0]) / 2, (box_lines[:, 2, 0] + box_lines[:, 2, 1]) / 2, ), axis=-1, ) else: box_features = np.stack( ( (box_lines[:, 0] + box_lines[:, 3]) / 2, (box_lines[:, 1] + box_lines[:, 2]) / 2, (box_lines[:, 0] + box_lines[:, 2]) / 2, (box_lines[:, 1] + box_lines[:, 3]) / 2, box_lines[:, 0], box_lines[:, 1], ), axis=-1, ) # Compute clusters clusters = fclusterdata(box_features, t=0.1, depth=4, criterion="distance", metric="euclidean") _blocks: Dict[int, List[int]] = {} # Form clusters for line_idx, cluster_idx in enumerate(clusters): if cluster_idx in _blocks.keys(): _blocks[cluster_idx].append(line_idx) else: _blocks[cluster_idx] = [line_idx] # Retrieve word-box level to return a fully nested structure blocks = [[lines[idx] for idx in block] for block in _blocks.values()] return blocks def _build_blocks( self, boxes: np.ndarray, word_preds: List[Tuple[str, float]], crop_orientations: List[Dict[str, Any]], ) -> List[Block]: """Gather independent words in structured blocks Args: ---- boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2) word_preds: list of all detected words of the page, of shape N crop_orientations: list of dictoinaries containing the general orientation (orientations + confidences) of the crops Returns: ------- list of block elements """ if boxes.shape[0] != len(word_preds): raise ValueError(f"Incompatible argument lengths: {boxes.shape[0]}, {len(word_preds)}") if boxes.shape[0] == 0: return [] # Decide whether we try to form lines _boxes = boxes if self.resolve_lines: lines = self._resolve_lines(_boxes if _boxes.ndim == 3 else _boxes[:, :4]) # Decide whether we try to form blocks if self.resolve_blocks and len(lines) > 1: _blocks = self._resolve_blocks(_boxes if _boxes.ndim == 3 else _boxes[:, :4], lines) else: _blocks = [lines] else: # Sort bounding boxes, one line for all boxes, one block for the line lines = [self._sort_boxes(_boxes if _boxes.ndim == 3 else _boxes[:, :4])[0]] # type: ignore[list-item] _blocks = [lines] blocks = [ Block([ Line([ Word( *word_preds[idx], tuple([tuple(pt) for pt in boxes[idx].tolist()]), # type: ignore[arg-type] crop_orientations[idx], ) if boxes.ndim == 3 else Word( *word_preds[idx], ((boxes[idx, 0], boxes[idx, 1]), (boxes[idx, 2], boxes[idx, 3])), crop_orientations[idx], ) for idx in line ]) for line in lines ]) for lines in _blocks ] return blocks def extra_repr(self) -> str: return ( f"resolve_lines={self.resolve_lines}, resolve_blocks={self.resolve_blocks}, " f"paragraph_break={self.paragraph_break}, " f"export_as_straight_boxes={self.export_as_straight_boxes}" ) def __call__( self, pages: List[np.ndarray], boxes: List[np.ndarray], text_preds: List[List[Tuple[str, float]]], page_shapes: List[Tuple[int, int]], crop_orientations: List[Dict[str, Any]], orientations: Optional[List[Dict[str, Any]]] = None, languages: Optional[List[Dict[str, Any]]] = None, ) -> Document: """Re-arrange detected words into structured blocks Args: ---- pages: list of N elements, where each element represents the page image boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5) or (*, 6) for all words for a given page text_preds: list of N elements, where each element is the list of all word prediction (text + confidence) page_shapes: shape of each page, of size N crop_orientations: list of N elements, where each element is a dictionary containing the general orientation (orientations + confidences) of the crops orientations: optional, list of N elements, where each element is a dictionary containing the orientation (orientation + confidence) languages: optional, list of N elements, where each element is a dictionary containing the language (language + confidence) Returns: ------- document object """ if len(boxes) != len(text_preds) != len(crop_orientations) or len(boxes) != len(page_shapes) != len( crop_orientations ): raise ValueError("All arguments are expected to be lists of the same size") _orientations = ( orientations if isinstance(orientations, list) else [None] * len(boxes) # type: ignore[list-item] ) _languages = languages if isinstance(languages, list) else [None] * len(boxes) # type: ignore[list-item] if self.export_as_straight_boxes and len(boxes) > 0: # If boxes are already straight OK, else fit a bounding rect if boxes[0].ndim == 3: # Iterate over pages and boxes boxes = [np.concatenate((p_boxes.min(1), p_boxes.max(1)), 1) for p_boxes in boxes] _pages = [ Page( page, self._build_blocks( page_boxes, word_preds, word_crop_orientations, ), _idx, shape, orientation, language, ) for page, _idx, shape, page_boxes, word_preds, word_crop_orientations, orientation, language in zip( pages, range(len(boxes)), page_shapes, boxes, text_preds, crop_orientations, _orientations, _languages ) ] return Document(_pages) class KIEDocumentBuilder(DocumentBuilder): """Implements a KIE document builder Args: ---- resolve_lines: whether words should be automatically grouped into lines resolve_blocks: whether lines should be automatically grouped into blocks paragraph_break: relative length of the minimum space separating paragraphs export_as_straight_boxes: if True, force straight boxes in the export (fit a rectangle box to all rotated boxes). Else, keep the boxes format unchanged, no matter what it is. """ def __call__( # type: ignore[override] self, pages: List[np.ndarray], boxes: List[Dict[str, np.ndarray]], text_preds: List[Dict[str, List[Tuple[str, float]]]], page_shapes: List[Tuple[int, int]], crop_orientations: List[Dict[str, List[Dict[str, Any]]]], orientations: Optional[List[Dict[str, Any]]] = None, languages: Optional[List[Dict[str, Any]]] = None, ) -> KIEDocument: """Re-arrange detected words into structured predictions Args: ---- pages: list of N elements, where each element represents the page image boxes: list of N dictionaries, where each element represents the localization predictions for a class, of shape (*, 5) or (*, 6) for all predictions text_preds: list of N dictionaries, where each element is the list of all word prediction page_shapes: shape of each page, of size N crop_orientations: list of N dictonaries, where each element is a list containing the general crop orientations (orientations + confidences) of the crops orientations: optional, list of N elements, where each element is a dictionary containing the orientation (orientation + confidence) languages: optional, list of N elements, where each element is a dictionary containing the language (language + confidence) Returns: ------- document object """ if len(boxes) != len(text_preds) != len(crop_orientations) or len(boxes) != len(page_shapes) != len( crop_orientations ): raise ValueError("All arguments are expected to be lists of the same size") _orientations = ( orientations if isinstance(orientations, list) else [None] * len(boxes) # type: ignore[list-item] ) _languages = languages if isinstance(languages, list) else [None] * len(boxes) # type: ignore[list-item] if self.export_as_straight_boxes and len(boxes) > 0: # If boxes are already straight OK, else fit a bounding rect if next(iter(boxes[0].values())).ndim == 3: straight_boxes: List[Dict[str, np.ndarray]] = [] # Iterate over pages for p_boxes in boxes: # Iterate over boxes of the pages straight_boxes_dict = {} for k, box in p_boxes.items(): straight_boxes_dict[k] = np.concatenate((box.min(1), box.max(1)), 1) straight_boxes.append(straight_boxes_dict) boxes = straight_boxes _pages = [ KIEPage( page, { k: self._build_blocks( page_boxes[k], word_preds[k], word_crop_orientations[k], ) for k in page_boxes.keys() }, _idx, shape, orientation, language, ) for page, _idx, shape, page_boxes, word_preds, word_crop_orientations, orientation, language in zip( pages, range(len(boxes)), page_shapes, boxes, text_preds, crop_orientations, _orientations, _languages ) ] return KIEDocument(_pages) def _build_blocks( # type: ignore[override] self, boxes: np.ndarray, word_preds: List[Tuple[str, float]], crop_orientations: List[Dict[str, Any]], ) -> List[Prediction]: """Gather independent words in structured blocks Args: ---- boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2) word_preds: list of all detected words of the page, of shape N crop_orientations: list of orientations for each word crop Returns: ------- list of block elements """ if boxes.shape[0] != len(word_preds): raise ValueError(f"Incompatible argument lengths: {boxes.shape[0]}, {len(word_preds)}") if boxes.shape[0] == 0: return [] # Decide whether we try to form lines _boxes = boxes idxs, _ = self._sort_boxes(_boxes if _boxes.ndim == 3 else _boxes[:, :4]) predictions = [ Prediction( value=word_preds[idx][0], confidence=word_preds[idx][1], geometry=tuple([tuple(pt) for pt in boxes[idx].tolist()]), # type: ignore[arg-type] crop_orientation=crop_orientations[idx], ) if boxes.ndim == 3 else Prediction( value=word_preds[idx][0], confidence=word_preds[idx][1], geometry=((boxes[idx, 0], boxes[idx, 1]), (boxes[idx, 2], boxes[idx, 3])), crop_orientation=crop_orientations[idx], ) for idx in idxs ] return predictions