""" This is an advanced PyMuPDF utility for detecting multi-column pages. It can be used in a shell script, or its main function can be imported and invoked as descript below. Features --------- - Identify text belonging to (a variable number of) columns on the page. - Text with different background color is handled separately, allowing for easier treatment of side remarks, comment boxes, etc. - Uses text block detection capability to identify text blocks and uses the block bboxes as primary structuring principle. - Supports ignoring footers via a footer margin parameter. - Returns re-created text boundary boxes (integer coordinates), sorted ascending by the top, then by the left coordinates. Restrictions ------------- - Only supporting horizontal, left-to-right text - Returns a list of text boundary boxes - not the text itself. The caller is expected to extract text from within the returned boxes. - Text written above images is ignored altogether (option). - This utility works as expected in most cases. The following situation cannot be handled correctly: * overlapping (non-disjoint) text blocks * image captions are not recognized and are handled like normal text Usage ------ - As a CLI shell command use python multi_column.py input.pdf footer_margin Where footer margin is the height of the bottom stripe to ignore on each page. This code is intended to be modified according to your need. - Use in a Python script as follows: ---------------------------------------------------------------------------------- from multi_column import column_boxes # for each page execute bboxes = column_boxes(page, footer_margin=50, no_image_text=True) # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0, # then x0 coordinates. Their text content can be extracted by all PyMuPDF # get_text() variants, like for instance the following: for rect in bboxes: print(page.get_text(clip=rect, sort=True)) ---------------------------------------------------------------------------------- """ import sys from magic_pdf.libs.commons import fitz def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True): """Determine bboxes which wrap a column.""" paths = page.get_drawings() bboxes = [] # path rectangles path_rects = [] # image bboxes img_bboxes = [] # bboxes of non-horizontal text # avoid when expanding horizontal text boxes vert_bboxes = [] # compute relevant page area clip = +page.rect clip.y1 -= footer_margin # Remove footer area clip.y0 += header_margin # Remove header area def can_extend(temp, bb, bboxlist): """Determines whether rectangle 'temp' can be extended by 'bb' without intersecting any of the rectangles contained in 'bboxlist'. Items of bboxlist may be None if they have been removed. Returns: True if 'temp' has no intersections with items of 'bboxlist'. """ for b in bboxlist: if not intersects_bboxes(temp, vert_bboxes) and ( b == None or b == bb or (temp & b).is_empty ): continue return False return True def in_bbox(bb, bboxes): """Return 1-based number if a bbox contains bb, else return 0.""" for i, bbox in enumerate(bboxes): if bb in bbox: return i + 1 return 0 def intersects_bboxes(bb, bboxes): """Return True if a bbox intersects bb, else return False.""" for bbox in bboxes: if not (bb & bbox).is_empty: return True return False def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes): """Extend a bbox to the right page border. Whenever there is no text to the right of a bbox, enlarge it up to the right page border. Args: bboxes: (list[IRect]) bboxes to check width: (int) page width path_bboxes: (list[IRect]) bboxes with a background color vert_bboxes: (list[IRect]) bboxes with vertical text img_bboxes: (list[IRect]) bboxes of images Returns: Potentially modified bboxes. """ for i, bb in enumerate(bboxes): # do not extend text with background color if in_bbox(bb, path_bboxes): continue # do not extend text in images if in_bbox(bb, img_bboxes): continue # temp extends bb to the right page border temp = +bb temp.x1 = width # do not cut through colored background or images if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes): continue # also, do not intersect other text bboxes check = can_extend(temp, bb, bboxes) if check: bboxes[i] = temp # replace with enlarged bbox return [b for b in bboxes if b != None] def clean_nblocks(nblocks): """Do some elementary cleaning.""" # 1. remove any duplicate blocks. blen = len(nblocks) if blen < 2: return nblocks start = blen - 1 for i in range(start, -1, -1): bb1 = nblocks[i] bb0 = nblocks[i - 1] if bb0 == bb1: del nblocks[i] # 2. repair sequence in special cases: # consecutive bboxes with almost same bottom value are sorted ascending # by x-coordinate. y1 = nblocks[0].y1 # first bottom coordinate i0 = 0 # its index i1 = -1 # index of last bbox with same bottom # Iterate over bboxes, identifying segments with approx. same bottom value. # Replace every segment by its sorted version. for i in range(1, len(nblocks)): b1 = nblocks[i] if abs(b1.y1 - y1) > 10: # different bottom if i1 > i0: # segment length > 1? Sort it! nblocks[i0 : i1 + 1] = sorted( nblocks[i0 : i1 + 1], key=lambda b: b.x0 ) y1 = b1.y1 # store new bottom value i0 = i # store its start index i1 = i # store current index if i1 > i0: # segment waiting to be sorted nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0) return nblocks # extract vector graphics for p in paths: path_rects.append(p["rect"].irect) path_bboxes = path_rects # sort path bboxes by ascending top, then left coordinates path_bboxes.sort(key=lambda b: (b.y0, b.x0)) # bboxes of images on page, no need to sort them for item in page.get_images(): img_bboxes.extend(page.get_image_rects(item[0])) # blocks of text on page blocks = page.get_text( "dict", flags=fitz.TEXTFLAGS_TEXT, clip=clip, )["blocks"] # Make block rectangles, ignoring non-horizontal text for b in blocks: bbox = fitz.IRect(b["bbox"]) # bbox of the block # ignore text written upon images if no_image_text and in_bbox(bbox, img_bboxes): continue # confirm first line to be horizontal line0 = b["lines"][0] # get first line if line0["dir"] != (1, 0): # only accept horizontal text vert_bboxes.append(bbox) continue srect = fitz.EMPTY_IRECT() for line in b["lines"]: lbbox = fitz.IRect(line["bbox"]) text = "".join([s["text"].strip() for s in line["spans"]]) if len(text) > 1: srect |= lbbox bbox = +srect if not bbox.is_empty: bboxes.append(bbox) # Sort text bboxes by ascending background, top, then left coordinates bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0)) # Extend bboxes to the right where possible bboxes = extend_right( bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes ) # immediately return of no text found if bboxes == []: return [] # -------------------------------------------------------------------- # Join bboxes to establish some column structure # -------------------------------------------------------------------- # the final block bboxes on page nblocks = [bboxes[0]] # pre-fill with first bbox bboxes = bboxes[1:] # remaining old bboxes for i, bb in enumerate(bboxes): # iterate old bboxes check = False # indicates unwanted joins # check if bb can extend one of the new blocks for j in range(len(nblocks)): nbb = nblocks[j] # a new block # never join across columns if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0: continue # never join across different background colors if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes): continue temp = bb | nbb # temporary extension of new block check = can_extend(temp, nbb, nblocks) if check == True: break if not check: # bb cannot be used to extend any of the new bboxes nblocks.append(bb) # so add it to the list j = len(nblocks) - 1 # index of it temp = nblocks[j] # new bbox added # check if some remaining bbox is contained in temp check = can_extend(temp, bb, bboxes) if check == False: nblocks.append(bb) else: nblocks[j] = temp bboxes[i] = None # do some elementary cleaning nblocks = clean_nblocks(nblocks) # return identified text bboxes return nblocks if __name__ == "__main__": """Only for debugging purposes, currently. Draw red borders around the returned text bboxes and insert the bbox number. Then save the file under the name "input-blocks.pdf". """ # get the file name filename = sys.argv[1] # check if footer margin is given if len(sys.argv) > 2: footer_margin = int(sys.argv[2]) else: # use default vaue footer_margin = 50 # check if header margin is given if len(sys.argv) > 3: header_margin = int(sys.argv[3]) else: # use default vaue header_margin = 50 # open document doc = fitz.open(filename) # iterate over the pages for page in doc: # remove any geometry issues page.wrap_contents() # get the text bboxes bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin) # prepare a canvas to draw rectangles and text shape = page.new_shape() # iterate over the bboxes for i, rect in enumerate(bboxes): shape.draw_rect(rect) # draw a border # write sequence number shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"]) # finish drawing / text with color red shape.finish(color=fitz.pdfcolor["red"]) shape.commit() # store to the page # save document with text bboxes doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))