|
""" |
|
This is an advanced PyMuPDF utility for detecting multi-column pages. |
|
It can be used in a shell script, or its main function can be imported and |
|
invoked as descript below. |
|
|
|
Features |
|
--------- |
|
- Identify text belonging to (a variable number of) columns on the page. |
|
- Text with different background color is handled separately, allowing for |
|
easier treatment of side remarks, comment boxes, etc. |
|
- Uses text block detection capability to identify text blocks and |
|
uses the block bboxes as primary structuring principle. |
|
- Supports ignoring footers via a footer margin parameter. |
|
- Returns re-created text boundary boxes (integer coordinates), sorted ascending |
|
by the top, then by the left coordinates. |
|
|
|
Restrictions |
|
------------- |
|
- Only supporting horizontal, left-to-right text |
|
- Returns a list of text boundary boxes - not the text itself. The caller is |
|
expected to extract text from within the returned boxes. |
|
- Text written above images is ignored altogether (option). |
|
- This utility works as expected in most cases. The following situation cannot |
|
be handled correctly: |
|
* overlapping (non-disjoint) text blocks |
|
* image captions are not recognized and are handled like normal text |
|
|
|
Usage |
|
------ |
|
- As a CLI shell command use |
|
|
|
python multi_column.py input.pdf footer_margin |
|
|
|
Where footer margin is the height of the bottom stripe to ignore on each page. |
|
This code is intended to be modified according to your need. |
|
|
|
- Use in a Python script as follows: |
|
|
|
---------------------------------------------------------------------------------- |
|
from multi_column import column_boxes |
|
|
|
# for each page execute |
|
bboxes = column_boxes(page, footer_margin=50, no_image_text=True) |
|
|
|
# bboxes is a list of fitz.IRect objects, that are sort ascending by their y0, |
|
# then x0 coordinates. Their text content can be extracted by all PyMuPDF |
|
# get_text() variants, like for instance the following: |
|
for rect in bboxes: |
|
print(page.get_text(clip=rect, sort=True)) |
|
---------------------------------------------------------------------------------- |
|
""" |
|
import sys |
|
from magic_pdf.libs.commons import fitz |
|
|
|
|
|
def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True): |
|
"""Determine bboxes which wrap a column.""" |
|
paths = page.get_drawings() |
|
bboxes = [] |
|
|
|
|
|
path_rects = [] |
|
|
|
|
|
img_bboxes = [] |
|
|
|
|
|
|
|
vert_bboxes = [] |
|
|
|
|
|
clip = +page.rect |
|
clip.y1 -= footer_margin |
|
clip.y0 += header_margin |
|
|
|
def can_extend(temp, bb, bboxlist): |
|
"""Determines whether rectangle 'temp' can be extended by 'bb' |
|
without intersecting any of the rectangles contained in 'bboxlist'. |
|
|
|
Items of bboxlist may be None if they have been removed. |
|
|
|
Returns: |
|
True if 'temp' has no intersections with items of 'bboxlist'. |
|
""" |
|
for b in bboxlist: |
|
if not intersects_bboxes(temp, vert_bboxes) and ( |
|
b == None or b == bb or (temp & b).is_empty |
|
): |
|
continue |
|
return False |
|
|
|
return True |
|
|
|
def in_bbox(bb, bboxes): |
|
"""Return 1-based number if a bbox contains bb, else return 0.""" |
|
for i, bbox in enumerate(bboxes): |
|
if bb in bbox: |
|
return i + 1 |
|
return 0 |
|
|
|
def intersects_bboxes(bb, bboxes): |
|
"""Return True if a bbox intersects bb, else return False.""" |
|
for bbox in bboxes: |
|
if not (bb & bbox).is_empty: |
|
return True |
|
return False |
|
|
|
def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes): |
|
"""Extend a bbox to the right page border. |
|
|
|
Whenever there is no text to the right of a bbox, enlarge it up |
|
to the right page border. |
|
|
|
Args: |
|
bboxes: (list[IRect]) bboxes to check |
|
width: (int) page width |
|
path_bboxes: (list[IRect]) bboxes with a background color |
|
vert_bboxes: (list[IRect]) bboxes with vertical text |
|
img_bboxes: (list[IRect]) bboxes of images |
|
Returns: |
|
Potentially modified bboxes. |
|
""" |
|
for i, bb in enumerate(bboxes): |
|
|
|
if in_bbox(bb, path_bboxes): |
|
continue |
|
|
|
|
|
if in_bbox(bb, img_bboxes): |
|
continue |
|
|
|
|
|
temp = +bb |
|
temp.x1 = width |
|
|
|
|
|
if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes): |
|
continue |
|
|
|
|
|
check = can_extend(temp, bb, bboxes) |
|
if check: |
|
bboxes[i] = temp |
|
|
|
return [b for b in bboxes if b != None] |
|
|
|
def clean_nblocks(nblocks): |
|
"""Do some elementary cleaning.""" |
|
|
|
|
|
blen = len(nblocks) |
|
if blen < 2: |
|
return nblocks |
|
start = blen - 1 |
|
for i in range(start, -1, -1): |
|
bb1 = nblocks[i] |
|
bb0 = nblocks[i - 1] |
|
if bb0 == bb1: |
|
del nblocks[i] |
|
|
|
|
|
|
|
|
|
y1 = nblocks[0].y1 |
|
i0 = 0 |
|
i1 = -1 |
|
|
|
|
|
|
|
for i in range(1, len(nblocks)): |
|
b1 = nblocks[i] |
|
if abs(b1.y1 - y1) > 10: |
|
if i1 > i0: |
|
nblocks[i0 : i1 + 1] = sorted( |
|
nblocks[i0 : i1 + 1], key=lambda b: b.x0 |
|
) |
|
y1 = b1.y1 |
|
i0 = i |
|
i1 = i |
|
if i1 > i0: |
|
nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0) |
|
return nblocks |
|
|
|
|
|
for p in paths: |
|
path_rects.append(p["rect"].irect) |
|
path_bboxes = path_rects |
|
|
|
|
|
path_bboxes.sort(key=lambda b: (b.y0, b.x0)) |
|
|
|
|
|
for item in page.get_images(): |
|
img_bboxes.extend(page.get_image_rects(item[0])) |
|
|
|
|
|
blocks = page.get_text( |
|
"dict", |
|
flags=fitz.TEXTFLAGS_TEXT, |
|
clip=clip, |
|
)["blocks"] |
|
|
|
|
|
for b in blocks: |
|
bbox = fitz.IRect(b["bbox"]) |
|
|
|
|
|
if no_image_text and in_bbox(bbox, img_bboxes): |
|
continue |
|
|
|
|
|
line0 = b["lines"][0] |
|
if line0["dir"] != (1, 0): |
|
vert_bboxes.append(bbox) |
|
continue |
|
|
|
srect = fitz.EMPTY_IRECT() |
|
for line in b["lines"]: |
|
lbbox = fitz.IRect(line["bbox"]) |
|
text = "".join([s["text"].strip() for s in line["spans"]]) |
|
if len(text) > 1: |
|
srect |= lbbox |
|
bbox = +srect |
|
|
|
if not bbox.is_empty: |
|
bboxes.append(bbox) |
|
|
|
|
|
bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0)) |
|
|
|
|
|
bboxes = extend_right( |
|
bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes |
|
) |
|
|
|
|
|
if bboxes == []: |
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
nblocks = [bboxes[0]] |
|
bboxes = bboxes[1:] |
|
|
|
for i, bb in enumerate(bboxes): |
|
check = False |
|
|
|
|
|
for j in range(len(nblocks)): |
|
nbb = nblocks[j] |
|
|
|
|
|
if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0: |
|
continue |
|
|
|
|
|
if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes): |
|
continue |
|
|
|
temp = bb | nbb |
|
check = can_extend(temp, nbb, nblocks) |
|
if check == True: |
|
break |
|
|
|
if not check: |
|
nblocks.append(bb) |
|
j = len(nblocks) - 1 |
|
temp = nblocks[j] |
|
|
|
|
|
check = can_extend(temp, bb, bboxes) |
|
if check == False: |
|
nblocks.append(bb) |
|
else: |
|
nblocks[j] = temp |
|
bboxes[i] = None |
|
|
|
|
|
nblocks = clean_nblocks(nblocks) |
|
|
|
|
|
return nblocks |
|
|
|
|
|
if __name__ == "__main__": |
|
"""Only for debugging purposes, currently. |
|
|
|
Draw red borders around the returned text bboxes and insert |
|
the bbox number. |
|
Then save the file under the name "input-blocks.pdf". |
|
""" |
|
|
|
|
|
filename = sys.argv[1] |
|
|
|
|
|
if len(sys.argv) > 2: |
|
footer_margin = int(sys.argv[2]) |
|
else: |
|
footer_margin = 50 |
|
|
|
|
|
if len(sys.argv) > 3: |
|
header_margin = int(sys.argv[3]) |
|
else: |
|
header_margin = 50 |
|
|
|
|
|
doc = fitz.open(filename) |
|
|
|
|
|
for page in doc: |
|
|
|
page.wrap_contents() |
|
|
|
|
|
bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin) |
|
|
|
|
|
shape = page.new_shape() |
|
|
|
|
|
for i, rect in enumerate(bboxes): |
|
shape.draw_rect(rect) |
|
|
|
|
|
shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"]) |
|
|
|
|
|
shape.finish(color=fitz.pdfcolor["red"]) |
|
shape.commit() |
|
|
|
|
|
doc.ez_save(filename.replace(".pdf", "-blocks.pdf")) |