Spaces:

derful
/

MinerU

Runtime error

App Files Files Community

MinerU / magic_pdf /layout /mcol_sort.py

derful

Upload folder using huggingface_hub

240e0a0 verified 12 months ago

raw

history blame contribute delete

11.3 kB

	"""
	This is an advanced PyMuPDF utility for detecting multi-column pages.
	It can be used in a shell script, or its main function can be imported and
	invoked as descript below.

	Features
	---------
	- Identify text belonging to (a variable number of) columns on the page.
	- Text with different background color is handled separately, allowing for
	easier treatment of side remarks, comment boxes, etc.
	- Uses text block detection capability to identify text blocks and
	uses the block bboxes as primary structuring principle.
	- Supports ignoring footers via a footer margin parameter.
	- Returns re-created text boundary boxes (integer coordinates), sorted ascending
	by the top, then by the left coordinates.

	Restrictions
	-------------
	- Only supporting horizontal, left-to-right text
	- Returns a list of text boundary boxes - not the text itself. The caller is
	expected to extract text from within the returned boxes.
	- Text written above images is ignored altogether (option).
	- This utility works as expected in most cases. The following situation cannot
	be handled correctly:
	* overlapping (non-disjoint) text blocks
	* image captions are not recognized and are handled like normal text

	Usage
	------
	- As a CLI shell command use

	python multi_column.py input.pdf footer_margin

	Where footer margin is the height of the bottom stripe to ignore on each page.
	This code is intended to be modified according to your need.

	- Use in a Python script as follows:

	----------------------------------------------------------------------------------
	from multi_column import column_boxes

	# for each page execute
	bboxes = column_boxes(page, footer_margin=50, no_image_text=True)

	# bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
	# then x0 coordinates. Their text content can be extracted by all PyMuPDF
	# get_text() variants, like for instance the following:
	for rect in bboxes:
	print(page.get_text(clip=rect, sort=True))
	----------------------------------------------------------------------------------
	"""
	import sys
	from magic_pdf.libs.commons import fitz


	def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
	"""Determine bboxes which wrap a column."""
	paths = page.get_drawings()
	bboxes = []

	# path rectangles
	path_rects = []

	# image bboxes
	img_bboxes = []

	# bboxes of non-horizontal text
	# avoid when expanding horizontal text boxes
	vert_bboxes = []

	# compute relevant page area
	clip = +page.rect
	clip.y1 -= footer_margin # Remove footer area
	clip.y0 += header_margin # Remove header area

	def can_extend(temp, bb, bboxlist):
	"""Determines whether rectangle 'temp' can be extended by 'bb'
	without intersecting any of the rectangles contained in 'bboxlist'.

	Items of bboxlist may be None if they have been removed.

	Returns:
	True if 'temp' has no intersections with items of 'bboxlist'.
	"""
	for b in bboxlist:
	if not intersects_bboxes(temp, vert_bboxes) and (
	b == None or b == bb or (temp & b).is_empty
	):
	continue
	return False

	return True

	def in_bbox(bb, bboxes):
	"""Return 1-based number if a bbox contains bb, else return 0."""
	for i, bbox in enumerate(bboxes):
	if bb in bbox:
	return i + 1
	return 0

	def intersects_bboxes(bb, bboxes):
	"""Return True if a bbox intersects bb, else return False."""
	for bbox in bboxes:
	if not (bb & bbox).is_empty:
	return True
	return False

	def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
	"""Extend a bbox to the right page border.

	Whenever there is no text to the right of a bbox, enlarge it up
	to the right page border.

	Args:
	bboxes: (list[IRect]) bboxes to check
	width: (int) page width
	path_bboxes: (list[IRect]) bboxes with a background color
	vert_bboxes: (list[IRect]) bboxes with vertical text
	img_bboxes: (list[IRect]) bboxes of images
	Returns:
	Potentially modified bboxes.
	"""
	for i, bb in enumerate(bboxes):
	# do not extend text with background color
	if in_bbox(bb, path_bboxes):
	continue

	# do not extend text in images
	if in_bbox(bb, img_bboxes):
	continue

	# temp extends bb to the right page border
	temp = +bb
	temp.x1 = width

	# do not cut through colored background or images
	if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
	continue

	# also, do not intersect other text bboxes
	check = can_extend(temp, bb, bboxes)
	if check:
	bboxes[i] = temp # replace with enlarged bbox

	return [b for b in bboxes if b != None]

	def clean_nblocks(nblocks):
	"""Do some elementary cleaning."""

	# 1. remove any duplicate blocks.
	blen = len(nblocks)
	if blen < 2:
	return nblocks
	start = blen - 1
	for i in range(start, -1, -1):
	bb1 = nblocks[i]
	bb0 = nblocks[i - 1]
	if bb0 == bb1:
	del nblocks[i]

	# 2. repair sequence in special cases:
	# consecutive bboxes with almost same bottom value are sorted ascending
	# by x-coordinate.
	y1 = nblocks[0].y1 # first bottom coordinate
	i0 = 0 # its index
	i1 = -1 # index of last bbox with same bottom

	# Iterate over bboxes, identifying segments with approx. same bottom value.
	# Replace every segment by its sorted version.
	for i in range(1, len(nblocks)):
	b1 = nblocks[i]
	if abs(b1.y1 - y1) > 10: # different bottom
	if i1 > i0: # segment length > 1? Sort it!
	nblocks[i0 : i1 + 1] = sorted(
	nblocks[i0 : i1 + 1], key=lambda b: b.x0
	)
	y1 = b1.y1 # store new bottom value
	i0 = i # store its start index
	i1 = i # store current index
	if i1 > i0: # segment waiting to be sorted
	nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
	return nblocks

	# extract vector graphics
	for p in paths:
	path_rects.append(p["rect"].irect)
	path_bboxes = path_rects

	# sort path bboxes by ascending top, then left coordinates
	path_bboxes.sort(key=lambda b: (b.y0, b.x0))

	# bboxes of images on page, no need to sort them
	for item in page.get_images():
	img_bboxes.extend(page.get_image_rects(item[0]))

	# blocks of text on page
	blocks = page.get_text(
	"dict",
	flags=fitz.TEXTFLAGS_TEXT,
	clip=clip,
	)["blocks"]

	# Make block rectangles, ignoring non-horizontal text
	for b in blocks:
	bbox = fitz.IRect(b["bbox"]) # bbox of the block

	# ignore text written upon images
	if no_image_text and in_bbox(bbox, img_bboxes):
	continue

	# confirm first line to be horizontal
	line0 = b["lines"][0] # get first line
	if line0["dir"] != (1, 0): # only accept horizontal text
	vert_bboxes.append(bbox)
	continue

	srect = fitz.EMPTY_IRECT()
	for line in b["lines"]:
	lbbox = fitz.IRect(line["bbox"])
	text = "".join([s["text"].strip() for s in line["spans"]])
	if len(text) > 1:
	srect \|= lbbox
	bbox = +srect

	if not bbox.is_empty:
	bboxes.append(bbox)

	# Sort text bboxes by ascending background, top, then left coordinates
	bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))

	# Extend bboxes to the right where possible
	bboxes = extend_right(
	bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
	)

	# immediately return of no text found
	if bboxes == []:
	return []

	# --------------------------------------------------------------------
	# Join bboxes to establish some column structure
	# --------------------------------------------------------------------
	# the final block bboxes on page
	nblocks = [bboxes[0]] # pre-fill with first bbox
	bboxes = bboxes[1:] # remaining old bboxes

	for i, bb in enumerate(bboxes): # iterate old bboxes
	check = False # indicates unwanted joins

	# check if bb can extend one of the new blocks
	for j in range(len(nblocks)):
	nbb = nblocks[j] # a new block

	# never join across columns
	if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
	continue

	# never join across different background colors
	if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
	continue

	temp = bb \| nbb # temporary extension of new block
	check = can_extend(temp, nbb, nblocks)
	if check == True:
	break

	if not check: # bb cannot be used to extend any of the new bboxes
	nblocks.append(bb) # so add it to the list
	j = len(nblocks) - 1 # index of it
	temp = nblocks[j] # new bbox added

	# check if some remaining bbox is contained in temp
	check = can_extend(temp, bb, bboxes)
	if check == False:
	nblocks.append(bb)
	else:
	nblocks[j] = temp
	bboxes[i] = None

	# do some elementary cleaning
	nblocks = clean_nblocks(nblocks)

	# return identified text bboxes
	return nblocks


	if __name__ == "__main__":
	"""Only for debugging purposes, currently.

	Draw red borders around the returned text bboxes and insert
	the bbox number.
	Then save the file under the name "input-blocks.pdf".
	"""

	# get the file name
	filename = sys.argv[1]

	# check if footer margin is given
	if len(sys.argv) > 2:
	footer_margin = int(sys.argv[2])
	else: # use default vaue
	footer_margin = 50

	# check if header margin is given
	if len(sys.argv) > 3:
	header_margin = int(sys.argv[3])
	else: # use default vaue
	header_margin = 50

	# open document
	doc = fitz.open(filename)

	# iterate over the pages
	for page in doc:
	# remove any geometry issues
	page.wrap_contents()

	# get the text bboxes
	bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)

	# prepare a canvas to draw rectangles and text
	shape = page.new_shape()

	# iterate over the bboxes
	for i, rect in enumerate(bboxes):
	shape.draw_rect(rect) # draw a border

	# write sequence number
	shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])

	# finish drawing / text with color red
	shape.finish(color=fitz.pdfcolor["red"])
	shape.commit() # store to the page

	# save document with text bboxes
	doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))