|
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap |
|
import collections |
|
|
|
|
|
|
|
def is_below(bbox1, bbox2): |
|
|
|
return bbox1[1] > bbox2[3] |
|
|
|
|
|
def merge_bboxes(bboxes): |
|
|
|
x0 = min(bbox[0] for bbox in bboxes) |
|
y0 = min(bbox[1] for bbox in bboxes) |
|
x1 = max(bbox[2] for bbox in bboxes) |
|
y1 = max(bbox[3] for bbox in bboxes) |
|
return [x0, y0, x1, y1] |
|
|
|
|
|
def merge_footnote_blocks(page_info, main_text_font): |
|
page_info['merged_bboxes'] = [] |
|
for layout in page_info['layout_bboxes']: |
|
|
|
footnote_bboxes = [block for block in page_info['footnote_bboxes_tmp'] if _is_in(block, layout['layout_bbox'])] |
|
|
|
if not footnote_bboxes: |
|
continue |
|
|
|
preproc_blocks = [block for block in page_info['preproc_blocks'] if _is_in(block['bbox'], layout['layout_bbox'])] |
|
|
|
font_names = collections.Counter() |
|
if len(preproc_blocks) > 0: |
|
|
|
line_sizes = [] |
|
|
|
block_sizes = [] |
|
for block in preproc_blocks: |
|
block_line_sizes = [] |
|
block_fonts = collections.Counter() |
|
for line in block['lines']: |
|
|
|
span_sizes = [span['size'] for span in line['spans'] if 'size' in span] |
|
if span_sizes: |
|
line_size = sum(span_sizes) / len(span_sizes) |
|
line_sizes.append(line_size) |
|
block_line_sizes.append(line_size) |
|
span_font = [(span['font'], len(span['text'])) for span in line['spans'] if |
|
'font' in span and len(span['text']) > 0] |
|
if span_font: |
|
|
|
|
|
|
|
for font, count in span_font: |
|
|
|
|
|
font_names[font] += count |
|
block_fonts[font] += count |
|
if block_line_sizes: |
|
|
|
block_size = sum(block_line_sizes) / len(block_line_sizes) |
|
block_font = block_fonts.most_common(1)[0][0] |
|
block_sizes.append((block, block_size, block_font)) |
|
|
|
|
|
|
|
main_text_size = collections.Counter(line_sizes).most_common(1)[0][0] |
|
else: |
|
continue |
|
|
|
need_merge_bboxes = [] |
|
|
|
for footnote_bbox in footnote_bboxes: |
|
|
|
main_text_bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if |
|
is_below(block['bbox'], footnote_bbox) and |
|
sum([size >= main_text_size, |
|
len(block['lines']) >= 5, |
|
block_font == main_text_font]) |
|
>= 2] |
|
|
|
if len(main_text_bboxes_below) > 0: |
|
continue |
|
else: |
|
|
|
need_merge_bboxes.append(footnote_bbox) |
|
if len(need_merge_bboxes) == 0: |
|
continue |
|
|
|
top_footnote_bbox = min(need_merge_bboxes, key=lambda bbox: bbox[1]) |
|
|
|
bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if is_below(block['bbox'], top_footnote_bbox)] |
|
|
|
|
|
|
|
merged_bbox = merge_bboxes([top_footnote_bbox] + bboxes_below) |
|
|
|
page_info['merged_bboxes'].append(merged_bbox) |
|
return page_info |
|
|
|
|
|
def remove_footnote_blocks(page_info): |
|
if page_info.get('merged_bboxes'): |
|
|
|
remain_text_blocks, removed_footnote_text_blocks = remove_footnote_text(page_info['preproc_blocks'], page_info['merged_bboxes']) |
|
|
|
image_blocks, removed_footnote_imgs_blocks = remove_footnote_image(page_info['images'], page_info['merged_bboxes']) |
|
|
|
page_info['preproc_blocks'] = remain_text_blocks |
|
page_info['images'] = image_blocks |
|
page_info['droped_text_block'].extend(removed_footnote_text_blocks) |
|
page_info['droped_image_block'].extend(removed_footnote_imgs_blocks) |
|
|
|
del page_info['merged_bboxes'] |
|
del page_info['footnote_bboxes_tmp'] |
|
return page_info |
|
|
|
|
|
def remove_footnote_text(raw_text_block, footnote_bboxes): |
|
""" |
|
:param raw_text_block: str类型,是当前页的文本内容 |
|
:param footnoteBboxes: list类型,是当前页的脚注bbox |
|
""" |
|
footnote_text_blocks = [] |
|
for block in raw_text_block: |
|
text_bbox = block['bbox'] |
|
|
|
if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]): |
|
|
|
block['tag'] = 'footnote' |
|
footnote_text_blocks.append(block) |
|
|
|
|
|
|
|
for block in footnote_text_blocks: |
|
raw_text_block.remove(block) |
|
|
|
return raw_text_block, footnote_text_blocks |
|
|
|
|
|
def remove_footnote_image(image_blocks, footnote_bboxes): |
|
""" |
|
:param image_bboxes: list类型,是当前页的图片bbox(结构体) |
|
:param footnoteBboxes: list类型,是当前页的脚注bbox |
|
""" |
|
footnote_imgs_blocks = [] |
|
for image_block in image_blocks: |
|
if any([_is_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]): |
|
footnote_imgs_blocks.append(image_block) |
|
|
|
for footnote_imgs_block in footnote_imgs_blocks: |
|
image_blocks.remove(footnote_imgs_block) |
|
|
|
return image_blocks, footnote_imgs_blocks |