MinerU / magic_pdf /layout /layout_spiler_recog.py
derful's picture
Upload folder using huggingface_hub
240e0a0 verified
"""
找到能分割布局的水平的横线、色块
"""
import os
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
def __rect_filter_by_width(rect, page_w, page_h):
mid_x = page_w/2
if rect[0]< mid_x < rect[2]:
return True
return False
def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
"""
不能出现在table和image的位置
"""
for box in image_bboxes:
if _is_in_or_part_overlap(rect, box):
return False
for box in table_bboxes:
if _is_in_or_part_overlap(rect, box):
return False
return True
def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
save_path = "./tmp/debug.pdf"
if os.path.exists(save_path):
# 删除已经存在的文件
os.remove(save_path)
# 创建一个新的空白 PDF 文件
doc = fitz.open('')
width = page.rect.width
height = page.rect.height
new_page = doc.new_page(width=width, height=height)
shape = new_page.new_shape()
for bbox in bboxes1:
# 原始box画上去
rect = fitz.Rect(*bbox[0:4])
shape = new_page.new_shape()
shape.draw_rect(rect)
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
shape.finish()
shape.commit()
for bbox in bboxes2:
# 原始box画上去
rect = fitz.Rect(*bbox[0:4])
shape = new_page.new_shape()
shape.draw_rect(rect)
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
shape.finish()
shape.commit()
for bbox in bboxes3:
# 原始box画上去
rect = fitz.Rect(*bbox[0:4])
shape = new_page.new_shape()
shape.draw_rect(rect)
shape.finish(color=fitz.pdfcolor['red'], fill=None)
shape.finish()
shape.commit()
parent_dir = os.path.dirname(save_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
doc.save(save_path)
doc.close()
def get_spilter_of_page(page, image_bboxes, table_bboxes):
"""
获取到色块和横线
"""
cdrawings = page.get_cdrawings()
spilter_bbox = []
for block in cdrawings:
if 'fill' in block:
fill = block['fill']
if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
rect = block['rect']
if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
spilter_bbox.append(list(rect))
"""过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1"""
for box in spilter_bbox:
if box[3]-box[1] <= 0:
box[3] = box[1] + 1
#__debug_show_page(page, spilter_bbox, [], [])
return spilter_bbox