|
""" |
|
找到能分割布局的水平的横线、色块 |
|
""" |
|
|
|
import os |
|
from magic_pdf.libs.commons import fitz |
|
from magic_pdf.libs.boxbase import _is_in_or_part_overlap |
|
|
|
|
|
def __rect_filter_by_width(rect, page_w, page_h): |
|
mid_x = page_w/2 |
|
if rect[0]< mid_x < rect[2]: |
|
return True |
|
return False |
|
|
|
|
|
def __rect_filter_by_pos(rect, image_bboxes, table_bboxes): |
|
""" |
|
不能出现在table和image的位置 |
|
""" |
|
for box in image_bboxes: |
|
if _is_in_or_part_overlap(rect, box): |
|
return False |
|
|
|
for box in table_bboxes: |
|
if _is_in_or_part_overlap(rect, box): |
|
return False |
|
|
|
return True |
|
|
|
|
|
def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,): |
|
save_path = "./tmp/debug.pdf" |
|
if os.path.exists(save_path): |
|
|
|
os.remove(save_path) |
|
|
|
doc = fitz.open('') |
|
|
|
width = page.rect.width |
|
height = page.rect.height |
|
new_page = doc.new_page(width=width, height=height) |
|
|
|
shape = new_page.new_shape() |
|
for bbox in bboxes1: |
|
|
|
rect = fitz.Rect(*bbox[0:4]) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2) |
|
shape.finish() |
|
shape.commit() |
|
|
|
for bbox in bboxes2: |
|
|
|
rect = fitz.Rect(*bbox[0:4]) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2) |
|
shape.finish() |
|
shape.commit() |
|
|
|
for bbox in bboxes3: |
|
|
|
rect = fitz.Rect(*bbox[0:4]) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=fitz.pdfcolor['red'], fill=None) |
|
shape.finish() |
|
shape.commit() |
|
|
|
parent_dir = os.path.dirname(save_path) |
|
if not os.path.exists(parent_dir): |
|
os.makedirs(parent_dir) |
|
|
|
doc.save(save_path) |
|
doc.close() |
|
|
|
def get_spilter_of_page(page, image_bboxes, table_bboxes): |
|
""" |
|
获取到色块和横线 |
|
""" |
|
cdrawings = page.get_cdrawings() |
|
|
|
spilter_bbox = [] |
|
for block in cdrawings: |
|
if 'fill' in block: |
|
fill = block['fill'] |
|
if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0): |
|
rect = block['rect'] |
|
if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes): |
|
spilter_bbox.append(list(rect)) |
|
|
|
"""过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1""" |
|
for box in spilter_bbox: |
|
if box[3]-box[1] <= 0: |
|
box[3] = box[1] + 1 |
|
|
|
|
|
|
|
return spilter_bbox |
|
|