File size: 3,067 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
找到能分割布局的水平的横线、色块
"""

import os
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.boxbase import _is_in_or_part_overlap


def __rect_filter_by_width(rect, page_w, page_h):
    mid_x = page_w/2
    if rect[0]< mid_x < rect[2]:
        return True
    return False


def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
    """
    不能出现在table和image的位置
    """
    for box in image_bboxes:
        if _is_in_or_part_overlap(rect, box):
            return False
    
    for box in table_bboxes:
        if _is_in_or_part_overlap(rect, box):
            return False
    
    return True


def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
    save_path = "./tmp/debug.pdf"
    if os.path.exists(save_path):
        # 删除已经存在的文件
        os.remove(save_path)
    # 创建一个新的空白 PDF 文件
    doc = fitz.open('')

    width = page.rect.width
    height = page.rect.height
    new_page = doc.new_page(width=width, height=height)
    
    shape = new_page.new_shape()
    for bbox in bboxes1:
        # 原始box画上去
        rect = fitz.Rect(*bbox[0:4])
        shape = new_page.new_shape()
        shape.draw_rect(rect)
        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
        shape.finish()
        shape.commit()
        
    for bbox in bboxes2:
        # 原始box画上去
        rect = fitz.Rect(*bbox[0:4])
        shape = new_page.new_shape()
        shape.draw_rect(rect)
        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
        shape.finish()
        shape.commit()
        
    for bbox in bboxes3:
        # 原始box画上去
        rect = fitz.Rect(*bbox[0:4])
        shape = new_page.new_shape()
        shape.draw_rect(rect)
        shape.finish(color=fitz.pdfcolor['red'], fill=None)
        shape.finish()
        shape.commit()
        
    parent_dir = os.path.dirname(save_path)
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)

    doc.save(save_path)
    doc.close() 
    
def get_spilter_of_page(page, image_bboxes, table_bboxes):
    """
    获取到色块和横线
    """
    cdrawings = page.get_cdrawings()
    
    spilter_bbox = []
    for block in cdrawings:
        if 'fill' in block:
            fill = block['fill']
        if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
            rect = block['rect']
            if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
                spilter_bbox.append(list(rect))
    
    """过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1"""
    for box in spilter_bbox:
        if box[3]-box[1] <= 0:
            box[3] = box[1] + 1
            
    #__debug_show_page(page, spilter_bbox, [], [])
    
    return spilter_bbox