File size: 5,414 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import sys

from magic_pdf.libs.commons import fitz
from termcolor import cprint


if sys.version_info[0] >= 3:
    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore


def open_pdf(pdf_path):
    try:
        pdf_document = fitz.open(pdf_path)  # type: ignore
        return pdf_document
    except Exception as e:
        print(f"无法打开PDF文件:{pdf_path}。原因是:{e}")
        raise e


def print_green_on_red(text):
    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")


def print_green(text):
    print()
    cprint(text, "green", attrs=["bold"], end="\n\n")


def print_red(text):
    print()
    cprint(text, "red", attrs=["bold"], end="\n\n")


def print_yellow(text):
    print()
    cprint(text, "yellow", attrs=["bold"], end="\n\n")


def safe_get(dict_obj, key, default):
    val = dict_obj.get(key)
    if val is None:
        return default
    else:
        return val


def is_bbox_overlap(bbox1, bbox2):
    """
    This function checks if bbox1 and bbox2 overlap or not

    Parameters
    ----------
    bbox1 : list
        bbox1
    bbox2 : list
        bbox2

    Returns
    -------
    bool
        True if bbox1 and bbox2 overlap, else False
    """
    x0_1, y0_1, x1_1, y1_1 = bbox1
    x0_2, y0_2, x1_2, y1_2 = bbox2

    if x0_1 > x1_2 or x0_2 > x1_1:
        return False
    if y0_1 > y1_2 or y0_2 > y1_1:
        return False

    return True


def is_in_bbox(bbox1, bbox2):
    """
    This function checks if bbox1 is in bbox2

    Parameters
    ----------
    bbox1 : list
        bbox1
    bbox2 : list
        bbox2

    Returns
    -------
    bool
        True if bbox1 is in bbox2, else False
    """
    x0_1, y0_1, x1_1, y1_1 = bbox1
    x0_2, y0_2, x1_2, y1_2 = bbox2

    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
        return True
    else:
        return False


def calculate_para_bbox(lines):
    """
    This function calculates the minimum bbox of the paragraph

    Parameters
    ----------
    lines : list
        lines

    Returns
    -------
    para_bbox : list
        bbox of the paragraph
    """
    x0 = min(line["bbox"][0] for line in lines)
    y0 = min(line["bbox"][1] for line in lines)
    x1 = max(line["bbox"][2] for line in lines)
    y1 = max(line["bbox"][3] for line in lines)
    return [x0, y0, x1, y1]


def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
    """
    This function checks if the line is right aligned from its neighbors

    Parameters
    ----------
    curr_line_bbox : list
        bbox of the current line
    prev_line_bbox : list
        bbox of the previous line
    next_line_bbox : list
        bbox of the next line
    avg_char_width : float
        average of char widths
    direction : int
        0 for prev, 1 for next, 2 for both

    Returns
    -------
    bool
        True if the line is right aligned from its neighbors, False otherwise.
    """
    horizontal_ratio = 0.5
    horizontal_thres = horizontal_ratio * avg_char_width

    _, _, x1, _ = curr_line_bbox
    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)

    if direction == 0:
        return abs(x1 - prev_x1) < horizontal_thres
    elif direction == 1:
        return abs(x1 - next_x1) < horizontal_thres
    elif direction == 2:
        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
    else:
        return False


def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
    """
    This function checks if the line is left aligned from its neighbors

    Parameters
    ----------
    curr_line_bbox : list
        bbox of the current line
    prev_line_bbox : list
        bbox of the previous line
    next_line_bbox : list
        bbox of the next line
    avg_char_width : float
        average of char widths
    direction : int
        0 for prev, 1 for next, 2 for both

    Returns
    -------
    bool
        True if the line is left aligned from its neighbors, False otherwise.
    """
    horizontal_ratio = 0.5
    horizontal_thres = horizontal_ratio * avg_char_width

    x0, _, _, _ = curr_line_bbox
    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)

    if direction == 0:
        return abs(x0 - prev_x0) < horizontal_thres
    elif direction == 1:
        return abs(x0 - next_x0) < horizontal_thres
    elif direction == 2:
        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
    else:
        return False


def end_with_punctuation(line_text):
    """
    This function checks if the line ends with punctuation marks
    """

    english_end_puncs = [".", "?", "!"]
    chinese_end_puncs = ["。", "?", "!"]
    end_puncs = english_end_puncs + chinese_end_puncs

    last_non_space_char = None
    for ch in line_text[::-1]:
        if not ch.isspace():
            last_non_space_char = ch
            break

    if last_non_space_char is None:
        return False

    return last_non_space_char in end_puncs


def is_nested_list(lst):
    if isinstance(lst, list):
        return any(isinstance(sub, list) for sub in lst)
    return False