File size: 10,443 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import math

from collections import defaultdict
from magic_pdf.para.commons import *

if sys.version_info[0] >= 3:
    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore


class HeaderFooterProcessor:
    def __init__(self) -> None:
        pass

    def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
        """
        This function gets the most common bboxes from the bboxes

        Parameters
        ----------
        bboxes : list
            bboxes
        page_height : float
            height of the page
        position : str, optional
            "top" or "bottom", by default "top"
        threshold : float, optional
            threshold, by default 0.25
        num_bboxes : int, optional
            number of bboxes to return, by default 3
        min_frequency : int, optional
            minimum frequency of the bbox, by default 2

        Returns
        -------
        common_bboxes : list
            common bboxes
        """
        # Filter bbox by position
        if position == "top":
            filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
        else:
            filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]

        # Find the most common bbox
        bbox_count = defaultdict(int)
        for bbox in filtered_bboxes:
            bbox_count[tuple(bbox)] += 1

        # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
        common_bboxes = [
            bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
        ][:num_bboxes]
        return common_bboxes

    def detect_footer_header(self, result_dict, similarity_threshold=0.5):
        """
        This function detects the header and footer of the document.

        Parameters
        ----------
        result_dict : dict
            result dictionary

        Returns
        -------
        result_dict : dict
            result dictionary
        """

        def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
            return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)

        def is_single_line_block(block):
            # Determine based on the width and height of the block
            block_width = block["X1"] - block["X0"]
            block_height = block["bbox"][3] - block["bbox"][1]

            # If the height of the block is close to the average character height and the width is large, it is considered a single line
            return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3

        # Traverse all blocks in the document
        single_preproc_blocks = 0
        total_blocks = 0
        single_preproc_blocks = 0

        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                for block_key, block in blocks.items():
                    if block_key.startswith("block_"):
                        total_blocks += 1
                        if is_single_line_block(block):
                            single_preproc_blocks += 1

        # If there are no blocks, skip the header and footer detection
        if total_blocks == 0:
            print("No blocks found. Skipping header/footer detection.")
            return result_dict

        # If most of the blocks are single-line, skip the header and footer detection
        if single_preproc_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
            return result_dict

        # Collect the bounding boxes of all blocks
        all_bboxes = []
        all_texts = []

        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                for block_key, block in blocks.items():
                    if block_key.startswith("block_"):
                        all_bboxes.append(block["bbox"])

        # Get the height of the page
        page_height = max(bbox[3] for bbox in all_bboxes)

        # Get the most common bbox lists for headers and footers
        common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
        common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []

        # Detect and mark headers and footers
        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                for block_key, block in blocks.items():
                    if block_key.startswith("block_"):
                        bbox = block["bbox"]
                        text = block["text"]

                        is_header = compare_bbox_with_list(bbox, common_header_bboxes)
                        is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)

                        block["is_header"] = int(is_header)
                        block["is_footer"] = int(is_footer)

        return result_dict


class NonHorizontalTextProcessor:
    def __init__(self) -> None:
        pass

    def detect_non_horizontal_texts(self, result_dict):
        """
        This function detects watermarks and vertical margin notes in the document.

        Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
        If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
        If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.

        Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
        If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
        If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.


        Parameters
        ----------
        result_dict : dict
            The result dictionary.

        Returns
        -------
        result_dict : dict
            The updated result dictionary.
        """
        # Dictionary to store information about potential watermarks
        potential_watermarks = {}
        potential_margin_notes = {}

        for page_id, page_content in result_dict.items():
            if page_id.startswith("page_"):
                for block_id, block_data in page_content.items():
                    if block_id.startswith("block_"):
                        if "dir" in block_data:
                            coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text

                            angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
                            angle = abs(math.degrees(angle))

                            if angle > 5 and angle < 85:  # Check if direction is watermarks
                                if coordinates_text in potential_watermarks:
                                    potential_watermarks[coordinates_text] += 1
                                else:
                                    potential_watermarks[coordinates_text] = 1

                            if angle > 85 and angle < 105:  # Check if direction is vertical
                                if coordinates_text in potential_margin_notes:
                                    potential_margin_notes[coordinates_text] += 1  # Increment count
                                else:
                                    potential_margin_notes[coordinates_text] = 1  # Initialize count

        # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
        watermark_threshold = len(result_dict) // 2
        watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}

        # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
        margin_note_threshold = len(result_dict) // 2
        margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}

        # Add watermark information to the result dictionary
        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                for block_id, block_data in blocks.items():
                    coordinates_text = (block_data["bbox"], block_data["text"])
                    if coordinates_text in watermarks:
                        block_data["is_watermark"] = 1
                    else:
                        block_data["is_watermark"] = 0

                    if coordinates_text in margin_notes:
                        block_data["is_vertical_margin_note"] = 1
                    else:
                        block_data["is_vertical_margin_note"] = 0

        return result_dict


class NoiseRemover:
    def __init__(self) -> None:
        pass

    def skip_data_noises(self, result_dict):
        """
        This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
        """
        filtered_result_dict = {}
        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                filtered_blocks = {}
                for block_id, block in blocks.items():
                    if block_id.startswith("block_"):
                        if any(
                            block.get(key, 0)
                            for key in [
                                "is_overlap",
                                "is_header",
                                "is_footer",
                                "is_watermark",
                                "is_vertical_margin_note",
                                "is_block_title",
                            ]
                        ):
                            continue
                        filtered_blocks[block_id] = block
                if filtered_blocks:
                    filtered_result_dict[page_id] = filtered_blocks

        return filtered_result_dict