File size: 11,796 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
import os
import json

from magic_pdf.para.commons import *

from magic_pdf.para.raw_processor import RawBlockProcessor
from magic_pdf.para.layout_match_processor import LayoutFilterProcessor
from magic_pdf.para.stats import BlockStatisticsCalculator
from magic_pdf.para.stats import DocStatisticsCalculator
from magic_pdf.para.title_processor import TitleProcessor
from magic_pdf.para.block_termination_processor import BlockTerminationProcessor
from magic_pdf.para.block_continuation_processor import BlockContinuationProcessor
from magic_pdf.para.draw import DrawAnnos
from magic_pdf.para.exceptions import (
    DenseSingleLineBlockException,
    TitleDetectionException,
    TitleLevelException,
    ParaSplitException,
    ParaMergeException,
    DiscardByException,
)


if sys.version_info[0] >= 3:
    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore


class ParaProcessPipeline:
    def __init__(self) -> None:
        pass

    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
        """
        This function processes the paragraphs, including:
        1. Read raw input json file into pdf_dic
        2. Detect and replace equations
        3. Combine spans into a natural line
        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
        5. Compute statistics for each block
        6. Detect titles in the document
        7. Detect paragraphs inside each block
        8. Divide the level of the titles
        9. Detect and combine paragraphs from different blocks into one paragraph
        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
        11. Draw annotations on the pdf file

        Parameters
        ----------
        pdf_dic_json_fpath : str
            path to the pdf dictionary json file.
            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
        input_pdf_doc : str
            path to the input pdf file
        output_pdf_path : str
            path to the output pdf file

        Returns
        -------
        pdf_dict : dict
            result dictionary
        """

        error_info = None

        output_json_file = ""
        output_dir = ""

        if input_pdf_path is not None:
            input_pdf_path = os.path.abspath(input_pdf_path)

            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")

        if output_pdf_path is not None:
            output_dir = os.path.dirname(output_pdf_path)
            output_json_file = f"{output_dir}/pdf_dic.json"

        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
            """
            Save the pdf_dic to a json file
            """
            output_pdf_file_name = os.path.basename(output_pdf_path)
            # output_dir = os.path.dirname(output_pdf_path)
            output_dir = "\\tmp\\pdf_parse"
            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            if para_debug_mode == "full":
                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)

            # Validate the output already exists
            if not os.path.exists(pdf_dic_json_fpath):
                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
                return None
            else:
                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")

            return pdf_dic_json_fpath

        """
        Preprocess the lines of block
        """
        # Find and replace the interline and inline equations, should be better done before the paragraph processing
        # Create "para_blocks" for each page.
        # equationProcessor = EquationsProcessor()
        # pdf_dic = equationProcessor.batch_process_blocks(pdf_info_dict)

        # Combine spans into a natural line
        rawBlockProcessor = RawBlockProcessor()
        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")

        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
        layoutFilter = LayoutFilterProcessor()
        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)

        # Compute statistics for each block
        blockStatisticsCalculator = BlockStatisticsCalculator()
        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")

        # Compute statistics for all blocks(namely this pdf document)
        docStatisticsCalculator = DocStatisticsCalculator()
        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")

        # Dump the first three stages of pdf_dic to a json file
        if para_debug_mode == "full":
            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)

        """
        Detect titles in the document
        """
        doc_statistics = pdf_dic["statistics"]
        titleProcessor = TitleProcessor(doc_statistics)
        pdf_dic = titleProcessor.batch_process_blocks_detect_titles(pdf_dic)

        if para_debug_mode == "full":
            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)

        """
        Detect and divide the level of the titles
        """
        titleProcessor = TitleProcessor()

        pdf_dic = titleProcessor.batch_process_blocks_recog_title_level(pdf_dic)

        if para_debug_mode == "full":
            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)

        """
        Detect and split paragraphs inside each block
        """
        blockInnerParasProcessor = BlockTerminationProcessor()

        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)

        if para_debug_mode == "full":
            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)

        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")

        """
        Detect and combine paragraphs from different blocks into one paragraph
        """
        blockContinuationProcessor = BlockContinuationProcessor()

        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)

        if para_debug_mode == "full":
            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)

        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")

        """
        Discard pdf files by checking exceptions and return the error info to the caller
        """
        discardByException = DiscardByException()

        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
            pdf_dic, exception=DenseSingleLineBlockException()
        )
        is_discard_by_title_detection = discardByException.discard_by_title_detection(
            pdf_dic, exception=TitleDetectionException()
        )
        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())

        """
        if any(
            info is not None
            for info in [
                is_discard_by_single_line_block,
                is_discard_by_title_detection,
                is_discard_by_title_level,
                is_discard_by_split_para,
                is_discard_by_merge_para,
            ]
        ):
            error_info = next(
                (
                    info
                    for info in [
                        is_discard_by_single_line_block,
                        is_discard_by_title_detection,
                        is_discard_by_title_level,
                        is_discard_by_split_para,
                        is_discard_by_merge_para,
                    ]
                    if info is not None
                ),
                None,
            )
            return pdf_dic, error_info

        if any(
            info is not None
            for info in [
                is_discard_by_single_line_block,
                is_discard_by_title_detection,
                is_discard_by_title_level,
                is_discard_by_split_para,
                is_discard_by_merge_para,
            ]
        ):
            error_info = next(
                (
                    info
                    for info in [
                        is_discard_by_single_line_block,
                        is_discard_by_title_detection,
                        is_discard_by_title_level,
                        is_discard_by_split_para,
                        is_discard_by_merge_para,
                    ]
                    if info is not None
                ),
                None,
            )
            return pdf_dic, error_info
        """

        """
        Dump the final pdf_dic to a json file
        """
        if para_debug_mode is not None:
            with open(output_json_file, "w", encoding="utf-8") as f:
                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)

        """
        Draw the annotations
        """

        if is_discard_by_single_line_block is not None:
            error_info = is_discard_by_single_line_block
        elif is_discard_by_title_detection is not None:
            error_info = is_discard_by_title_detection
        elif is_discard_by_title_level is not None:
            error_info = is_discard_by_title_level
        elif is_discard_by_split_para is not None:
            error_info = is_discard_by_split_para
        elif is_discard_by_merge_para is not None:
            error_info = is_discard_by_merge_para

        if error_info is not None:
            return pdf_dic, error_info

        """
        Dump the final pdf_dic to a json file
        """
        if para_debug_mode is not None:
            with open(output_json_file, "w", encoding="utf-8") as f:
                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)

        """
        Draw the annotations
        """
        if para_debug_mode is not None:
            drawAnnos = DrawAnnos()
            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)

        """
        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
        """
        if para_debug_mode is not None:
            for fpath in os.listdir(output_dir):
                if fpath.endswith(".json") and "stage" in fpath:
                    os.remove(os.path.join(output_dir, fpath))

        return pdf_dic, error_info