File size: 6,317 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class RawBlockProcessor:
    def __init__(self) -> None:
        self.y_tolerance = 2
        self.pdf_dic = {}

    def __span_flags_decomposer(self, span_flags):
        """
        Make font flags human readable.

        Parameters
        ----------
        self : object
            The instance of the class.

        span_flags : int
            span flags

        Returns
        -------
        l : dict
            decomposed flags
        """

        l = {
            "is_superscript": False,
            "is_italic": False,
            "is_serifed": False,
            "is_sans_serifed": False,
            "is_monospaced": False,
            "is_proportional": False,
            "is_bold": False,
        }

        if span_flags & 2**0:
            l["is_superscript"] = True  # 表示上标

        if span_flags & 2**1:
            l["is_italic"] = True  # 表示斜体

        if span_flags & 2**2:
            l["is_serifed"] = True  # 表示衬线字体
        else:
            l["is_sans_serifed"] = True  # 表示非衬线字体

        if span_flags & 2**3:
            l["is_monospaced"] = True  # 表示等宽字体
        else:
            l["is_proportional"] = True  # 表示比例字体

        if span_flags & 2**4:
            l["is_bold"] = True  # 表示粗体

        return l

    def __make_new_lines(self, raw_lines):
        """
        This function makes new lines.

        Parameters
        ----------
        self : object
            The instance of the class.

        raw_lines : list
            raw lines

        Returns
        -------
        new_lines : list
            new lines
        """
        new_lines = []
        new_line = None

        for raw_line in raw_lines:
            raw_line_bbox = raw_line["bbox"]
            raw_line_spans = raw_line["spans"]
            raw_line_text = "".join([span["text"] for span in raw_line_spans])
            raw_line_dir = raw_line.get("dir", None)

            decomposed_line_spans = []
            for span in raw_line_spans:
                raw_flags = span["flags"]
                decomposed_flags = self.__span_flags_decomposer(raw_flags)
                span["decomposed_flags"] = decomposed_flags
                decomposed_line_spans.append(span)

            if new_line is None:
                new_line = {
                    "bbox": raw_line_bbox,
                    "text": raw_line_text,
                    "dir": raw_line_dir if raw_line_dir else (0, 0),
                    "spans": decomposed_line_spans,
                }
            else:
                if (
                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
                ):
                    new_line["bbox"] = (
                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
                        new_line["bbox"][1],  # top
                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
                        raw_line_bbox[3],  # bottom
                    )
                    new_line["text"] += " " + raw_line_text
                    new_line["spans"].extend(raw_line_spans)
                    new_line["dir"] = (
                        new_line["dir"][0] + raw_line_dir[0],
                        new_line["dir"][1] + raw_line_dir[1],
                    )
                else:
                    new_lines.append(new_line)
                    new_line = {
                        "bbox": raw_line_bbox,
                        "text": raw_line_text,
                        "dir": raw_line_dir if raw_line_dir else (0, 0),
                        "spans": raw_line_spans,
                    }
        if new_line:
            new_lines.append(new_line)

        return new_lines

    def __make_new_block(self, raw_block):
        """
        This function makes a new block.

        Parameters
        ----------
        self : object
            The instance of the class.
        ----------
        raw_block : dict
            a raw block

        Returns
        -------
        new_block : dict

        Schema of new_block:
        {
            "block_id": "block_1",
            "bbox": [0, 0, 100, 100],
            "text": "This is a block.",
            "lines": [
                {
                    "bbox": [0, 0, 100, 100],
                    "text": "This is a line.",
                    "spans": [
                        {
                            "text": "This is a span.",
                            "font": "Times New Roman",
                            "size": 12,
                            "color": "#000000",
                        }
                    ],
                }
            ],
        }
        """
        new_block = {}

        block_id = raw_block["number"]
        block_bbox = raw_block["bbox"]
        block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
        raw_lines = raw_block["lines"]
        block_lines = self.__make_new_lines(raw_lines)

        new_block["block_id"] = block_id
        new_block["bbox"] = block_bbox
        new_block["text"] = block_text
        new_block["lines"] = block_lines

        return new_block

    def batch_process_blocks(self, pdf_dic):
        """
        This function processes the blocks in batch.

        Parameters
        ----------
        self : object
            The instance of the class.
        ----------
        blocks : list
            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.

        Returns
        -------
        result_dict : dict
            result dictionary
        """

        for page_id, blocks in pdf_dic.items():
            if page_id.startswith("page_"):
                para_blocks = []
                if "preproc_blocks" in blocks.keys():
                    input_blocks = blocks["preproc_blocks"]
                    for raw_block in input_blocks:
                        new_block = self.__make_new_block(raw_block)
                        para_blocks.append(new_block)

                blocks["para_blocks"] = para_blocks

        return pdf_dic