File size: 17,459 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import math
from loguru import logger

from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType

TYPE_INLINE_EQUATION = ContentType.InlineEquation
TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']


@DeprecationWarning
def mk_nlp_markdown_1(para_dict: dict):
    """
    对排序后的bboxes拼接内容
    """
    content_lst = []
    for _, page_info in para_dict.items():
        para_blocks = page_info.get("para_blocks")
        if not para_blocks:
            continue

        for block in para_blocks:
            item = block["paras"]
            for _, p in item.items():
                para_text = p["para_text"]
                is_title = p["is_para_title"]
                title_level = p['para_title_level']
                md_title_prefix = "#"*title_level
                if is_title:
                    content_lst.append(f"{md_title_prefix} {para_text}")
                else:
                    content_lst.append(para_text)

    content_text = "\n\n".join(content_lst)

    return content_text



# 找到目标字符串在段落中的索引
def __find_index(paragraph, target):
    index = paragraph.find(target)
    if index != -1:
        return index
    else:
        return None


def __insert_string(paragraph, target, postion):
    new_paragraph = paragraph[:postion] + target + paragraph[postion:] 
    return new_paragraph


def __insert_after(content, image_content, target):
    """
    在content中找到target,将image_content插入到target后面
    """
    index = content.find(target)
    if index != -1:
        content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
    else:
        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
    return content

def __insert_before(content, image_content, target):
    """
    在content中找到target,将image_content插入到target前面
    """
    index = content.find(target)
    if index != -1:
        content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
    else:
        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
    return content


@DeprecationWarning
def mk_mm_markdown_1(para_dict: dict):
    """拼装多模态markdown"""
    content_lst = []
    for _, page_info in para_dict.items():
        page_lst = [] # 一个page内的段落列表
        para_blocks = page_info.get("para_blocks")
        pymu_raw_blocks = page_info.get("preproc_blocks")
        
        all_page_images = []
        all_page_images.extend(page_info.get("images",[]))
        all_page_images.extend(page_info.get("image_backup", []) )
        all_page_images.extend(page_info.get("tables",[]))
        all_page_images.extend(page_info.get("table_backup",[]) )
        
        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
            for img in all_page_images:
                page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
            page_md = "\n\n".join(page_lst)
            
        else:
            for block in para_blocks:
                item = block["paras"]
                for _, p in item.items():
                    para_text = p["para_text"]
                    is_title = p["is_para_title"]
                    title_level = p['para_title_level']
                    md_title_prefix = "#"*title_level
                    if is_title:
                        page_lst.append(f"{md_title_prefix} {para_text}")
                    else:
                        page_lst.append(para_text)
                        
            """拼装成一个页面的文本"""
            page_md = "\n\n".join(page_lst)
            """插入图片"""
            for img in all_page_images:
                imgbox = img['bbox']
                img_content = f"![]({img['image_path']})"
                # 先看在哪个block内
                for block in pymu_raw_blocks:
                    bbox = block['bbox']
                    if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
                        for l in block['lines']:
                            line_box = l['bbox']
                            if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的,插入line前面
                                line_txt = "".join([s['text'] for s in l['spans']])
                                page_md = __insert_before(page_md, img_content, line_txt)
                                break
                            break
                        else:# 在行与行之间
                            # 找到图片x0,y0与line的x0,y0最近的line
                            min_distance = 100000
                            min_line = None
                            for l in block['lines']:
                                line_box = l['bbox']
                                distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
                                if distance < min_distance:
                                    min_distance = distance
                                    min_line = l
                            if min_line:
                                line_txt = "".join([s['text'] for s in min_line['spans']])
                                img_h = imgbox[3] - imgbox[1]
                                if min_distance<img_h: # 文字在图片前面
                                    page_md = __insert_after(page_md, img_content, line_txt)
                                else:
                                    page_md = __insert_before(page_md, img_content, line_txt)
                            else:
                                logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
                else:# 应当在两个block之间
                    # 找到上方最近的block,如果上方没有就找大下方最近的block
                    top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
                    if top_txt_block:
                        line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
                        page_md = __insert_after(page_md, img_content, line_txt)
                    else:
                        bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
                        if bottom_txt_block:
                            line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
                            page_md = __insert_before(page_md, img_content, line_txt)
                        else:
                            logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
                    
        content_lst.append(page_md)
                    
    """拼装成全部页面的文本"""
    content_text = "\n\n".join(content_lst)

    return content_text


def __insert_after_para(text, type, element, content_list):
    """
    在content_list中找到text,将image_path作为一个新的node插入到text后面
    """
    for i, c in enumerate(content_list):
        content_type = c.get("type")
        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
            if type == "image":
                content_node = {
                    "type": "image",
                    "img_path": element.get("image_path"),
                    "img_alt": "",
                    "img_title": "",
                    "img_caption": "",
                }
            elif type == "table":
                content_node = {
                    "type": "table",
                    "img_path": element.get("image_path"),
                    "table_latex": element.get("text"),
                    "table_title": "",
                    "table_caption": "",
                    "table_quality": element.get("quality"),
                }
            content_list.insert(i+1, content_node)
            break
    else:
        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
    


def __insert_before_para(text, type, element, content_list):
    """
    在content_list中找到text,将image_path作为一个新的node插入到text前面
    """
    for i, c in enumerate(content_list):
        content_type = c.get("type")
        if content_type in  UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
            if type == "image":
                content_node = {
                    "type": "image",
                    "img_path": element.get("image_path"),
                    "img_alt": "",
                    "img_title": "",
                    "img_caption": "",
                }
            elif type == "table":
                content_node = {
                    "type": "table",
                    "img_path": element.get("image_path"),
                    "table_latex": element.get("text"),
                    "table_title": "",
                    "table_caption": "",
                    "table_quality": element.get("quality"),
                }
            content_list.insert(i, content_node)
            break
    else:
        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
         

def mk_universal_format(pdf_info_list: list, img_buket_path):
    """
    构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
    """
    content_lst = []
    for page_info in pdf_info_list:
        page_lst = [] # 一个page内的段落列表
        para_blocks = page_info.get("para_blocks")
        pymu_raw_blocks = page_info.get("preproc_blocks")
        
        all_page_images = []
        all_page_images.extend(page_info.get("images",[]))
        all_page_images.extend(page_info.get("image_backup", []) )
        # all_page_images.extend(page_info.get("tables",[]))
        # all_page_images.extend(page_info.get("table_backup",[]) )
        all_page_tables = []
        all_page_tables.extend(page_info.get("tables", []))

        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
            for img in all_page_images:
                content_node = {
                    "type": "image",
                    "img_path": join_path(img_buket_path, img['image_path']),
                    "img_alt":"",
                    "img_title":"",
                    "img_caption":""
                }
                page_lst.append(content_node) # TODO 图片顺序
            for table in all_page_tables:
                content_node = {
                    "type": "table",
                    "img_path": join_path(img_buket_path, table['image_path']),
                    "table_latex": table.get("text"),
                    "table_title": "",
                    "table_caption": "",
                    "table_quality": table.get("quality"),
                }
                page_lst.append(content_node) # TODO 图片顺序
        else:
            for block in para_blocks:
                item = block["paras"]
                for _, p in item.items():
                    font_type = p['para_font_type']# 对于文本来说,要么是普通文本,要么是个行间公式
                    if font_type == TYPE_INTERLINE_EQUATION:
                        content_node = {
                            "type": "equation",
                            "latex": p["para_text"]
                        }
                        page_lst.append(content_node)
                    else:
                        para_text = p["para_text"]
                        is_title = p["is_para_title"]
                        title_level = p['para_title_level']
                        
                        if is_title:
                            content_node = {
                                "type": f"h{title_level}",
                                "text": para_text
                            }
                            page_lst.append(content_node)
                        else:
                            content_node = {
                                "type": "text",
                                "text": para_text
                            }
                            page_lst.append(content_node)
                            
        content_lst.extend(page_lst)
        
        """插入图片"""
        for img in all_page_images:
            insert_img_or_table("image", img, pymu_raw_blocks, content_lst)

        """插入表格"""
        for table in all_page_tables:
            insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
    # end for
    return content_lst


def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
    element_bbox = element['bbox']
    # 先看在哪个block内
    for block in pymu_raw_blocks:
        bbox = block['bbox']
        if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
            3] + 1:  # 确定在这个大的block内,然后进入逐行比较距离
            for l in block['lines']:
                line_box = l['bbox']
                if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
                    3] + 1:  # 在line内的,插入line前面
                    line_txt = "".join([s['text'] for s in l['spans']])
                    __insert_before_para(line_txt, type, element, content_lst)
                    break
                break
            else:  # 在行与行之间
                # 找到图片x0,y0与line的x0,y0最近的line
                min_distance = 100000
                min_line = None
                for l in block['lines']:
                    line_box = l['bbox']
                    distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
                    if distance < min_distance:
                        min_distance = distance
                        min_line = l
                if min_line:
                    line_txt = "".join([s['text'] for s in min_line['spans']])
                    img_h = element_bbox[3] - element_bbox[1]
                    if min_distance < img_h:  # 文字在图片前面
                        __insert_after_para(line_txt, type, element, content_lst)
                    else:
                        __insert_before_para(line_txt, type, element, content_lst)
                    break
                else:
                    logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
    else:  # 应当在两个block之间
        # 找到上方最近的block,如果上方没有就找大下方最近的block
        top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
        if top_txt_block:
            line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
            __insert_after_para(line_txt, type, element, content_lst)
        else:
            bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
            if bottom_txt_block:
                line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
                __insert_before_para(line_txt, type, element, content_lst)
            else:  # TODO ,图片可能独占一列,这种情况上下是没有图片的
                logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")


def mk_mm_markdown(content_list):
    """
    基于同一格式的内容列表,构造markdown,含图片
    """
    content_md = []
    for c in content_list:
        content_type = c.get("type")
        if content_type == "text":
            content_md.append(c.get("text"))
        elif content_type == "equation":
            content = c.get("latex")
            if content.startswith("$$") and content.endswith("$$"):
                content_md.append(content)
            else:
                content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
        elif content_type in UNI_FORMAT_TEXT_TYPE:
            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
        elif content_type == "image":
            content_md.append(f"![]({c.get('img_path')})")
    return "\n\n".join(content_md)

def mk_nlp_markdown(content_list):
    """
    基于同一格式的内容列表,构造markdown,不含图片
    """
    content_md = []
    for c in content_list:
        content_type = c.get("type")
        if content_type == "text":
            content_md.append(c.get("text"))
        elif content_type == "equation":
            content_md.append(f"$$\n{c.get('latex')}\n$$")
        elif content_type == "table":
            content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
        elif content_type in UNI_FORMAT_TEXT_TYPE:
            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
    return "\n\n".join(content_md)