File size: 853 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict:  # text_block -> json中的preproc_block
    """解决行内文本间距过大问题"""
    for i in range(len(pdf_info_dict)):

        text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']

        for block in text_blocks:

            x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
            
            for line in block['lines']:

                x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
                # line_box = [x1, y1, x2, y2] 
                if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
                    # if len(line['spans']) == 1:
                    line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
                
                x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox'] 

    return pdf_info_dict