File size: 4,096 Bytes
9ee83a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# add module
import os
import shutil
import sys
from subprocess import call

from grobid_client.grobid_client import GrobidClient

module_path = os.path.abspath(os.path.join('/project'))
if module_path not in sys.path:
    sys.path.append(module_path)

from core.tei import single_entry

# temp_dir = '/project/temp'
# pdffigures2_home = '/opt/pdffigures2'
# grobid_home = '/opt/grobid'
# grobid_python_config_pth = '/opt/grobid_client_python/config.json
temp_dir = '/home/quanta/Projects/doc2slide-summarizer/temp'
pdffigures2_home = '/home/quanta/Library/pdffigures2'
grobid_home = '/home/quanta/Library/grobid/grobid-0.6.2'
grobid_python_config_pth = '/home/quanta/Library/grobid_client_python/config.json'


def remove_temp_directory():
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)


def grobid_clident():
    return GrobidClient(config_path=grobid_python_config_pth)


def process_pdf(pdf_pth: str, file_name: str):
    """This function will preprocess pdf, generate xml, extract figures, and then move all things to /project/temp"""

    client = grobid_clident()
    remove_temp_directory()

    name = file_name[:-4]

    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    temp_pdf_dir = os.path.join(temp_dir, name, 'pdf')
    if not os.path.exists(temp_pdf_dir):
        os.makedirs(temp_pdf_dir)
    temp_xml_dir = os.path.join(temp_dir, name, 'xml')
    if not os.path.exists(temp_xml_dir):
        os.makedirs(temp_xml_dir)

    # copy pdf to temp dir
    shutil.copy(pdf_pth, temp_pdf_dir)

    # process to xml
    client.process(
        'processFulltextDocument',
        temp_pdf_dir,
        tei_coordinates=True,
        force=True,
        verbose=True,
        output=temp_xml_dir,
    )

    xml_name = name + '.tei.xml'
    xml_pth = os.path.join(temp_xml_dir, xml_name)

    # now scan figures
    fig_dir_profix = 'figure'
    img_dir_profix = 'figure/image'
    json_dir_profix = 'figure/json'

    tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix)
    if not os.path.exists(tmp_fig_dir):
        os.makedirs(tmp_fig_dir)
    tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix)
    if not os.path.exists(tmp_img_dir):
        os.makedirs(tmp_img_dir)
    tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix)
    if not os.path.exists(tmp_json_dir):
        os.makedirs(tmp_json_dir)

    args = [
        'sbt',
        '-J-Xmx4G',
        'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json',
    ]
    call(args, cwd=pdffigures2_home)

    shutil.move(tmp_fig_dir, os.path.join(temp_dir, name))

    figure_json_pth = os.path.join(temp_dir, name, 'figure/json', name + '.json')

    # merge to single json
    _, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth)

    temp_json_dir = os.path.join(temp_dir, name, 'json')
    if not os.path.exists(temp_json_dir):
        os.makedirs(temp_json_dir)

    json_data = {
        'title': title,
        'abstract': abstract,
        'text': text,
        'headers': headers,
        'figures': figures,
    }

    import json
    json_pth = os.path.join(temp_json_dir, name + '.json')
    with open(json_pth, 'w') as f:
        json.dump(json_data, f, indent=4)

    # get preprocessed data
    with open(json_pth, 'r') as f:
        data = json.load(f)
    paper_length = len(data['text'])
    sections = [{
        'idx': i,
        'title': head['section'],
        'n': head['n'],
        'text': ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]),
        'matched_slides': [],
    } for i, head in enumerate(data['headers'])]

    with open(os.path.join(temp_dir, name, name + '.preprocessed_text.json'), 'w') as f:
        json.dump([sec['text'] for sec in sections], f, indent=4)


if __name__ == '__main__':
    process_pdf('/project/example/example.pdf')