Spaces:
Sleeping
Sleeping
File size: 4,096 Bytes
9ee83a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# add module
import os
import shutil
import sys
from subprocess import call
from grobid_client.grobid_client import GrobidClient
module_path = os.path.abspath(os.path.join('/project'))
if module_path not in sys.path:
sys.path.append(module_path)
from core.tei import single_entry
# temp_dir = '/project/temp'
# pdffigures2_home = '/opt/pdffigures2'
# grobid_home = '/opt/grobid'
# grobid_python_config_pth = '/opt/grobid_client_python/config.json
temp_dir = '/home/quanta/Projects/doc2slide-summarizer/temp'
pdffigures2_home = '/home/quanta/Library/pdffigures2'
grobid_home = '/home/quanta/Library/grobid/grobid-0.6.2'
grobid_python_config_pth = '/home/quanta/Library/grobid_client_python/config.json'
def remove_temp_directory():
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
def grobid_clident():
return GrobidClient(config_path=grobid_python_config_pth)
def process_pdf(pdf_pth: str, file_name: str):
"""This function will preprocess pdf, generate xml, extract figures, and then move all things to /project/temp"""
client = grobid_clident()
remove_temp_directory()
name = file_name[:-4]
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
temp_pdf_dir = os.path.join(temp_dir, name, 'pdf')
if not os.path.exists(temp_pdf_dir):
os.makedirs(temp_pdf_dir)
temp_xml_dir = os.path.join(temp_dir, name, 'xml')
if not os.path.exists(temp_xml_dir):
os.makedirs(temp_xml_dir)
# copy pdf to temp dir
shutil.copy(pdf_pth, temp_pdf_dir)
# process to xml
client.process(
'processFulltextDocument',
temp_pdf_dir,
tei_coordinates=True,
force=True,
verbose=True,
output=temp_xml_dir,
)
xml_name = name + '.tei.xml'
xml_pth = os.path.join(temp_xml_dir, xml_name)
# now scan figures
fig_dir_profix = 'figure'
img_dir_profix = 'figure/image'
json_dir_profix = 'figure/json'
tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix)
if not os.path.exists(tmp_fig_dir):
os.makedirs(tmp_fig_dir)
tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix)
if not os.path.exists(tmp_img_dir):
os.makedirs(tmp_img_dir)
tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix)
if not os.path.exists(tmp_json_dir):
os.makedirs(tmp_json_dir)
args = [
'sbt',
'-J-Xmx4G',
'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json',
]
call(args, cwd=pdffigures2_home)
shutil.move(tmp_fig_dir, os.path.join(temp_dir, name))
figure_json_pth = os.path.join(temp_dir, name, 'figure/json', name + '.json')
# merge to single json
_, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth)
temp_json_dir = os.path.join(temp_dir, name, 'json')
if not os.path.exists(temp_json_dir):
os.makedirs(temp_json_dir)
json_data = {
'title': title,
'abstract': abstract,
'text': text,
'headers': headers,
'figures': figures,
}
import json
json_pth = os.path.join(temp_json_dir, name + '.json')
with open(json_pth, 'w') as f:
json.dump(json_data, f, indent=4)
# get preprocessed data
with open(json_pth, 'r') as f:
data = json.load(f)
paper_length = len(data['text'])
sections = [{
'idx': i,
'title': head['section'],
'n': head['n'],
'text': ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]),
'matched_slides': [],
} for i, head in enumerate(data['headers'])]
with open(os.path.join(temp_dir, name, name + '.preprocessed_text.json'), 'w') as f:
json.dump([sec['text'] for sec in sections], f, indent=4)
if __name__ == '__main__':
process_pdf('/project/example/example.pdf')
|