|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
import sys
|
|
import subprocess
|
|
import tempfile
|
|
import itertools
|
|
|
|
|
|
STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
|
|
|
|
|
|
PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
|
|
".", "?", "!", ",", ":", "-", "--", "...", ";"]
|
|
|
|
class PTBTokenizer:
|
|
"""Python wrapper of Stanford PTBTokenizer"""
|
|
|
|
def tokenize(self, captions_for_image):
|
|
cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
|
|
'edu.stanford.nlp.process.PTBTokenizer', \
|
|
'-preserveLines', '-lowerCase']
|
|
|
|
|
|
|
|
|
|
final_tokenized_captions_for_image = {}
|
|
image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
|
|
sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
|
|
|
|
|
|
|
|
|
|
path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
|
|
tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname, mode='w', encoding='utf-8')
|
|
tmp_file.write(sentences)
|
|
tmp_file.close()
|
|
|
|
|
|
|
|
|
|
cmd.append(os.path.basename(tmp_file.name))
|
|
p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
|
|
stdout=subprocess.PIPE)
|
|
token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
|
|
lines = token_lines.decode().split('\n')
|
|
|
|
os.remove(tmp_file.name)
|
|
|
|
|
|
|
|
|
|
for k, line in zip(image_id, lines):
|
|
if not k in final_tokenized_captions_for_image:
|
|
final_tokenized_captions_for_image[k] = []
|
|
tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
|
|
if w not in PUNCTUATIONS])
|
|
final_tokenized_captions_for_image[k].append(tokenized_caption)
|
|
|
|
return final_tokenized_captions_for_image
|
|
|