|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import sys |
|
import subprocess |
|
import tempfile |
|
import itertools |
|
|
|
|
|
STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' |
|
|
|
|
|
PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ |
|
".", "?", "!", ",", ":", "-", "--", "...", ";"] |
|
|
|
class PTBTokenizer: |
|
"""Python wrapper of Stanford PTBTokenizer""" |
|
|
|
def tokenize(self, captions_for_image): |
|
cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ |
|
'edu.stanford.nlp.process.PTBTokenizer', \ |
|
'-preserveLines', '-lowerCase'] |
|
|
|
|
|
|
|
|
|
final_tokenized_captions_for_image = {} |
|
image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] |
|
sentences = '\n'.join([c.replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) |
|
|
|
|
|
|
|
|
|
path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) |
|
tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) |
|
tmp_file.write(sentences.encode()) |
|
tmp_file.close() |
|
|
|
|
|
|
|
|
|
cmd.append(os.path.basename(tmp_file.name)) |
|
p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ |
|
stdout=subprocess.PIPE) |
|
token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] |
|
token_lines = token_lines.decode() |
|
lines = token_lines.split('\n') |
|
|
|
os.remove(tmp_file.name) |
|
|
|
|
|
|
|
|
|
for k, line in zip(image_id, lines): |
|
if not k in final_tokenized_captions_for_image: |
|
final_tokenized_captions_for_image[k] = [] |
|
tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ |
|
if w not in PUNCTUATIONS]) |
|
final_tokenized_captions_for_image[k].append(tokenized_caption) |
|
|
|
return final_tokenized_captions_for_image |
|
|