Spaces:

yuwd
/

Polos-Demo

Sleeping

App Files Files Community

Polos-Demo / pacscore /evaluation /tokenizer.py

yuwd

init

03f6091 about 1 year ago

raw

history blame

2.46 kB

	#!/usr/bin/env python
	#
	# File Name : ptbtokenizer.py
	#
	# Description : Do the PTB Tokenization and remove punctuations.
	#
	# Creation Date : 29-12-2014
	# Last Modified : Thu Mar 19 09:53:35 2015
	# Authors : Hao Fang <[email protected]> and Tsung-Yi Lin <[email protected]>

	import os
	import subprocess
	import tempfile

	class PTBTokenizer(object):
	"""Python wrapper of Stanford PTBTokenizer"""

	corenlp_jar = 'stanford-corenlp-3.4.1.jar'
	punctuations = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
	".", "?", "!", ",", ":", "-", "--", "...", ";"]

	@classmethod
	def tokenize(cls, corpus):
	cmd = ['java', '-cp', cls.corenlp_jar, \
	'edu.stanford.nlp.process.PTBTokenizer', \
	'-preserveLines', '-lowerCase']

	if isinstance(corpus, list) or isinstance(corpus, tuple):
	if isinstance(corpus[0], list) or isinstance(corpus[0], tuple):
	corpus = {i:c for i, c in enumerate(corpus)}
	else:
	corpus = {i: [c, ] for i, c in enumerate(corpus)}

	# prepare data for PTB Tokenizer
	tokenized_corpus = {}
	image_id = [k for k, v in list(corpus.items()) for _ in range(len(v))]
	sentences = '\n'.join([c.replace('\n', ' ') for k, v in corpus.items() for c in v])

	# save sentences to temporary file
	path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
	tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
	tmp_file.write(sentences.encode())
	tmp_file.close()

	# tokenize sentence
	cmd.append(os.path.basename(tmp_file.name))
	p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
	stdout=subprocess.PIPE, stderr=open(os.devnull, 'w'))
	token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
	token_lines = token_lines.decode()
	lines = token_lines.split('\n')
	# remove temp file
	os.remove(tmp_file.name)

	# create dictionary for tokenized captions
	for k, line in zip(image_id, lines):
	if not k in tokenized_corpus:
	tokenized_corpus[k] = []
	tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
	if w not in cls.punctuations])
	tokenized_corpus[k].append(tokenized_caption)

	return tokenized_corpus