Spaces:

zhenyundeng
/

AVeriTeC-API

Build error

AVeriTeC-API / drqa /tokenizers /corenlp_tokenizer.py

zhenyundeng

add files

e62781a 12 months ago

4.49 kB

	#!/usr/bin/env python3
	# Copyright 2017-present, Facebook, Inc.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.
	"""Simple wrapper around the Stanford CoreNLP pipeline.

	Serves commands to a java subprocess running the jar. Requires java 8.
	"""

	import copy
	import json
	import pexpect

	from .tokenizer import Tokens, Tokenizer
	from . import DEFAULTS


	class CoreNLPTokenizer(Tokenizer):

	def __init__(self, **kwargs):
	"""
	Args:
	annotators: set that can include pos, lemma, and ner.
	classpath: Path to the corenlp directory of jars
	mem: Java heap memory
	"""
	self.classpath = (kwargs.get('classpath') or
	DEFAULTS['corenlp_classpath'])
	self.annotators = copy.deepcopy(kwargs.get('annotators', set()))
	self.mem = kwargs.get('mem', '2g')
	self._launch()

	def _launch(self):
	"""Start the CoreNLP jar with pexpect."""
	annotators = ['tokenize', 'ssplit']
	if 'ner' in self.annotators:
	annotators.extend(['pos', 'lemma', 'ner'])
	elif 'lemma' in self.annotators:
	annotators.extend(['pos', 'lemma'])
	elif 'pos' in self.annotators:
	annotators.extend(['pos'])
	annotators = ','.join(annotators)
	options = ','.join(['untokenizable=noneDelete',
	'invertible=true'])
	cmd = ['java', '-mx' + self.mem, '-cp', '"%s"' % self.classpath,
	'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators',
	annotators, '-tokenize.options', options,
	'-outputFormat', 'json', '-prettyPrint', 'false']

	# We use pexpect to keep the subprocess alive and feed it commands.
	# Because we don't want to get hit by the max terminal buffer size,
	# we turn off canonical input processing to have unlimited bytes.
	self.corenlp = pexpect.spawn('/bin/bash', maxread=100000, timeout=60)
	self.corenlp.setecho(False)
	self.corenlp.sendline('stty -icanon')
	self.corenlp.sendline(' '.join(cmd))
	self.corenlp.delaybeforesend = 0
	self.corenlp.delayafterread = 0
	self.corenlp.expect_exact('NLP>', searchwindowsize=100)

	@staticmethod
	def _convert(token):
	if token == '-LRB-':
	return '('
	if token == '-RRB-':
	return ')'
	if token == '-LSB-':
	return '['
	if token == '-RSB-':
	return ']'
	if token == '-LCB-':
	return '{'
	if token == '-RCB-':
	return '}'
	return token

	def tokenize(self, text):
	# Since we're feeding text to the commandline, we're waiting on seeing
	# the NLP> prompt. Hacky!
	if 'NLP>' in text:
	raise RuntimeError('Bad token (NLP>) in text!')

	# Sending q will cause the process to quit -- manually override
	if text.lower().strip() == 'q':
	token = text.strip()
	index = text.index(token)
	data = [(token, text[index:], (index, index + 1), 'NN', 'q', 'O')]
	return Tokens(data, self.annotators)

	# Minor cleanup before tokenizing.
	clean_text = text.replace('\n', ' ')

	self.corenlp.sendline(clean_text.encode('utf-8'))
	self.corenlp.expect_exact('NLP>', searchwindowsize=100)

	# Skip to start of output (may have been stderr logging messages)
	output = self.corenlp.before
	start = output.find(b'{"sentences":')
	output = json.loads(output[start:].decode('utf-8'))

	data = []
	tokens = [t for s in output['sentences'] for t in s['tokens']]
	for i in range(len(tokens)):
	# Get whitespace
	start_ws = tokens[i]['characterOffsetBegin']
	if i + 1 < len(tokens):
	end_ws = tokens[i + 1]['characterOffsetBegin']
	else:
	end_ws = tokens[i]['characterOffsetEnd']

	data.append((
	self._convert(tokens[i]['word']),
	text[start_ws: end_ws],
	(tokens[i]['characterOffsetBegin'],
	tokens[i]['characterOffsetEnd']),
	tokens[i].get('pos', None),
	tokens[i].get('lemma', None),
	tokens[i].get('ner', None)
	))
	return Tokens(data, self.annotators)