#!/usr/bin/env python3 # Copyright 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """Simple wrapper around the Stanford CoreNLP pipeline. Serves commands to a java subprocess running the jar. Requires java 8. """ import copy import json import pexpect from .tokenizer import Tokens, Tokenizer from . import DEFAULTS class CoreNLPTokenizer(Tokenizer): def __init__(self, **kwargs): """ Args: annotators: set that can include pos, lemma, and ner. classpath: Path to the corenlp directory of jars mem: Java heap memory """ self.classpath = (kwargs.get('classpath') or DEFAULTS['corenlp_classpath']) self.annotators = copy.deepcopy(kwargs.get('annotators', set())) self.mem = kwargs.get('mem', '2g') self._launch() def _launch(self): """Start the CoreNLP jar with pexpect.""" annotators = ['tokenize', 'ssplit'] if 'ner' in self.annotators: annotators.extend(['pos', 'lemma', 'ner']) elif 'lemma' in self.annotators: annotators.extend(['pos', 'lemma']) elif 'pos' in self.annotators: annotators.extend(['pos']) annotators = ','.join(annotators) options = ','.join(['untokenizable=noneDelete', 'invertible=true']) cmd = ['java', '-mx' + self.mem, '-cp', '"%s"' % self.classpath, 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', annotators, '-tokenize.options', options, '-outputFormat', 'json', '-prettyPrint', 'false'] # We use pexpect to keep the subprocess alive and feed it commands. # Because we don't want to get hit by the max terminal buffer size, # we turn off canonical input processing to have unlimited bytes. self.corenlp = pexpect.spawn('/bin/bash', maxread=100000, timeout=60) self.corenlp.setecho(False) self.corenlp.sendline('stty -icanon') self.corenlp.sendline(' '.join(cmd)) self.corenlp.delaybeforesend = 0 self.corenlp.delayafterread = 0 self.corenlp.expect_exact('NLP>', searchwindowsize=100) @staticmethod def _convert(token): if token == '-LRB-': return '(' if token == '-RRB-': return ')' if token == '-LSB-': return '[' if token == '-RSB-': return ']' if token == '-LCB-': return '{' if token == '-RCB-': return '}' return token def tokenize(self, text): # Since we're feeding text to the commandline, we're waiting on seeing # the NLP> prompt. Hacky! if 'NLP>' in text: raise RuntimeError('Bad token (NLP>) in text!') # Sending q will cause the process to quit -- manually override if text.lower().strip() == 'q': token = text.strip() index = text.index(token) data = [(token, text[index:], (index, index + 1), 'NN', 'q', 'O')] return Tokens(data, self.annotators) # Minor cleanup before tokenizing. clean_text = text.replace('\n', ' ') self.corenlp.sendline(clean_text.encode('utf-8')) self.corenlp.expect_exact('NLP>', searchwindowsize=100) # Skip to start of output (may have been stderr logging messages) output = self.corenlp.before start = output.find(b'{"sentences":') output = json.loads(output[start:].decode('utf-8')) data = [] tokens = [t for s in output['sentences'] for t in s['tokens']] for i in range(len(tokens)): # Get whitespace start_ws = tokens[i]['characterOffsetBegin'] if i + 1 < len(tokens): end_ws = tokens[i + 1]['characterOffsetBegin'] else: end_ws = tokens[i]['characterOffsetEnd'] data.append(( self._convert(tokens[i]['word']), text[start_ws: end_ws], (tokens[i]['characterOffsetBegin'], tokens[i]['characterOffsetEnd']), tokens[i].get('pos', None), tokens[i].get('lemma', None), tokens[i].get('ner', None) )) return Tokens(data, self.annotators)