Spaces:
Build error
Build error
File size: 4,486 Bytes
e62781a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
#!/usr/bin/env python3
# Copyright 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""Simple wrapper around the Stanford CoreNLP pipeline.
Serves commands to a java subprocess running the jar. Requires java 8.
"""
import copy
import json
import pexpect
from .tokenizer import Tokens, Tokenizer
from . import DEFAULTS
class CoreNLPTokenizer(Tokenizer):
def __init__(self, **kwargs):
"""
Args:
annotators: set that can include pos, lemma, and ner.
classpath: Path to the corenlp directory of jars
mem: Java heap memory
"""
self.classpath = (kwargs.get('classpath') or
DEFAULTS['corenlp_classpath'])
self.annotators = copy.deepcopy(kwargs.get('annotators', set()))
self.mem = kwargs.get('mem', '2g')
self._launch()
def _launch(self):
"""Start the CoreNLP jar with pexpect."""
annotators = ['tokenize', 'ssplit']
if 'ner' in self.annotators:
annotators.extend(['pos', 'lemma', 'ner'])
elif 'lemma' in self.annotators:
annotators.extend(['pos', 'lemma'])
elif 'pos' in self.annotators:
annotators.extend(['pos'])
annotators = ','.join(annotators)
options = ','.join(['untokenizable=noneDelete',
'invertible=true'])
cmd = ['java', '-mx' + self.mem, '-cp', '"%s"' % self.classpath,
'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators',
annotators, '-tokenize.options', options,
'-outputFormat', 'json', '-prettyPrint', 'false']
# We use pexpect to keep the subprocess alive and feed it commands.
# Because we don't want to get hit by the max terminal buffer size,
# we turn off canonical input processing to have unlimited bytes.
self.corenlp = pexpect.spawn('/bin/bash', maxread=100000, timeout=60)
self.corenlp.setecho(False)
self.corenlp.sendline('stty -icanon')
self.corenlp.sendline(' '.join(cmd))
self.corenlp.delaybeforesend = 0
self.corenlp.delayafterread = 0
self.corenlp.expect_exact('NLP>', searchwindowsize=100)
@staticmethod
def _convert(token):
if token == '-LRB-':
return '('
if token == '-RRB-':
return ')'
if token == '-LSB-':
return '['
if token == '-RSB-':
return ']'
if token == '-LCB-':
return '{'
if token == '-RCB-':
return '}'
return token
def tokenize(self, text):
# Since we're feeding text to the commandline, we're waiting on seeing
# the NLP> prompt. Hacky!
if 'NLP>' in text:
raise RuntimeError('Bad token (NLP>) in text!')
# Sending q will cause the process to quit -- manually override
if text.lower().strip() == 'q':
token = text.strip()
index = text.index(token)
data = [(token, text[index:], (index, index + 1), 'NN', 'q', 'O')]
return Tokens(data, self.annotators)
# Minor cleanup before tokenizing.
clean_text = text.replace('\n', ' ')
self.corenlp.sendline(clean_text.encode('utf-8'))
self.corenlp.expect_exact('NLP>', searchwindowsize=100)
# Skip to start of output (may have been stderr logging messages)
output = self.corenlp.before
start = output.find(b'{"sentences":')
output = json.loads(output[start:].decode('utf-8'))
data = []
tokens = [t for s in output['sentences'] for t in s['tokens']]
for i in range(len(tokens)):
# Get whitespace
start_ws = tokens[i]['characterOffsetBegin']
if i + 1 < len(tokens):
end_ws = tokens[i + 1]['characterOffsetBegin']
else:
end_ws = tokens[i]['characterOffsetEnd']
data.append((
self._convert(tokens[i]['word']),
text[start_ws: end_ws],
(tokens[i]['characterOffsetBegin'],
tokens[i]['characterOffsetEnd']),
tokens[i].get('pos', None),
tokens[i].get('lemma', None),
tokens[i].get('ner', None)
))
return Tokens(data, self.annotators)
|