#!/usr/bin/python # -*- coding: utf-8 -*- # Author: Rico Sennrich # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. """ (Hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. Assumes tokenized and sentence-split text. To get Moses XML format, first projectivize the trees, then use conll2mosesxml.py. """ from __future__ import print_function, unicode_literals import os import sys import codecs import argparse from subprocess import Popen, PIPE # hack for python2/3 compatibility from io import open argparse.open = open def create_parser(): parser = argparse.ArgumentParser( description=( """Wrapper around Stanford CoreNLP to produce CoNLL dependency format. Assumes that text is tokenized and has one sentence per line.""")) parser.add_argument( '--stanford', type=str, metavar='PATH', required=True, help='path to Stanford CoreNLP') parser.add_argument( '--java', type=str, default='java', metavar='PATH', help='path to java executable') parser.add_argument( '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH', help="Input text (default: standard input).") parser.add_argument( '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH', help="Output text (default: standard output).") return parser def process_stanford(infile, javacmd, stanfordpath): corenlp_jar = os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') corenlp_models_jar = os.path.join( stanfordpath, 'stanford-corenlp-3.5.0-models.jar') stanford = Popen( [ javacmd, '-cp', "%s:%s" % (corenlp_jar, corenlp_models_jar), 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize, ssplit, pos, depparse, lemma', '-ssplit.eolonly', 'true', '-tokenize.whitespace', 'true', '-numThreads', '8', '-textFile', '-', 'outFile', '-', ], stdin=infile, stdout=PIPE) return stanford.stdout def get_sentences(instream): sentence = [] expect = 0 for line in instream: if expect == 0 and line.startswith('Sentence #'): if sentence: yield sentence sentence = [] expect = 1 elif line == '\n': expect = 0 elif expect == 3: rel, remainder = line.split('(') head, dep = remainder.split() head_int = int(head.split('-')[-1][:-1]) dep_int = int(dep.split('-')[-1][:-1]) sentence[dep_int - 1]['head'] = head_int sentence[dep_int - 1]['label'] = rel elif expect == 2: linesplit = line.split('[', 1)[1].rsplit(']', 1)[0].split('] [') if len(linesplit) != len(sentence): sys.stderr.write( "Warning: mismatch in number of words in sentence\n") sys.stderr.write(' '.join(w['word'] for w in sentence)) for i in range(len(sentence)): sentence[i]['pos'] = '-' sentence[i]['lemma'] = '-' sentence[i]['head'] = 0 sentence[i]['label'] = '-' expect = 0 continue for i, w in enumerate(linesplit): sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0] sentence[i]['lemma'] = w.split(' Lemma=')[-1] expect = 3 elif expect == 1: for w in line.split(): sentence.append({'word': w}) expect = 2 if sentence: yield sentence def write(sentence, outstream): for i, w in enumerate(sentence): outstream.write( '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format( i + 1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label'])) if __name__ == '__main__': if sys.version_info < (3, 0): sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) sys.stdin = codecs.getreader('UTF-8')(sys.stdin) parser = create_parser() options = parser.parse_args() stanford = process_stanford(options.input, options.java, options.stanford) for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)): write(sentence, options.output) options.output.write('\n')