sakharamg
/

NMTKD

Model card Files Files and versions Community

File size: 4,678 Bytes

158b61b

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

"""
(Hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format.
Assumes tokenized and sentence-split text.

To get Moses XML format, first projectivize the trees, then use
conll2mosesxml.py.
"""

from __future__ import print_function, unicode_literals
import os
import sys
import codecs
import argparse

from subprocess import Popen, PIPE

# hack for python2/3 compatibility
from io import open
argparse.open = open


def create_parser():
    parser = argparse.ArgumentParser(
        description=(
            """Wrapper around Stanford CoreNLP to produce CoNLL dependency format.
            Assumes that text is tokenized and has one sentence per line."""))

    parser.add_argument(
        '--stanford', type=str,
        metavar='PATH', required=True,
        help='path to Stanford CoreNLP')

    parser.add_argument(
        '--java', type=str, default='java',
        metavar='PATH',
        help='path to java executable')

    parser.add_argument(
        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
        metavar='PATH',
        help="Input text (default: standard input).")
    parser.add_argument(
        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
        metavar='PATH',
        help="Output text (default: standard output).")

    return parser


def process_stanford(infile, javacmd, stanfordpath):

    corenlp_jar = os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar')
    corenlp_models_jar = os.path.join(
        stanfordpath, 'stanford-corenlp-3.5.0-models.jar')
    stanford = Popen(
        [
            javacmd,
            '-cp', "%s:%s" % (corenlp_jar, corenlp_models_jar),
            'edu.stanford.nlp.pipeline.StanfordCoreNLP',
            '-annotators', 'tokenize, ssplit, pos, depparse, lemma',
            '-ssplit.eolonly', 'true',
            '-tokenize.whitespace', 'true',
            '-numThreads', '8',
            '-textFile', '-',
            'outFile', '-',
        ],
        stdin=infile, stdout=PIPE)
    return stanford.stdout


def get_sentences(instream):
    sentence = []
    expect = 0

    for line in instream:
        if expect == 0 and line.startswith('Sentence #'):
            if sentence:
                yield sentence
            sentence = []
            expect = 1

        elif line == '\n':
            expect = 0

        elif expect == 3:
            rel, remainder = line.split('(')
            head, dep = remainder.split()
            head_int = int(head.split('-')[-1][:-1])
            dep_int = int(dep.split('-')[-1][:-1])
            sentence[dep_int - 1]['head'] = head_int
            sentence[dep_int - 1]['label'] = rel

        elif expect == 2:
            linesplit = line.split('[', 1)[1].rsplit(']', 1)[0].split('] [')
            if len(linesplit) != len(sentence):
                sys.stderr.write(
                    "Warning: mismatch in number of words in sentence\n")
                sys.stderr.write(' '.join(w['word'] for w in sentence))
                for i in range(len(sentence)):
                    sentence[i]['pos'] = '-'
                    sentence[i]['lemma'] = '-'
                    sentence[i]['head'] = 0
                    sentence[i]['label'] = '-'
                expect = 0
                continue
            for i, w in enumerate(linesplit):
                sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0]
                sentence[i]['lemma'] = w.split(' Lemma=')[-1]
            expect = 3

        elif expect == 1:
            for w in line.split():
                sentence.append({'word': w})
            expect = 2

    if sentence:
        yield sentence


def write(sentence, outstream):
    for i, w in enumerate(sentence):
        outstream.write(
            '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(
                i + 1, w['word'], w['lemma'], w['pos'], w['pos'], '-',
                w['head'], w['label']))


if __name__ == '__main__':
    if sys.version_info < (3, 0):
        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)

    parser = create_parser()
    options = parser.parse_args()

    stanford = process_stanford(options.input, options.java, options.stanford)
    for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)):
        write(sentence, options.output)
        options.output.write('\n')