|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import itertools |
|
import sys |
|
|
|
import spacy |
|
import tensorflow as tf |
|
|
|
tf.flags.DEFINE_string('corpus', '', 'Filename of corpus') |
|
tf.flags.DEFINE_string('labeled_pairs', '', 'Filename of labeled pairs') |
|
tf.flags.DEFINE_string('output', '', 'Filename of output file') |
|
FLAGS = tf.flags.FLAGS |
|
|
|
|
|
def get_path(mod_token, head_token): |
|
"""Returns the path between a modifier token and a head token.""" |
|
|
|
mod_ancestors = list(reversed(list(mod_token.ancestors))) |
|
head_ancestors = list(reversed(list(head_token.ancestors))) |
|
|
|
|
|
|
|
if (not mod_ancestors or not head_ancestors |
|
or mod_ancestors[0] != head_ancestors[0]): |
|
return None |
|
|
|
|
|
|
|
ix = 1 |
|
while (ix < len(mod_ancestors) and ix < len(head_ancestors) |
|
and mod_ancestors[ix] == head_ancestors[ix]): |
|
ix += 1 |
|
|
|
|
|
|
|
|
|
path = ['/'.join(('<X>', mod_token.pos_, mod_token.dep_, '>'))] |
|
|
|
path += ['/'.join((tok.lemma_, tok.pos_, tok.dep_, '>')) |
|
for tok in reversed(mod_ancestors[ix:])] |
|
|
|
root_token = mod_ancestors[ix - 1] |
|
path += ['/'.join((root_token.lemma_, root_token.pos_, root_token.dep_, '^'))] |
|
|
|
path += ['/'.join((tok.lemma_, tok.pos_, tok.dep_, '<')) |
|
for tok in head_ancestors[ix:]] |
|
|
|
path += ['/'.join(('<Y>', head_token.pos_, head_token.dep_, '<'))] |
|
|
|
return '::'.join(path) |
|
|
|
|
|
def main(_): |
|
nlp = spacy.load('en_core_web_sm') |
|
|
|
|
|
with tf.gfile.GFile(FLAGS.labeled_pairs) as fh: |
|
parts = (l.decode('utf-8').split('\t') for l in fh.read().splitlines()) |
|
labeled_pairs = {(mod, head): rel for mod, head, rel in parts} |
|
|
|
|
|
mods_for_head = { |
|
head: set(hm[1] for hm in head_mods) |
|
for head, head_mods in itertools.groupby( |
|
sorted((head, mod) for (mod, head) in labeled_pairs.iterkeys()), |
|
lambda (head, mod): head)} |
|
|
|
|
|
heads = set(mods_for_head.keys()) |
|
|
|
|
|
|
|
out_fh = sys.stdout if not FLAGS.output else tf.gfile.GFile(FLAGS.output, 'w') |
|
in_fh = sys.stdin if not FLAGS.corpus else tf.gfile.GFile(FLAGS.corpus) |
|
|
|
num_paths = 0 |
|
for line, sen in enumerate(in_fh, start=1): |
|
if line % 100 == 0: |
|
print('\rProcessing line %d: %d paths' % (line, num_paths), |
|
end='', file=sys.stderr) |
|
|
|
sen = sen.decode('utf-8').strip() |
|
doc = nlp(sen) |
|
|
|
for head_token in doc: |
|
head_text = head_token.text.lower() |
|
if head_text in heads: |
|
mods = mods_for_head[head_text] |
|
for mod_token in doc: |
|
mod_text = mod_token.text.lower() |
|
if mod_text in mods: |
|
path = get_path(mod_token, head_token) |
|
if path: |
|
label = labeled_pairs[(mod_text, head_text)] |
|
line = '\t'.join((mod_text, head_text, label, path, sen)) |
|
print(line.encode('utf-8'), file=out_fh) |
|
num_paths += 1 |
|
|
|
out_fh.close() |
|
|
|
if __name__ == '__main__': |
|
tf.app.run() |
|
|