Spaces:

ltg
/

ssa-perin

Runtime error

File size: 8,197 Bytes
import re;
import sys;

from graph import Graph;

TEXT = re.compile(r"^# text = (.+)$");
ID = re.compile(r"^# sent_id = (.+)$");
RANGE = re.compile(r"^([0-9]+)-([0-9]+)$");
ANCHOR = re.compile(r".*TokenRange=([0-9]+):([0-9]+)");

def read_tuples(stream):
  id, input = None, None;
  tuples = [];
  for line in stream:
    line = line.rstrip();
    if line.startswith("#"):
      match = TEXT.match(line)
      if match:
        input = match.group(1);
        continue;
      match = ID.match(line);
      if match:
        id = match.group(1);
        continue;
    elif len(line) == 0:
      # if there is no `text` comment in the conll, one should reconstruct
      # the input sentence from the FORM column, since it is required in :construct_graph
      if input is None:
        input = reconstruct_input_from_tuples(tuples)
      if tuples:
        yield id, input, tuples;
        id, input = None, None;
        tuples = []
    else:
      tuples.append(line.split("\t"));

def reconstruct_input_from_tuples(tuples):
  """ Reconstruct input sentence from the CoNLL-U representation.
  each tuple in tuples correspond to a line in a block. """
  if not tuples: return ''
  # iterate only surface tokens - discard empty nodes and tokens included in ranges
  surface_indicator = get_is_surface_token_indicator(tuples)
  surface_tuples = [tuple
                    for is_surface, tuple in zip(surface_indicator, tuples)
                    if is_surface]
  sent_str = ''
  for t in surface_tuples:
    tok = t[1] # FORM column
    sent_str += tok
    if "SpaceAfter=No" not in t[-1] and t is not tuples[-1]: # Misc. column (last column)
      # in last token, don't add space in any case
      sent_str += ' '

  return sent_str

def get_ids2range_tuple(tuples):
  """
  Return Dict[int: tuple].
   for each node-id k that is part of a multi-word token (denoted by range-id "i-j"), let t be the tuple
   of the token i-j (the multiword token). the dict will be {k:t} over all these ks.
  """
  ranges2multiword = dict()
  for tuple in tuples:
    match = RANGE.match(tuple[0])
    if match is not None:
      for t in range(int(match.group(1)), int(match.group(2)) + 1):
        ranges2multiword[t] = tuple
  return ranges2multiword

def get_is_surface_token_indicator(tuples):
  """
  Return a list of boolean in same length as `tuples`,
  where output[i] indicate whether tuple[i] correspond to a surface token.
  surface tokens are those tokens that are required for detokenization of input sentence.
  see https://universaldependencies.org/format.html#words-tokens-and-empty-nodes

  the conditions to be a surface token -
    1. be not an empty node (in the form "i.j")
    2. be not a (syntactic) word that is contained in a multi-word token. that is, the word's id
    isn't included in any range-id (in the form "i-j").
  """
  ids2range_tuple = get_ids2range_tuple(tuples)
  ids = [t[0] for t in tuples]
  surface_indicator = ["." not in tid # condition 1.
                       and ("-" in tid or int(tid) not in ids2range_tuple) # condition 2.
                       for tid in ids]
  return surface_indicator

def read_anchors(stream):
  if stream is None:
    while True: yield None, None;
  else:
    id = None;
    tokens = list();
    for line in stream:
      line = line.rstrip("\n");
      if len(line) == 0:
        yield id, tokens;
        id = None;
        tokens.clear();
      elif line.startswith("#"):
        id = line[1:];
      else:
        fields = line.split("\t");
        if len(fields) == 3:
          tokens.append((int(fields[0]), int(fields[1])));
    if len(tokens) > 0:
      yield id, tokens;

def construct_graph_nodes(id, input, tuples, framework, text, anchors):
  i = 0;
  def compute(form):
    nonlocal i;
    m = None;
    j = input.find(form, i);
    if j >= i:
      i, m = j, len(form);
    else:
      base = form;
      k, l = len(input), 0;
      for old, new in {("‘", "`"), ("‘", "'"), ("’", "'"), ("`", "'"),
                       ("“", "\""), ("”", "\""),
                       ("–", "--"), ("–", "---"), ("—", "---"),
                       ("…", "..."), ("…", ". . .")}:
        form = base.replace(old, new);
        j = input.find(form, i);
        if j >= i and j < k: k, l = j, len(form);
      if k < len(input): i, m = k, l;
    if m:
      match = {"from": i, "to": i + m};
      i += m;
      return match;
    else:
      raise Exception("[{}] failed to anchor |{}| in |{}|{}| ({})"
                      "".format(graph.id, form, input[:i], input[i:], i));

  graph = Graph(id, flavor = 0, framework = framework);
  if input is not None: graph.add_input(input);
  elif text is not None: graph.add_input(text);
  input = graph.input;

  anchors_generator = read_anchors(anchors);
  _, anchors_tokens = next(anchors_generator);
  id, ids = 0, dict();
  ids2range_tuple = get_ids2range_tuple(tuples)
  for tuple, is_surface_token in zip(tuples, get_is_surface_token_indicator(tuples)):
    id += 1;
    ids[tuple[0]] = id;
    form, lemma, upos, xpos, features, head, misc = \
      tuple[1], tuple[2], tuple[3], tuple[4], tuple[5], tuple[6], tuple[9];
    properties = {"lemma": lemma, "upos": upos, "xpos": xpos};
    if features != "_":
      for feature in features.split("|"):
        name, value = feature.split("=", 1);
        properties[name] = value;
    # retrieve anchoring - only for surface tokens
    if not is_surface_token:
      anchors = []
    elif anchors_tokens is not None:
      start, end = anchors_tokens.pop(0);
      anchors = [{"from": start, "to": end}];
    else:
      tid = tuple[0]
      if tid.isnumeric() and int(tid) in ids2range_tuple:
        range_tuple_misc = ids2range_tuple[int(tid)][9];
        if range_tuple_misc != "_":
          misc = range_tuple_misc
      match = ANCHOR.match(misc);
      if match:
        anchors = [{"from": int(match.group(1)), "to": int(match.group(2))}];
      else:
        anchors = [compute(form)];
    graph.add_node(id, label = form,
                   properties = list(properties.keys()),
                   values = list(properties.values()),
                   top = True if head == "0" else False,
                   anchors = anchors);
  return graph, ids;

def construct_graph_edges(tuples, graph, ids):
  """ Given a graph with nodes (and id-mapping) pre-constructed,
  read edges from tuples and add them to graph.
  Modifies `graph` argument. """
  for tuple in tuples:
    id, head, type = tuple[0], tuple[6], tuple[7]
    if head in ids:
      graph.add_edge(ids[head], ids[id], type)

def construct_enhanced_graph_edges(tuples, graph, ids):
  """ Given a graph with nodes (and id-mapping) pre-constructed,
  read edges from tuples and add them to graph.
  This function is for reading Enhance UD graphs, which is distinguished from reading
  basic UD only in source of edges information -- DEPS column instead of HEAD, DEPREL columns.
  See https://universaldependencies.org/format.html#syntactic-annotation for EUD format specifications
  which we follow here.
  Modifies `graph` argument. """
  for tuple in tuples:
    id, deps = tuple[0], tuple[8]
    if deps == "_": # empty list of relations
      continue
    for rel in deps.split("|"): # relations are delimited with bar
      head, dep_type = rel.split(":", 1)
      if head in ids:
        graph.add_edge(ids[head], ids[id], dep_type)


def construct_graph(id, input, tuples, framework = None, text = None, anchors = None, enhanced_graph=False):
  graph, ids = construct_graph_nodes(id, input, tuples, framework, text, anchors)
  if not enhanced_graph:
    # basic UD graph (default)
    construct_graph_edges(tuples, graph, ids)
  else:
    # Enhanced UD graphs
    construct_enhanced_graph_edges(tuples, graph, ids)
  return graph

def read(stream, framework = None, text = None, anchors = None, trace = 0, enhanced_graph=False):
  tuples_generator = read_tuples(stream)
  for id, input, tuples in tuples_generator:
    if trace:
      print("conllu.read(): processing graph #{} ...".format(id),
            file = sys.stderr);
    graph = construct_graph(id, input, tuples, framework, text, anchors, enhanced_graph)
    yield graph, None;