Spaces:

ltg
/

ssa-perin

Runtime error

File size: 19,205 Bytes
#!/usr/bin/env python3

# -*- coding: utf-8; -*-

import argparse;
import json;
import multiprocessing as mp;
import re;
import sys;
import time;
from pathlib import Path;
from zipfile import ZipFile;

import codec.amr;
import codec.conllu;
import codec.eds;
import codec.mrp;
import codec.norec;
import codec.pmb;
import codec.sdp;
import codec.treex;
import codec.ucca;
import inspector;
import score.edm;
import score.mces;
import score.sdp;
import score.smatch;
import score.ucca;
import validate.core;
from analyzer import analyze;

__author__ = "oe"

ENCODING = "utf-8";
NORMALIZATIONS = {"anchors", "case", "edges", "attributes"};
VALIDATIONS = {"input", "anchors", "edges",
               "amr", "eds", "sdp", "ucca"}

def read_graphs(stream, format = None,
                full = False, normalize = False, reify = False, node_centric = False,
                frameworks = None, prefix = None, text = None, filter = None,
                trace = 0, strict = 0, quiet = False, robust = False,
                alignment = None, anchors = None, pretty = False,
                id = None, n = None, i = None):

  name = getattr(stream, "name", "");
  if name.endswith(".zip"):
    with ZipFile(name) as zip:
      stream = None;
      for entry in zip.namelist():
        if entry.endswith(".mrp"):
          if stream is not None:
            print("read_graphs(): multiple MRP entries in ‘{}’; exit."
                  "".format(name), file = sys.stderr);
            sys.exit(1);
          stream = zip.open(entry);
      if stream is None:
        print("read_graphs(): missing MRP entry in ‘{}’; exit."
              "".format(name), file = sys.stderr);
        sys.exit(1);

  generator = None;
  if format in {"amr", "camr"}:
    generator \
      = codec.amr.read(stream, full = full, reify = reify,
                       text = text, camr = format == "camr",
                       alignment = alignment, quiet = quiet, trace = trace);
  elif format in {"ccd", "dm", "pas", "psd"}:
    generator = codec.sdp.read(stream, framework = format, text = text);
  elif format == "eds":
    generator = codec.eds.read(stream, reify = reify, text = text);
  elif format == "mrp":
    generator = codec.mrp.read(stream, text = text, robust = robust);
  elif format == "norec":
    generator = codec.norec.read(stream, text = text, node_centric = node_centric);
  elif format == "pmb":
    generator = codec.pmb.read(stream, full = full,
                               reify = reify, text = text,
                               trace = trace, strict = strict);
  elif format == "treex":
    generator = codec.treex.read(stream)
  elif format == "ucca":
    generator = codec.ucca.read(stream, text = text, prefix = prefix);
  elif format == "conllu" or format == "ud":
    generator = codec.conllu.read(stream, framework = format, text = text,
                                  anchors = anchors, trace = trace);
  elif format == "eud":
    generator = codec.conllu.read(stream, framework = format, text = text,
                                  anchors = anchors, trace = trace,
                                  enhanced_graph = True);
  else:
    print("read_graphs(): invalid input codec {}; exit."
          "".format(format), file = sys.stderr);
    sys.exit(1);

  if generator is None:
    return None, None;

  #
  # (for now) break out of the generators, for downstream simplicity
  #
  graphs = [];
  overlays = [];
  j = 0;
  while n is None or n < 1 or j < n:
    try:
      graph, overlay = next(generator);
      if frameworks is not None and graph.framework not in frameworks: continue;
      if filter is not None and graph.id not in filter: continue;
      if id is not None:
        if graph.id == id:
          graphs.append(graph); overlays.append(overlay);
      elif i is not None and i >= 0:
        if j == i:
          graphs.append(graph); overlays.append(overlay);
          break;
      else:
        graphs.append(graph); overlays.append(overlay);
      j += 1;
    except StopIteration:
      break;
    except Exception as error:
      print(error, file = sys.stderr);
      pass;

  if pretty:
    for graph in graphs: graph.prettify(trace);
  if normalize:
    for graph in graphs: graph.normalize(normalize, trace);

  return graphs, overlays;

def main(args=None):
  parser = argparse.ArgumentParser(description = "MRP Graph Toolkit");
  parser.add_argument("--inspect", action = "store_true");
  parser.add_argument("--analyze", action = "store_true");
  parser.add_argument("--normalize", action = "append", default = []);
  parser.add_argument("--full", action = "store_true");
  parser.add_argument("--reify", action = "store_true");
  parser.add_argument("--node_centric", action = "store_true");
  parser.add_argument("--unique", action = "store_true");
  parser.add_argument("--ids", action = "store_true");
  parser.add_argument("--strings", action = "store_true");
  parser.add_argument("--framework", action = "append", default = []);
  parser.add_argument("--gold",
                      type = argparse.FileType("r", encoding = ENCODING));
  parser.add_argument("--alignment",
                      type = argparse.FileType("r", encoding = ENCODING));
  parser.add_argument("--overlay",
                      type = argparse.FileType("w", encoding = ENCODING));
  parser.add_argument("--format");
  parser.add_argument("--score");
  parser.add_argument("--validate", action = "append", default = []);
  parser.add_argument("--limit");
  parser.add_argument("--read", required = True);
  parser.add_argument("--write");
  parser.add_argument("--text");
  parser.add_argument("--inverse", action = "store_true");
  parser.add_argument("--anchors",
                      type = argparse.FileType("r", encoding = ENCODING));
  parser.add_argument("--prefix");
  parser.add_argument("--source");
  parser.add_argument("--targets");
  parser.add_argument("--pretty", action = "store_true");
  parser.add_argument("--inject");
  parser.add_argument("--version", type = float, default = 1.1);
  parser.add_argument("--cores", type = int, default = 1);
  parser.add_argument("--i", type = int);
  parser.add_argument("--n", type = int);
  parser.add_argument("--id");
  parser.add_argument("--filter");
  parser.add_argument("--quiet", action = "store_true");
  parser.add_argument("--robust", action = "store_true");
  parser.add_argument("--trace", "-t", action = "count", default = 0);
  parser.add_argument("--strict", action = "count", default = 0);
  parser.add_argument("--errors",
                      type = argparse.FileType("w", encoding = ENCODING));
  parser.add_argument("input", nargs = "?",
                      type = argparse.FileType("r", encoding = ENCODING),
                      default = sys.stdin);
  parser.add_argument("output", nargs = "?",
                      type = argparse.FileType("w", encoding = ENCODING),
                      default = sys.stdout);
  if args is None:
    args = sys.argv
  arguments = parser.parse_args(args);

  text = None;
  if arguments.text is not None:
    path = Path(arguments.text);
    if path.is_file():
      text = {};
      with path.open() as stream:
        for line in stream:
          id, string = line.split("\t", maxsplit = 1);
          if string.endswith("\n"): string = string[:len(string) - 1];
          if arguments.inverse: text[string] = id;
          else: text[id] = string;
    elif path.is_dir():
      text = path;
  elif arguments.inverse:
    print("main.py(): option ‘--inverse’ requires ‘--text’; exit.",
          file = sys.stderr);
    sys.exit(1);

  if arguments.read not in {"mrp",
                            "ccd", "dm", "pas", "psd", "treex",
                            "eds", "ucca",
                            "amr", "camr", "pmb",
                            "conllu", "ud", "eud",
                            "norec"}:
    print("main.py(): invalid input format: {}; exit."
          "".format(arguments.read), file = sys.stderr);
    sys.exit(1);

  filter = None;
  if arguments.filter is not None:
    try:
      path = Path(arguments.filter);
      filter = set();
      with path.open() as stream:
        for line in stream:
          filter.add(line.split("\t", maxsplit = 1)[0]);
    except:
      print("main.py(): invalid ‘--filter’: {}; exit."
            "".format(arguments.write), file = sys.stderr);
      sys.exit(1);
    if filter is not None and len(filter) == 0: filter = None;

  if arguments.write is not None and \
     arguments.write not in \
     {"dot", "tikz", "displacy", "evaluation", "id", "json", "mrp",
      "source", "targets", "txt", "ucca", "norec"}:
    print("main.py(): invalid output format: {}; exit."
          "".format(arguments.write), file = sys.stderr);
    sys.exit(1);

  #
  # backwards compatibility: desirable until august 2019, say
  #
  if arguments.score == "mces": arguments.score = "mrp";
  if arguments.score is not None and \
     arguments.score not in {"mrp", "sdp", "edm", "ucca", "smatch"}:
    print("main.py(): invalid evaluation metric: {}; exit."
          "".format(arguments.score), file = sys.stderr);
    sys.exit(1);

  if arguments.format and \
     arguments.format not in {"mrp",
                              "ccd", "dm", "pas", "psd",
                              "eds", "ucca",
                              "amr", "camr", "pmb",
                              "conllu", "ud", "eud"}:
    print("main.py(): invalid gold format: {}; exit."
          "".format(arguments.read), file = sys.stderr);
    sys.exit(1);

  if len(arguments.normalize) == 1 and arguments.normalize[0] == "all":
    normalize = NORMALIZATIONS;
  else:
    normalize = set();
    for action in arguments.normalize:
      if action in NORMALIZATIONS:
        normalize.add(action);
      else:
        print("main.py(): invalid type of normalization: {}; exit."
              "".format(action), file = sys.stderr);
        sys.exit(1);
  if arguments.score is not None and len(normalize) == 0:
    normalize = NORMALIZATIONS;

  if arguments.targets == "gather" and not arguments.unique:
    print("main.py(): option ‘--targets gather’ requires ‘--unique’; exit.",
          file = sys.stderr);
    sys.exit(1);

  if arguments.alignment is not None and arguments.overlay is None:
    print("main.py(): option ‘--alignment’ requires ‘--overlay’; exit.",
          file = sys.stderr);
    sys.exit(1);

  if len(arguments.framework) == 0: arguments.framework = None;

  if arguments.cores == 0: arguments.cores = mp.cpu_count();
    
  graphs, overlays \
    = read_graphs(arguments.input, format = arguments.read,
                  full = arguments.full, normalize = normalize,
                  reify = arguments.reify, frameworks = arguments.framework,
                  text = text, filter = filter, alignment = arguments.alignment,
                  anchors = arguments.anchors, pretty = arguments.pretty,
                  trace = arguments.trace, strict = arguments.strict, node_centric = arguments.node_centric,
                  quiet = arguments.quiet, robust = arguments.robust,
                  id = arguments.id, n = arguments.n, i = arguments.i);
  if graphs is None:
    print("main.py(): unable to read input graphs: {}; exit."
          "".format(arguments.input.name), file = sys.stderr);
    sys.exit(1);

  if arguments.unique:
    targets = dict();
    if arguments.targets == "gather":
      for graph in graphs:
        if graph.id in targets: targets[graph.id].add(graph.framework);
        else: targets[graph.id] = {graph.framework};
      arguments.targets = None;
    unique = list();
    ids = set();
    for graph in graphs:
      id = graph.id;
      if id in targets: graph.targets(list(targets[id]));
      if id not in ids:
        ids.add(id);
        unique.append(graph);
    graphs = unique;

  #
  # inject any additional information provided on the command line
  #
  if arguments.source:
    for graph in graphs: graph.source(arguments.source);
  if arguments.inject:
    for graph in graphs: graph.inject(arguments.inject);

  if arguments.validate == ["all"]:
    actions = VALIDATIONS;
  else:
    actions = set();
    for action in arguments.validate:
      if action in VALIDATIONS:
        actions.add(action);
      else:
        print("main.py(): invalid type of validation: {}; exit."
              "".format(action), file = sys.stderr);
        sys.exit(1);

  if arguments.quiet: arguments.trace = 0;

  if actions:
    for graph in graphs:
      validate.core.test(graph, actions, stream = sys.stderr);

  if arguments.analyze:
    analyze(graphs);

  gold = None;
  if arguments.gold and arguments.score or arguments.inspect:
    if arguments.format is None: arguments.format = arguments.read;
    gold, _ = read_graphs(arguments.gold, format = arguments.format,
                          full = arguments.full, normalize = normalize,
                          reify = arguments.reify, node_centric = arguments.node_centric,
                          frameworks = arguments.framework,
                          text = text, filter = filter,
                          trace = arguments.trace, quiet = arguments.quiet,
                          robust = arguments.robust,
                          id = arguments.id, n = arguments.n, i = arguments.i);
    if gold is None:
      print("main.py(): unable to read gold graphs: {}; exit."
            "".format(arguments.gold.name), file = sys.stderr);
      sys.exit(1);

  if arguments.inspect:
    result = inspector.summarize(graphs, gold);
    if arguments.write == "json" or True:
      json.dump(result, arguments.output, indent = None);
      print(file = arguments.output);
    sys.exit(0);

  if arguments.score:
    limits = {"rrhc": None, "mces": None};
    for metric in arguments.score.split(","):
      if arguments.limit is not None:
        try:
          match = re.search(r"([0-9]+):([0-9]+)", arguments.limit)
          if match:
            limits["rrhc"] = int(match.group(1));
            limits["mces"] = int(match.group(2));
          else:
            if metric == "smatch":
              limits["rrhc"] = int(arguments.limit);
            else:
              limits["mces"] = int(arguments.limit);
        except:
          print("main.py(): invalid ‘--limit’ {}; exit."
                "".format(arguments.limit),
                file = sys.stderr);
          sys.exit(1);
      errors = dict() if arguments.errors else None;
      result = None;
      launch = time.time(), time.process_time();
      if metric == "edm":
        result = score.edm.evaluate(gold, graphs,
                                    format = arguments.write,
                                    trace = arguments.trace);
      elif metric == "mrp":
        result = score.mces.evaluate(gold, graphs,
                                     format = arguments.write,
                                     limits = limits,
                                     cores = arguments.cores,
                                     trace = arguments.trace,
                                     errors = errors,
                                     quiet = arguments.quiet);
      elif metric == "sdp":
        result = score.sdp.evaluate(gold, graphs,
                                    format = arguments.write,
                                    trace = arguments.trace);
      elif metric == "smatch":
        result = score.smatch.evaluate(gold, graphs,
                                       format = arguments.write,
                                       limit = limits["rrhc"],
                                       values = {"tops", "labels",
                                                 "properties", "anchors",
                                                 "edges", "attributes"},
                                       trace = arguments.trace);
      elif metric == "ucca":
        result = score.ucca.evaluate(gold, graphs,
                                     format = arguments.write,
                                     trace = arguments.trace);

      if result is not None:
        result["time"] = time.time() - launch[0];
        result["cpu"] = time.process_time() - launch[1];
        if arguments.write == "json" or True:
          #
          # _fix_me_
          # we should write a genuine custom JSON encoder
          #
          print("{", file = arguments.output, end = "");
          start = True;
          for key in result:
            if start: start = False;
            else: print(",\n ", file = arguments.output, end = "");
            print("\"{}\": ".format(key), file = arguments.output, end = "");
            json.dump(result[key], arguments.output, indent = None);
          print("}", file = arguments.output);

      if errors is not None:
        if arguments.write == "dot":
          for graph in gold:
            graph.dot(arguments.errors,
                      ids = arguments.ids, strings = arguments.strings,
                      errors = errors[graph.framework][graph.id]);
        elif arguments.write == "json" or True:
          json.dump(errors, arguments.errors, indent = None);
    sys.exit(0);
      
  for graph in graphs:
    if arguments.write in {"mrp", "evaluation"}:
      if arguments.write == "evaluation":
        graph.flavor = graph.framework = graph.nodes = graph.edges = None;
        if arguments.targets is not None:
          graph.targets(arguments.targets.split(","));
      json.dump(graph.encode(arguments.version), arguments.output,
                indent = None, ensure_ascii = False);
      print(file = arguments.output);
    elif arguments.write == "dot":
      graph.dot(arguments.output,
                ids = arguments.ids, strings = arguments.strings);
      print(file = arguments.output);
    elif arguments.write == "tikz":
      graph.tikz(arguments.output);
    elif arguments.write == "displacy":
      graph.displacy(arguments.output);
    elif arguments.write == "id":
      print("{}".format(graph.id), file = arguments.output);
    elif arguments.write == "source":
      print("{}\t{}".format(graph.id, graph.source()), file = arguments.output);
    elif arguments.write == "targets":
      for target in graph.targets() or (""):
        print("{}\t{}".format(graph.id, target), file = arguments.output);
    elif arguments.write == "txt":
      print("{}\t{}".format(graph.id, graph.input), file = arguments.output);
    elif arguments.write == "ucca":
      # Prints everything to one long file. To split to separate XML files, use, e.g.,
      # csplit -zk output.xml '/^<root/' -f '' -b '%02d.xml' {99}
      codec.ucca.write(graph, graph.input, file = arguments.output)

  if arguments.write == "norec":
    norec_graphs = [codec.norec.write(graph, graph.input, node_centric = arguments.node_centric) for graph in graphs]
    json.dump(norec_graphs, arguments.output, indent=None)

  if arguments.overlay:
    for graph in overlays:
      if graph:
        json.dump(graph.encode(arguments.version), arguments.overlay,
                  indent = None, ensure_ascii = False);
        print(file = arguments.overlay);

if __name__ == "__main__":
  main();