Spaces:
Runtime error
Runtime error
from operator import itemgetter; | |
import os.path; | |
import re; | |
import xml.etree.ElementTree as ET; | |
from graph import Graph; | |
def walk(id, node, parent, nodes, edges, ns): | |
i = node.get("id"); | |
o = node.findtext(ns + "ord"); | |
if i is None or o is None and parent is not None: | |
raise Exception("treex.walk(): " | |
"missing ‘id’ or ‘ord’ values while decoding tree #{}; exit." | |
"".format(id)); | |
nodes.append((i, int(o) if o is not None else 0, node)); | |
if edges is not None: | |
functor = node.findtext(ns + "functor"); | |
if parent is not None and functor is not None: | |
edges.append((parent, i, functor)); | |
children = node.find(ns + "children"); | |
if children is not None: | |
for child in children: | |
if child.tag == ns + "LM": | |
walk(id, child, i, nodes, edges, ns); | |
if children.find(ns + "LM") is None: | |
walk(id, children, i, nodes, edges, ns); | |
def read(fp, text = None): | |
ns = "{http://ufal.mff.cuni.cz/pdt/pml/}"; | |
# | |
# _fix_me_ | |
# factor out the anchor()ing code into a reusable form. (oe; 4-apr-20) | |
# | |
n = None; | |
i = 0; | |
def skip(): | |
nonlocal i; | |
while i < n and graph.input[i] in {" ", "\t"}: | |
i += 1; | |
def scan(candidates): | |
for candidate in candidates: | |
if graph.input.startswith(candidate, i): | |
return len(candidate); | |
def anchor(form): | |
nonlocal i; | |
skip(); | |
m = None; | |
if graph.input.startswith(form, i): | |
m = len(form); | |
else: | |
for old, new in {("‘", "`"), ("’", "'")}: | |
form = form.replace(old, new); | |
if graph.input.startswith(form, i): | |
m = len(form); | |
break; | |
if not m: | |
m = scan({"“", "\"", "``"}) or scan({"‘", "`"}) \ | |
or scan({"”", "\"", "''"}) or scan({"’", "'"}) \ | |
or scan({"—", "—", "---", "--"}) \ | |
or scan({"…", "...", ". . ."}); | |
if m: | |
anchor = {"from": i, "to": i + m}; | |
i += m; | |
skip(); | |
return anchor; | |
else: | |
raise Exception("{}: failed to anchor |{}| in |{}| ({})" | |
"".format(graph.id, form, graph.input, i)); | |
tree = ET.parse(fp).getroot(); | |
bundles = tree.find(ns + "bundles"); | |
for item in bundles.findall(ns + "LM"): | |
id = item.get("id"); | |
graph = Graph(id, flavor = 0, framework = "ptg"); | |
surface = list(); nodes = list(); edges = list(); | |
for zone in item.iter(ns + "zone"): | |
if zone.get("language") == "en": | |
sentence = zone.findtext(ns + "sentence"); | |
trees = zone.find(ns + "trees"); | |
if trees is not None: | |
atree = trees.find(ns + "a_tree"); | |
ttree = trees.find(ns + "t_tree"); | |
root = atree.find(ns + "children"); | |
top = ttree.find(ns + "children"); | |
# print(id, sentence, atree, ttree, root, top); | |
if root is None or top is None: | |
raise Exception("treex.read(): " | |
"missing ‘a_tree’ or ‘t_tree’ values while decoding tree #{}; exit." | |
"".format(id)); | |
walk(id, root, None, surface, None, ns); | |
walk(id, top, None, nodes, edges, ns); | |
# | |
# determine character-based anchors for all .surface. (analytical) tokens | |
# | |
anchoring = dict(); | |
if sentence is not None: | |
graph.add_input(sentence); | |
n = len(graph.input); | |
i = 0; | |
for node in sorted(surface, key = itemgetter(1)): | |
anchoring[node[0]] = anchor(node[2].findtext(ns + "form")); | |
# | |
# now process tectogrammatical nodes in surface order (as indicated in the | |
# annotations): map to consecutive numerical identifiers; retrieve anchors | |
# from corresponding analytical nodes; and create actual (new) graph nodes. | |
# | |
mapping = {}; | |
to = 0; | |
for node in sorted(nodes, key = itemgetter(1)): | |
mapping[node[0]] = i = len(mapping); | |
properties = dict(); | |
a = node[2].find(ns + "a"); | |
if a is not None: | |
anchors = list(); | |
for lex in a: | |
if len(lex) == 0: | |
anchors.append(anchoring[lex.text]); | |
else: | |
for lm in lex.findall(ns + "LM"): | |
anchors.append(anchoring[lm.text]); | |
anchors = sorted(anchors, key = itemgetter("to")); | |
to = anchors[-1]["to"]; | |
else: | |
# | |
# _fix_me_ | |
# discuss anchoring of generated nodes: currently, for uniformity, we | |
# anchor them to an empty string immediately after the final character | |
# of the preceding non-generated node. but this arguably introduces a | |
# vacuous piece of information, unless one were to argue that it rather | |
# is an encoding of the node status for generated nodes? (oe; 4-apr-20) | |
# | |
anchors = [{"from": to, "to": to}]; | |
# | |
# the node label comes from the tectogrammatical lemma | |
# | |
lemma = node[2].findtext(ns + "t_lemma"); | |
frame = node[2].findtext(ns + "val_frame.rf"); | |
# | |
# where present (mostly on verbs), extract the valency frame identifier | |
# _fix_me_ | |
# for compatibility with earlier PSD releases, strip prefix that seems to | |
# identify the valency dictionary. (oe; 4-apr-20) | |
# | |
if frame is not None: | |
if "#" in frame: | |
properties["frame"] = frame[frame.index("#") + 1:]; | |
else: | |
properties["frame"] = frame; | |
# | |
# selectively expose grammatemes as node-local properties, but ignore | |
# (vanilla but very high-frequent) default values | |
# | |
grammatemes = node[2].find(ns + "gram"); | |
if grammatemes is not None: | |
for property, default in [("tense", {"nil"}), ("negation", {"neg0"})]: | |
match = grammatemes.findtext(ns + property); | |
if match is not None and match not in default: | |
properties[property] = match; | |
graph.add_node(id = i, label = lemma, anchors = anchors, | |
properties = properties.keys(), | |
values = properties.values(), | |
top = node[0] == top.get("id")); | |
# | |
# similarly, record all edges, now using mapped identifiers | |
# | |
for source, target, label in edges: | |
graph.add_edge(mapping[source], mapping[target], label); | |
# | |
# in a second pass (so that all internal identifiers are mapped already), | |
# create edges reflecting coreference annotations. | |
# | |
for node in nodes: | |
coref = node[2].findtext(ns + "coref_gram.rf"); | |
if coref is not None: | |
graph.add_edge(mapping[node[0]], mapping[coref], "coref_gram"); | |
yield graph, None; | |