ssa-perin / mtool /codec /conllu.py
larkkin's picture
Add application code and models, update README
8044721
raw
history blame
8.2 kB
import re;
import sys;
from graph import Graph;
TEXT = re.compile(r"^# text = (.+)$");
ID = re.compile(r"^# sent_id = (.+)$");
RANGE = re.compile(r"^([0-9]+)-([0-9]+)$");
ANCHOR = re.compile(r".*TokenRange=([0-9]+):([0-9]+)");
def read_tuples(stream):
id, input = None, None;
tuples = [];
for line in stream:
line = line.rstrip();
if line.startswith("#"):
match = TEXT.match(line)
if match:
input = match.group(1);
continue;
match = ID.match(line);
if match:
id = match.group(1);
continue;
elif len(line) == 0:
# if there is no `text` comment in the conll, one should reconstruct
# the input sentence from the FORM column, since it is required in :construct_graph
if input is None:
input = reconstruct_input_from_tuples(tuples)
if tuples:
yield id, input, tuples;
id, input = None, None;
tuples = []
else:
tuples.append(line.split("\t"));
def reconstruct_input_from_tuples(tuples):
""" Reconstruct input sentence from the CoNLL-U representation.
each tuple in tuples correspond to a line in a block. """
if not tuples: return ''
# iterate only surface tokens - discard empty nodes and tokens included in ranges
surface_indicator = get_is_surface_token_indicator(tuples)
surface_tuples = [tuple
for is_surface, tuple in zip(surface_indicator, tuples)
if is_surface]
sent_str = ''
for t in surface_tuples:
tok = t[1] # FORM column
sent_str += tok
if "SpaceAfter=No" not in t[-1] and t is not tuples[-1]: # Misc. column (last column)
# in last token, don't add space in any case
sent_str += ' '
return sent_str
def get_ids2range_tuple(tuples):
"""
Return Dict[int: tuple].
for each node-id k that is part of a multi-word token (denoted by range-id "i-j"), let t be the tuple
of the token i-j (the multiword token). the dict will be {k:t} over all these ks.
"""
ranges2multiword = dict()
for tuple in tuples:
match = RANGE.match(tuple[0])
if match is not None:
for t in range(int(match.group(1)), int(match.group(2)) + 1):
ranges2multiword[t] = tuple
return ranges2multiword
def get_is_surface_token_indicator(tuples):
"""
Return a list of boolean in same length as `tuples`,
where output[i] indicate whether tuple[i] correspond to a surface token.
surface tokens are those tokens that are required for detokenization of input sentence.
see https://universaldependencies.org/format.html#words-tokens-and-empty-nodes
the conditions to be a surface token -
1. be not an empty node (in the form "i.j")
2. be not a (syntactic) word that is contained in a multi-word token. that is, the word's id
isn't included in any range-id (in the form "i-j").
"""
ids2range_tuple = get_ids2range_tuple(tuples)
ids = [t[0] for t in tuples]
surface_indicator = ["." not in tid # condition 1.
and ("-" in tid or int(tid) not in ids2range_tuple) # condition 2.
for tid in ids]
return surface_indicator
def read_anchors(stream):
if stream is None:
while True: yield None, None;
else:
id = None;
tokens = list();
for line in stream:
line = line.rstrip("\n");
if len(line) == 0:
yield id, tokens;
id = None;
tokens.clear();
elif line.startswith("#"):
id = line[1:];
else:
fields = line.split("\t");
if len(fields) == 3:
tokens.append((int(fields[0]), int(fields[1])));
if len(tokens) > 0:
yield id, tokens;
def construct_graph_nodes(id, input, tuples, framework, text, anchors):
i = 0;
def compute(form):
nonlocal i;
m = None;
j = input.find(form, i);
if j >= i:
i, m = j, len(form);
else:
base = form;
k, l = len(input), 0;
for old, new in {("β€˜", "`"), ("β€˜", "'"), ("’", "'"), ("`", "'"),
("β€œ", "\""), ("”", "\""),
("–", "--"), ("–", "---"), ("β€”", "---"),
("…", "..."), ("…", ". . .")}:
form = base.replace(old, new);
j = input.find(form, i);
if j >= i and j < k: k, l = j, len(form);
if k < len(input): i, m = k, l;
if m:
match = {"from": i, "to": i + m};
i += m;
return match;
else:
raise Exception("[{}] failed to anchor |{}| in |{}|{}| ({})"
"".format(graph.id, form, input[:i], input[i:], i));
graph = Graph(id, flavor = 0, framework = framework);
if input is not None: graph.add_input(input);
elif text is not None: graph.add_input(text);
input = graph.input;
anchors_generator = read_anchors(anchors);
_, anchors_tokens = next(anchors_generator);
id, ids = 0, dict();
ids2range_tuple = get_ids2range_tuple(tuples)
for tuple, is_surface_token in zip(tuples, get_is_surface_token_indicator(tuples)):
id += 1;
ids[tuple[0]] = id;
form, lemma, upos, xpos, features, head, misc = \
tuple[1], tuple[2], tuple[3], tuple[4], tuple[5], tuple[6], tuple[9];
properties = {"lemma": lemma, "upos": upos, "xpos": xpos};
if features != "_":
for feature in features.split("|"):
name, value = feature.split("=", 1);
properties[name] = value;
# retrieve anchoring - only for surface tokens
if not is_surface_token:
anchors = []
elif anchors_tokens is not None:
start, end = anchors_tokens.pop(0);
anchors = [{"from": start, "to": end}];
else:
tid = tuple[0]
if tid.isnumeric() and int(tid) in ids2range_tuple:
range_tuple_misc = ids2range_tuple[int(tid)][9];
if range_tuple_misc != "_":
misc = range_tuple_misc
match = ANCHOR.match(misc);
if match:
anchors = [{"from": int(match.group(1)), "to": int(match.group(2))}];
else:
anchors = [compute(form)];
graph.add_node(id, label = form,
properties = list(properties.keys()),
values = list(properties.values()),
top = True if head == "0" else False,
anchors = anchors);
return graph, ids;
def construct_graph_edges(tuples, graph, ids):
""" Given a graph with nodes (and id-mapping) pre-constructed,
read edges from tuples and add them to graph.
Modifies `graph` argument. """
for tuple in tuples:
id, head, type = tuple[0], tuple[6], tuple[7]
if head in ids:
graph.add_edge(ids[head], ids[id], type)
def construct_enhanced_graph_edges(tuples, graph, ids):
""" Given a graph with nodes (and id-mapping) pre-constructed,
read edges from tuples and add them to graph.
This function is for reading Enhance UD graphs, which is distinguished from reading
basic UD only in source of edges information -- DEPS column instead of HEAD, DEPREL columns.
See https://universaldependencies.org/format.html#syntactic-annotation for EUD format specifications
which we follow here.
Modifies `graph` argument. """
for tuple in tuples:
id, deps = tuple[0], tuple[8]
if deps == "_": # empty list of relations
continue
for rel in deps.split("|"): # relations are delimited with bar
head, dep_type = rel.split(":", 1)
if head in ids:
graph.add_edge(ids[head], ids[id], dep_type)
def construct_graph(id, input, tuples, framework = None, text = None, anchors = None, enhanced_graph=False):
graph, ids = construct_graph_nodes(id, input, tuples, framework, text, anchors)
if not enhanced_graph:
# basic UD graph (default)
construct_graph_edges(tuples, graph, ids)
else:
# Enhanced UD graphs
construct_enhanced_graph_edges(tuples, graph, ids)
return graph
def read(stream, framework = None, text = None, anchors = None, trace = 0, enhanced_graph=False):
tuples_generator = read_tuples(stream)
for id, input, tuples in tuples_generator:
if trace:
print("conllu.read(): processing graph #{} ...".format(id),
file = sys.stderr);
graph = construct_graph(id, input, tuples, framework, text, anchors, enhanced_graph)
yield graph, None;