Spaces:
Runtime error
Runtime error
import re; | |
import sys; | |
from graph import Graph; | |
TEXT = re.compile(r"^# text = (.+)$"); | |
ID = re.compile(r"^# sent_id = (.+)$"); | |
RANGE = re.compile(r"^([0-9]+)-([0-9]+)$"); | |
ANCHOR = re.compile(r".*TokenRange=([0-9]+):([0-9]+)"); | |
def read_tuples(stream): | |
id, input = None, None; | |
tuples = []; | |
for line in stream: | |
line = line.rstrip(); | |
if line.startswith("#"): | |
match = TEXT.match(line) | |
if match: | |
input = match.group(1); | |
continue; | |
match = ID.match(line); | |
if match: | |
id = match.group(1); | |
continue; | |
elif len(line) == 0: | |
# if there is no `text` comment in the conll, one should reconstruct | |
# the input sentence from the FORM column, since it is required in :construct_graph | |
if input is None: | |
input = reconstruct_input_from_tuples(tuples) | |
if tuples: | |
yield id, input, tuples; | |
id, input = None, None; | |
tuples = [] | |
else: | |
tuples.append(line.split("\t")); | |
def reconstruct_input_from_tuples(tuples): | |
""" Reconstruct input sentence from the CoNLL-U representation. | |
each tuple in tuples correspond to a line in a block. """ | |
if not tuples: return '' | |
# iterate only surface tokens - discard empty nodes and tokens included in ranges | |
surface_indicator = get_is_surface_token_indicator(tuples) | |
surface_tuples = [tuple | |
for is_surface, tuple in zip(surface_indicator, tuples) | |
if is_surface] | |
sent_str = '' | |
for t in surface_tuples: | |
tok = t[1] # FORM column | |
sent_str += tok | |
if "SpaceAfter=No" not in t[-1] and t is not tuples[-1]: # Misc. column (last column) | |
# in last token, don't add space in any case | |
sent_str += ' ' | |
return sent_str | |
def get_ids2range_tuple(tuples): | |
""" | |
Return Dict[int: tuple]. | |
for each node-id k that is part of a multi-word token (denoted by range-id "i-j"), let t be the tuple | |
of the token i-j (the multiword token). the dict will be {k:t} over all these ks. | |
""" | |
ranges2multiword = dict() | |
for tuple in tuples: | |
match = RANGE.match(tuple[0]) | |
if match is not None: | |
for t in range(int(match.group(1)), int(match.group(2)) + 1): | |
ranges2multiword[t] = tuple | |
return ranges2multiword | |
def get_is_surface_token_indicator(tuples): | |
""" | |
Return a list of boolean in same length as `tuples`, | |
where output[i] indicate whether tuple[i] correspond to a surface token. | |
surface tokens are those tokens that are required for detokenization of input sentence. | |
see https://universaldependencies.org/format.html#words-tokens-and-empty-nodes | |
the conditions to be a surface token - | |
1. be not an empty node (in the form "i.j") | |
2. be not a (syntactic) word that is contained in a multi-word token. that is, the word's id | |
isn't included in any range-id (in the form "i-j"). | |
""" | |
ids2range_tuple = get_ids2range_tuple(tuples) | |
ids = [t[0] for t in tuples] | |
surface_indicator = ["." not in tid # condition 1. | |
and ("-" in tid or int(tid) not in ids2range_tuple) # condition 2. | |
for tid in ids] | |
return surface_indicator | |
def read_anchors(stream): | |
if stream is None: | |
while True: yield None, None; | |
else: | |
id = None; | |
tokens = list(); | |
for line in stream: | |
line = line.rstrip("\n"); | |
if len(line) == 0: | |
yield id, tokens; | |
id = None; | |
tokens.clear(); | |
elif line.startswith("#"): | |
id = line[1:]; | |
else: | |
fields = line.split("\t"); | |
if len(fields) == 3: | |
tokens.append((int(fields[0]), int(fields[1]))); | |
if len(tokens) > 0: | |
yield id, tokens; | |
def construct_graph_nodes(id, input, tuples, framework, text, anchors): | |
i = 0; | |
def compute(form): | |
nonlocal i; | |
m = None; | |
j = input.find(form, i); | |
if j >= i: | |
i, m = j, len(form); | |
else: | |
base = form; | |
k, l = len(input), 0; | |
for old, new in {("β", "`"), ("β", "'"), ("β", "'"), ("`", "'"), | |
("β", "\""), ("β", "\""), | |
("β", "--"), ("β", "---"), ("β", "---"), | |
("β¦", "..."), ("β¦", ". . .")}: | |
form = base.replace(old, new); | |
j = input.find(form, i); | |
if j >= i and j < k: k, l = j, len(form); | |
if k < len(input): i, m = k, l; | |
if m: | |
match = {"from": i, "to": i + m}; | |
i += m; | |
return match; | |
else: | |
raise Exception("[{}] failed to anchor |{}| in |{}|{}| ({})" | |
"".format(graph.id, form, input[:i], input[i:], i)); | |
graph = Graph(id, flavor = 0, framework = framework); | |
if input is not None: graph.add_input(input); | |
elif text is not None: graph.add_input(text); | |
input = graph.input; | |
anchors_generator = read_anchors(anchors); | |
_, anchors_tokens = next(anchors_generator); | |
id, ids = 0, dict(); | |
ids2range_tuple = get_ids2range_tuple(tuples) | |
for tuple, is_surface_token in zip(tuples, get_is_surface_token_indicator(tuples)): | |
id += 1; | |
ids[tuple[0]] = id; | |
form, lemma, upos, xpos, features, head, misc = \ | |
tuple[1], tuple[2], tuple[3], tuple[4], tuple[5], tuple[6], tuple[9]; | |
properties = {"lemma": lemma, "upos": upos, "xpos": xpos}; | |
if features != "_": | |
for feature in features.split("|"): | |
name, value = feature.split("=", 1); | |
properties[name] = value; | |
# retrieve anchoring - only for surface tokens | |
if not is_surface_token: | |
anchors = [] | |
elif anchors_tokens is not None: | |
start, end = anchors_tokens.pop(0); | |
anchors = [{"from": start, "to": end}]; | |
else: | |
tid = tuple[0] | |
if tid.isnumeric() and int(tid) in ids2range_tuple: | |
range_tuple_misc = ids2range_tuple[int(tid)][9]; | |
if range_tuple_misc != "_": | |
misc = range_tuple_misc | |
match = ANCHOR.match(misc); | |
if match: | |
anchors = [{"from": int(match.group(1)), "to": int(match.group(2))}]; | |
else: | |
anchors = [compute(form)]; | |
graph.add_node(id, label = form, | |
properties = list(properties.keys()), | |
values = list(properties.values()), | |
top = True if head == "0" else False, | |
anchors = anchors); | |
return graph, ids; | |
def construct_graph_edges(tuples, graph, ids): | |
""" Given a graph with nodes (and id-mapping) pre-constructed, | |
read edges from tuples and add them to graph. | |
Modifies `graph` argument. """ | |
for tuple in tuples: | |
id, head, type = tuple[0], tuple[6], tuple[7] | |
if head in ids: | |
graph.add_edge(ids[head], ids[id], type) | |
def construct_enhanced_graph_edges(tuples, graph, ids): | |
""" Given a graph with nodes (and id-mapping) pre-constructed, | |
read edges from tuples and add them to graph. | |
This function is for reading Enhance UD graphs, which is distinguished from reading | |
basic UD only in source of edges information -- DEPS column instead of HEAD, DEPREL columns. | |
See https://universaldependencies.org/format.html#syntactic-annotation for EUD format specifications | |
which we follow here. | |
Modifies `graph` argument. """ | |
for tuple in tuples: | |
id, deps = tuple[0], tuple[8] | |
if deps == "_": # empty list of relations | |
continue | |
for rel in deps.split("|"): # relations are delimited with bar | |
head, dep_type = rel.split(":", 1) | |
if head in ids: | |
graph.add_edge(ids[head], ids[id], dep_type) | |
def construct_graph(id, input, tuples, framework = None, text = None, anchors = None, enhanced_graph=False): | |
graph, ids = construct_graph_nodes(id, input, tuples, framework, text, anchors) | |
if not enhanced_graph: | |
# basic UD graph (default) | |
construct_graph_edges(tuples, graph, ids) | |
else: | |
# Enhanced UD graphs | |
construct_enhanced_graph_edges(tuples, graph, ids) | |
return graph | |
def read(stream, framework = None, text = None, anchors = None, trace = 0, enhanced_graph=False): | |
tuples_generator = read_tuples(stream) | |
for id, input, tuples in tuples_generator: | |
if trace: | |
print("conllu.read(): processing graph #{} ...".format(id), | |
file = sys.stderr); | |
graph = construct_graph(id, input, tuples, framework, text, anchors, enhanced_graph) | |
yield graph, None; | |