Spaces:
Runtime error
Runtime error
File size: 8,197 Bytes
8044721 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
import re;
import sys;
from graph import Graph;
TEXT = re.compile(r"^# text = (.+)$");
ID = re.compile(r"^# sent_id = (.+)$");
RANGE = re.compile(r"^([0-9]+)-([0-9]+)$");
ANCHOR = re.compile(r".*TokenRange=([0-9]+):([0-9]+)");
def read_tuples(stream):
id, input = None, None;
tuples = [];
for line in stream:
line = line.rstrip();
if line.startswith("#"):
match = TEXT.match(line)
if match:
input = match.group(1);
continue;
match = ID.match(line);
if match:
id = match.group(1);
continue;
elif len(line) == 0:
# if there is no `text` comment in the conll, one should reconstruct
# the input sentence from the FORM column, since it is required in :construct_graph
if input is None:
input = reconstruct_input_from_tuples(tuples)
if tuples:
yield id, input, tuples;
id, input = None, None;
tuples = []
else:
tuples.append(line.split("\t"));
def reconstruct_input_from_tuples(tuples):
""" Reconstruct input sentence from the CoNLL-U representation.
each tuple in tuples correspond to a line in a block. """
if not tuples: return ''
# iterate only surface tokens - discard empty nodes and tokens included in ranges
surface_indicator = get_is_surface_token_indicator(tuples)
surface_tuples = [tuple
for is_surface, tuple in zip(surface_indicator, tuples)
if is_surface]
sent_str = ''
for t in surface_tuples:
tok = t[1] # FORM column
sent_str += tok
if "SpaceAfter=No" not in t[-1] and t is not tuples[-1]: # Misc. column (last column)
# in last token, don't add space in any case
sent_str += ' '
return sent_str
def get_ids2range_tuple(tuples):
"""
Return Dict[int: tuple].
for each node-id k that is part of a multi-word token (denoted by range-id "i-j"), let t be the tuple
of the token i-j (the multiword token). the dict will be {k:t} over all these ks.
"""
ranges2multiword = dict()
for tuple in tuples:
match = RANGE.match(tuple[0])
if match is not None:
for t in range(int(match.group(1)), int(match.group(2)) + 1):
ranges2multiword[t] = tuple
return ranges2multiword
def get_is_surface_token_indicator(tuples):
"""
Return a list of boolean in same length as `tuples`,
where output[i] indicate whether tuple[i] correspond to a surface token.
surface tokens are those tokens that are required for detokenization of input sentence.
see https://universaldependencies.org/format.html#words-tokens-and-empty-nodes
the conditions to be a surface token -
1. be not an empty node (in the form "i.j")
2. be not a (syntactic) word that is contained in a multi-word token. that is, the word's id
isn't included in any range-id (in the form "i-j").
"""
ids2range_tuple = get_ids2range_tuple(tuples)
ids = [t[0] for t in tuples]
surface_indicator = ["." not in tid # condition 1.
and ("-" in tid or int(tid) not in ids2range_tuple) # condition 2.
for tid in ids]
return surface_indicator
def read_anchors(stream):
if stream is None:
while True: yield None, None;
else:
id = None;
tokens = list();
for line in stream:
line = line.rstrip("\n");
if len(line) == 0:
yield id, tokens;
id = None;
tokens.clear();
elif line.startswith("#"):
id = line[1:];
else:
fields = line.split("\t");
if len(fields) == 3:
tokens.append((int(fields[0]), int(fields[1])));
if len(tokens) > 0:
yield id, tokens;
def construct_graph_nodes(id, input, tuples, framework, text, anchors):
i = 0;
def compute(form):
nonlocal i;
m = None;
j = input.find(form, i);
if j >= i:
i, m = j, len(form);
else:
base = form;
k, l = len(input), 0;
for old, new in {("β", "`"), ("β", "'"), ("β", "'"), ("`", "'"),
("β", "\""), ("β", "\""),
("β", "--"), ("β", "---"), ("β", "---"),
("β¦", "..."), ("β¦", ". . .")}:
form = base.replace(old, new);
j = input.find(form, i);
if j >= i and j < k: k, l = j, len(form);
if k < len(input): i, m = k, l;
if m:
match = {"from": i, "to": i + m};
i += m;
return match;
else:
raise Exception("[{}] failed to anchor |{}| in |{}|{}| ({})"
"".format(graph.id, form, input[:i], input[i:], i));
graph = Graph(id, flavor = 0, framework = framework);
if input is not None: graph.add_input(input);
elif text is not None: graph.add_input(text);
input = graph.input;
anchors_generator = read_anchors(anchors);
_, anchors_tokens = next(anchors_generator);
id, ids = 0, dict();
ids2range_tuple = get_ids2range_tuple(tuples)
for tuple, is_surface_token in zip(tuples, get_is_surface_token_indicator(tuples)):
id += 1;
ids[tuple[0]] = id;
form, lemma, upos, xpos, features, head, misc = \
tuple[1], tuple[2], tuple[3], tuple[4], tuple[5], tuple[6], tuple[9];
properties = {"lemma": lemma, "upos": upos, "xpos": xpos};
if features != "_":
for feature in features.split("|"):
name, value = feature.split("=", 1);
properties[name] = value;
# retrieve anchoring - only for surface tokens
if not is_surface_token:
anchors = []
elif anchors_tokens is not None:
start, end = anchors_tokens.pop(0);
anchors = [{"from": start, "to": end}];
else:
tid = tuple[0]
if tid.isnumeric() and int(tid) in ids2range_tuple:
range_tuple_misc = ids2range_tuple[int(tid)][9];
if range_tuple_misc != "_":
misc = range_tuple_misc
match = ANCHOR.match(misc);
if match:
anchors = [{"from": int(match.group(1)), "to": int(match.group(2))}];
else:
anchors = [compute(form)];
graph.add_node(id, label = form,
properties = list(properties.keys()),
values = list(properties.values()),
top = True if head == "0" else False,
anchors = anchors);
return graph, ids;
def construct_graph_edges(tuples, graph, ids):
""" Given a graph with nodes (and id-mapping) pre-constructed,
read edges from tuples and add them to graph.
Modifies `graph` argument. """
for tuple in tuples:
id, head, type = tuple[0], tuple[6], tuple[7]
if head in ids:
graph.add_edge(ids[head], ids[id], type)
def construct_enhanced_graph_edges(tuples, graph, ids):
""" Given a graph with nodes (and id-mapping) pre-constructed,
read edges from tuples and add them to graph.
This function is for reading Enhance UD graphs, which is distinguished from reading
basic UD only in source of edges information -- DEPS column instead of HEAD, DEPREL columns.
See https://universaldependencies.org/format.html#syntactic-annotation for EUD format specifications
which we follow here.
Modifies `graph` argument. """
for tuple in tuples:
id, deps = tuple[0], tuple[8]
if deps == "_": # empty list of relations
continue
for rel in deps.split("|"): # relations are delimited with bar
head, dep_type = rel.split(":", 1)
if head in ids:
graph.add_edge(ids[head], ids[id], dep_type)
def construct_graph(id, input, tuples, framework = None, text = None, anchors = None, enhanced_graph=False):
graph, ids = construct_graph_nodes(id, input, tuples, framework, text, anchors)
if not enhanced_graph:
# basic UD graph (default)
construct_graph_edges(tuples, graph, ids)
else:
# Enhanced UD graphs
construct_enhanced_graph_edges(tuples, graph, ids)
return graph
def read(stream, framework = None, text = None, anchors = None, trace = 0, enhanced_graph=False):
tuples_generator = read_tuples(stream)
for id, input, tuples in tuples_generator:
if trace:
print("conllu.read(): processing graph #{} ...".format(id),
file = sys.stderr);
graph = construct_graph(id, input, tuples, framework, text, anchors, enhanced_graph)
yield graph, None;
|