File size: 8,197 Bytes
8044721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import re;
import sys;

from graph import Graph;

TEXT = re.compile(r"^# text = (.+)$");
ID = re.compile(r"^# sent_id = (.+)$");
RANGE = re.compile(r"^([0-9]+)-([0-9]+)$");
ANCHOR = re.compile(r".*TokenRange=([0-9]+):([0-9]+)");

def read_tuples(stream):
  id, input = None, None;
  tuples = [];
  for line in stream:
    line = line.rstrip();
    if line.startswith("#"):
      match = TEXT.match(line)
      if match:
        input = match.group(1);
        continue;
      match = ID.match(line);
      if match:
        id = match.group(1);
        continue;
    elif len(line) == 0:
      # if there is no `text` comment in the conll, one should reconstruct
      # the input sentence from the FORM column, since it is required in :construct_graph
      if input is None:
        input = reconstruct_input_from_tuples(tuples)
      if tuples:
        yield id, input, tuples;
        id, input = None, None;
        tuples = []
    else:
      tuples.append(line.split("\t"));

def reconstruct_input_from_tuples(tuples):
  """ Reconstruct input sentence from the CoNLL-U representation.
  each tuple in tuples correspond to a line in a block. """
  if not tuples: return ''
  # iterate only surface tokens - discard empty nodes and tokens included in ranges
  surface_indicator = get_is_surface_token_indicator(tuples)
  surface_tuples = [tuple
                    for is_surface, tuple in zip(surface_indicator, tuples)
                    if is_surface]
  sent_str = ''
  for t in surface_tuples:
    tok = t[1] # FORM column
    sent_str += tok
    if "SpaceAfter=No" not in t[-1] and t is not tuples[-1]: # Misc. column (last column)
      # in last token, don't add space in any case
      sent_str += ' '

  return sent_str

def get_ids2range_tuple(tuples):
  """
  Return Dict[int: tuple].
   for each node-id k that is part of a multi-word token (denoted by range-id "i-j"), let t be the tuple
   of the token i-j (the multiword token). the dict will be {k:t} over all these ks.
  """
  ranges2multiword = dict()
  for tuple in tuples:
    match = RANGE.match(tuple[0])
    if match is not None:
      for t in range(int(match.group(1)), int(match.group(2)) + 1):
        ranges2multiword[t] = tuple
  return ranges2multiword

def get_is_surface_token_indicator(tuples):
  """
  Return a list of boolean in same length as `tuples`,
  where output[i] indicate whether tuple[i] correspond to a surface token.
  surface tokens are those tokens that are required for detokenization of input sentence.
  see https://universaldependencies.org/format.html#words-tokens-and-empty-nodes

  the conditions to be a surface token -
    1. be not an empty node (in the form "i.j")
    2. be not a (syntactic) word that is contained in a multi-word token. that is, the word's id
    isn't included in any range-id (in the form "i-j").
  """
  ids2range_tuple = get_ids2range_tuple(tuples)
  ids = [t[0] for t in tuples]
  surface_indicator = ["." not in tid # condition 1.
                       and ("-" in tid or int(tid) not in ids2range_tuple) # condition 2.
                       for tid in ids]
  return surface_indicator

def read_anchors(stream):
  if stream is None:
    while True: yield None, None;
  else:
    id = None;
    tokens = list();
    for line in stream:
      line = line.rstrip("\n");
      if len(line) == 0:
        yield id, tokens;
        id = None;
        tokens.clear();
      elif line.startswith("#"):
        id = line[1:];
      else:
        fields = line.split("\t");
        if len(fields) == 3:
          tokens.append((int(fields[0]), int(fields[1])));
    if len(tokens) > 0:
      yield id, tokens;

def construct_graph_nodes(id, input, tuples, framework, text, anchors):
  i = 0;
  def compute(form):
    nonlocal i;
    m = None;
    j = input.find(form, i);
    if j >= i:
      i, m = j, len(form);
    else:
      base = form;
      k, l = len(input), 0;
      for old, new in {("β€˜", "`"), ("β€˜", "'"), ("’", "'"), ("`", "'"),
                       ("β€œ", "\""), ("”", "\""),
                       ("–", "--"), ("–", "---"), ("β€”", "---"),
                       ("…", "..."), ("…", ". . .")}:
        form = base.replace(old, new);
        j = input.find(form, i);
        if j >= i and j < k: k, l = j, len(form);
      if k < len(input): i, m = k, l;
    if m:
      match = {"from": i, "to": i + m};
      i += m;
      return match;
    else:
      raise Exception("[{}] failed to anchor |{}| in |{}|{}| ({})"
                      "".format(graph.id, form, input[:i], input[i:], i));

  graph = Graph(id, flavor = 0, framework = framework);
  if input is not None: graph.add_input(input);
  elif text is not None: graph.add_input(text);
  input = graph.input;

  anchors_generator = read_anchors(anchors);
  _, anchors_tokens = next(anchors_generator);
  id, ids = 0, dict();
  ids2range_tuple = get_ids2range_tuple(tuples)
  for tuple, is_surface_token in zip(tuples, get_is_surface_token_indicator(tuples)):
    id += 1;
    ids[tuple[0]] = id;
    form, lemma, upos, xpos, features, head, misc = \
      tuple[1], tuple[2], tuple[3], tuple[4], tuple[5], tuple[6], tuple[9];
    properties = {"lemma": lemma, "upos": upos, "xpos": xpos};
    if features != "_":
      for feature in features.split("|"):
        name, value = feature.split("=", 1);
        properties[name] = value;
    # retrieve anchoring - only for surface tokens
    if not is_surface_token:
      anchors = []
    elif anchors_tokens is not None:
      start, end = anchors_tokens.pop(0);
      anchors = [{"from": start, "to": end}];
    else:
      tid = tuple[0]
      if tid.isnumeric() and int(tid) in ids2range_tuple:
        range_tuple_misc = ids2range_tuple[int(tid)][9];
        if range_tuple_misc != "_":
          misc = range_tuple_misc
      match = ANCHOR.match(misc);
      if match:
        anchors = [{"from": int(match.group(1)), "to": int(match.group(2))}];
      else:
        anchors = [compute(form)];
    graph.add_node(id, label = form,
                   properties = list(properties.keys()),
                   values = list(properties.values()),
                   top = True if head == "0" else False,
                   anchors = anchors);
  return graph, ids;

def construct_graph_edges(tuples, graph, ids):
  """ Given a graph with nodes (and id-mapping) pre-constructed,
  read edges from tuples and add them to graph.
  Modifies `graph` argument. """
  for tuple in tuples:
    id, head, type = tuple[0], tuple[6], tuple[7]
    if head in ids:
      graph.add_edge(ids[head], ids[id], type)

def construct_enhanced_graph_edges(tuples, graph, ids):
  """ Given a graph with nodes (and id-mapping) pre-constructed,
  read edges from tuples and add them to graph.
  This function is for reading Enhance UD graphs, which is distinguished from reading
  basic UD only in source of edges information -- DEPS column instead of HEAD, DEPREL columns.
  See https://universaldependencies.org/format.html#syntactic-annotation for EUD format specifications
  which we follow here.
  Modifies `graph` argument. """
  for tuple in tuples:
    id, deps = tuple[0], tuple[8]
    if deps == "_": # empty list of relations
      continue
    for rel in deps.split("|"): # relations are delimited with bar
      head, dep_type = rel.split(":", 1)
      if head in ids:
        graph.add_edge(ids[head], ids[id], dep_type)


def construct_graph(id, input, tuples, framework = None, text = None, anchors = None, enhanced_graph=False):
  graph, ids = construct_graph_nodes(id, input, tuples, framework, text, anchors)
  if not enhanced_graph:
    # basic UD graph (default)
    construct_graph_edges(tuples, graph, ids)
  else:
    # Enhanced UD graphs
    construct_enhanced_graph_edges(tuples, graph, ids)
  return graph

def read(stream, framework = None, text = None, anchors = None, trace = 0, enhanced_graph=False):
  tuples_generator = read_tuples(stream)
  for id, input, tuples in tuples_generator:
    if trace:
      print("conllu.read(): processing graph #{} ...".format(id),
            file = sys.stderr);
    graph = construct_graph(id, input, tuples, framework, text, anchors, enhanced_graph)
    yield graph, None;