File size: 15,222 Bytes
8044721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
"""Utility functions for UCCA package."""
import os
import time
from collections import OrderedDict
from collections import deque
from enum import Enum
from itertools import groupby, islice
from operator import attrgetter, itemgetter

import numpy as np

from ucca import layer0, layer1

MODEL_ENV_VAR = "SPACY_MODEL"  # Determines the default spaCy model to load
DEFAULT_MODEL = {"en": "en_core_web_md", "fr": "fr_core_news_md", "de": "de_core_news_md"}

N_THREADS = 4
BATCH_SIZE = 50


class Attr(Enum):
    """Wrapper for spaCy Attr, determining order for saving in layer0.extra per token when as_array=True"""
    ORTH = 0
    LEMMA = 1
    TAG = 2
    POS = 3
    ENT_TYPE = 4
    ENT_IOB = 5
    DEP = 6
    HEAD = 7
    SHAPE = 8
    PREFIX = 9
    SUFFIX = 10

    def __call__(self, value, vocab=None, as_array=False, lang=None):
        """Resolve numeric ID of attribute value to string (if as_array=False) or to int (if as_array=True)"""
        if value is None:
            return None
        if self in (Attr.ENT_IOB, Attr.HEAD):
            return int(np.int64(value))
        if as_array:
            is_str = isinstance(value, str)
            if is_str or self in (Attr.ORTH, Attr.LEMMA):
                try:  # Will find the value even if it's a new string, but that's OK since the hash is deterministic
                    i = get_vocab(vocab, lang).strings[value]
                    if is_str:  # Replace with numeric ID since as_array=True
                        value = i
                except KeyError:
                    value = None
            return value if value is None or isinstance(value, str) else int(value)
        try:
            return get_vocab(vocab, lang)[value].text
        except KeyError:
            return None
    
    @property
    def key(self):
        """String used in `extra' dict of Terminals to store this attribute when as_array=False"""
        return self.name.lower()


def get_nlp(lang="en"):
    """Load spaCy model for a given language, determined by `models' dict or by MODEL_ENV_VAR"""
    instance = nlp.get(lang)
    if instance is None:
        import spacy
        model = models.get(lang)
        if not model:
            models[lang] = model = os.environ.get("_".join((MODEL_ENV_VAR, lang.upper()))) or \
                                   os.environ.get(MODEL_ENV_VAR) or DEFAULT_MODEL.get(lang, "xx")
        started = time.time()
        with external_write_mode():
            print("Loading spaCy model '%s'... " % model, end="", flush=True)
            try:
                nlp[lang] = instance = spacy.load(model)
            except OSError:
                spacy.cli.download(model)
                try:
                    nlp[lang] = instance = spacy.load(model)
                except OSError as e:
                    raise OSError("Failed to get spaCy model. Download it manually using "
                                  "`python -m spacy download %s`." % model) from e
            tokenizer[lang] = instance.tokenizer
            instance.tokenizer = lambda words: spacy.tokens.Doc(instance.vocab, words=words)
            print("Done (%.3fs)." % (time.time() - started))
    return instance


models = {}  # maps language two-letter code to name of spaCy model
nlp = {}  # maps language two-letter code to actual loaded spaCy model
tokenizer = {}  # maps language two-letter code to tokenizer of spaCy model


def get_tokenizer(tokenized=False, lang="en"):
    instance = get_nlp(lang)
    return instance.tokenizer if tokenized else tokenizer[lang]


def get_vocab(vocab=None, lang=None):
    if vocab is not None:
        return vocab
    return (get_nlp(lang) if lang else get_nlp()).vocab


def get_word_vectors(dim=None, size=None, filename=None, vocab=None):
    """
    Get word vectors from spaCy model or from text file
    :param dim: dimension to trim vectors to (default: keep original)
    :param size: maximum number of vectors to load (default: all)
    :param filename: text file to load vectors from (default: from spaCy model)
    :param vocab: instead of strings, look up keys of returned dict in vocab (use lang str, e.g. "en", for spaCy vocab)
    :return: tuple of (dict of word [string or integer] -> vector [NumPy array], dimension)
    """
    orig_keys = vocab is None
    if isinstance(vocab, str) or not filename:
        vocab = get_nlp(vocab if isinstance(vocab, str) else "en").vocab

    def _lookup(word):
        try:
            return word.orth_ if orig_keys else word.orth
        except AttributeError:
            if orig_keys:
                return word
        lex = vocab[word]
        return getattr(lex, "orth", lex)

    if filename:
        it = read_word_vectors(dim, size, filename)
        nr_row, nr_dim = next(it)
        vectors = OrderedDict(islice(((_lookup(w), v) for w, v in it if orig_keys or w in vocab), nr_row))
    else:  # return spaCy vectors
        nr_row, nr_dim = vocab.vectors.shape
        if dim is not None and dim < nr_dim:
            nr_dim = int(dim)
            vocab.vectors.resize(shape=(int(size or nr_row), nr_dim))
        lexemes = sorted([l for l in vocab if l.has_vector], key=attrgetter("prob"), reverse=True)[:size]
        vectors = OrderedDict((_lookup(l), l.vector) for l in lexemes)
    return vectors, nr_dim


def read_word_vectors(dim, size, filename):
    """
    Read word vectors from text file, with an optional first row indicating size and dimension
    :param dim: dimension to trim vectors to
    :param size: maximum number of vectors to load
    :param filename: text file to load vectors from
    :return: generator: first element is (#vectors, #dims); and all the rest are (word [string], vector [NumPy array])
    """
    try:
        first_line = True
        nr_row = nr_dim = None
        with open(filename, encoding="utf-8") as f:
            for line in f:
                fields = line.split()
                if first_line:
                    first_line = False
                    try:
                        nr_row, nr_dim = map(int, fields)
                        is_header = True
                    except ValueError:
                        nr_dim = len(fields) - 1  # No header, just get vector length from first one
                        is_header = False
                    if dim and dim < nr_dim:
                        nr_dim = dim
                    yield size or nr_row, nr_dim
                    if is_header:
                        continue  # Read next line
                word, *vector = fields
                if len(vector) >= nr_dim:  # May not be equal if word is whitespace
                    yield word, np.asarray(vector[-nr_dim:], dtype="f")
    except OSError as e:
        raise IOError("Failed loading word vectors from '%s'" % filename) from e


def annotate(passage, *args, **kwargs):
    """
    Run spaCy pipeline on the given passage, unless already annotated
    :param passage: Passage object, whose layer 0 nodes will be added entries in the `extra' dict
    """
    list(annotate_all([passage], *args, **kwargs))


def annotate_as_tuples(passages, replace=False, as_array=False, lang="en", vocab=None, verbose=False):
    for passage_lang, passages_by_lang in groupby(passages, get_lang):
        for need_annotation, stream in groupby(to_annotate(passages_by_lang, replace, as_array), lambda x: bool(x[0])):
            annotated = get_nlp(passage_lang or lang).pipe(
                stream, as_tuples=True, n_threads=N_THREADS, batch_size=BATCH_SIZE) if need_annotation else stream
            annotated = set_docs(annotated, as_array, passage_lang or lang, vocab, replace, verbose)
            for passage, passages in groupby(annotated, itemgetter(0)):
                yield deque(passages, maxlen=1).pop()  # Wait until all paragraphs have been annotated


def annotate_all(passages, replace=False, as_array=False, as_tuples=False, lang="en", vocab=None, verbose=False):
    """
    Run spaCy pipeline on the given passages, unless already annotated
    :param passages: iterable of Passage objects, whose layer 0 nodes will be added entries in the `extra' dict
    :param replace: even if a given passage is already annotated, replace with new annotation
    :param as_array: instead of adding `extra' entries to each terminal, set layer 0 extra["doc"] to array of ids
    :param as_tuples: treat input as tuples of (passage text, context), and return context for each passage as-is
    :param lang: optional two-letter language code, will be overridden if passage has "lang" attrib
    :param vocab: optional dictionary of vocabulary IDs to string values, to avoid loading spaCy model
    :param verbose: whether to print annotated text
    :return: generator of annotated passages, which are actually modified in-place (same objects as input)
    """
    if not as_tuples:
        passages = ((p,) for p in passages)
    for t in annotate_as_tuples(passages, replace=replace, as_array=as_array, lang=lang, vocab=vocab, verbose=verbose):
        yield t if as_tuples else t[0]


def get_lang(passage_context):
    return passage_context[0].attrib.get("lang")


def to_annotate(passage_contexts, replace, as_array):
    """Filter passages to get only those that require annotation; split to paragraphs and return generator of
    (list of tokens, (paragraph index, list of Terminals, Passage) + original context appended) tuples"""
    return (([t.text for t in terminals] if replace or not is_annotated(passage, as_array) else (),
             (i, terminals, passage) + tuple(context)) for passage, *context in passage_contexts
            for i, terminals in enumerate(break2paragraphs(passage, return_terminals=True)))


def is_annotated(passage, as_array):
    """Whether the passage is already annotated or only partially annotated"""
    l0 = passage.layer(layer0.LAYER_ID)
    if as_array:
        docs = l0.extra.get("doc")
        return not l0.all or docs is not None and len(docs) == max(t.paragraph for t in l0.all) and \
            sum(map(len, docs)) == len(l0.all) and \
            all(i is None or isinstance(i, int) for l in docs for t in l for i in t)
    return all(a.key in t.extra for t in l0.all for a in Attr)


def set_docs(annotated, as_array, lang, vocab, replace, verbose):
    """Given spaCy annotations, set values in layer0.extra per paragraph if as_array=True, or else in Terminal.extra"""
    for doc, (i, terminals, passage, *context) in annotated:
        if doc:  # Not empty, so copy values
            from spacy import attrs
            arr = doc.to_array([getattr(attrs, a.name) for a in Attr])
            if as_array:
                docs = passage.layer(layer0.LAYER_ID).docs(i + 1)
                existing = docs[i] + (len(arr) - len(docs[i])) * [len(Attr) * [None]]
                docs[i] = [[a(v if e is None or replace else e, get_vocab(vocab, lang), as_array=True)
                            for a, v, e in zip(Attr, values, es)] for values, es in zip(arr, existing)]
            else:
                for terminal, values in zip(terminals, arr):
                    for attr, value in zip(Attr, values):
                        if replace or not terminal.extra.get(attr.key):
                            terminal.extra[attr.key] = attr(value, get_vocab(vocab, lang))
        if verbose:
            data = [[a.key for a in Attr]] + \
                   [[str(a(t.tok[a.value], get_vocab(vocab, lang)) if as_array else t.extra[a.key])
                     for a in Attr] for j, t in enumerate(terminals)]
            width = [max(len(f) for f in t) for t in data]
            for j in range(len(Attr)):
                try:
                    print(" ".join("%-*s" % (w, f[j]) for f, w in zip(data, width)))
                except UnicodeEncodeError:
                    pass
            print()
        yield (passage,) + tuple(context)


SENTENCE_END_MARKS = ('.', '?', '!')


def break2sentences(passage, lang="en", *args, **kwargs):
    """
    Breaks paragraphs into sentences according to the annotation.

    A sentence is a list of terminals which ends with a mark from
    SENTENCE_END_MARKS, and is also the end of a paragraph or parallel scene.
    :param passage: the Passage object to operate on
    :param lang: optional two-letter language code
    :return: a list of positions in the Passage, each denotes a closing Terminal of a sentence.
    """
    del args, kwargs
    l1 = passage.layer(layer1.LAYER_ID)
    terminals = extract_terminals(passage)
    if not terminals:
        return []
    if any(n.outgoing for n in l1.all):  # Passage is labeled
        ps_ends = [ps.end_position for ps in l1.top_scenes]
        ps_starts = [ps.start_position for ps in l1.top_scenes]
        marks = [t.position for t in terminals if t.text in SENTENCE_END_MARKS]
        # Annotations doesn't always include the ending period (or other mark)
        # with the parallel scene it closes. Hence, if the terminal before the
        # mark closed the parallel scene, and this mark doesn't open a scene
        # in any way (hence it probably just "hangs" there), it's a sentence end
        marks = [x for x in marks if x in ps_ends or ((x - 1) in ps_ends and x not in ps_starts)]
    else:  # Not labeled, split using spaCy
        annotated = get_nlp(lang=lang)([t.text for t in terminals])
        marks = [span.end for span in annotated.sents]
    marks = sorted(set(marks + break2paragraphs(passage)))
    # Avoid punctuation-only sentences
    if len(marks) > 1:
        marks = [x for x, y in zip(marks[:-1], marks[1:]) if not all(layer0.is_punct(t) for t in terminals[x:y])] + \
                [marks[-1]]
    return marks


def extract_terminals(p):
    """returns an iterator of the terminals of the passage p"""
    return p.layer(layer0.LAYER_ID).all


def break2paragraphs(passage, return_terminals=False, *args, **kwargs):
    """
    Breaks into paragraphs according to the annotation.

    Uses the `paragraph' attribute of layer 0 to find paragraphs.
    :param passage: the Passage object to operate on
    :param return_terminals: whether to return actual Terminal objects of all terminals rather than just end positions
    :return: a list of positions in the Passage, each denotes a closing Terminal of a paragraph.
    """
    del args, kwargs
    terminals = list(extract_terminals(passage))
    if not terminals:
        return []
    return [list(p) for _, p in groupby(terminals, key=attrgetter("paragraph"))] if return_terminals else \
        [t.position - 1 for t in terminals if t.position > 1 and t.para_pos == 1] + [terminals[-1].position]


def indent_xml(xml_as_string):
    """
    Indents a string of XML-like objects.

    This works only for units with no text or tail members, and only for
    strings whose leaves are written as <tag /> and not <tag></tag>.
    :param xml_as_string: XML string to indent
    :return: indented XML string
    """
    tabs = 0
    lines = str(xml_as_string).replace('><', '>\n<').splitlines()
    s = ''
    for line in lines:
        if line.startswith('</'):
            tabs -= 1
        s += ("  " * tabs) + line + '\n'
        if not (line.endswith('/>') or line.startswith('</')):
            tabs += 1
    return s