File size: 10,280 Bytes
d61b9c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/usr/bin/env python3
from collections import defaultdict

import torch
from pytext.models.embeddings.dict_embedding import DictEmbedding
from pytext.models.embeddings.word_embedding import WordEmbedding
from pytext.models.model import EmbeddingBase, EmbeddingList


class PyTextInterpretableEmbedding(EmbeddingBase):
    r"""
    In PyText DocNN models we need a way to access word embedding layers,
    generate the embeddings and subtract the baseline.
    To do so, we separate embedding layers from the model, compute the embeddings
    separately and do all operations needed outside of the model.
    The original embedding layer is being replaced by `PyTextInterpretableEmbedding`
    layer which passes precomputed embedding vectors to lower layers.
    """

    def __init__(self, embeddings) -> None:
        self.embedding_dims = [embedding.embedding_dim for embedding in embeddings]
        super().__init__(sum(self.embedding_dims))
        self.embeddings = embeddings

    def forward(self, input):
        r"""
        The forward pass of embedding layer. This can be for the text or any
        type of embedding.

        Args

           input: Input embeddings tensor

        Return

           output: Output tensor is the same as input. It passes through
                   the embedding tensors to lower layers without any
                   modifications
        """
        return input

    def get_attribution_map(self, attributions):
        r"""
        After attribution scores are computed for an input embedding vector
        we need to split it up into attribution sub tensors for each
        feature type: word, dict and other types

        TODO: we can potentally also output tuples of attributions. This might be
        a better option. We'll work on this in a separate diff.

        Args

           attributions: A tensor that contains attribution values for each input
                         field. It usually has the same dimensions as the input
                         tensor

        Return

           attribution_map: A dictionary of feature_type and attribution values

        """
        begin = 0
        attribution_map = defaultdict()
        for embedding, embedding_size in zip(self.embeddings, self.embedding_dims):
            end = begin + embedding_size
            if isinstance(embedding, WordEmbedding):
                attribution_map["word"] = attributions[:, :, begin:end]
            elif isinstance(embedding, DictEmbedding):
                attribution_map["dict"] = attributions[:, :, begin:end]
            else:
                raise NotImplementedError(
                    "Currently only word and dict " "embeddings are supported"
                )
            begin = end

        return attribution_map


class BaselineGenerator:
    r"""
    This is an example input baseline generator for DocNN model which uses
    word and dict features.
    """
    PAD = "<pad>"

    def __init__(self, model, data_handler, device) -> None:
        self.model = model
        self.data_handler = data_handler
        if "dict_feat" in data_handler.features:
            self.vocab_dict = data_handler.features["dict_feat"].vocab
        if "word_feat" in data_handler.features:
            self.vocab_word = data_handler.features["word_feat"].vocab

        self.baseline_single_word_feature = self._generate_baseline_single_word_feature(
            device
        )
        self.baseline_single_dict_feature = self._generate_baseline_single_dict_feature(
            device
        )

    def generate_baseline(self, integ_grads_embeddings, seq_length):
        r"""
        Generates baseline for input word and dict features. In the future we
        will extend it to support char and other features as well.
        This baseline is entirely based on the `<pad>` token.

        Args

            integ_grads_embeddings: A reference to integrated gradients embedding
                                    layer
            seq_length: The length of each sequence which depends on batch size

        Return
                baseline: A tuple of feature baselines
                          Each feature type has a corresponding baseline tensor
                          in the tuple.
                          Currently only Dict and Word feature types are supported
        """
        baseline = []
        for embedding in integ_grads_embeddings.embeddings:
            if isinstance(embedding, WordEmbedding):
                baseline.append(self._generate_word_baseline(seq_length))
            elif isinstance(embedding, DictEmbedding):
                baseline.append(self._generate_dict_baseline(seq_length))
            else:
                raise NotImplementedError(
                    "Currently only word and dict " "embeddings are supported"
                )
        return tuple(baseline)

    def _generate_baseline_single_word_feature(self, device):
        return (
            torch.tensor(
                [self.vocab_word.stoi[self.PAD] if hasattr(self, "vocab_word") else 0]
            )
            .unsqueeze(0)
            .to(device)
        )

    def _generate_baseline_single_dict_feature(self, device):
        r"""Generate dict features based on Assistant's case study by using
         sia_transformer:
         fbcode/assistant/sia/transformer/sia_transformer.py
         sia_transformer generates dict features in a special gazetter format
         See `fbsource/fbcode/pytext/models/embeddings/dict_embedding.py`

         It generates word dict feature embeddings for each word token.

         The output of SIATransformer after running it on `<pad>` token
         looks as following:
        OutputRecord(tokens=['<', 'pad', '>'],
                     token_ranges=[(0, 1), (1, 4), (4, 5)],
                     gazetteer_feats=['<pad>', '<pad>', '<pad>'],
                     gazetteer_feat_lengths=[1, 1, 1],
                     gazetteer_feat_weights=[0.0, 0.0, 0.0],
                     characters=[['<', '<pad>', '<pad>'],
                                ['p', 'a', 'd'], ['>', '<pad>', '<pad>']],
                     pretrained_token_embedding=[ ], dense_feats=None)
        """
        gazetteer_feats = [self.PAD, self.PAD, self.PAD]
        gazetteer_feat_lengths = [1, 1, 1]
        gazetteer_feat_weights = [0.0, 0.0, 0.0]
        gazetteer_feat_id = (
            torch.tensor(
                [
                    self.vocab_dict.stoi[gazetteer_feat]
                    if hasattr(self, "vocab_dict")
                    else 0
                    for gazetteer_feat in gazetteer_feats
                ]
            )
            .unsqueeze(0)
            .to(device)
        )
        gazetteer_feat_weights = (
            torch.tensor(gazetteer_feat_weights).unsqueeze(0).to(device)
        )
        gazetteer_feat_lengths = (
            torch.tensor(gazetteer_feat_lengths).to(device).view(1, -1)[:, 1]
        )

        return (gazetteer_feat_id, gazetteer_feat_weights, gazetteer_feat_lengths)

    def _generate_word_baseline(self, seq_length):
        return self.baseline_single_word_feature.repeat(1, seq_length)

    def _generate_dict_baseline(self, seq_length):
        return (
            self.baseline_single_dict_feature[0].repeat(1, seq_length),
            self.baseline_single_dict_feature[1].repeat(1, seq_length),
            self.baseline_single_dict_feature[2].repeat(1, seq_length),
        )


def configure_task_integ_grads_embeddings(task):
    r"""
    Wraps Pytext's DocNN model embedding with `IntegratedGradientsEmbedding` for
    a given input task.
    IntegratedGradientsEmbedding allows to perform baseline related operations

    Args

        task: DocNN task reference

    Returns

        integrated_gradients_embedding_lst: The embedding layer which contains
                    IntegratedGradientsEmbedding as a wrapper over the original
                    embeddings of the model

    """
    integrated_gradients_embedding_lst = configure_model_integ_grads_embeddings(
        task.model
    )
    task.model.embedding = integrated_gradients_embedding_lst
    return integrated_gradients_embedding_lst[0]


def configure_model_integ_grads_embeddings(model):
    r"""
    Wraps Pytext's DocNN model embedding with `IntegratedGradientsEmbedding`
    IntegratedGradientsEmbedding allows to perform baseline related operations

    Args

        model: a reference to DocModel

    Returns

        integrated_gradients_embedding_lst: The embedding layer which contains
                    IntegratedGradientsEmbedding as a wrapper over the original
                    embeddings of the model

    """
    embeddings = model.embedding
    integrated_gradients_embedding = PyTextInterpretableEmbedding(embeddings)
    return EmbeddingList([integrated_gradients_embedding], False)


def reshape_word_features(word_features):
    r"""
     Creates one-sample batch for word features for sanity check purposes

    Args

        word_features: A tensor of diemnsions #words x #embeddings

    Return

        word_features: A tensor of dimensions 1 x #words x #embeddings

    """
    return word_features.unsqueeze(0)


def reshape_dict_features(
    dict_feature_id_batch, dict_weight_batch, dict_seq_len_batch, seq_length, idx
):
    r"""
    Creates one-sample batch for dict features for sanity check purposes
    It reads and reshapes id, weight and seq_length feature arrays for given
    input index `idx` from the input batch

    Args

        dict_feature_id_batch: The batch tensor for ids
        dict_weight_matrix: The batch tensor for weights
        dict_seq_len_matrix: The batch tensor for sequence length
        seq_length: The number of tokens per sequence
        idx: The index of sample in the batch

    Return

        dict_feature_ids: A tensor of dimensions [ bsz x # dict feature embeddings]
        dict_feature_weights: [ bsz x # dict feature embeddings]
        dict_feature_lens: [ bsz * seq_length ]

    """
    dict_feature_ids = dict_feature_id_batch[idx].unsqueeze(0)
    dict_feature_weights = dict_weight_batch[idx].unsqueeze(0)
    dict_feature_lens = dict_seq_len_batch[idx].unsqueeze(0)
    return (dict_feature_ids, dict_feature_weights, dict_feature_lens)