Spaces:
Build error
Build error
File size: 10,280 Bytes
d61b9c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 |
#!/usr/bin/env python3
from collections import defaultdict
import torch
from pytext.models.embeddings.dict_embedding import DictEmbedding
from pytext.models.embeddings.word_embedding import WordEmbedding
from pytext.models.model import EmbeddingBase, EmbeddingList
class PyTextInterpretableEmbedding(EmbeddingBase):
r"""
In PyText DocNN models we need a way to access word embedding layers,
generate the embeddings and subtract the baseline.
To do so, we separate embedding layers from the model, compute the embeddings
separately and do all operations needed outside of the model.
The original embedding layer is being replaced by `PyTextInterpretableEmbedding`
layer which passes precomputed embedding vectors to lower layers.
"""
def __init__(self, embeddings) -> None:
self.embedding_dims = [embedding.embedding_dim for embedding in embeddings]
super().__init__(sum(self.embedding_dims))
self.embeddings = embeddings
def forward(self, input):
r"""
The forward pass of embedding layer. This can be for the text or any
type of embedding.
Args
input: Input embeddings tensor
Return
output: Output tensor is the same as input. It passes through
the embedding tensors to lower layers without any
modifications
"""
return input
def get_attribution_map(self, attributions):
r"""
After attribution scores are computed for an input embedding vector
we need to split it up into attribution sub tensors for each
feature type: word, dict and other types
TODO: we can potentally also output tuples of attributions. This might be
a better option. We'll work on this in a separate diff.
Args
attributions: A tensor that contains attribution values for each input
field. It usually has the same dimensions as the input
tensor
Return
attribution_map: A dictionary of feature_type and attribution values
"""
begin = 0
attribution_map = defaultdict()
for embedding, embedding_size in zip(self.embeddings, self.embedding_dims):
end = begin + embedding_size
if isinstance(embedding, WordEmbedding):
attribution_map["word"] = attributions[:, :, begin:end]
elif isinstance(embedding, DictEmbedding):
attribution_map["dict"] = attributions[:, :, begin:end]
else:
raise NotImplementedError(
"Currently only word and dict " "embeddings are supported"
)
begin = end
return attribution_map
class BaselineGenerator:
r"""
This is an example input baseline generator for DocNN model which uses
word and dict features.
"""
PAD = "<pad>"
def __init__(self, model, data_handler, device) -> None:
self.model = model
self.data_handler = data_handler
if "dict_feat" in data_handler.features:
self.vocab_dict = data_handler.features["dict_feat"].vocab
if "word_feat" in data_handler.features:
self.vocab_word = data_handler.features["word_feat"].vocab
self.baseline_single_word_feature = self._generate_baseline_single_word_feature(
device
)
self.baseline_single_dict_feature = self._generate_baseline_single_dict_feature(
device
)
def generate_baseline(self, integ_grads_embeddings, seq_length):
r"""
Generates baseline for input word and dict features. In the future we
will extend it to support char and other features as well.
This baseline is entirely based on the `<pad>` token.
Args
integ_grads_embeddings: A reference to integrated gradients embedding
layer
seq_length: The length of each sequence which depends on batch size
Return
baseline: A tuple of feature baselines
Each feature type has a corresponding baseline tensor
in the tuple.
Currently only Dict and Word feature types are supported
"""
baseline = []
for embedding in integ_grads_embeddings.embeddings:
if isinstance(embedding, WordEmbedding):
baseline.append(self._generate_word_baseline(seq_length))
elif isinstance(embedding, DictEmbedding):
baseline.append(self._generate_dict_baseline(seq_length))
else:
raise NotImplementedError(
"Currently only word and dict " "embeddings are supported"
)
return tuple(baseline)
def _generate_baseline_single_word_feature(self, device):
return (
torch.tensor(
[self.vocab_word.stoi[self.PAD] if hasattr(self, "vocab_word") else 0]
)
.unsqueeze(0)
.to(device)
)
def _generate_baseline_single_dict_feature(self, device):
r"""Generate dict features based on Assistant's case study by using
sia_transformer:
fbcode/assistant/sia/transformer/sia_transformer.py
sia_transformer generates dict features in a special gazetter format
See `fbsource/fbcode/pytext/models/embeddings/dict_embedding.py`
It generates word dict feature embeddings for each word token.
The output of SIATransformer after running it on `<pad>` token
looks as following:
OutputRecord(tokens=['<', 'pad', '>'],
token_ranges=[(0, 1), (1, 4), (4, 5)],
gazetteer_feats=['<pad>', '<pad>', '<pad>'],
gazetteer_feat_lengths=[1, 1, 1],
gazetteer_feat_weights=[0.0, 0.0, 0.0],
characters=[['<', '<pad>', '<pad>'],
['p', 'a', 'd'], ['>', '<pad>', '<pad>']],
pretrained_token_embedding=[ ], dense_feats=None)
"""
gazetteer_feats = [self.PAD, self.PAD, self.PAD]
gazetteer_feat_lengths = [1, 1, 1]
gazetteer_feat_weights = [0.0, 0.0, 0.0]
gazetteer_feat_id = (
torch.tensor(
[
self.vocab_dict.stoi[gazetteer_feat]
if hasattr(self, "vocab_dict")
else 0
for gazetteer_feat in gazetteer_feats
]
)
.unsqueeze(0)
.to(device)
)
gazetteer_feat_weights = (
torch.tensor(gazetteer_feat_weights).unsqueeze(0).to(device)
)
gazetteer_feat_lengths = (
torch.tensor(gazetteer_feat_lengths).to(device).view(1, -1)[:, 1]
)
return (gazetteer_feat_id, gazetteer_feat_weights, gazetteer_feat_lengths)
def _generate_word_baseline(self, seq_length):
return self.baseline_single_word_feature.repeat(1, seq_length)
def _generate_dict_baseline(self, seq_length):
return (
self.baseline_single_dict_feature[0].repeat(1, seq_length),
self.baseline_single_dict_feature[1].repeat(1, seq_length),
self.baseline_single_dict_feature[2].repeat(1, seq_length),
)
def configure_task_integ_grads_embeddings(task):
r"""
Wraps Pytext's DocNN model embedding with `IntegratedGradientsEmbedding` for
a given input task.
IntegratedGradientsEmbedding allows to perform baseline related operations
Args
task: DocNN task reference
Returns
integrated_gradients_embedding_lst: The embedding layer which contains
IntegratedGradientsEmbedding as a wrapper over the original
embeddings of the model
"""
integrated_gradients_embedding_lst = configure_model_integ_grads_embeddings(
task.model
)
task.model.embedding = integrated_gradients_embedding_lst
return integrated_gradients_embedding_lst[0]
def configure_model_integ_grads_embeddings(model):
r"""
Wraps Pytext's DocNN model embedding with `IntegratedGradientsEmbedding`
IntegratedGradientsEmbedding allows to perform baseline related operations
Args
model: a reference to DocModel
Returns
integrated_gradients_embedding_lst: The embedding layer which contains
IntegratedGradientsEmbedding as a wrapper over the original
embeddings of the model
"""
embeddings = model.embedding
integrated_gradients_embedding = PyTextInterpretableEmbedding(embeddings)
return EmbeddingList([integrated_gradients_embedding], False)
def reshape_word_features(word_features):
r"""
Creates one-sample batch for word features for sanity check purposes
Args
word_features: A tensor of diemnsions #words x #embeddings
Return
word_features: A tensor of dimensions 1 x #words x #embeddings
"""
return word_features.unsqueeze(0)
def reshape_dict_features(
dict_feature_id_batch, dict_weight_batch, dict_seq_len_batch, seq_length, idx
):
r"""
Creates one-sample batch for dict features for sanity check purposes
It reads and reshapes id, weight and seq_length feature arrays for given
input index `idx` from the input batch
Args
dict_feature_id_batch: The batch tensor for ids
dict_weight_matrix: The batch tensor for weights
dict_seq_len_matrix: The batch tensor for sequence length
seq_length: The number of tokens per sequence
idx: The index of sample in the batch
Return
dict_feature_ids: A tensor of dimensions [ bsz x # dict feature embeddings]
dict_feature_weights: [ bsz x # dict feature embeddings]
dict_feature_lens: [ bsz * seq_length ]
"""
dict_feature_ids = dict_feature_id_batch[idx].unsqueeze(0)
dict_feature_weights = dict_weight_batch[idx].unsqueeze(0)
dict_feature_lens = dict_seq_len_batch[idx].unsqueeze(0)
return (dict_feature_ids, dict_feature_weights, dict_feature_lens)
|