Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
from collections import defaultdict | |
import torch | |
from pytext.models.embeddings.dict_embedding import DictEmbedding | |
from pytext.models.embeddings.word_embedding import WordEmbedding | |
from pytext.models.model import EmbeddingBase, EmbeddingList | |
class PyTextInterpretableEmbedding(EmbeddingBase): | |
r""" | |
In PyText DocNN models we need a way to access word embedding layers, | |
generate the embeddings and subtract the baseline. | |
To do so, we separate embedding layers from the model, compute the embeddings | |
separately and do all operations needed outside of the model. | |
The original embedding layer is being replaced by `PyTextInterpretableEmbedding` | |
layer which passes precomputed embedding vectors to lower layers. | |
""" | |
def __init__(self, embeddings) -> None: | |
self.embedding_dims = [embedding.embedding_dim for embedding in embeddings] | |
super().__init__(sum(self.embedding_dims)) | |
self.embeddings = embeddings | |
def forward(self, input): | |
r""" | |
The forward pass of embedding layer. This can be for the text or any | |
type of embedding. | |
Args | |
input: Input embeddings tensor | |
Return | |
output: Output tensor is the same as input. It passes through | |
the embedding tensors to lower layers without any | |
modifications | |
""" | |
return input | |
def get_attribution_map(self, attributions): | |
r""" | |
After attribution scores are computed for an input embedding vector | |
we need to split it up into attribution sub tensors for each | |
feature type: word, dict and other types | |
TODO: we can potentally also output tuples of attributions. This might be | |
a better option. We'll work on this in a separate diff. | |
Args | |
attributions: A tensor that contains attribution values for each input | |
field. It usually has the same dimensions as the input | |
tensor | |
Return | |
attribution_map: A dictionary of feature_type and attribution values | |
""" | |
begin = 0 | |
attribution_map = defaultdict() | |
for embedding, embedding_size in zip(self.embeddings, self.embedding_dims): | |
end = begin + embedding_size | |
if isinstance(embedding, WordEmbedding): | |
attribution_map["word"] = attributions[:, :, begin:end] | |
elif isinstance(embedding, DictEmbedding): | |
attribution_map["dict"] = attributions[:, :, begin:end] | |
else: | |
raise NotImplementedError( | |
"Currently only word and dict " "embeddings are supported" | |
) | |
begin = end | |
return attribution_map | |
class BaselineGenerator: | |
r""" | |
This is an example input baseline generator for DocNN model which uses | |
word and dict features. | |
""" | |
PAD = "<pad>" | |
def __init__(self, model, data_handler, device) -> None: | |
self.model = model | |
self.data_handler = data_handler | |
if "dict_feat" in data_handler.features: | |
self.vocab_dict = data_handler.features["dict_feat"].vocab | |
if "word_feat" in data_handler.features: | |
self.vocab_word = data_handler.features["word_feat"].vocab | |
self.baseline_single_word_feature = self._generate_baseline_single_word_feature( | |
device | |
) | |
self.baseline_single_dict_feature = self._generate_baseline_single_dict_feature( | |
device | |
) | |
def generate_baseline(self, integ_grads_embeddings, seq_length): | |
r""" | |
Generates baseline for input word and dict features. In the future we | |
will extend it to support char and other features as well. | |
This baseline is entirely based on the `<pad>` token. | |
Args | |
integ_grads_embeddings: A reference to integrated gradients embedding | |
layer | |
seq_length: The length of each sequence which depends on batch size | |
Return | |
baseline: A tuple of feature baselines | |
Each feature type has a corresponding baseline tensor | |
in the tuple. | |
Currently only Dict and Word feature types are supported | |
""" | |
baseline = [] | |
for embedding in integ_grads_embeddings.embeddings: | |
if isinstance(embedding, WordEmbedding): | |
baseline.append(self._generate_word_baseline(seq_length)) | |
elif isinstance(embedding, DictEmbedding): | |
baseline.append(self._generate_dict_baseline(seq_length)) | |
else: | |
raise NotImplementedError( | |
"Currently only word and dict " "embeddings are supported" | |
) | |
return tuple(baseline) | |
def _generate_baseline_single_word_feature(self, device): | |
return ( | |
torch.tensor( | |
[self.vocab_word.stoi[self.PAD] if hasattr(self, "vocab_word") else 0] | |
) | |
.unsqueeze(0) | |
.to(device) | |
) | |
def _generate_baseline_single_dict_feature(self, device): | |
r"""Generate dict features based on Assistant's case study by using | |
sia_transformer: | |
fbcode/assistant/sia/transformer/sia_transformer.py | |
sia_transformer generates dict features in a special gazetter format | |
See `fbsource/fbcode/pytext/models/embeddings/dict_embedding.py` | |
It generates word dict feature embeddings for each word token. | |
The output of SIATransformer after running it on `<pad>` token | |
looks as following: | |
OutputRecord(tokens=['<', 'pad', '>'], | |
token_ranges=[(0, 1), (1, 4), (4, 5)], | |
gazetteer_feats=['<pad>', '<pad>', '<pad>'], | |
gazetteer_feat_lengths=[1, 1, 1], | |
gazetteer_feat_weights=[0.0, 0.0, 0.0], | |
characters=[['<', '<pad>', '<pad>'], | |
['p', 'a', 'd'], ['>', '<pad>', '<pad>']], | |
pretrained_token_embedding=[ ], dense_feats=None) | |
""" | |
gazetteer_feats = [self.PAD, self.PAD, self.PAD] | |
gazetteer_feat_lengths = [1, 1, 1] | |
gazetteer_feat_weights = [0.0, 0.0, 0.0] | |
gazetteer_feat_id = ( | |
torch.tensor( | |
[ | |
self.vocab_dict.stoi[gazetteer_feat] | |
if hasattr(self, "vocab_dict") | |
else 0 | |
for gazetteer_feat in gazetteer_feats | |
] | |
) | |
.unsqueeze(0) | |
.to(device) | |
) | |
gazetteer_feat_weights = ( | |
torch.tensor(gazetteer_feat_weights).unsqueeze(0).to(device) | |
) | |
gazetteer_feat_lengths = ( | |
torch.tensor(gazetteer_feat_lengths).to(device).view(1, -1)[:, 1] | |
) | |
return (gazetteer_feat_id, gazetteer_feat_weights, gazetteer_feat_lengths) | |
def _generate_word_baseline(self, seq_length): | |
return self.baseline_single_word_feature.repeat(1, seq_length) | |
def _generate_dict_baseline(self, seq_length): | |
return ( | |
self.baseline_single_dict_feature[0].repeat(1, seq_length), | |
self.baseline_single_dict_feature[1].repeat(1, seq_length), | |
self.baseline_single_dict_feature[2].repeat(1, seq_length), | |
) | |
def configure_task_integ_grads_embeddings(task): | |
r""" | |
Wraps Pytext's DocNN model embedding with `IntegratedGradientsEmbedding` for | |
a given input task. | |
IntegratedGradientsEmbedding allows to perform baseline related operations | |
Args | |
task: DocNN task reference | |
Returns | |
integrated_gradients_embedding_lst: The embedding layer which contains | |
IntegratedGradientsEmbedding as a wrapper over the original | |
embeddings of the model | |
""" | |
integrated_gradients_embedding_lst = configure_model_integ_grads_embeddings( | |
task.model | |
) | |
task.model.embedding = integrated_gradients_embedding_lst | |
return integrated_gradients_embedding_lst[0] | |
def configure_model_integ_grads_embeddings(model): | |
r""" | |
Wraps Pytext's DocNN model embedding with `IntegratedGradientsEmbedding` | |
IntegratedGradientsEmbedding allows to perform baseline related operations | |
Args | |
model: a reference to DocModel | |
Returns | |
integrated_gradients_embedding_lst: The embedding layer which contains | |
IntegratedGradientsEmbedding as a wrapper over the original | |
embeddings of the model | |
""" | |
embeddings = model.embedding | |
integrated_gradients_embedding = PyTextInterpretableEmbedding(embeddings) | |
return EmbeddingList([integrated_gradients_embedding], False) | |
def reshape_word_features(word_features): | |
r""" | |
Creates one-sample batch for word features for sanity check purposes | |
Args | |
word_features: A tensor of diemnsions #words x #embeddings | |
Return | |
word_features: A tensor of dimensions 1 x #words x #embeddings | |
""" | |
return word_features.unsqueeze(0) | |
def reshape_dict_features( | |
dict_feature_id_batch, dict_weight_batch, dict_seq_len_batch, seq_length, idx | |
): | |
r""" | |
Creates one-sample batch for dict features for sanity check purposes | |
It reads and reshapes id, weight and seq_length feature arrays for given | |
input index `idx` from the input batch | |
Args | |
dict_feature_id_batch: The batch tensor for ids | |
dict_weight_matrix: The batch tensor for weights | |
dict_seq_len_matrix: The batch tensor for sequence length | |
seq_length: The number of tokens per sequence | |
idx: The index of sample in the batch | |
Return | |
dict_feature_ids: A tensor of dimensions [ bsz x # dict feature embeddings] | |
dict_feature_weights: [ bsz x # dict feature embeddings] | |
dict_feature_lens: [ bsz * seq_length ] | |
""" | |
dict_feature_ids = dict_feature_id_batch[idx].unsqueeze(0) | |
dict_feature_weights = dict_weight_batch[idx].unsqueeze(0) | |
dict_feature_lens = dict_seq_len_batch[idx].unsqueeze(0) | |
return (dict_feature_ids, dict_feature_weights, dict_feature_lens) | |