ash56 commited on Feb 16

Commit

878264b

verified ·

1 Parent(s): d28af7f

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
fairseq/examples/MMPT/projects/retri/videoclip/test_didemo_zs.yaml +31 -0
fairseq/examples/MMPT/projects/retri/videoclip/vttqa_videoclip.yaml +49 -0
fairseq/examples/MMPT/projects/retri/videoclip/youcook_videoclip.yaml +49 -0
fairseq/examples/MMPT/projects/task/coin.yaml +25 -0
fairseq/examples/MMPT/projects/task/coin_videoclip.yaml +7 -0
fairseq/examples/MMPT/projects/task/test.yaml +13 -0
fairseq/examples/MMPT/projects/task/test_vtt.yaml +19 -0
fairseq/examples/MMPT/projects/task/test_youcook.yaml +22 -0
fairseq/examples/MMPT/projects/task/test_youcookcap.yaml +23 -0
fairseq/examples/MMPT/projects/task/vtt.yaml +25 -0
fairseq/examples/MMPT/projects/task/vtt_videoclip.yaml +12 -0
fairseq/examples/MMPT/projects/task/vttqa_videoclip.yaml +10 -0
fairseq/examples/MMPT/projects/task/youcook.yaml +25 -0
fairseq/examples/MMPT/projects/task/youcook_videoclip.yaml +9 -0
fairseq/examples/MMPT/projects/task/youcookcap.yaml +23 -0
fairseq/examples/MMPT/scripts/text_token_extractor/configs/bert-base-uncased.yaml +5 -0
fairseq/examples/MMPT/scripts/text_token_extractor/pretokenization.py +106 -0
fairseq/examples/MMPT/scripts/video_feature_extractor/extract.py +157 -0
fairseq/examples/MMPT/scripts/video_feature_extractor/how2/s3d.sh +8 -0
fairseq/examples/MMPT/scripts/video_feature_extractor/model.py +58 -0
fairseq/examples/MMPT/scripts/video_feature_extractor/pathbuilder.py +89 -0
fairseq/examples/MMPT/scripts/video_feature_extractor/preprocessing.py +57 -0
fairseq/examples/MMPT/scripts/video_feature_extractor/random_sequence_shuffler.py +29 -0
fairseq/examples/MMPT/scripts/video_feature_extractor/shard_feature.py +64 -0
fairseq/examples/MMPT/scripts/video_feature_extractor/videoreader.py +242 -0
fairseq/examples/MMPT/videoclip.png +3 -0
fairseq/examples/MMPT/vlm.png +3 -0
fairseq/examples/adaptive_span/README.md +90 -0
fairseq/examples/adaptive_span/__init__.py +19 -0
fairseq/examples/adaptive_span/adagrad_with_grad_clip.py +128 -0
fairseq/examples/adaptive_span/adaptive_span_attention.py +160 -0
fairseq/examples/adaptive_span/adaptive_span_loss.py +107 -0
fairseq/examples/adaptive_span/adaptive_span_model.py +263 -0
fairseq/examples/adaptive_span/adaptive_span_model_wrapper.py +145 -0
fairseq/examples/adaptive_span/truncated_bptt_lm_task.py +285 -0
fairseq/examples/attention_head_selection/README.md +161 -0
fairseq/examples/attention_head_selection/src/__init__.py +0 -0
fairseq/examples/attention_head_selection/src/data/__init__.py +0 -0
fairseq/examples/attention_head_selection/src/data/speech_to_text_dataset_with_domain.py +242 -0
fairseq/examples/attention_head_selection/src/loss/__init__.py +0 -0
fairseq/examples/attention_head_selection/src/loss/attention_head_selection.py +27 -0
fairseq/examples/attention_head_selection/src/models/__init__.py +0 -0
fairseq/examples/attention_head_selection/src/models/head_selection_s2t_transformer.py +170 -0
fairseq/examples/attention_head_selection/src/models/head_selection_transformer.py +215 -0
fairseq/examples/attention_head_selection/src/modules/__init__.py +0 -0
fairseq/examples/attention_head_selection/src/modules/attn_head_selector.py +81 -0
fairseq/examples/attention_head_selection/src/modules/head_selection_transformer_layer.py +92 -0
fairseq/examples/attention_head_selection/src/modules/multihead_attention_selection.py +355 -0
fairseq/examples/attention_head_selection/src/modules/multihead_functional.py +278 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+fairseq/examples/MMPT/vlm.png filter=lfs diff=lfs merge=lfs -text
+fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text

fairseq/examples/MMPT/projects/retri/videoclip/test_didemo_zs.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+slurm_config: big
+task_type: local_predict
+dataset:
+  split: test
+  video_processor: VideoProcessor
+  aligner: DiDeMoAligner
+  bert_name: bert-base-uncased
+  meta_processor: DiDeMoMetaProcessor
+  test_path: data/didemo/test_data.json
+  vfeat_dir: data/feat/feat_didemo_s3d
+  text_processor: DiDeMoTextProcessor
+  num_iso_layer: 12
+  max_video_len: 32
+  max_len: 96
+fairseq:
+  dataset:
+    batch_size: 256
+    valid_subset: test
+    num_workers: 2
+  common_eval:
+    path: runs/retri/videoclip/checkpoint_best.pt
+model:
+  model_cls: MMFusionSeparate
+  mm_encoder_cls: null
+  video_encoder_cls: MMBertForEncoder
+  text_encoder_cls: BertModel
+  num_hidden_video_layers: 6
+eval:
+  save_path: runs/retri/videoclip/didemo_zs/eval
+metric: DiDeMoMetric
+predictor: DiDeMoPredictor

fairseq/examples/MMPT/projects/retri/videoclip/vttqa_videoclip.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+dataset:
+  video_processor: VideoProcessor
+  bert_name: bert-base-uncased
+  meta_processor: MSRVTTMetaProcessor
+  train_path: data/msrvtt/MSRVTT_train.csv
+  dup: 20
+  val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
+  vfeat_dir: data/feat/feat_vtt_s3d
+  text_processor: MSRVTTTextProcessor
+  json_path: data/msrvtt/MSRVTT_data.json
+  aligner: DSAligner
+  num_iso_layer: 12
+  max_video_len: 32
+  max_len: 96
+fairseq:
+  common:
+    tensorboard_logdir: run
+    log_interval: 1000
+    fp16: true
+  dataset:
+    num_workers: 4
+    batch_size: 128
+  optimization:
+    lr:
+    - 5.0e-05
+    clip_norm: 2.0
+    optimizer: adam
+    adam_betas: (0.9, 0.98)
+    lr_scheduler: polynomial_decay
+    total_num_update: 1000000
+    warmup_updates: 122
+    weight_decay: 0.0
+    ddp_backend: no_c10d
+    max_epoch: 5
+  checkpoint:
+    restore_file: runs/retri/videoclip/checkpoint_best.pt
+    reset_optimizer: true
+    reset_dataloader: true
+    reset_meters: true
+    save_dir: runs/retri/videoclip/vttqa
+task_type: sweep_small
+model:
+  model_cls: MMFusionSeparate
+  mm_encoder_cls: null
+  video_encoder_cls: MMBertForEncoder
+  text_encoder_cls: BertModel
+  num_hidden_video_layers: 6
+loss:
+  loss_cls: V2TContraLoss

fairseq/examples/MMPT/projects/retri/videoclip/youcook_videoclip.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+dataset:
+  video_processor: YoucookVideoProcessor
+  bert_name: bert-base-uncased
+  meta_processor: YoucookMetaProcessor
+  train_path: data/youcook/youcook_train.pkl
+  val_path: data/youcook/youcook_val.pkl
+  trainval_annotation: data/youcook/youcookii_annotations_trainval.json
+  use_annotation_text: true
+  vfeat_dir: data/feat/feat_youcook_s3d
+  text_processor: TextProcessor
+  aligner: DSAligner
+  num_iso_layer: 12
+  max_video_len: 32
+  max_len: 96
+fairseq:
+  common:
+    tensorboard_logdir: run
+    log_interval: 1000
+    fp16: true
+  dataset:
+    num_workers: 4
+    batch_size: 128
+  optimization:
+    lr:
+    - 5.0e-05
+    clip_norm: 2.0
+    optimizer: adam
+    adam_betas: (0.9, 0.98)
+    lr_scheduler: polynomial_decay
+    total_num_update: 1000000
+    warmup_updates: 122
+    weight_decay: 0.0
+    ddp_backend: no_c10d
+    max_epoch: 10
+  checkpoint:
+    restore_file: runs/retri/videoclip/checkpoint_best.pt
+    reset_optimizer: true
+    reset_dataloader: true
+    reset_meters: true
+    save_dir: runs/retri/videoclip/youcook
+task_type: sweep_small
+model:
+  model_cls: MMFusionSeparate
+  mm_encoder_cls: null
+  video_encoder_cls: MMBertForEncoder
+  text_encoder_cls: BertModel
+  num_hidden_video_layers: 6
+loss:
+  loss_cls: T2VContraLoss

fairseq/examples/MMPT/projects/task/coin.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+includes: projects/task/ft.yaml
+task_type: sweep_big
+dataset:
+  meta_processor: COINActionSegmentationMetaProcessor
+  train_path: data/coin/COIN.json
+  val_path: data/coin/COIN.json
+  vfeat_dir: data/feat/feat_coin_s3d
+  video_processor: VideoProcessor
+  text_processor: COINActionSegmentationTextProcessor
+  aligner: COINActionSegmentationAligner
+  num_iso_layer: 12
+  sliding_window: 8
+  sliding_window_size: 32
+model:
+  model_cls: MMFusionActionSegmentation
+  mm_encoder_cls: MMBertForTokenClassification
+loss:
+  loss_cls: CrossEntropy
+fairseq:
+  dataset:
+    batch_size: 1
+  optimization:
+    max_epoch: 8
+  checkpoint:
+    save_dir: runs/task/coin

fairseq/examples/MMPT/projects/task/coin_videoclip.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+includes: projects/task/coin.yaml
+model:
+  model_cls: MMFusionSeparateActionSegmentation
+  mm_encoder_cls:
+  video_encoder_cls: MMBertForTokenClassification
+  text_encoder_cls: BertModel  # dummy, not used.
+  num_hidden_video_layers: 6

fairseq/examples/MMPT/projects/task/test.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+# this yaml cannot be run alone: implement a test_${dataset}.yaml
+slurm_config: big
+task_type: local_predict
+dataset:
+  split: test
+  video_processor: VideoProcessor
+  aligner: DSAligner
+  bert_name: bert-base-uncased
+fairseq:
+  dataset:
+    batch_size: 256
+    valid_subset: test
+    num_workers: 2

fairseq/examples/MMPT/projects/task/test_vtt.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+includes: projects/task/test.yaml
+dataset:
+  meta_processor: MSRVTTMetaProcessor
+  test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
+  video_processor: VideoProcessor
+  vfeat_dir: data/feat/feat_vtt_s3d
+  text_processor: MSRVTTTextProcessor
+  num_iso_layer: 12
+model:
+  model_cls: MMFusionJoint
+  mm_encoder_cls: MMBertForJoint
+eval:
+  save_path: runs/task/vtt/eval
+fairseq:
+  # read code and find what is the checkpoint arg.
+  common_eval:
+    path: runs/task/vtt/checkpoint_last.pt
+metric: RetrievalMetric
+predictor: RetrievalPredictor

fairseq/examples/MMPT/projects/task/test_youcook.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+includes: projects/task/test.yaml
+dataset:
+  meta_processor: YoucookMetaProcessor
+  test_path: data/youcook/youcook_val.pkl
+  trainval_annotation: data/youcook/youcookii_annotations_trainval.json
+  use_annotation_text: True
+  video_processor: YoucookVideoProcessor
+  vfeat_dir: data/feat/feat_youcook_s3d # /checkpoint/huxu/feat/youcook_vmz # /checkpoint/prarora/berniehuang/feat_youcook_vmz
+  text_processor: TextProcessor
+  aligner: DSAligner
+  num_iso_layer: 12
+model:
+  model_cls: MMFusionJoint
+  mm_encoder_cls: MMBertForJoint
+eval:
+  save_path: runs/task/youcook/eval
+fairseq:
+  # read code and find what is the checkpoint arg.
+  common_eval:
+    path: runs/task/youcook/checkpoint_last.pt
+metric: RetrievalMetric
+predictor: RetrievalPredictor

fairseq/examples/MMPT/projects/task/test_youcookcap.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+includes: projects/task/test.yaml
+dataset:
+  meta_processor: YoucookNLGMetaProcessor
+  test_path: data/youcook/val_list.txt
+  trainval_annotation: data/youcook/youcookii_annotations_trainval.json
+  video_processor: YoucookVideoProcessor
+  vfeat_dir: data/feat/feat_youcook_s3d
+  text_processor: NLGTextProcessor
+  aligner: DSNLGAligner
+model:
+  model_cls: MMFusionNLG
+  mm_encoder_cls: MMBertForNLG
+  max_decode_length: 24
+eval:
+  save_path: runs/task/youcookcap/eval
+fairseq:
+  # read code and find what is the checkpoint arg.
+  common_eval:
+    path: runs/task/youcookcap/checkpoint_best.pt
+metric: NLGMetric
+predictor: NLGPredictor
+gen_param:
+  num_beams: 5

fairseq/examples/MMPT/projects/task/vtt.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+includes: projects/task/ft.yaml
+dataset:
+  meta_processor: MSRVTTMetaProcessor
+  train_path: data/msrvtt/MSRVTT_train.csv
+  jsfusion_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
+  full_test_path: data/msrvtt/MSRVTT_FULL_test.csv
+  dup: 20
+  val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
+  vfeat_dir: data/feat/feat_vtt_s3d
+  text_processor: MSRVTTTextProcessor
+  json_path: data/msrvtt/MSRVTT_data.json
+  aligner: DSAligner
+  num_iso_layer: 12
+model:
+  model_cls: MMFusionJoint
+  mm_encoder_cls: MMBertForJoint
+loss:
+  loss_cls: T2VContraLoss
+fairseq:
+  dataset:
+    batch_size: 256
+  optimization:
+    max_epoch: 10
+  checkpoint:
+    save_dir: runs/task/vtt

fairseq/examples/MMPT/projects/task/vtt_videoclip.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+includes: projects/task/vtt.yaml
+model:
+  model_cls: MMFusionSeparate
+  mm_encoder_cls:
+  video_encoder_cls: MMBertForEncoder
+  text_encoder_cls: BertModel
+  num_hidden_video_layers: 6
+fairseq:
+  dataset:
+    batch_size: 224
+#   model_cls: MMFusionShare
+#   mm_encoder_cls: MMBertForEncoder

fairseq/examples/MMPT/projects/task/vttqa_videoclip.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+includes: projects/task/vttqa.yaml
+model:
+  model_cls: MMFusionSeparate
+  mm_encoder_cls:
+  video_encoder_cls: MMBertForEncoder
+  text_encoder_cls: BertModel
+  num_hidden_video_layers: 6
+#   model_cls: MMFusionShare
+#   mm_encoder_cls: MMBertForEncoder

fairseq/examples/MMPT/projects/task/youcook.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+includes: projects/task/ft.yaml
+dataset:
+  meta_processor: YoucookMetaProcessor
+  train_path: data/youcook/youcook_train.pkl
+  val_path: data/youcook/youcook_val.pkl
+  trainval_annotation: data/youcook/youcookii_annotations_trainval.json
+  use_annotation_text: True
+  video_processor: YoucookVideoProcessor
+  vfeat_dir: data/feat/feat_youcook_s3d # /checkpoint/huxu/feat/youcook_vmz # /checkpoint/prarora/berniehuang/feat_youcook_vmz
+  text_processor: TextProcessor
+  aligner: DSAligner
+  num_iso_layer: 12
+model:
+  model_cls: MMFusionJoint
+  mm_encoder_cls: MMBertForJoint
+loss:
+  loss_cls: T2VContraLoss
+fairseq:
+  dataset:
+    batch_size: 128
+  optimization:
+    max_epoch: 10
+  checkpoint:
+    save_dir: runs/task/youcook

fairseq/examples/MMPT/projects/task/youcook_videoclip.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+includes: projects/task/youcook.yaml
+model:
+  model_cls: MMFusionSeparate
+  mm_encoder_cls:
+  video_encoder_cls: MMBertForEncoder
+  text_encoder_cls: BertModel
+  num_hidden_video_layers: 6
+  # model_cls: MMFusionShare
+  # mm_encoder_cls: MMBertForEncoder

fairseq/examples/MMPT/projects/task/youcookcap.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+# finetuning for youcook captioning.
+includes: projects/task/ft.yaml
+dataset:
+  meta_processor: YoucookNLGMetaProcessor
+  train_path: data/youcook/train_list.txt
+  val_path: data/youcook/val_list.txt
+  trainval_annotation: data/youcook/youcookii_annotations_trainval.json
+  video_processor: YoucookVideoProcessor
+  vfeat_dir: data/feat/feat_youcook_s3d
+  text_processor: NLGTextProcessor
+  aligner: DSNLGAligner
+model:
+  model_cls: MMFusionNLG
+  mm_encoder_cls: MMBertForNLG
+loss:
+  loss_cls: NLGLoss
+fairseq:
+  dataset:
+    batch_size: 128
+  optimization:
+    max_epoch: 10
+  checkpoint:
+    save_dir: runs/task/youcookcap

fairseq/examples/MMPT/scripts/text_token_extractor/configs/bert-base-uncased.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+dataset:
+  bert_name: bert-base-uncased
+  caption_pkl_path: data/how2/raw_caption_dedup.pkl
+  use_fast: true
+  target_dir: data/feat/feat_how2_s3d_shard_small

fairseq/examples/MMPT/scripts/text_token_extractor/pretokenization.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import pickle
+import os
+import argparse
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+from mmpt.processors import PKLJSONStrTextProcessor
+from mmpt.utils import ShardedTensor, recursive_config
+class TokenizerDataset(Dataset):
+    def __init__(self, config):
+        self.text_processor = PKLJSONStrTextProcessor(config)
+        self.video_ids = list(self.text_processor.data.keys())
+    def __getitem__(self, idx):
+        video_id = self.video_ids[idx]
+        return video_id, self.text_processor(video_id)
+    def __len__(self):
+        return len(self.video_ids)
+def numpify(shard_idx, video_ids, captions, target_dir, split, prefix, max_cap_len=32):
+    startends = []
+    caps_ids = []
+    for video_id in video_ids:
+        caption = captions[video_id]
+        startend = []
+        cap_ids = []
+        for start, end, cap in zip(
+                caption["start"], caption["end"], caption["cap"]):
+            startend.append(np.array([start, end]).astype("float32"))
+            cap_id = np.full((max_cap_len,), -1, dtype=np.int32)
+            cap = cap[:max_cap_len]
+            cap_id[:len(cap)] = cap
+            cap_ids.append(cap_id)
+        startends.append(np.stack(startend))
+        caps_ids.append(np.stack(cap_ids))
+    startends = ShardedTensor.from_list(startends)
+    target_path = os.path.join(
+        target_dir,
+        prefix + split + "_" + str(shard_idx)
+    )
+    print("save to", target_path)
+    startends.save(target_path + ".startends")
+    caps_ids = ShardedTensor.from_list(caps_ids)
+    caps_ids.save(target_path + ".caps_ids")
+def sharding(config, out_file):
+    with open(out_file, "rb") as fr:
+        captions = pickle.load(fr)
+    target_dir = config.target_dir
+    prefix = os.path.basename(
+                os.path.splitext(config.caption_pkl_path)[0]
+            ) + "." + config.bert_name + "."
+    for split in ["train", "val"]:
+        target_path = os.path.join(target_dir, split + "_meta")
+        with open(target_path + ".pkl", "rb") as fr:
+            meta = pickle.load(fr)
+        print("load meta", target_path, len(meta))
+        for shard_id in meta:
+            numpify(
+                shard_id, meta[shard_id], captions,
+                target_dir, split, prefix
+            )
+def tokenize(config, out_file):
+    def collator(samples):
+        return samples
+    dataset = TokenizerDataset(config)
+    data = {}
+    for idx, batch in enumerate(
+            DataLoader(dataset, collate_fn=collator, num_workers=16)):
+        for video_id, caption in batch:
+            data[video_id] = caption
+        if idx % 5000 == 0:
+            print(idx)
+    with open(out_file, "wb") as fw:
+        pickle.dump(data, fw, pickle.HIGHEST_PROTOCOL)
+def main(args):
+    config = recursive_config(args.config).dataset
+    out_file = os.path.splitext(config.caption_pkl_path)[0] \
+        + "." + config.bert_name + ".pkl"
+    if not os.path.isfile(out_file):
+        tokenize(config, out_file)
+    sharding(config, out_file)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="pretokenize (raw_)caption.json into pkl.")
+    parser.add_argument('config', type=str)
+    args = parser.parse_args()
+    main(args)

fairseq/examples/MMPT/scripts/video_feature_extractor/extract.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright Howto100M authors.
+# Copyright (c) Facebook, Inc. All Rights Reserved
+import torch as th
+import torch.nn.functional as F
+import math
+import numpy as np
+import argparse
+from torch.utils.data import DataLoader
+from model import get_model
+from preprocessing import Preprocessing
+from random_sequence_shuffler import RandomSequenceSampler
+from tqdm import tqdm
+from pathbuilder import PathBuilder
+from videoreader import VideoLoader
+parser = argparse.ArgumentParser(description='Easy video feature extractor')
+parser.add_argument('--vdir', type=str)
+parser.add_argument('--fdir', type=str)
+parser.add_argument('--hflip', type=int, default=0)
+parser.add_argument('--batch_size', type=int, default=64,
+                            help='batch size')
+parser.add_argument('--type', type=str, default='2d',
+                            help='CNN type')
+parser.add_argument('--half_precision', type=int, default=0,
+                            help='output half precision float')
+parser.add_argument('--num_decoding_thread', type=int, default=4,
+                            help='Num parallel thread for video decoding')
+parser.add_argument('--l2_normalize', type=int, default=1,
+                            help='l2 normalize feature')
+parser.add_argument('--resnext101_model_path', type=str, default='model/resnext101.pth',
+                            help='Resnext model path')
+parser.add_argument('--vmz_model_path', type=str, default='model/r2plus1d_34_clip8_ig65m_from_scratch-9bae36ae.pth',
+                            help='vmz model path')
+args = parser.parse_args()
+# TODO: refactor all args into config. (current code is from different people.)
+CONFIGS = {
+    "2d": {
+        "fps": 1,
+        "size": 224,
+        "centercrop": False,
+        "shards": 0,
+    },
+    "3d": {
+        "fps": 24,
+        "size": 112,
+        "centercrop": True,
+        "shards": 0,
+    },
+    "s3d": {
+        "fps": 30,
+        "size": 224,
+        "centercrop": True,
+        "shards": 0,
+    },
+    "vmz": {
+        "fps": 24,
+        "size": 112,
+        "centercrop": True,
+        "shards": 0,
+    },
+    "vae": {
+        "fps": 2,
+        "size": 256,
+        "centercrop": True,
+        "shards": 100,
+    }
+}
+config = CONFIGS[args.type]
+video_dirs = args.vdir
+feature_dir = args.fdir
+video_dict = PathBuilder.build(video_dirs, feature_dir, ".npy", config["shards"])
+dataset = VideoLoader(
+    video_dict=video_dict,
+    framerate=config["fps"],
+    size=config["size"],
+    centercrop=config["centercrop"],
+    hflip=args.hflip
+)
+n_dataset = len(dataset)
+sampler = RandomSequenceSampler(n_dataset, 10)
+loader = DataLoader(
+    dataset,
+    batch_size=1,
+    shuffle=False,
+    num_workers=args.num_decoding_thread,
+    sampler=sampler if n_dataset > 10 else None,
+)
+preprocess = Preprocessing(args.type)
+model = get_model(args)
+with th.no_grad():
+    for k, data in tqdm(enumerate(loader), total=loader.__len__(), ascii=True):
+        input_file = data['input'][0]
+        output_file = data['output'][0]
+        if len(data['video'].shape) > 3:
+            video = data['video'].squeeze()
+            if len(video.shape) == 4:
+                video = preprocess(video)
+                n_chunk = len(video)
+                if args.type == 'vmz':
+                    n_chunk = math.ceil(n_chunk/float(3))
+                    features = th.cuda.FloatTensor(n_chunk, 512).fill_(0)
+                elif args.type == 's3d':
+                    features = th.cuda.FloatTensor(n_chunk, 512).fill_(0)
+                elif args.type == "vae":
+                    features = th.cuda.LongTensor(n_chunk, 1024).fill_(0)
+                else:
+                    features = th.cuda.FloatTensor(n_chunk, 2048).fill_(0)
+                n_iter = int(math.ceil(n_chunk / float(args.batch_size)))
+                for i in range(n_iter):
+                    factor = 1
+                    if args.type == 'vmz':
+                        factor = 3
+                    min_ind = factor * i * args.batch_size
+                    max_ind = factor * (i + 1) * args.batch_size
+                    video_batch = video[min_ind:max_ind:factor].cuda()
+                    if args.type == '2d':
+                        batch_features = model(video_batch) # (51, 487), (51, 512)
+                    elif args.type == 's3d':
+                        batch_features = model(video_batch)
+                        batch_features = batch_features['video_embedding']
+                    elif args.type == "vae":
+                        # image_code.
+                        batch_features = model(video_batch)
+                    else:
+                        batch_pred, batch_features = model(video_batch) # (51, 487), (51, 512)
+                    if args.l2_normalize:
+                        batch_features = F.normalize(batch_features, dim=1)
+                    features[i*args.batch_size:(i+1)*args.batch_size] = batch_features
+                features = features.cpu().numpy()
+                if args.half_precision:
+                    if args.type == "vae":
+                        features = features.astype(np.int16)
+                    else:
+                        features = features.astype('float16')
+                else:
+                    if args.type == "vae":
+                        features = features.astype(np.int32)
+                    else:
+                        features = features.astype('float32')
+                np.save(output_file, features)
+        else:
+            print('Video {} error.'.format(input_file))

fairseq/examples/MMPT/scripts/video_feature_extractor/how2/s3d.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/bin/bash
+python scripts/video_feature_extractor/extract.py \
+    --vdir <path_to_video_folder> \
+    --fdir data/feat/feat_how2_s3d \
+    --type=s3d --num_decoding_thread=4 \
+    --batch_size 32 --half_precision 1

fairseq/examples/MMPT/scripts/video_feature_extractor/model.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Howto100M authors and Facebook, Inc. All Rights Reserved
+import torch as th
+from torch import nn
+class GlobalAvgPool(nn.Module):
+    def __init__(self):
+        super(GlobalAvgPool, self).__init__()
+    def forward(self, x):
+        return th.mean(x, dim=[-2, -1])
+def get_model(args):
+    assert args.type in ['2d', '3d', 'vmz', 's3d', 'vae']
+    if args.type == '2d':
+        print('Loading 2D-ResNet-152 ...')
+        import torchvision.models as models
+        model = models.resnet152(pretrained=True)
+        model = nn.Sequential(*list(model.children())[:-2], GlobalAvgPool())
+        model = model.cuda()
+    elif args.type == 'vmz':
+        print('Loading VMZ ...')
+        from vmz34 import r2plus1d_34
+        model = r2plus1d_34(pretrained_path=args.vmz_model_path, pretrained_num_classes=487)
+        model = model.cuda()
+    elif args.type == 's3d':
+        # we use one copy of s3d instead of dup another one for feature extraction.
+        from mmpt.processors.models.s3dg import S3D
+        model = S3D('pretrained_models/s3d_dict.npy', 512)
+        model.load_state_dict(th.load('pretrained_models/s3d_howto100m.pth'))
+        model = model.cuda()
+    elif args.type == '3d':
+        print('Loading 3D-ResneXt-101 ...')
+        from videocnn.models import resnext
+        model = resnext.resnet101(
+            num_classes=400,
+            shortcut_type='B',
+            cardinality=32,
+            sample_size=112,
+            sample_duration=16,
+            last_fc=False)
+        model = model.cuda()
+        model_data = th.load(args.resnext101_model_path)
+        model.load_state_dict(model_data)
+    elif args.type == 'vae':
+        from openaivae import OpenAIParallelDiscreteVAE
+        model = OpenAIParallelDiscreteVAE()
+        model = model.cuda()
+    else:
+        raise ValueError("model not supported yet.")
+    model.eval()
+    print('loaded')
+    return model

fairseq/examples/MMPT/scripts/video_feature_extractor/pathbuilder.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import urllib.parse
+import json
+import pandas as pd
+from tqdm import tqdm
+# TODO: extending to other datasets.
+supported_formats = {}
+class PathBuilder(object):
+    @classmethod
+    def build(cls, video_dirs, feature_dir, ext, shards=0, split=None):
+        meta_fn = os.path.join(feature_dir, "meta_plan.json")
+        os.makedirs(feature_dir, exist_ok=True)
+        if os.path.isfile(meta_fn):
+            with open(meta_fn) as fr:
+                meta = json.load(fr)
+                return meta
+        print("searching videos...")
+        video_id_to_path = {}
+        for video_dir in video_dirs.split(","):
+            # TODO: add supports of recursive listdir.
+            if video_dir in supported_formats:
+                supported_formats[video_dir].load(video_dir, video_id_to_path)
+            else:
+                for idx, fn in enumerate(tqdm(os.listdir(video_dir))):
+                    video_fn = os.path.join(video_dir, fn)
+                    if os.path.isfile(video_fn):
+                        video_id = os.path.splitext(fn)[0]
+                        video_id_to_path[video_id] = video_fn
+                    elif os.path.isdir(video_fn):
+                        # shards of folders.
+                        shard_dir = video_fn
+                        for idx, fn in enumerate(os.listdir(shard_dir)):
+                            video_fn = os.path.join(shard_dir, fn)
+                            if os.path.isfile(video_fn):
+                                video_id = os.path.splitext(fn)[0]
+                                video_id_to_path[video_id] = video_fn
+        video_path, feature_path = [], []
+        valid_ext = set()
+        for idx, video_id in enumerate(video_id_to_path):
+            video_path.append(video_id_to_path[video_id])
+            if ext is None:
+                # use original file ext for format compatibility.
+                video_id_to_path[video_id]
+                path = urllib.parse.urlparse(video_id_to_path[video_id]).path
+                ext = os.path.splitext(path)[1]
+            if ext not in valid_ext:
+                valid_ext.add(ext)
+                print("adding", ext)
+            if shards:
+                shard_id = str(idx % shards)
+                feature_fn = os.path.join(
+                    feature_dir, shard_id, video_id + ext)
+            else:
+                feature_fn = os.path.join(
+                    feature_dir, video_id + ext)
+            feature_path.append(feature_fn)
+        print("targeting", len(feature_path), "videos")
+        meta = {
+            "video_path": video_path, "feature_path": feature_path}
+        with open(meta_fn, "w") as fw:
+            json.dump(meta, fw)
+        if split is not None:
+            splits = split.split("/")
+            assert len(splits) == 2
+            cur, total = int(splits[0]), int(splits[1])
+            assert cur < total
+            import math
+            chunk = math.ceil(len(meta["video_path"]) / total)
+            start = cur * chunk
+            end = (cur + 1) * chunk
+            meta = {
+                    "video_path": meta["video_path"][start:end],
+                    "feature_path": meta["feature_path"][start:end]
+            }
+        return meta

fairseq/examples/MMPT/scripts/video_feature_extractor/preprocessing.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright Howto100m authors.
+# Copyright (c) Facebook, Inc. All Rights Reserved
+import torch as th
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = th.FloatTensor(mean).view(1, 3, 1, 1)
+        self.std = th.FloatTensor(std).view(1, 3, 1, 1)
+    def __call__(self, tensor):
+        tensor = (tensor - self.mean) / (self.std + 1e-8)
+        return tensor
+class Preprocessing(object):
+    def __init__(self, type):
+        self.type = type
+        if type == '2d':
+            self.norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        elif type == '3d':
+            self.norm = Normalize(mean=[110.6, 103.2, 96.3], std=[1.0, 1.0, 1.0])
+        elif type == 'vmz':
+            self.norm = Normalize(mean=[110.201, 100.64, 95.997], std=[58.1489, 56.4701, 55.3324])
+    def _zero_pad(self, tensor, size):
+        n = size - len(tensor) % size
+        if n == size:
+            return tensor
+        else:
+            z = th.zeros(n, tensor.shape[1], tensor.shape[2], tensor.shape[3])
+            return th.cat((tensor, z), 0)
+    def __call__(self, tensor):
+        if self.type == '2d':
+            tensor = tensor / 255.0
+            tensor = self.norm(tensor)
+        elif self.type == 'vmz':
+            #tensor = self._zero_pad(tensor, 8)
+            tensor = self._zero_pad(tensor, 10)
+            tensor = self.norm(tensor)
+            #tensor = tensor.view(-1, 8, 3, 112, 112)
+            tensor = tensor.view(-1, 10, 3, 112, 112)
+            tensor = tensor.transpose(1, 2)
+        elif self.type == '3d':
+            tensor = self._zero_pad(tensor, 16)
+            tensor = self.norm(tensor)
+            tensor = tensor.view(-1, 16, 3, 112, 112)
+            tensor = tensor.transpose(1, 2)
+        elif self.type == 's3d':
+            tensor = tensor / 255.0
+            tensor = self._zero_pad(tensor, 30)
+            tensor = tensor.view(-1, 30, 3, 224, 224) # N x 30 x 3 x H x W
+            tensor = tensor.transpose(1, 2) # N x 3 x 30 x H x W
+        # for vae do nothing
+        return tensor

fairseq/examples/MMPT/scripts/video_feature_extractor/random_sequence_shuffler.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) Facebook, Inc. All Rights Reserved
+import numpy as np
+from torch.utils.data.sampler import Sampler
+class RandomSequenceSampler(Sampler):
+    def __init__(self, n_sample, seq_len):
+        self.n_sample = n_sample
+        self.seq_len = seq_len
+    def _pad_ind(self, ind):
+        zeros = np.zeros(self.seq_len - self.n_sample % self.seq_len)
+        ind = np.concatenate((ind, zeros))
+        return ind
+    def __iter__(self):
+        idx = np.arange(self.n_sample)
+        if self.n_sample % self.seq_len != 0:
+            idx = self._pad_ind(idx)
+        idx = np.reshape(idx, (-1, self.seq_len))
+        np.random.shuffle(idx)
+        idx = np.reshape(idx, (-1))
+        return iter(idx.astype(int))
+    def __len__(self):
+        return self.n_sample + (self.seq_len - self.n_sample % self.seq_len)

fairseq/examples/MMPT/scripts/video_feature_extractor/shard_feature.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import os
+import pickle
+from mmpt.utils import ShardedTensor
+class Shard(object):
+    def __init__(
+        self,
+        vfeat_dir,
+        tfeat_dir,
+        target_dir,
+        file_paths,
+        shard_size=4096
+    ):
+        self.vfeat_dir = vfeat_dir
+        self.tfeat_dir = tfeat_dir
+        self.target_dir = target_dir
+        self.video_ids = {}
+        for split, file_path in zip(["train", "val"], file_paths):
+            with open(file_path) as fr:
+                self.video_ids[split] = [
+                    line.strip() for line in fr.readlines()]
+        self.shard_size = shard_size
+    def __call__(self, split="train"):
+        for split in ["train", "val"]:
+            meta = {}
+            for shard_idx, shard_offset in enumerate(
+                range(0, len(self.video_ids[split]), self.shard_size)
+            ):
+                print(shard_idx)
+                meta_shard = []
+                video_shard = []
+                for video_id in self.video_ids[split][shard_offset:shard_offset+self.shard_size]:
+                    meta_shard.append(video_id)
+                    npy_file = os.path.join(self.vfeat_dir, video_id + ".npy")
+                    video_shard.append(np.load(npy_file))
+                meta[shard_idx] = meta_shard
+                video_shard = ShardedTensor.from_list(video_shard)
+                target_path = os.path.join(
+                    self.target_dir, split + "_" + str(shard_idx))
+                video_shard.save(target_path)
+            target_path = os.path.join(self.target_dir, split + "_meta")
+            with open(target_path + ".pkl", "wb") as fw:
+                pickle.dump(meta, fw, pickle.HIGHEST_PROTOCOL)
+if __name__ == "__main__":
+    shard = Shard(
+        "data/feat/feat_how2_s3d",
+        "data/how2/raw_caption_dedup.bert-base-uncased",
+        "data/feat/feat_how2_s3d_shard_small",
+        ["data/how2/how2_s3d_train.lst", "data/how2/how2_s3d_val.lst"]
+    )
+    shard()

fairseq/examples/MMPT/scripts/video_feature_extractor/videoreader.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# Copyright Howto100M authors.
+# Copyright (c) Facebook, Inc. All Rights Reserved
+import torch as th
+import pandas as pd
+import os
+import numpy as np
+import ffmpeg
+import random
+from torch.utils.data import Dataset
+class VideoLoader(Dataset):
+    """modified from how2's video_feature_extractor."""
+    def __init__(
+        self,
+        csv=None,
+        video_dict=None,
+        framerate=1,
+        size=112,
+        centercrop=False,
+        hflip=False,
+        **kwargs
+    ):
+        if csv is None and video_dict is None:
+            raise ValueError("csv and video_dict cannot be both None.")
+        if csv is not None:
+            self.csv = pd.read_csv(csv)
+        if video_dict is not None:
+            self.csv = pd.DataFrame.from_dict(video_dict)
+        self.centercrop = centercrop
+        self.size = size
+        self.framerate = framerate
+        self.hflip = hflip
+    def __len__(self):
+        return len(self.csv)
+    def _get_video_dim(self, video_path):
+        probe = ffmpeg.probe(video_path)
+        video_stream = next((stream for stream in probe['streams']
+                             if stream['codec_type'] == 'video'), None)
+        width = int(video_stream['width'])
+        height = int(video_stream['height'])
+        return height, width
+    def _get_video_info(self, video_path):
+        probe = ffmpeg.probe(video_path)
+        video_stream = next((stream for stream in probe['streams']
+                             if stream['codec_type'] == 'video'), None)
+        return video_stream
+    def _get_output_dim(self, h, w):
+        if isinstance(self.size, tuple) and len(self.size) == 2:
+            return self.size
+        elif h >= w:
+            return int(h * self.size / w), self.size
+        else:
+            return self.size, int(w * self.size / h)
+    def __getitem__(self, idx):
+        video_path = self.csv['video_path'].values[idx]
+        output_file = self.csv['feature_path'].values[idx]
+        return self._decode(output_file, video_path)
+    def _decode(self, output_file, video_path):
+        if not(os.path.isfile(output_file)) and os.path.isfile(video_path):
+            try:
+                h, w = self._get_video_dim(video_path)
+            except Exception:
+                print('ffprobe failed at: {}'.format(video_path))
+                return {'video': th.zeros(1), 'input': video_path,
+                        'output': output_file}
+            try:
+                os.makedirs(os.path.dirname(output_file), exist_ok=True)
+                height, width = self._get_output_dim(h, w)
+                cmd = (
+                    ffmpeg
+                    .input(video_path)
+                    .filter('fps', fps=self.framerate)
+                    .filter('scale', width, height)
+                )
+                if self.hflip:
+                    cmd = cmd.filter('hflip')
+                if self.centercrop:
+                    x = int((width - self.size) / 2.0)
+                    y = int((height - self.size) / 2.0)
+                    cmd = cmd.crop(x, y, self.size, self.size)
+                video = self._run(cmd, output_file)
+            except Exception:
+                video = th.zeros(1)
+        else:
+            video = th.zeros(1)
+        return {'video': video, 'input': video_path, 'output': output_file}
+    def _run(self, cmd, output_file):
+        out, _ = (
+            cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24')
+            .run(capture_stdout=True, quiet=True)
+        )
+        if self.centercrop and isinstance(self.size, int):
+            height, width = self.size, self.size
+        video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
+        video = th.from_numpy(video.astype('float32'))
+        return video.permute(0, 3, 1, 2)
+class VideoVerifier(VideoLoader):
+    def __getitem__(self, idx):
+        video_path = self.csv['video_path'].values[idx]
+        try:
+            return self._get_video_info(video_path)
+        except Exception:
+            # print('ffprobe failed at: {}'.format(video_path))
+            return None
+class VideoCompressor(VideoLoader):
+    def __init__(
+        self,
+        csv=None,
+        video_dict=None,
+        framerate=1,
+        size=112,
+        centercrop=False,
+        hflip=False,
+        crf=32,
+        **kwargs
+    ):
+        super().__init__(
+            csv,
+            video_dict,
+            framerate,
+            size,
+            centercrop,
+            hflip
+        )
+        self.crf = crf
+    def _run(self, cmd, output_file):
+        out, _ = (
+            cmd.output(filename=output_file, crf=self.crf)
+            .run(quiet=True)
+        )
+        video = None
+        return video
+class VideoDownloader(VideoCompressor):
+    """download"""
+    def __getitem__(self, idx):
+        video_path = self.csv['video_path'].values[idx]
+        output_file = self.csv['feature_path'].values[idx]
+        if not(os.path.isfile(output_file)):
+            os.makedirs(os.path.dirname(output_file), exist_ok=True)
+            cmd = "wget -O" + output_file + " " + video_path
+            # import subprocess
+            # subprocess.check_output(
+            #    cmd,
+            #    stderr=subprocess.STDOUT, shell=True)
+            os.system(cmd)
+        return {'video': None, 'input': video_path, 'output': output_file}
+class AvKeyframeVideoCompressor(VideoLoader):
+    """extract keyframes from a video and save it as jpg.
+    TODO: consider to merge with `CodecProcessor`.
+    """
+    def __init__(
+        self,
+        csv=None,
+        video_dict=None,
+        framerate=1,
+        size=112,
+        centercrop=False,
+        max_num_frames=5,
+        **kwargs
+    ):
+        super().__init__(csv, video_dict, framerate, size, centercrop)
+        self.max_num_frames = max_num_frames
+    def _get_video_dim(self, video_fn):
+        """decord cannot probe the size of a video, we use pyav instead."""
+        import av
+        with av.open(video_fn) as container:
+            height = container.streams.video[0].codec_context.height
+            width = container.streams.video[0].codec_context.width
+        return height, width
+    def _get_output_dim(self, height, width):
+        """
+        keep the shorter side be `self.size`, strech the other.
+        """
+        if height >= width:
+            return int(height * self.size / width), self.size
+        else:
+            return self.size, int(width * self.size / height)
+    def __getitem__(self, idx):
+        import av
+        video_path = self.csv['video_path'].values[idx]
+        output_file = self.csv['feature_path'].values[idx]
+        if not(os.path.isdir(output_file)) and os.path.isfile(video_path):
+            try:
+                h, w = self._get_video_dim(video_path)
+            except Exception:
+                print('probe failed at: {}'.format(video_path))
+                return {'video': th.zeros(1), 'input': video_path,
+                        'output': output_file}
+            try:
+                height, width = self._get_output_dim(h, w)
+                # new for av.
+                with av.open(video_path) as container:
+                    container.streams.video[0].thread_type = "AUTO"
+                    container.streams.video[0].codec_context.height = height
+                    container.streams.video[0].codec_context.width = width
+                    if self.framerate == 0:     # keyframe.
+                        container.streams.video[0].codec_context.skip_frame = 'NONKEY'
+                    frames = []
+                    for frame in container.decode(video=0):
+                        frames.append(frame)
+                    frames = random.sample(frames, self.max_num_frames)
+                    os.makedirs(output_file, exist_ok=True)
+                    for frame in frames:
+                        frame.to_image().save(
+                            os.path.join(
+                                output_file,
+                                "%04d.jpg" % frame.index))
+            except Exception:
+                print('extract failed at: {}'.format(video_path))
+                return {'video': th.zeros(1), 'input': video_path,
+                        'output': output_file}
+        video = th.zeros(1)
+        return {'video': video, 'input': video_path, 'output': output_file}

fairseq/examples/MMPT/videoclip.png ADDED Viewed

Git LFS Details

SHA256: 1d54fe18d1259ade9332e78fdb74f834fdfbdb0b0486517e6a7cd48956b30663
Pointer size: 131 Bytes
Size of remote file: 386 kB

fairseq/examples/MMPT/vlm.png ADDED Viewed

Git LFS Details

SHA256: 722852ed6258ac9f7ffd3e3913fa1a370702c4d989ef6d881847432d59ade4e5
Pointer size: 131 Bytes
Size of remote file: 418 kB

fairseq/examples/adaptive_span/README.md ADDED Viewed

	@@ -0,0 +1,90 @@

+# Adaptive Span
+Adaptive Span is a novel self-attention mechanism that can learn its optimal
+attention span. This allows us to extend significantly the maximum context size
+used in Transformer, while maintaining control over their memory footprint
+and computational time. It uses the Truncated BPTT technique for training,
+as in [transformerXL](https://github.com/pytorch/fairseq/blob/main/examples/truncated_bptt/README.md).
+Adaptive Span was introduced by paper:
+[Adaptive Attention Span in Transformers](https://arxiv.org/abs/1905.07799),
+which achieved state-of-the-art language modeling results at the time of publication.
+We manage to reproduce their result in fairseq and keep most of the
+[original implementation](https://github.com/facebookresearch/adaptive-span) untouched.
+You can refer to the their sweep file as well if any combination of hyperparameter is not clear.
+##### 0. Setup
+First you need to process the Enwik8 dataset, we use the pre-tokenized dataset
+from [adaptive span paper](https://github.com/facebookresearch/adaptive-span/blob/master/get_data.sh).
+You can download the dataset, and then run:
+```bash
+fairseq-preprocess --only-source --trainpref ~/data/enwik8/train.txt \
+    --validpref ~/data/enwik8/valid.txt --testpref ~/data/enwik8/test.txt \
+    --destdir ~/data/enwik8/data-bin/ --joined-dictionary --workers 20
+```
+##### 1. Train a Adaptive Span model on Enwik8
+We will train a 12-layer Adaptive Span model following the [hyperparameters
+used in the original
+paper](https://github.com/facebookresearch/adaptive-span/blob/master/experiments/enwik8.sh).
+The following command assumes 4 GPUs, so that the total batch size is 64
+sequences (4 x 16). Training should take 2-3 days on 4 V100 GPUs:
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train \
+    --user-dir examples/adaptive_span \
+    --data  ~/data/enwik8/data-bin/ \
+    --fp16 --fp16-no-flatten-grads --max-update 600000 \
+    --task truncated_bptt_lm --tokens-per-sample 512 --arch adaptive_span \
+    --n-layer 12 --d-model 512 --n-head 8 --d-inner 2048 --dropout 0.3 \
+    --attn-span 8192 --optimizer adagrad_with_grad_clip --adagrad-clip 0.03 \
+    --validate-interval-updates 1000 \
+    --lr-scheduler fixed --warmup-updates 32000 --batch-size-valid 32 \
+    --lr 0.07 --criterion adaptive_span_loss --batch-size 16 --update-freq 1 \
+    --seed 2 --log-format json --log-interval 25 --aux-loss-scaler 5e-07
+```
+This should land around 1.05 on validation, 1.03 on test. You can lower the
+--aux-loss-scaler for better performance (longer span). It gives ~0.03 bpc
+improvement to the transformerXL baseline here.
+If training on a single GPU, set `--update-freq=4` to accumulate 4x gradients
+and simulate training on 4 GPUs.
+You can also reproduce the transformerXL result on enwik8 using this code base.
+It should land around 1.06 on test,matching the [original paper](https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/run_enwik8_base.sh).
+You can try by
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train \
+    --user-dir examples/truncated_bptt \
+    ~/data/enwik8/data-bin/ \
+    --task truncated_bptt_lm  --fp16 --max-update 400000 \
+    --tokens-per-sample 512 --arch transformer_xl --n-layer 12 \
+    --d-model 512 --n-head 8 --d-head 64 --d-inner 2048 --dropout 0.1 \
+    --dropatt 0.0 --mem-len 512 --optimizer adam --clip-norm 0.25 \
+    --lr-scheduler cosine --warmup-updates 0 \
+    --lr 0.0 --lr 0.00025 --batch-size 15 \
+    --update-freq 1 --seed 2 --log-format json --log-interval 25 \
+    --fp16
+```
+##### 2. Evaluate
+For Adaptive Span:
+```bash
+fairseq-eval-lm ~/data/enwik8/data-bin/ --path model/checkpoint_best.pt \
+ --user-dir examples/adaptive_span \
+ --task truncated_bptt_lm --batch-size 8 --tokens-per-sample 512 --gen-subset test
+```
+For Transformer-XL evaluation:
+```bash
+fairseq-eval-lm ~/data/enwik8/data-bin/ --path model/checkpoint_best.pt \
+    --user-dir examples/truncated_bptt/ --task truncated_bptt_lm --batch-size 8 \
+    --tokens-per-sample 80 \
+    --model-overrides '{"mem_len":2100,"clamp_len":820,"same_length":True}' \
+    --gen-subset valid
+```
+*Note:* During training the model saw 512 tokens of context
+(``--tokens-per-sample=512``), with batch size 8. These settings match the evaluation
+settings from [the original
+paper](https://github.com/facebookresearch/adaptive-span/blob/master/experiments/enwik8.sh).

fairseq/examples/adaptive_span/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import os
+# automatically import any Python files in the current directory
+cur_dir = os.path.dirname(__file__)
+for file in os.listdir(cur_dir):
+    path = os.path.join(cur_dir, file)
+    if (
+        not file.startswith("_")
+        and not file.startswith(".")
+        and (file.endswith(".py") or os.path.isdir(path))
+    ):
+        mod_name = file[: file.find(".py")] if file.endswith(".py") else file
+        module = importlib.import_module(__name__ + "." + mod_name)

fairseq/examples/adaptive_span/adagrad_with_grad_clip.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from torch.optim import Adagrad
+from fairseq.optim import LegacyFairseqOptimizer, register_optimizer
+@register_optimizer("adagrad_with_grad_clip")
+class FairseqAdagradWithGradClip(LegacyFairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = AdagradWithGradClip(params, **self.optimizer_config)
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        parser.add_argument('--adagrad-clip', default=0.0, type=float, metavar='D',
+                            help='internal grad clip')
+        # fmt: on
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.args.lr[0],
+            "weight_decay": self.args.weight_decay,
+            "grad_clip": self.args.adagrad_clip,
+        }
+    @property
+    def supports_flat_params(self):
+        return False
+def _clip_grad(clr, grad, group_grad_clip):
+    if group_grad_clip > 0:
+        norm = grad.norm(2).item()
+        if norm > group_grad_clip:
+            clr *= group_grad_clip / (norm + 1e-10)
+    return clr
+class AdagradWithGradClip(Adagrad):
+    """Adagrad algorithm with custom gradient clipping"""
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        lr_decay=0,
+        weight_decay=0,
+        initial_accumulator_value=0,
+        grad_clip=0,
+    ):
+        Adagrad.__init__(
+            self,
+            params,
+            lr=lr,
+            lr_decay=lr_decay,
+            weight_decay=weight_decay,
+            initial_accumulator_value=initial_accumulator_value,
+        )
+        self.defaults["grad_clip"] = grad_clip
+        self.param_groups[0].setdefault("grad_clip", grad_clip)
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                state = self.state[p]
+                state["step"] += 1
+                if group["weight_decay"] != 0:
+                    if p.grad.data.is_sparse:
+                        raise RuntimeError(
+                            "weight_decay option is "
+                            "not compatible with sparse "
+                            "gradients"
+                        )
+                    grad = grad.add(group["weight_decay"], p.data)
+                clr = group["lr"] / (1 + (state["step"] - 1) * group["lr_decay"])
+                # clip
+                clr = _clip_grad(clr=clr, grad=grad, group_grad_clip=group["grad_clip"])
+                if grad.is_sparse:
+                    # the update is non-linear so indices must be unique
+                    grad = grad.coalesce()
+                    grad_indices = grad._indices()
+                    grad_values = grad._values()
+                    size = grad.size()
+                    def make_sparse(values):
+                        constructor = grad.new
+                        if grad_indices.dim() == 0 or values.dim() == 0:
+                            return constructor().resize_as_(grad)
+                        return constructor(grad_indices, values, size)
+                    state["sum"].add_(make_sparse(grad_values.pow(2)))
+                    std = state["sum"]._sparse_mask(grad)
+                    std_values = std._values().sqrt_().add_(1e-10)
+                    p.data.add_(-clr, make_sparse(grad_values / std_values))
+                else:
+                    state["sum"].addcmul_(1, grad, grad)
+                    std = state["sum"].sqrt().add_(1e-10)
+                    p.data.addcdiv_(-clr, grad, std)
+        return loss

fairseq/examples/adaptive_span/adaptive_span_attention.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class AdaptiveMask(nn.Module):
+    """Soft masking function for adaptive size.
+    It masks out the last K values of an input. The masking value
+    goes from 1 to 0 gradually, so K can be learned with
+    back-propagation.
+    Args:
+        max_size: maximum size (i.e. input dimension)
+        ramp_size: size of the ramp going from 0 to 1
+        init_val: initial size proportion not to be masked out
+        shape: learn multiple sizes independent of each other
+    """
+    def __init__(self, max_size, ramp_size, init_val=0, shape=(1,)):
+        nn.Module.__init__(self)
+        self._max_size = max_size
+        self._ramp_size = ramp_size
+        self.current_val = nn.Parameter(torch.zeros(*shape) + init_val)
+        mask_template = torch.linspace(1 - max_size, 0, steps=max_size)
+        self.register_buffer("mask_template", mask_template)
+    def forward(self, x):
+        mask = self.mask_template.float() + self.current_val.float() * self._max_size
+        mask = mask / self._ramp_size + 1
+        mask = mask.clamp(0, 1)
+        if x.size(-1) < self._max_size:
+            # the input could have been trimmed beforehand to save computation
+            mask = mask.narrow(-1, self._max_size - x.size(-1), x.size(-1))
+        x = (x * mask).type_as(x)
+        return x
+    def get_current_max_size(self, include_ramp=True):
+        current_size = math.ceil(self.current_val.max().item() * self._max_size)
+        if include_ramp:
+            current_size += self._ramp_size
+        current_size = max(0, min(self._max_size, current_size))
+        return current_size
+    def get_current_avg_size(self, include_ramp=True):
+        current_size = math.ceil(
+            self.current_val.float().mean().item() * self._max_size
+        )
+        if include_ramp:
+            current_size += self._ramp_size
+        current_size = max(0, min(self._max_size, current_size))
+        return current_size
+    def clamp_param(self):
+        """this need to be called after each update"""
+        self.current_val.data.clamp_(0, 1)
+class AdaptiveSpan(nn.Module):
+    """Adaptive attention span for Transformerself.
+    This module learns an attention span length from data for each
+    self-attention head.
+    Args:
+        attn_span: maximum attention span
+        adapt_span_loss: loss coefficient for the span length
+        adapt_span_ramp: length of the masking ramp
+        adapt_span_init: initial size ratio
+        adapt_span_cache: adapt cache size to reduce memory usage
+    """
+    def __init__(
+        self,
+        attn_span,
+        adapt_span_ramp,
+        adapt_span_init,
+        n_head,
+        adapt_span_layer,
+        **kargs
+    ):
+        nn.Module.__init__(self)
+        self._max_span = attn_span
+        self._n_head = n_head
+        self._adapt_span_layer = adapt_span_layer
+        if self._adapt_span_layer:
+            self._mask = AdaptiveMask(
+                max_size=self._max_span,
+                ramp_size=adapt_span_ramp,
+                init_val=adapt_span_init,
+            )
+        else:
+            self._mask = AdaptiveMask(
+                max_size=self._max_span,
+                ramp_size=adapt_span_ramp,
+                init_val=adapt_span_init,
+                shape=(n_head, 1, 1),
+            )
+    def forward(self, attn, normalize=True):
+        """mask attention with the right span"""
+        # batch and head dimensions are merged together, so separate them first
+        self.clamp_param()
+        if self._adapt_span_layer:
+            attn = self._mask(attn)
+        else:
+            B = attn.size(0)  # batch size
+            M = attn.size(1)  # block size
+            attn = attn.reshape(B // self._n_head, self._n_head, M, -1)
+            attn = self._mask(attn)
+            attn = attn.view(B, M, -1)
+        return attn
+    def get_trim_len(self):
+        """how much of memory can be trimmed to reduce computation"""
+        L = self._max_span
+        trim_len = min(L - 1, L - self._mask.get_current_max_size())
+        # too fine granularity might be bad for the memory management
+        trim_len = math.floor(trim_len / 64) * 64
+        return trim_len
+    def trim_memory(self, query, key, value, key_pe):
+        """trim out unnecessary memory beforehand to reduce computation"""
+        trim_len = self.get_trim_len()
+        cache_size = key.size(1) - query.size(1)
+        trim_len_cache = trim_len - (self._max_span - cache_size)
+        if trim_len_cache > 0:
+            key = key[:, trim_len_cache:, :]
+            value = value[:, trim_len_cache:, :]
+        elif trim_len_cache < 0:
+            # cache is too short! this happens when validation resumes
+            # after a lot of updates.
+            key = F.pad(key, [0, 0, -trim_len_cache, 0])
+            value = F.pad(value, [0, 0, -trim_len_cache, 0])
+        if trim_len > 0:
+            if key_pe is not None:
+                key_pe = key_pe[:, :, trim_len:]
+        return key, value, key_pe
+    def get_cache_size(self):
+        """determine how long the cache should be"""
+        trim_len = self.get_trim_len()
+        # give a buffer of 64 steps since a span might increase
+        # in future updates
+        return min(self._max_span, self._max_span - trim_len + 64)
+    def get_loss(self):
+        """a loss term for regularizing the span length"""
+        return self._max_span * self._mask.current_val.float().mean()
+    def get_current_max_span(self):
+        return self._mask.get_current_max_size()
+    def get_current_avg_span(self):
+        return self._mask.get_current_avg_size()
+    def clamp_param(self):
+        self._mask.clamp_param()

fairseq/examples/adaptive_span/adaptive_span_loss.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from dataclasses import dataclass
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import register_criterion
+from fairseq.criterions.cross_entropy import CrossEntropyCriterion
+from fairseq.dataclass import FairseqDataclass
+from omegaconf import II
+@dataclass
+class AdaptiveSpanCriterionConfig(FairseqDataclass):
+    sentence_avg: bool = II("optimization.sentence_avg")
+@register_criterion("adaptive_span_loss", dataclass=AdaptiveSpanCriterionConfig)
+class AdaptiveSpanCriterion(CrossEntropyCriterion):
+    def __init__(self, task, sentence_avg):
+        super().__init__(task, sentence_avg)
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss here is summed, different from the adaptive span code
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        loss, aux_loss, avg_span, max_span = self.compute_loss(
+            model, net_output, sample, reduce=reduce
+        )
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        loss /= sample_size
+        total_loss = loss + aux_loss
+        sample_size = 1
+        logging_output = {
+            "loss": loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+            "total_loss": total_loss.data,
+            "avg_span": avg_span * sample_size,
+            "max_span": max_span * sample_size,
+        }
+        return total_loss, sample_size, logging_output
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        loss, _ = super().compute_loss(model, net_output, sample, reduce)
+        aux_loss = model.get_aux_loss()
+        avg_span = model.get_current_avg_span()
+        max_span = model.get_current_max_span()
+        return loss, aux_loss, avg_span, max_span
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        total_loss_sum = sum(log.get("total_loss", 0) for log in logging_outputs)
+        avg_span_sum = sum(log.get("avg_span", 0) for log in logging_outputs)
+        max_span_sum = sum(log.get("max_span", 0) for log in logging_outputs)
+        # we divide by log(2) to convert the loss from base e to base 2
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        metrics.log_scalar("avg_span", avg_span_sum / sample_size, sample_size, round=3)
+        metrics.log_scalar("max_span", max_span_sum / sample_size, sample_size, round=3)
+        # total loss contains the L1 norm on adaptive-span
+        metrics.log_scalar(
+            "total_loss",
+            total_loss_sum / sample_size / math.log(2),
+            sample_size,
+            round=3,
+        )
+        if sample_size != ntokens:
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+            )
+        else:
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
+            )
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True

fairseq/examples/adaptive_span/adaptive_span_model.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.modules.layer_norm import LayerNorm
+from .adaptive_span_attention import AdaptiveSpan
+# Size notations:
+# B = batch_size, H = d_model, M = block_size, L = attn_span
+def _skew(X, pad_value):
+    """shift every row 1 step to right"""
+    # X = B x M x L
+    B, M, L = X.size()
+    X = F.pad(X, (0, M + 1), value=pad_value)  # B x M x (L+M+1)
+    X = X.view(B, -1)  # B x ML+MM+M
+    X = X[:, :-M]  # B x ML+MM
+    X = X.view(B, M, M + L)  # B x M x L+M
+    return X
+def _unskew(X):
+    """reverse _skew operation"""
+    # X = B x M x L+M
+    B, M, L = X.size()
+    L -= M
+    X = X.view(B, -1)  # B x ML+MM
+    X = F.pad(X, (0, M))  # B x ML+MM+M
+    X = X.view(B, M, M + L + 1)  # B x M x L+M+1
+    X = X[:, :, :L]  # B x M x L
+    return X
+class SeqAttention(nn.Module):
+    """Sequential self-attention layer.
+    Each token will attend to its previous fixed number of steps.
+    Note that attention doesn't include the current step itself.
+    """
+    def __init__(self, d_model, n_head, attn_span, dropout, adapt_span_layer, **kargs):
+        nn.Module.__init__(self)
+        self.dropout = nn.Dropout(dropout)
+        self.d_model = d_model  # size of a single head
+        self.attn_span = attn_span
+        self.adaptive_span = AdaptiveSpan(
+            attn_span=attn_span,
+            n_head=n_head,
+            adapt_span_layer=adapt_span_layer,
+            **kargs
+        )
+    def forward(self, query, key, value, key_pe):
+        # query size = B x M x H
+        # key, value sizes = B x (M+L) x H
+        key, value, key_pe = self.adaptive_span.trim_memory(query, key, value, key_pe)
+        # compute attention from context
+        # B x M (dest) x (M+L) (src)
+        attn_cont = torch.matmul(query, key.transpose(-1, -2))
+        attn_cont = _unskew(attn_cont)  # B x M x L
+        # compute the effect of position embedding
+        attn_pos = torch.matmul(query, key_pe)  # B x M x L_pos
+        attn = attn_cont + attn_pos
+        attn = attn / math.sqrt(self.d_model)  # B x M X L_pos
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        # trim attention lengths according to the learned span
+        attn = self.adaptive_span(attn)
+        attn = self.dropout(attn)  # B x M X L_pos
+        attn_cont = _skew(attn, 0)  # B x M X (L+M)
+        out = torch.matmul(attn_cont, value)  # B x M x H
+        return out
+    def get_cache_size(self):
+        return self.adaptive_span.get_cache_size()
+class MultiHeadSeqAttention(nn.Module):
+    def __init__(self, d_model, n_head, **kargs):
+        nn.Module.__init__(self)
+        assert d_model % n_head == 0
+        self.n_head = n_head
+        self.head_dim = d_model // n_head
+        self.attn = SeqAttention(d_model=self.head_dim, n_head=n_head, **kargs)
+        self.proj_query = nn.Linear(d_model, d_model, bias=False)
+        nn.init.xavier_normal_(self.proj_query.weight)
+        self.proj_out = nn.Linear(d_model, d_model, bias=False)
+        nn.init.xavier_normal_(self.proj_out.weight)
+        self.proj_val = nn.Linear(d_model, d_model, bias=False)
+        nn.init.xavier_normal_(self.proj_val.weight)
+        self.proj_key = nn.Linear(d_model, d_model, bias=False)
+        nn.init.xavier_normal_(self.proj_key.weight)
+    def head_reshape(self, x):
+        K = self.n_head
+        D = self.head_dim
+        x = x.view(x.size()[:-1] + (K, D))  # B x (M+L) x K x D
+        x = x.transpose(1, 2).contiguous()  # B x K x (M+L) x D
+        x = x.view(-1, x.size(-2), x.size(-1))  # B_K x (M+L) x D
+        return x
+    def forward(self, query, key, value, key_pe):
+        B = query.size(0)
+        K = self.n_head
+        D = self.head_dim
+        M = query.size(1)
+        query = self.proj_query(query)
+        query = self.head_reshape(query)
+        value = self.proj_val(value)
+        value = self.head_reshape(value)
+        key = self.proj_key(key)
+        key = self.head_reshape(key)
+        out = self.attn(query, key, value, key_pe)  # B_K x M x D
+        out = out.view(B, K, M, D)  # B x K x M x D
+        out = out.transpose(1, 2).contiguous()  # B x M x K x D
+        out = out.view(B, M, -1)  # B x M x K_D
+        out = self.proj_out(out)
+        return out
+class FeedForwardLayer(nn.Module):
+    def __init__(self, d_model, d_inner, dropout, **kargs):
+        nn.Module.__init__(self)
+        self.fc1 = nn.Linear(d_model, d_inner)
+        self.fc2 = nn.Linear(d_inner, d_model)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, h):
+        h1 = F.relu(self.fc1(h))
+        h1 = self.dropout(h1)
+        h2 = self.fc2(h1)
+        return h2
+class TransformerSeqLayer(nn.Module):
+    def __init__(self, d_model, **kargs):
+        nn.Module.__init__(self)
+        self.attn = MultiHeadSeqAttention(d_model=d_model, **kargs)
+        self.norm1 = LayerNorm(d_model)
+        self.ff = FeedForwardLayer(d_model=d_model, **kargs)
+        self.norm2 = LayerNorm(d_model)
+    def forward(self, h, h_cache, key_pe):
+        # h = B x M x H
+        # h_cache = B x L x H
+        h_all = torch.cat([h_cache, h], dim=1)  # B x (M+L) x H
+        attn_out = self.attn(h, h_all, h_all, key_pe)
+        h = self.norm1(h + attn_out)  # B x M x H
+        if self.ff is not None:
+            ff_out = self.ff(h)
+            out = self.norm2(h + ff_out)  # B x M x H
+        else:
+            out = h
+        return out
+    def get_cache_size(self):
+        return self.attn.attn.get_cache_size()
+class TransformerSeq(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        d_model,
+        n_head,
+        n_layer,
+        attn_span,
+        emb_dropout,
+        aux_loss_scaler,
+        adapt_span_layer,
+        **kargs
+    ):
+        nn.Module.__init__(self)
+        # token embeddings
+        self.in_emb = nn.Embedding(vocab_size, d_model)
+        nn.init.normal_(self.in_emb.weight, mean=0, std=d_model ** -0.5)
+        self.out_emb = nn.Linear(d_model, vocab_size)
+        self.aux_loss_scaler = aux_loss_scaler
+        if emb_dropout > 0:
+            self.emb_dropout = nn.Dropout(emb_dropout)
+        else:
+            self.emb_dropout = None
+        # position embeddings
+        self.key_pe = nn.Parameter(torch.randn(1, d_model // n_head, attn_span))
+        self.layers = nn.ModuleList()
+        self.layers.extend(
+            TransformerSeqLayer(
+                d_model=d_model,
+                n_head=n_head,
+                attn_span=attn_span,
+                adapt_span_layer=adapt_span_layer,
+                **kargs
+            )
+            for _ in range(n_layer)
+        )
+    def forward(self, x, h_cache, target=None):
+        # x size = B x M
+        block_size = x.size(1)
+        h = self.in_emb(x)  # B x M x H
+        if self.emb_dropout is not None:
+            h = self.emb_dropout(h)
+        h_cache_next = []
+        for l, layer in enumerate(self.layers):
+            cache_size = layer.attn.attn.get_cache_size()
+            if cache_size > block_size:
+                h_cache_next_l = torch.cat(
+                    [h_cache[l][:, -cache_size + block_size :, :], h], dim=1
+                ).detach()
+            else:
+                h_cache_next_l = h[:, -cache_size:, :].detach()
+            h_cache_next.append(h_cache_next_l)
+            h = layer(h, h_cache[l], self.key_pe)  # B x M x H
+        if self.emb_dropout is not None:
+            h = self.emb_dropout(h)
+        out = F.log_softmax(self.out_emb(h).float(), dim=-1).type_as(h)
+        dummy_loss = None
+        return out, h_cache_next, dummy_loss
+    def get_aux_loss(self):
+        loss = 0.0
+        for layer in self.layers:
+            loss += layer.attn.attn.adaptive_span.get_loss()
+        return self.aux_loss_scaler * loss
+    def get_current_max_span(self):
+        max_span = 0.0
+        for layer in self.layers:
+            max_span = max(
+                max_span, layer.attn.attn.adaptive_span.get_current_max_span()
+            )
+        return max_span
+    def get_current_avg_span(self):
+        avg_span = 0.0
+        for layer in self.layers:
+            avg_span += layer.attn.attn.adaptive_span.get_current_avg_span()
+        return avg_span / len(self.layers)

fairseq/examples/adaptive_span/adaptive_span_model_wrapper.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+import torch
+from fairseq.dataclass import FairseqDataclass
+from fairseq.models import (
+    FairseqIncrementalDecoder,
+    FairseqLanguageModel,
+    register_model,
+)
+from .adaptive_span_model import TransformerSeq as AdaptiveSpanTransformerModel
+logger = logging.getLogger(__name__)
+@dataclass
+class AdaptiveSpanSmallConfig(FairseqDataclass):
+    # defaults come from https://github.com/facebookresearch/adaptive-span/blob/master/experiments/enwik8_small.sh
+    vocab_size: int = 50
+    d_model: int = 256
+    n_head: int = 4
+    d_inner: int = 1024
+    n_layer: int = 8
+    attn_span: int = 1024
+    dropout: float = 0.0
+    emb_dropout: float = 0.0
+    adapt_span_ramp: int = 32
+    adapt_span_init: float = 0.0
+    aux_loss_scaler: float = 0.000002
+    adapt_span_layer: bool = False
+@register_model("adaptive_span", dataclass=AdaptiveSpanSmallConfig)
+class AdaptiveSpanTransformer(FairseqLanguageModel):
+    @classmethod
+    def build_model(cls, cfg: AdaptiveSpanSmallConfig, task):
+        return cls(AdaptiveSpanDecoder(cfg, task))
+    def get_aux_loss(self):
+        return self.decoder.get_aux_loss()
+    def get_current_max_span(self):
+        return self.decoder.get_current_max_span()
+    def get_current_avg_span(self):
+        return self.decoder.get_current_avg_span()
+class AdaptiveSpanDecoder(FairseqIncrementalDecoder):
+    def __init__(self, cfg, task):
+        super().__init__(task.target_dictionary)
+        self.config = cfg
+        config = AdaptiveSpanSmallConfig(
+            vocab_size=len(task.target_dictionary),
+            d_model=cfg.d_model,
+            n_head=cfg.n_head,
+            d_inner=cfg.d_inner,
+            n_layer=cfg.n_layer,
+            attn_span=cfg.attn_span,
+            dropout=cfg.dropout,
+            emb_dropout=cfg.emb_dropout,
+            adapt_span_ramp=cfg.adapt_span_ramp,
+            adapt_span_init=cfg.adapt_span_init,
+            aux_loss_scaler=cfg.aux_loss_scaler,
+            adapt_span_layer=cfg.adapt_span_layer,
+        )
+        logger.info(config)
+        self.model = AdaptiveSpanTransformerModel(**config.__dict__)
+        self._mems = None
+    def forward(
+        self,
+        src_tokens,
+        incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
+        encoder_out=None,
+    ):
+        bsz = src_tokens.size(0)
+        if incremental_state is not None:  # used during inference
+            mems = self.get_incremental_state("mems")
+            src_tokens = src_tokens[:, -1:]  # only keep the most recent token
+        else:
+            mems = self._mems
+        if mems is None:
+            # first time init
+            mems = self.init_hid_cache(bsz)
+        output = self.model(x=src_tokens, h_cache=mems,)
+        if incremental_state is not None:
+            self.set_incremental_state(incremental_state, "mems", output[1])
+        else:
+            self._mems = output[1]
+        return (output[0],)
+    def max_positions(self):
+        return self.config.attn_span
+    def init_hid_cache(self, batch_sz):
+        hid = []
+        for layer in self.model.layers:
+            param = next(self.model.parameters())
+            h = torch.zeros(
+                batch_sz,
+                layer.get_cache_size(),
+                self.config.d_model,
+                dtype=param.dtype,
+                device=param.device,
+            )
+            hid.append(h)
+        return hid
+    def get_aux_loss(self):
+        return self.model.get_aux_loss()
+    def get_current_max_span(self):
+        return self.model.get_current_max_span()
+    def get_current_avg_span(self):
+        return self.model.get_current_avg_span()
+    def reorder_incremental_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[torch.Tensor]]],
+        new_order: torch.Tensor,
+    ):
+        """Reorder incremental state.
+        This will be called when the order of the input has changed from the
+        previous time step. A typical use case is beam search, where the input
+        order changes between time steps based on the selection of beams.
+        """
+        raise NotImplementedError("This is required for generation/beam search")
+        # mems = self.get_incremental_state(incremental_state, "mems")
+        # if mems is not None:
+        #     new_mems = [mems_i.index_select(1, new_order) for mems_i in mems]
+        #     self.set_incremental_state(incremental_state, "mems", new_mems)

fairseq/examples/adaptive_span/truncated_bptt_lm_task.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple
+import torch
+from fairseq import utils
+from fairseq.data import (
+    Dictionary,
+    TokenBlockDataset,
+    data_utils,
+    iterators,
+)
+from fairseq.dataclass import FairseqDataclass
+from fairseq.distributed import utils as dist_utils
+from fairseq.tasks import FairseqTask, register_task
+from omegaconf import II
+logger = logging.getLogger(__name__)
+@dataclass
+class TruncatedBPTTLMConfig(FairseqDataclass):
+    data: str = field(default="???", metadata={"help": "path to data directory"})
+    tokens_per_sample: int = field(
+        default=1024, metadata={"help": "max number of tokens per sequence"},
+    )
+    batch_size: int = II("dataset.batch_size")
+    # Some models use *max_target_positions* to know how many positional
+    # embeddings to learn. We use II(...) to make it default to
+    # *tokens_per_sample*, but in principle there could be more positional
+    # embeddings than tokens in a single batch. This may also be irrelevant for
+    # custom model implementations.
+    max_target_positions: int = II("task.tokens_per_sample")
+    # these will be populated automatically if not provided
+    data_parallel_rank: Optional[int] = None
+    data_parallel_size: Optional[int] = None
+@register_task("truncated_bptt_lm", dataclass=TruncatedBPTTLMConfig)
+class TruncatedBPTTLMTask(FairseqTask):
+    def __init__(self, cfg: TruncatedBPTTLMConfig):
+        super().__init__(cfg)
+        if cfg.data_parallel_rank is None or cfg.data_parallel_size is None:
+            if torch.distributed.is_initialized():
+                cfg.data_parallel_rank = dist_utils.get_data_parallel_rank()
+                cfg.data_parallel_size = dist_utils.get_data_parallel_world_size()
+            else:
+                cfg.data_parallel_rank = 0
+                cfg.data_parallel_size = 1
+        # load the dictionary
+        paths = utils.split_paths(cfg.data)
+        assert len(paths) > 0
+        self.dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+        logger.info("dictionary: {} types".format(len(self.dictionary)))
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split (e.g., train, valid, test)"""
+        # support sharded datasets
+        paths = utils.split_paths(self.cfg.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+        # each element of *data* will be a tensorized line from the original
+        # text dataset, similar to ``open(split_path).readlines()``
+        data = data_utils.load_indexed_dataset(
+            split_path, self.dictionary, combine=combine
+        )
+        if data is None:
+            raise FileNotFoundError(
+                "Dataset not found: {} ({})".format(split, split_path)
+            )
+        # this is similar to ``data.view(-1).split(tokens_per_sample)``
+        data = TokenBlockDataset(
+            data,
+            data.sizes,
+            block_size=self.cfg.tokens_per_sample,
+            pad=None,  # unused
+            eos=None,  # unused
+            break_mode="none",
+        )
+        self.datasets[split] = TruncatedBPTTDataset(
+            data=data,
+            bsz_per_shard=self.cfg.batch_size,
+            shard_id=self.cfg.data_parallel_rank,
+            num_shards=self.cfg.data_parallel_size,
+        )
+    def dataset(self, split):
+        return self.datasets[split]
+    def get_batch_iterator(
+        self,
+        dataset,
+        num_workers=0,
+        epoch=1,
+        data_buffer_size=0,
+        skip_remainder_batch=False,
+        **kwargs
+    ):
+        return iterators.EpochBatchIterator(
+            dataset=dataset,
+            collate_fn=self._collate_fn,
+            num_workers=num_workers,
+            epoch=epoch,
+            buffer_size=data_buffer_size,
+            # we don't use the batching functionality from EpochBatchIterator;
+            # instead every item in *dataset* is a whole batch
+            batch_sampler=[[i] for i in range(len(dataset))],
+            disable_shuffling=True,
+            skip_remainder_batch=skip_remainder_batch,
+        )
+    def _collate_fn(self, items: List[List[torch.Tensor]]):
+        # we don't use fairseq's batching functionality, so we expect a single
+        # Tensor of type List[torch.Tensor]
+        assert len(items) == 1
+        # item will have shape B x T (the last batch may have length < T)
+        id, item = items[0]
+        item = data_utils.collate_tokens(item, pad_idx=self.source_dictionary.pad())
+        B, T = item.size()
+        # shift item one position over and append a padding token for the target
+        target = torch.nn.functional.pad(
+            item[:, 1:], (0, 1, 0, 0), value=self.target_dictionary.pad()
+        )
+        # fairseq expects batches to have the following structure
+        return {
+            "id": torch.tensor([id] * item.size(0)),
+            "net_input": {"src_tokens": item,},
+            "target": target,
+            "nsentences": item.size(0),
+            "ntokens": item.numel(),
+        }
+    def build_dataset_for_inference(
+        self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs
+    ) -> torch.utils.data.Dataset:
+        eos = self.source_dictionary.eos()
+        dataset = TokenBlockDataset(
+            src_tokens,
+            src_lengths,
+            block_size=None,  # ignored for "eos" break mode
+            pad=self.source_dictionary.pad(),
+            eos=eos,
+            break_mode="eos",
+        )
+        class Dataset(torch.utils.data.Dataset):
+            def __getitem__(self, i):
+                item = dataset[i]
+                if item[-1] == eos:
+                    # remove eos to support generating with a prefix
+                    item = item[:-1]
+                return (i, [item])
+            def __len__(self):
+                return len(dataset)
+        return Dataset()
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        with torch.no_grad():
+            if constraints is not None:
+                raise NotImplementedError
+            # SequenceGenerator doesn't use *src_tokens* directly, we need to
+            # pass the *prefix_tokens* argument instead.
+            if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
+                prefix_tokens = sample["net_input"]["src_tokens"]
+            # begin generation with the end-of-sentence token
+            bos_token = self.source_dictionary.eos()
+            return generator.generate(
+                models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
+            )
+    def eval_lm_dataloader(
+        self,
+        dataset,
+        max_tokens: Optional[int] = 36000,
+        batch_size: Optional[int] = None,
+        max_positions: Optional[int] = None,
+        num_shards: int = 1,
+        shard_id: int = 0,
+        num_workers: int = 1,
+        data_buffer_size: int = 10,
+        context_window: int = 0,
+    ):
+        if context_window > 0:
+            raise NotImplementedError(
+                "Transformer-XL doesn't need --context-window, try "
+                "--model-overrides '{\"mem_len\":42}' instead "
+            )
+        return self.get_batch_iterator(
+            dataset=dataset,
+            max_tokens=max_tokens,
+            max_sentences=batch_size,
+            max_positions=max_positions,
+            ignore_invalid_inputs=True,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            data_buffer_size=data_buffer_size,
+        ).next_epoch_itr(shuffle=False)
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+class TruncatedBPTTDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        data: List[torch.Tensor],  # ordered list of items
+        bsz_per_shard,  # number of items processed per GPUs per forward
+        shard_id,  # current GPU ID
+        num_shards,  # number of GPUs
+    ):
+        super().__init__()
+        self.data = data
+        def batchify(data, bsz):
+            # Work out how cleanly we can divide the dataset into bsz parts.
+            nbatch = data.size(0) // bsz
+            # Trim off any extra elements that wouldn't cleanly fit (remainders).
+            data = data.narrow(0, 0, nbatch * bsz)
+            # Evenly divide the data across the bsz batches.
+            data = data.view(bsz, -1).contiguous()
+            return data
+        # total number of sequences processed by all GPUs in each forward pass
+        global_batch_size = bsz_per_shard * num_shards
+        """
+        With a 16 item dataset, bsz_per_shard=2 and num_shards=3,
+        *indices* might look like:
+            indices = [[0, 1],
+                       [2, 3],
+                       [4, 5],
+                       [6, 7],
+                       [8, 9],
+                       [10, 11]]
+        The size of the TruncatedBPTTDataset instance will be 2,
+        and shard 1 will see items:
+            [(0, [data[4], data[6]]),
+             (1, [data[5], data[7]])]
+        """
+        indices = batchify(torch.arange(len(data)), global_batch_size)
+        assert indices.size(0) == global_batch_size
+        self.my_indices = indices[
+            shard_id * bsz_per_shard : (shard_id + 1) * bsz_per_shard
+        ]
+        assert self.my_indices.size(0) == bsz_per_shard
+    def __len__(self):
+        return self.my_indices.size(1)
+    def __getitem__(self, i) -> Tuple[int, List[torch.Tensor]]:
+        return (i, [self.data[idx] for idx in self.my_indices[:, i]])

fairseq/examples/attention_head_selection/README.md ADDED Viewed

	@@ -0,0 +1,161 @@

+# Pay Better Attention to Attention: Head Selection in Multilingual and Multi-Domain Sequence Modeling (Gong et al., 2021)
+[https://arxiv.org/pdf/2106.10840.pdf](https://arxiv.org/pdf/2106.10840.pdf)
+## Introduction
+We present attention head selection strategies in multilingual and multi-domain sequence modeling including text translation, speech recognition and speech translation tasks.
+Below is an example of training multilingual/multi-domain speech recognition models.
+## Data Preparation
+Prepare mTEDx data as in [mTEDx example](https://github.com/fairinternal/fairseq-py/blob/0d9c5851e6fac40f9e366b3633ccd615c2901788/examples/speech_to_text/docs/mtedx_example.md) and CoVoST data as in [CoVoST example](https://github.com/fairinternal/fairseq-py/blob/0d9c5851e6fac40f9e366b3633ccd615c2901788/examples/speech_to_text/docs/covost_example.md). Similarly prepare EuroParl data.
+## Training a multilingual ASR model with attention head selection
+```bash
+data_dir=<path to mtedx data>
+train_subset="train_ar_ar_tedx,train_de_de_tedx,train_el_el_tedx,train_es_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_pt_tedx,train_ru_ru_tedx"
+valid_subset="valid_ar_ar_tedx,valid_de_de_tedx,valid_el_el_tedx,valid_es_es_tedx,valid_fr_fr_tedx,valid_it_it_tedx,valid_pt_pt_tedx,valid_ru_ru_tedx"
+strateg=<subset or group>
+fairseq-train ${data_dir} \
+    --user-dir examples/attention_head_selection/src \
+    --train-subset "${train_subset}" \
+    --valid-subset "${valid_subset}" \
+    --config-yaml 'config_asr.yaml' \
+    --arch 'head_selection_s2t_transformer_s' \
+    --task 'speech_to_text_head_selection' \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --lr-scheduler 'inverse_sqrt' --stop-min-lr -1.0 --warmup-updates 10000 \
+    --lr 5e-4 \
+    --clip-norm 10.0 \
+    --seed 1 \
+    --max-epoch 400 \
+    --max-tokens 32000 \
+    --ignore-prefix-size 1 \
+    --dropout 0.3 \
+    --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
+    --skip-invalid-size-inputs-valid-test \
+    --encoder-attn-head-select \
+    --total-encoder-attention-heads 8 \
+    --decoder-self-attn-head-select \
+    --total-decoder-attention-heads 8 \
+    --attn-head-select-strategy ${strategy} \
+    --task-type lang \
+```
+## Training a multi-domain ASR model with attention head selection
+```bash
+data_dir=<path to multi-domain data>
+train_subset="train_es_es_tedx,train_fr_fr_tedx,train_pt_pt_tedx,train_it_it_tedx,train_ru_ru_tedx,train_el_el_tedx,train_ar_ar_tedx,train_de_de_tedx,train_ar_ar_cv,train_de_de_cv,train_es_es_cv,train_fr_fr_cv,train_it_it_cv,train_pt_pt_cv,train_ru_ru_cv,train_de_de_ep,train_es_es_ep,train_fr_fr_ep,train_it_it_ep,train_pt_pt_ep"
+valid_subset="dev_es_es_tedx,dev_fr_fr_tedx,dev_pt_pt_tedx,dev_it_it_tedx,dev_ru_ru_tedx,dev_el_el_tedx,dev_ar_ar_tedx,dev_de_de_tedx,dev_ar_ar_cv,dev_de_de_cv,dev_es_es_cv,dev_fr_fr_cv,dev_it_it_cv,dev_pt_pt_cv,dev_ru_ru_cv,dev_de_de_ep,dev_es_es_ep,dev_fr_fr_ep,dev_it_it_ep,dev_pt_pt_ep"
+strateg=<subset or group>
+fairseq-train ${data_dir} \
+    --user-dir examples/attention_head_selection/src \
+    --train-subset "${train_subset}" \
+    --valid-subset "${valid_subset}" \
+    --config-yaml 'config_asr.yaml' \
+    --arch head_selection_s2t_transformer_s \
+    --task speech_to_text_head_selection \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --lr-scheduler 'inverse_sqrt' --stop-min-lr -1.0 --warmup-updates 10000 \
+    --lr 5e-4 \
+    --clip-norm 10.0 \
+    --seed 1 \
+    --max-epoch 400 \
+    --max-tokens 32000 \
+    --ignore-prefix-size 1 \
+    --dropout 0.3 \
+    --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
+    --skip-invalid-size-inputs-valid-test \
+    --encoder-attn-head-select \
+    --total-encoder-attention-heads 8 \
+    --decoder-self-attn-head-select \
+    --total-decoder-attention-heads 8 \
+    --attn-head-select-strategy ${strategy} \
+    --task-type domain
+```
+## Inference in multilingual setting
+```bash
+MODEL_DIR=<checkpoint directory>
+data_dir=<path to mtedx data>
+gen_subset=<data to test, e.g., test_ar_ar_tedx>
+train_subset="train_ar_ar_tedx,train_de_de_tedx,train_el_el_tedx,train_es_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_pt_tedx,train_ru_ru_tedx"
+last_n=10
+CHECKPOINT_FILENAME="avg_last_${last_n}_checkpoint.pt"
+CHECKPOINT="_avg"
+RESULTS="${MODEL_DIR}/ckpt${CHECKPOINT}"
+if [ ! -d $RESULTS ]; then
+    mkdir -p $RESULTS
+fi;
+python scripts/average_checkpoints.py \
+  --inputs ${MODEL_DIR} --num-epoch-checkpoints ${last_n} \
+  --output "${MODEL_DIR}/${CHECKPOINT_FILENAME}"
+fairseq-generate ${data_dir} \
+    --user-dir examples/attention_head_selection/src \
+    --arch 'head_selection_s2t_transformer_s' \
+    --task 'speech_to_text_head_selection' \
+    --train-subset ${train_subset} \
+    --gen-subset ${gen_subset} \
+    --path "${MODEL_DIR}/${CHECKPOINT_FILENAME}" \
+    --config-yaml 'config_asr.yaml' \
+    --prefix-size 1 \
+    --max-tokens 40000 --beam 5 \
+    --skip-invalid-size-inputs-valid-test \
+    --results-path ${RESULTS} \
+    --scoring wer --wer-tokenizer 13a \
+    --wer-lowercase --wer-remove-punct --remove-bpe
+```
+## Inference in multi-domain setting
+```bash
+MODEL_DIR=<checkpoint directory>
+data_dir=<path to multi-domain data>
+gen_subset=<data to test, e.g., test_pt_pt_cv>
+train_subset="train_es_es_tedx,train_fr_fr_tedx,train_pt_pt_tedx,train_it_it_tedx,train_ru_ru_tedx,train_el_el_tedx,train_ar_ar_tedx,train_de_de_tedx,train_ar_ar_cv,train_de_de_cv,train_es_es_cv,train_fr_fr_cv,train_it_it_cv,train_pt_pt_cv,train_ru_ru_cv,train_de_de_ep,train_es_es_ep,train_fr_fr_ep,train_it_it_ep,train_pt_pt_ep"
+last_n=10
+CHECKPOINT_FILENAME="avg_last_${last_n}_checkpoint.pt"
+CHECKPOINT="_avg"
+RESULTS="${MODEL_DIR}/ckpt${CHECKPOINT}"
+if [ ! -d $RESULTS ]; then
+    mkdir -p $RESULTS
+fi;
+python scripts/average_checkpoints.py \
+  --inputs ${MODEL_DIR} --num-epoch-checkpoints ${last_n} \
+  --output "${MODEL_DIR}/${CHECKPOINT_FILENAME}"
+fairseq-generate ${data_dir} \
+    --user-dir examples/attention_head_selection/src \
+    --arch 'head_selection_s2t_transformer_s' \
+    --task 'speech_to_text_head_selection' \
+    --train-subset ${train_subset} \
+    --gen-subset ${gen_subset} \
+    --path "${MODEL_DIR}/${CHECKPOINT_FILENAME}" \
+    --config-yaml 'config_asr.yaml' \
+    --prefix-size 1 \
+    --max-tokens 40000 --beam 5 \
+    --skip-invalid-size-inputs-valid-test \
+    --results-path ${RESULTS} \
+    --scoring wer --wer-tokenizer 13a \
+    --wer-lowercase --wer-remove-punct --remove-bpe
+```
+## Citation
+```bibtex
+@article{gong2021pay,
+  title={Pay Better Attention to Attention: Head Selection in Multilingual and Multi-Domain Sequence Modeling},
+  author={Gong, Hongyu and Tang, Yun and Pino, Juan and Li, Xian},
+  journal={arXiv preprint arXiv:2106.10840},
+  year={2021}
+}
+'''

fairseq/examples/attention_head_selection/src/__init__.py ADDED Viewed

File without changes

fairseq/examples/attention_head_selection/src/data/__init__.py ADDED Viewed

File without changes

fairseq/examples/attention_head_selection/src/data/speech_to_text_dataset_with_domain.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+import torch
+from fairseq.data import (
+    ConcatDataset,
+    Dictionary,
+    FairseqDataset,
+    ResamplingDataset
+)
+from fairseq.data.audio.data_cfg import S2TDataConfig
+from fairseq.data.audio.speech_to_text_dataset import (
+    SpeechToTextDatasetItem,
+    SpeechToTextDataset,
+    SpeechToTextDatasetCreator
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class SpeechToTextDatasetItemWithDomain(SpeechToTextDatasetItem):
+    src_lang_id: Optional[torch.Tensor] = None
+    tgt_lang_id: Optional[torch.Tensor] = None
+    domain_id: Optional[torch.Tensor] = None
+class SpeechToTextDatasetWithDomain(SpeechToTextDataset):
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        cfg: S2TDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        n_frames_per_step=1,
+        speaker_to_id=None,
+        src_lang_ids: Optional[List[int]] = None,
+        tgt_lang_ids: Optional[List[int]] = None,
+        domain_ids: Optional[List[int]] = None
+    ):
+        super().__init__(
+            split, is_train_split, cfg, audio_paths, n_frames,
+            src_texts, tgt_texts, speakers, src_langs, tgt_langs,
+            ids, tgt_dict, pre_tokenizer, bpe_tokenizer,
+            n_frames_per_step, speaker_to_id
+        )
+        assert src_lang_ids is None or len(src_lang_ids) == self.n_samples
+        assert tgt_lang_ids is None or len(tgt_lang_ids) == self.n_samples
+        assert domain_ids is None or len(domain_ids) == self.n_samples
+        self.src_lang_ids = src_lang_ids
+        self.tgt_lang_ids = tgt_lang_ids
+        self.domain_ids = domain_ids
+    def __getitem__(self, index: int) -> SpeechToTextDatasetItemWithDomain:
+        item = super().__getitem__(index)
+        src_lang_id = self.src_lang_ids[index]
+        tgt_lang_id = self.tgt_lang_ids[index]
+        domain_id = self.domain_ids[index]
+        return SpeechToTextDatasetItemWithDomain(
+            index=item.index, source=item.source,
+            target=item.target, speaker_id=item.speaker_id,
+            src_lang_id=src_lang_id,
+            tgt_lang_id=tgt_lang_id,
+            domain_id=domain_id
+        )
+    def collater(
+        self, samples: List[SpeechToTextDatasetItem], return_order: bool = False
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+        out = super().collater(samples, return_order=True)
+        order = out["order"]
+        src_lang_ids = torch.tensor([x.src_lang_id for x in samples], dtype=torch.long).index_select(0, order)
+        tgt_lang_ids = torch.tensor([x.tgt_lang_id for x in samples], dtype=torch.long).index_select(0, order)
+        domain_ids = torch.tensor([x.domain_id for x in samples], dtype=torch.long).index_select(0, order)
+        out["src_lang_ids"] = src_lang_ids
+        out["tgt_lang_ids"] = tgt_lang_ids
+        out["domain_ids"] = domain_ids
+        if not return_order:
+            del out["order"]
+        return out
+class SpeechToTextDatasetCreatorWithDomain(SpeechToTextDatasetCreator):
+    KEY_SRC_LANG_ID, KEY_TGT_LANG_ID = "src_lang_id", "tgt_lang_id"
+    KEY_DOMAIN_ID = "domain_id"
+    # default values
+    DEFAULT_SRC_LANG_ID, DEFAULT_TGT_LANG_ID, DEFAULT_DOMAIN_ID = 0, 0, 0
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        cfg: S2TDataConfig,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id
+    ) -> SpeechToTextDatasetWithDomain:
+        audio_root = Path(cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples]
+        n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples]
+        tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples]
+        src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples]
+        speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+        src_lang_ids = [s.get(cls.KEY_SRC_LANG_ID, cls.DEFAULT_SRC_LANG_ID) for s in samples]
+        tgt_lang_ids = [s.get(cls.KEY_TGT_LANG_ID, cls.DEFAULT_TGT_LANG_ID) for s in samples]
+        domain_ids = [s.get(cls.KEY_DOMAIN_ID, cls.DEFAULT_DOMAIN_ID) for s in samples]
+        return SpeechToTextDatasetWithDomain(
+            split_name,
+            is_train_split,
+            cfg,
+            audio_paths,
+            n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+            src_lang_ids=src_lang_ids,
+            tgt_lang_ids=tgt_lang_ids,
+            domain_ids=domain_ids
+        )
+    @classmethod
+    def _load_samples_from_tsv(
+        cls,
+        root: str,
+        split: str,
+        src_lang_map,
+        tgt_lang_map,
+        domain_map
+    ):
+        # metadata from split
+        _, src_lang, tgt_lang, domain = split.split("_")
+        src_lang_id = src_lang_map[src_lang]
+        tgt_lang_id = tgt_lang_map[tgt_lang]
+        domain_id = domain_map[domain]
+        samples = SpeechToTextDatasetCreator._load_samples_from_tsv(root, split)
+        for s in samples:
+            s.update({
+                cls.KEY_SRC_LANG_ID: src_lang_id,
+                cls.KEY_TGT_LANG_ID: tgt_lang_id,
+                cls.KEY_DOMAIN_ID: domain_id
+            })
+        return samples
+    @classmethod
+    def _from_tsv(
+        cls,
+        root: str,
+        cfg: S2TDataConfig,
+        split: str,
+        tgt_dict,
+        is_train_split: bool,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id,
+        src_lang_map: Dict[str, int],
+        tgt_lang_map: Dict[str, int],
+        domain_map: Dict[str, int]
+    ) -> SpeechToTextDatasetItemWithDomain:
+        samples = cls._load_samples_from_tsv(
+            root, split, src_lang_map,
+            tgt_lang_map, domain_map
+        )
+        return cls._from_list(
+            split, is_train_split, samples, cfg, tgt_dict, pre_tokenizer,
+            bpe_tokenizer, n_frames_per_step, speaker_to_id
+        )
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        cfg: S2TDataConfig,
+        splits: str,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        is_train_split: bool,
+        epoch: int,
+        seed: int,
+        src_lang_map: Dict[str, int],
+        tgt_lang_map: Dict[str, int],
+        domain_map: Dict[str, int],
+        n_frames_per_step: int = 1,
+        speaker_to_id=None
+    ) -> SpeechToTextDatasetWithDomain:
+        datasets = [
+            cls._from_tsv(
+                root, cfg, split, tgt_dict, is_train_split, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, src_lang_map, tgt_lang_map, domain_map
+            )
+            for split in splits.split(",")
+        ]
+        if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0:
+            # temperature-based sampling
+            size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha)
+            datasets = [
+                ResamplingDataset(
+                    d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0)
+                )
+                for r, d in zip(size_ratios, datasets)
+            ]
+        return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0]

fairseq/examples/attention_head_selection/src/loss/__init__.py ADDED Viewed

File without changes

fairseq/examples/attention_head_selection/src/loss/attention_head_selection.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+from torch.nn.modules.loss import _Loss
+class HeadSelectionLoss(_Loss):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.kl_weight = getattr(args, "kl_weight", 0.0)
+    def forward(self, head_samples, sample_sizes, prior=0.5, eps=1e-7):
+        """
+        head_scores: (num_tasks, num_layers, num_heads)
+        sample_sizes: (num_tasks, )
+        """
+        kl_loss = (head_samples * (torch.log(head_samples + eps) - math.log(prior))).sum(-1).sum(-1)
+        kl_loss /= (torch.numel(head_samples) / head_samples.size(0))
+        kl_loss = self.kl_weight * torch.matmul(kl_loss, sample_sizes)
+        return kl_loss

fairseq/examples/attention_head_selection/src/models/__init__.py ADDED Viewed

File without changes

fairseq/examples/attention_head_selection/src/models/head_selection_s2t_transformer.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from typing import Dict, List, Optional
+from pathlib import Path
+import torch.nn as nn
+from torch import Tensor
+from fairseq import checkpoint_utils
+from fairseq.models import register_model, register_model_architecture
+from fairseq.utils import safe_hasattr
+from fairseq.models.speech_to_text.s2t_transformer import (
+    S2TTransformerModel,
+    S2TTransformerEncoder,
+    TransformerDecoderScriptable
+)
+from fairseq.models.speech_to_text.s2t_transformer import base_architecture as s2t_base_architecture
+from ..modules.attn_head_selector import AttnHeadSelector
+from ..modules.head_selection_transformer_layer import HeadSelectionTransformerEncoderLayer
+from .head_selection_transformer import HeadSelectionTransformerDecoder
+logger = logging.getLogger(__name__)
+@register_model("head_selection_s2t_transformer")
+class HeadSelectionS2TTransformerModel(S2TTransformerModel):
+    """
+    Head selection implemented in S2TTransformer
+    """
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+    @staticmethod
+    def add_args(parser):
+        S2TTransformerModel.add_args(parser)
+        # encoder head selection
+        parser.add_argument(
+            "--encoder-attn-head-select",
+            action="store_true",
+            default=False,
+            help="encoder head selection"
+        )
+        parser.add_argument(
+            "--total-encoder-attention-heads",
+            type=int,
+            help="total number of encoder attention heads"
+        )
+        # decoder self attention selection
+        parser.add_argument(
+            "--decoder-self-attn-head-select",
+            action="store_true",
+            default=False,
+            help="decoder self-attention head selection"
+        )
+        # decoder-encoder attention selection
+        parser.add_argument(
+            "--dec-enc-attn-head-select",
+            action="store_true",
+            default=False,
+            help="decoder-encoder attention head selection"
+        )
+        parser.add_argument(
+            "--total-decoder-attention-heads",
+            type=int,
+            help="total number of decoder attention heads"
+        )
+        # selection strategy
+        parser.add_argument(
+            "--attn-head-select-strategy",
+            type=str,
+            help="attention head selection strategy, subset or group"
+        )
+    @classmethod
+    def build_encoder(cls, args):
+        if safe_hasattr(args, "encoder_attn_head_select") and args.encoder_attn_head_select:
+            encoder = HeadSelectionS2TTransformerEncoder(args)
+        else:
+            encoder = S2TTransformerEncoder(args)
+        pretraining_path = getattr(args, "load_pretrained_encoder_from", None)
+        if pretraining_path is not None:
+            if not Path(pretraining_path).exists():
+                logger.warning(
+                    f"skipped pretraining because {pretraining_path} does not exist"
+                )
+            else:
+                encoder = checkpoint_utils.load_pretrained_component_from_model(
+                    component=encoder, checkpoint=pretraining_path
+                )
+                logger.info(f"loaded pretrained encoder from: {pretraining_path}")
+        return encoder
+    @classmethod
+    def build_decoder(cls, args, task, embed_tokens):
+        if (safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select) or (safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select):
+            return HeadSelectionTransformerDecoderScriptable(args, task.target_dictionary, embed_tokens)
+        else:
+            return TransformerDecoderScriptable(args, task.target_dictionary, embed_tokens)
+class HeadSelectionS2TTransformerEncoder(S2TTransformerEncoder):
+    def __init__(self, args):
+        super().__init__(args)
+        self.attn_head_selector = AttnHeadSelector(
+            args.encoder_tasks,
+            args.encoder_layers,
+            args.total_encoder_attention_heads,
+            args.encoder_attention_heads,
+            args.attn_head_select_strategy,
+        )
+        self.task_ids = None
+        self.transformer_layers = nn.ModuleList([
+            HeadSelectionTransformerEncoderLayer(args, layer_idx, attn_head_selector=self.attn_head_selector) for layer_idx in range(args.encoder_layers)
+        ])
+    def set_task_ids(self, task_ids):
+        self.task_ids = task_ids
+    def _forward(self, src_tokens, src_lengths, return_all_hiddens=False):
+        self.attn_head_selector.head_select(self.task_ids)
+        return super()._forward(src_tokens, src_lengths, return_all_hiddens)
+class HeadSelectionTransformerDecoderScriptable(HeadSelectionTransformerDecoder):
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        # call scriptable method from parent class
+        x, _ = self.extract_features_scriptable(
+            prev_output_tokens,
+            encoder_out,
+            incremental_state,
+            full_context_alignment,
+            alignment_layer,
+            alignment_heads,
+        )
+        return x, None
+@register_model_architecture(model_name="head_selection_s2t_transformer", arch_name="head_selection_s2t_transformer")
+def base_architecture(args):
+    s2t_base_architecture(args)
+    args.encoder_attn_head_select = getattr(args, "encoder_attn_head_select", False)
+    args.decoder_self_attn_head_select = getattr(args, "decoder_self_attn_head_select", False)
+    args.dec_enc_attn_head_select = getattr(args, "dec_enc_attn_head_select", False)
+    args.total_encoder_attention_heads = getattr(args, "total_encoder_attention_heads", 8)
+    args.total_decoder_attention_heads = getattr(args, "total_decoder_attention_heads", 8)
+    args.attn_head_select_strategy = getattr(args, "attn_head_select_strategy", "group")
+@register_model_architecture("head_selection_s2t_transformer", "head_selection_s2t_transformer_s")
+def head_selection_s2t_transformer_s(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+    base_architecture(args)

fairseq/examples/attention_head_selection/src/models/head_selection_transformer.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any, List, Dict, Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from fairseq.utils import safe_hasattr
+from fairseq.models.transformer import (
+    TransformerModel,
+    TransformerEncoder,
+    TransformerDecoder
+)
+from ..modules.attn_head_selector import AttnHeadSelector
+from ..modules.head_selection_transformer_layer import (
+    HeadSelectionTransformerEncoderLayer,
+    HeadSelectionTransformerDecoderLayer
+)
+class HeadSelectionTransformerModel(TransformerModel):
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+    @staticmethod
+    def add_args(parser):
+        TransformerModel.add_args(parser)
+        # encoder head selection
+        parser.add_argument(
+            "--encoder-attn-head-select",
+            action="store_true",
+            default=False,
+            help="encoder head selection"
+        )
+        parser.add_argument(
+            "--total-encoder-attention-heads",
+            type=int,
+            help="total number of encoder attention heads"
+        )
+        # decoder self attention
+        parser.add_argument(
+            "--decoder-self-attn-head-select",
+            action="store_true",
+            default=False,
+            help="decoder self-attention head selection"
+        )
+        # decoder-encoder attention
+        parser.add_argument(
+            "--dec-enc-attn-head-select",
+            action="store_true",
+            default=False,
+            help="decoder-encoder attention head selection"
+        )
+        parser.add_argument(
+            "--total-decoder-attention-heads",
+            type=int,
+            help="total number of decoder attention heads"
+        )
+        # selection strategy
+        parser.add_argument(
+            "--attn-head-select-strategy",
+            type=str,
+            help="attention head selection strategy, subset or group"
+        )
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        if safe_hasattr(args, "encoder_attn_head_select") and args.encoder_attn_head_select:
+            return HeadSelectionTransformerEncoder(
+                args, src_dict, embed_tokens
+            )
+        else:
+            return TransformerEncoder(args, src_dict, embed_tokens)
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        if (safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select) or (safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select):
+            return HeadSelectionTransformerDecoder(
+                args, tgt_dict, embed_tokens
+            )
+        else:
+            return TransformerDecoder(args, tgt_dict, embed_tokens)
+class HeadSelectionTransformerEncoder(TransformerEncoder):
+    def __init__(self, args, dictionary, embed_tokens):
+        self.num_tasks = args.encoder_tasks
+        self.num_layers = args.encoder_layers
+        self.total_num_heads = args.total_encoder_attention_heads
+        self.num_heads = args.encoder_attention_heads
+        self.select_strategy = args.attn_head_select_strategy
+        super().__init__(args, dictionary, embed_tokens)
+        self.attn_head_selector = AttnHeadSelector(
+            self.num_tasks,
+            self.num_layers,
+            self.total_num_heads,
+            self.num_heads,
+            self.select_strategy
+        )
+        self.task_ids = None
+        self.layers = nn.ModuleList(
+            [self.build_encoder_layer(args, i) for i in range(args.encoder_layers)]
+        )
+    def set_task_ids(self, task_ids):
+        self.task_ids = task_ids
+    def build_encoder_layer(self, args, layer_idx=None):
+        return HeadSelectionTransformerEncoderLayer(
+            args,
+            layer_idx,
+            attn_head_selector=self.attn_head_selector
+        )
+    def forward(
+        self,
+        src_tokens,
+        src_lengths: Optional[torch.Tensor] = None,
+        return_all_hiddens: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+    ):
+        self.attn_head_selector.head_select(self.task_ids)
+        return super().forward(src_tokens, src_lengths, return_all_hiddens, token_embeddings)
+class HeadSelectionTransformerDecoder(TransformerDecoder):
+    def __init__(
+        self,
+        args,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+        output_projection=None,
+    ):
+        self.num_tasks = args.decoder_tasks
+        self.num_layers = args.decoder_layers
+        self.total_num_heads = args.total_decoder_attention_heads
+        self.num_heads = args.decoder_attention_heads
+        self.select_strategy = args.attn_head_select_strategy
+        super().__init__(
+            args, dictionary, embed_tokens,
+            no_encoder_attn=no_encoder_attn,
+            output_projection=output_projection
+        )
+        self.self_attn_head_selector = None
+        self.enc_attn_head_selector = None
+        if safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select:
+            self.self_attn_head_selector = AttnHeadSelector(
+                self.num_tasks,
+                self.num_layers,
+                self.total_num_heads,
+                self.num_heads,
+                self.select_strategy
+            )
+        if safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select:
+            self.enc_attn_head_selector = AttnHeadSelector(
+                self.num_tasks,
+                self.num_layers,
+                self.total_num_heads,
+                self.num_heads,
+                self.select_strategy
+            )
+        self.task_ids = None
+        self.layers = nn.ModuleList(
+            [
+                self.build_head_selection_decoder_layer(args, no_encoder_attn, idx) for idx in range(args.decoder_layers)
+            ]
+        )
+    def set_task_ids(self, task_ids):
+        self.task_ids = task_ids
+    def build_head_selection_decoder_layer(self, args, no_encoder_attn=False, layer_idx=None):
+        return HeadSelectionTransformerDecoderLayer(
+            args,
+            layer_idx,
+            self.self_attn_head_selector,
+            self.enc_attn_head_selector,
+            no_encoder_attn=no_encoder_attn
+        )
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        if self.self_attn_head_selector is not None:
+            self.self_attn_head_selector.head_select(self.task_ids)
+        if self.enc_attn_head_selector is not None:
+            self.enc_attn_head_selector.head_select(self.task_ids)
+        return super().forward(
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            features_only=features_only,
+            full_context_alignment=full_context_alignment,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+            src_lengths=src_lengths,
+            return_all_hiddens=return_all_hiddens
+        )

fairseq/examples/attention_head_selection/src/modules/__init__.py ADDED Viewed

File without changes

fairseq/examples/attention_head_selection/src/modules/attn_head_selector.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import math
+class AttnHeadSelector(nn.Module):
+    """
+    Latent variable modeling of attention head selection
+    """
+    def __init__(
+        self, num_tasks, num_layers,
+        total_num_heads, num_heads,
+        select_strategy="group",
+        head_select_temp=5.0
+    ):
+        super(AttnHeadSelector, self).__init__()
+        self.num_tasks = num_tasks
+        self.num_layers = num_layers
+        self.total_num_heads = total_num_heads
+        self.num_heads = num_heads
+        self.select_strategy = select_strategy
+        self.temp = head_select_temp
+        self.head_logits = torch.nn.Parameter(
+            torch.Tensor(self.num_tasks, self.num_layers, total_num_heads),
+            requires_grad=True
+        )
+        nn.init.uniform_(
+            self.head_logits, a=math.log(0.01),
+            b=math.log(1.0)
+        )
+    def gumbel_sample(self, logits, tau=1.0):
+        gumbels1 = -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log()
+        gumbels2 = -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log()
+        gumbels1 = (logits + gumbels1 - gumbels2) / tau
+        y_soft = gumbels1.sigmoid()
+        return y_soft
+    def subset_select(self, y_soft, topk, dim=-1):
+        top_values, top_inds = torch.topk(y_soft, k=topk, dim=dim)
+        top_ret = 1.0 - top_values.detach() + top_values
+        return top_inds.detach(), top_ret
+    def group_selet(self, y_soft, topk, dim=-1):
+        # top_values: (num_tasks, num_layers, topk)
+        top_values, top_inds = torch.max(
+            y_soft.view(self.num_tasks, self.num_layers, -1, topk), dim=2
+        )
+        top_inds = top_inds * topk + torch.arange(topk, device=top_inds.device).unsqueeze(0).unsqueeze(1)
+        top_ret = 1.0 - top_values.detach() + top_values
+        return top_inds.detach(), top_ret
+    def head_select(self, task_ids=None):
+        # gumbel_sample
+        self.head_samples = self.gumbel_sample(self.head_logits, tau=self.temp)
+        # head select
+        if self.select_strategy == "subset":
+            self.subset_heads, self.subset_weights = self.subset_select(
+                self.head_samples,
+                topk=self.num_heads,
+            )
+        elif self.select_strategy == "group":
+            self.subset_heads, self.subset_weights = self.group_selet(
+                self.head_samples,
+                topk=self.num_heads,
+            )
+        else:
+            raise ValueError("{} is not supported".format(self.select_strategy))
+        self.batch_subset = self.subset_heads[task_ids, :, :]
+        self.batch_weights = self.subset_weights[task_ids, :, :]
+    def forward(self, layer_idx):
+        assert layer_idx is not None
+        batch_subset = self.batch_subset[:, layer_idx, :]
+        batch_weights = self.batch_weights[:, layer_idx, :]
+        return batch_subset, batch_weights

fairseq/examples/attention_head_selection/src/modules/head_selection_transformer_layer.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from fairseq.utils import safe_getattr
+from fairseq.modules import TransformerEncoderLayer, TransformerDecoderLayer
+from ..modules.multihead_attention_selection import MultiheadAttentionSelection
+class HeadSelectionTransformerEncoderLayer(TransformerEncoderLayer):
+    def __init__(self, args, layer_idx, attn_head_selector=None):
+        super().__init__(args)
+        self.layer_idx = layer_idx
+        self.self_attn = self.build_self_attention_selection(
+            self.embed_dim, args, attn_head_selector
+        )
+    def build_self_attention_selection(self, embed_dim, args, attn_head_selector=None):
+        return MultiheadAttentionSelection(
+            embed_dim,
+            args.total_encoder_attention_heads,
+            args.encoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            layer_idx=self.layer_idx,
+            attn_head_selector=attn_head_selector
+        )
+class HeadSelectionTransformerDecoderLayer(TransformerDecoderLayer):
+    def __init__(
+        self,
+        args,
+        layer_idx,
+        self_attn_head_selector=None,
+        enc_attn_head_selector=None,
+        no_encoder_attn=False,
+        add_bias_kv=False,
+        add_zero_attn=False,
+    ):
+        self.layer_idx = layer_idx
+        super().__init__(args, no_encoder_attn, add_bias_kv, add_zero_attn)
+        if self_attn_head_selector is not None:
+            self.self_attn = self.build_self_attention_selection(
+                self.embed_dim, args,
+                self_attn_head_selector=self_attn_head_selector,
+                add_bias_kv=add_bias_kv,
+                add_zero_attn=add_zero_attn
+            )
+        if enc_attn_head_selector is not None:
+            self.encoder_attn = self.build_encoder_attention_selection(
+                self.embed_dim, args,
+                enc_attn_head_selector=enc_attn_head_selector
+            )
+    def build_self_attention_selection(
+        self, embed_dim, args, self_attn_head_selector=None,
+        add_bias_kv=False, add_zero_attn=False
+    ):
+        return MultiheadAttentionSelection(
+            embed_dim,
+            args.total_decoder_attention_heads,
+            args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=not safe_getattr(args, "cross_self_attention"),
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            layer_idx=self.layer_idx,
+            attn_head_selector=self_attn_head_selector,
+        )
+    def build_encoder_attention_selection(self, embed_dim, args, enc_attn_head_selector=None):
+        return MultiheadAttentionSelection(
+            embed_dim,
+            args.total_decoder_attention_heads,
+            args.decoder_attention_heads,
+            kdim=args.encoder_embed_dim,
+            vdim=args.encoder_embed_dim,
+            dropout=args.attention_dropout,
+            encoder_decoder_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            layer_idx=self.layer_idx,
+            attn_head_selector=enc_attn_head_selector,
+        )

fairseq/examples/attention_head_selection/src/modules/multihead_attention_selection.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict, Optional, Tuple
+import torch
+from fairseq import utils
+from fairseq.modules.quant_noise import quant_noise
+from torch import Tensor, nn
+from torch.nn import Parameter
+from fairseq.modules.multihead_attention import MultiheadAttention
+from ..modules.multihead_functional import multi_head_attention_forward
+class MultiheadAttentionSelection(MultiheadAttention):
+    def __init__(
+        self,
+        embed_dim,
+        total_num_heads,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        q_noise=0.0,
+        qn_block_size=8,
+        layer_idx=0,
+        attn_head_selector=None
+    ):
+        super().__init__(
+            embed_dim,
+            num_heads,
+            kdim=kdim,
+            vdim=vdim,
+            dropout=dropout,
+            bias=bias,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=self_attention,
+            encoder_decoder_attention=encoder_decoder_attention,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+        self.layer_idx = layer_idx
+        self.attn_head_selector = attn_head_selector
+        self.total_num_heads = total_num_heads
+        self.total_embed_dim = self.head_dim * total_num_heads
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, self.total_embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, self.total_embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(embed_dim, self.total_embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, self.total_embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, self.total_embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.reset_parameters()
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        need_weights: bool = True,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+        # subset_heads: Optional[Tensor] = None,
+        # subset_weights: Optional[Tensor] = None
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        if need_head_weights:
+            need_weights = True
+        is_tpu = query.device.type == "xla"
+        subset_heads, subset_weights = self.attn_head_selector(self.layer_idx)
+        tgt_len, bsz, embed_dim = query.size()
+        src_len = tgt_len
+        assert list(query.size()) == [tgt_len, bsz, self.embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                assert key_bsz == bsz
+                assert value is not None
+                assert src_len, bsz == value.shape[:2]
+        if (
+            not self.onnx_trace
+            and not is_tpu  # don't use PyTorch version on TPUs
+            and incremental_state is None
+            and not static_kv
+            # A workaround for quantization to work. Otherwise JIT compilation
+            # treats bias in linear module as method.
+            and not torch.jit.is_scripting()
+        ):
+            assert key is not None and value is not None
+            return multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.total_num_heads,
+                self.num_heads,
+                torch.empty([0]),
+                torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout_module.p,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                self.training or self.dropout_module.apply_during_inference,
+                key_padding_mask,
+                need_weights,
+                attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj.weight,
+                k_proj_weight=self.k_proj.weight,
+                v_proj_weight=self.v_proj.weight,
+                subset_heads=subset_heads,
+                subset_weights=subset_weights
+            )
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.total_num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                .view(-1, bsz * self.total_num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                .view(-1, bsz * self.total_num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.total_num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+                src_len = k.size(1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.total_num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+            saved_state["prev_key"] = k.view(bsz, self.total_num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.total_num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        assert k.size(1) == src_len
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        torch.zeros(key_padding_mask.size(0), 1).type_as(
+                            key_padding_mask
+                        ),
+                    ],
+                    dim=1,
+                )
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        assert list(attn_weights.size()) == [bsz * self.total_num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.total_num_heads, tgt_len, src_len)
+            if not is_tpu:
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                    float("-inf"),
+                )
+            else:
+                attn_weights = attn_weights.transpose(0, 2)
+                attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
+                attn_weights = attn_weights.transpose(0, 2)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if before_softmax:
+            return attn_weights, v
+        attn_weights_float = utils.softmax(
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+        assert v is not None
+        # evaluation
+        if subset_heads is not None and subset_heads.numel() == 1:
+            subset_heads = subset_heads.repeat(bsz)
+            subset_weights = subset_weights.repeat(bsz)
+        if subset_heads is None:
+            attn = torch.bmm(attn_probs, v)
+        else:
+            # training with head selection
+            mixed_attn = torch.bmm(attn_probs, v).contiguous().view(bsz, self.total_num_heads, tgt_len, self.head_dim)
+            attn = torch.stack(
+                [mixed_attn[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1
+            )
+            attn = attn * subset_weights.unsqueeze(2).unsqueeze(3)
+            attn = attn.contiguous().view(bsz * self.num_heads, tgt_len, self.head_dim)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.size(1) == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            if subset_heads is None:
+                attn_weights = attn_weights_float.view(
+                    bsz, self.num_heads, tgt_len, src_len
+                ).transpose(1, 0)
+            else:
+                mixed_attn_weights = attn_weights_float.view(
+                    bsz, self.total_num_heads, tgt_len, src_len
+                )
+                attn_weights = torch.stack(
+                    [mixed_attn_weights[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1
+                ).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        return attn, attn_weights

fairseq/examples/attention_head_selection/src/modules/multihead_functional.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional, Tuple
+import torch
+from torch import Tensor
+from torch.nn.functional import (
+    linear, softmax, dropout, pad,
+    has_torch_function,
+    handle_torch_function,
+    _in_projection_packed,
+)
+import math
+import warnings
+def _scaled_dot_product_attention(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    attn_mask: Optional[Tensor] = None,
+    dropout_p: float = 0.0,
+    bsz: int = 1,
+    subset_heads: Optional[Tensor] = None,
+    subset_weights: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor]:
+    B, Nt, E = q.shape
+    q = q / math.sqrt(E)
+    # B: bsz * total_num_heads
+    # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns)
+    attn = torch.bmm(q, k.transpose(-2, -1))
+    if attn_mask is not None:
+        attn += attn_mask
+    attn = softmax(attn, dim=-1)
+    if dropout_p > 0.0:
+        attn = dropout(attn, p=dropout_p)
+    if subset_heads is None:
+        # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E)
+        output = torch.bmm(attn, v)
+    else:
+        mixed_output = torch.bmm(attn, v).contiguous().view(bsz, -1, Nt, E)
+        output = torch.stack(
+            [mixed_output[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))],
+            dim=1
+        )
+        output = output * subset_weights.unsqueeze(2).unsqueeze(3)
+        output = output.contiguous().view(-1, Nt, E)
+    if subset_heads is not None:
+        _, Nt, Ns = attn.size()
+        mixed_attn = attn.view(bsz, -1, Nt, Ns)
+        attn = torch.stack(
+            [mixed_attn[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1
+        )
+    return output, attn
+def _in_projection(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    w_q: Tensor,
+    w_k: Tensor,
+    w_v: Tensor,
+    b_q: Optional[Tensor] = None,
+    b_k: Optional[Tensor] = None,
+    b_v: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
+def multi_head_attention_forward(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    total_num_heads: int,
+    num_heads: int,
+    in_proj_weight: Tensor,
+    in_proj_bias: Optional[Tensor],
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Optional[Tensor],
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+    subset_heads: Optional[Tensor] = None,
+    subset_weights: Optional[Tensor] = None,
+):
+    tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
+    if has_torch_function(tens_ops):
+        return handle_torch_function(
+            multi_head_attention_forward,
+            tens_ops,
+            query,
+            key,
+            value,
+            embed_dim_to_check,
+            total_num_heads,
+            num_heads,
+            in_proj_weight,
+            in_proj_bias,
+            bias_k,
+            bias_v,
+            add_zero_attn,
+            dropout_p,
+            out_proj_weight,
+            out_proj_bias,
+            training=training,
+            key_padding_mask=key_padding_mask,
+            need_weights=need_weights,
+            attn_mask=attn_mask,
+            use_separate_proj_weight=use_separate_proj_weight,
+            q_proj_weight=q_proj_weight,
+            k_proj_weight=k_proj_weight,
+            v_proj_weight=v_proj_weight,
+            static_k=static_k,
+            static_v=static_v,
+            subset_heads=subset_heads,
+            subset_weights=subset_weights
+        )
+    # set up shape vars
+    tgt_len, bsz, embed_dim = query.shape
+    src_len, _, _ = key.shape
+    assert embed_dim == embed_dim_to_check, \
+        f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
+    if isinstance(embed_dim, torch.Tensor):
+        # embed_dim can be a tensor when JIT tracing
+        head_dim = embed_dim.div(num_heads, rounding_mode='trunc')
+    else:
+        head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
+    if use_separate_proj_weight:
+        # allow MHA to have different embedding dimensions when separate projection weights are used
+        assert key.shape[:2] == value.shape[:2], \
+            f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
+    else:
+        assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
+    #
+    # compute in-projection
+    #
+    if not use_separate_proj_weight:
+        q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
+    else:
+        assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
+        assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
+        assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
+        if in_proj_bias is None:
+            b_q = b_k = b_v = None
+        else:
+            b_q, b_k, b_v = in_proj_bias.chunk(3)
+        q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
+    # prep attention mask
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+        else:
+            assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, \
+                f"Only float, byte, and bool types are supported for attn_mask, not {attn_mask.dtype}"
+        # ensure attn_mask's dim is 3
+        if attn_mask.dim() == 2:
+            correct_2d_size = (tgt_len, src_len)
+            if attn_mask.shape != correct_2d_size:
+                raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
+            attn_mask = attn_mask.unsqueeze(0)
+        elif attn_mask.dim() == 3:
+            correct_3d_size = (bsz * total_num_heads, tgt_len, src_len)
+            if attn_mask.shape != correct_3d_size:
+                raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.")
+        else:
+            raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
+    # prep key padding mask
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+        key_padding_mask = key_padding_mask.to(torch.bool)
+    # add bias along batch dimension (currently second)
+    if bias_k is not None and bias_v is not None:
+        assert static_k is None, "bias cannot be added to static key."
+        assert static_v is None, "bias cannot be added to static value."
+        k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+        v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    else:
+        assert bias_k is None
+        assert bias_v is None
+    #
+    # reshape q, k, v for multihead attention and make em batch first
+    #
+    q = q.contiguous().view(tgt_len, bsz * total_num_heads, head_dim).transpose(0, 1)
+    if static_k is None:
+        k = k.contiguous().view(k.shape[0], bsz * total_num_heads, head_dim).transpose(0, 1)
+    else:
+        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
+        assert static_k.size(0) == bsz * total_num_heads, \
+            f"expecting static_k.size(0) of {bsz * total_num_heads}, but got {static_k.size(0)}"
+        assert static_k.size(2) == head_dim, \
+            f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
+        k = static_k
+    if static_v is None:
+        v = v.contiguous().view(v.shape[0], bsz * total_num_heads, head_dim).transpose(0, 1)
+    else:
+        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
+        assert static_v.size(0) == bsz * total_num_heads, \
+            f"expecting static_v.size(0) of {bsz * total_num_heads}, but got {static_v.size(0)}"
+        assert static_v.size(2) == head_dim, \
+            f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
+        v = static_v
+    # add zero attention along batch dimension (now first)
+    if add_zero_attn:
+        zero_attn_shape = (bsz * total_num_heads, 1, head_dim)
+        k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    # update source sequence length after adjustments
+    src_len = k.size(1)
+    # merge key padding and attention masks
+    if key_padding_mask is not None:
+        assert key_padding_mask.shape == (bsz, src_len), \
+            f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
+        key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).   \
+            expand(-1, total_num_heads, -1, -1).reshape(bsz * total_num_heads, 1, src_len)
+        if attn_mask is None:
+            attn_mask = key_padding_mask
+        elif attn_mask.dtype == torch.bool:
+            attn_mask = attn_mask.logical_or(key_padding_mask)
+        else:
+            attn_mask = attn_mask.masked_fill(key_padding_mask, float("-inf"))
+    # convert mask to float
+    if attn_mask is not None and attn_mask.dtype == torch.bool:
+        new_attn_mask = torch.zeros_like(attn_mask, dtype=torch.float)
+        new_attn_mask.masked_fill_(attn_mask, float("-inf"))
+        attn_mask = new_attn_mask
+    # adjust dropout probability
+    if not training:
+        dropout_p = 0.0
+    #
+    # (deep breath) calculate attention and out projection
+    #
+    attn_output, attn_output_weights = _scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, bsz, subset_heads, subset_weights)
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None