]
+ # so source tokens can be used to predict P(S|T)
+ return torch.stack([reorder_tokens(token, length, eos) for token, length in zip(tokens, lengths)])
+
+
+def normalized_scores_with_batch_vocab(
+ model_decoder, features, target_ids, k, bsz, beam_size,
+ pad_idx, top_k=0, vocab_size_meter=None, start_idx=None,
+ end_idx=None, **kwargs):
+ """
+ Get normalized probabilities (or log probs) from a net's output
+ w.r.t. vocab consisting of target IDs in the batch
+ """
+ if model_decoder.adaptive_softmax is None:
+ weight = model_decoder.output_projection.weight
+ vocab_ids = torch.unique(
+ torch.cat(
+ (torch.unique(target_ids), torch.arange(top_k, device=target_ids.device))
+ )
+ )
+ id_map = dict(zip(vocab_ids.tolist(), range(len(vocab_ids))))
+ mapped_target_ids = target_ids.cpu().apply_(
+ lambda x, id_map=id_map: id_map[x]
+ ).to(target_ids.device)
+ expanded_target_ids = mapped_target_ids[:, :].repeat(1, k).view(bsz*beam_size*k, -1)
+ if start_idx is not None and end_idx is not None:
+ expanded_target_ids = expanded_target_ids[start_idx:end_idx, :]
+ logits = F.linear(features, weight[vocab_ids, :])
+ log_softmax = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+ intermed_scores = torch.gather(
+ log_softmax[:, :-1, :],
+ 2,
+ expanded_target_ids[:, 1:].unsqueeze(2),
+ ).squeeze()
+ not_padding = expanded_target_ids[:, 1:] != pad_idx
+ intermed_scores *= not_padding.float()
+ return intermed_scores
+ else:
+ raise ValueError("adaptive softmax doesn't work with " +
+ "`normalized_scores_with_batch_vocab()`")
diff --git a/fairseq/examples/fast_noisy_channel/noisy_channel_translation.py b/fairseq/examples/fast_noisy_channel/noisy_channel_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74bdfd456f9b7c546ce528173c77431b4f57ac1
--- /dev/null
+++ b/fairseq/examples/fast_noisy_channel/noisy_channel_translation.py
@@ -0,0 +1,127 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.tasks.translation import TranslationTask
+from fairseq.tasks.language_modeling import LanguageModelingTask
+from fairseq import checkpoint_utils
+import argparse
+from fairseq.tasks import register_task
+import torch
+
+
+@register_task("noisy_channel_translation")
+class NoisyChannelTranslation(TranslationTask):
+ """
+ Rescore the top k candidates from each beam using noisy channel modeling
+ """
+
+ @staticmethod
+ def add_args(parser):
+ """Add task-specific arguments to the parser."""
+ TranslationTask.add_args(parser)
+ # fmt: off
+ parser.add_argument('--channel-model', metavar='FILE',
+ help='path to P(S|T) model. P(S|T) and P(T|S) must share source and target dictionaries.')
+ parser.add_argument('--combine-method', default='lm_only',
+ choices=['lm_only', 'noisy_channel'],
+ help="""method for combining direct and channel model scores.
+ lm_only: decode with P(T|S)P(T)
+ noisy_channel: decode with 1/t P(T|S) + 1/s(P(S|T)P(T))""")
+ parser.add_argument('--normalize-lm-scores-by-tgt-len', action='store_true', default=False,
+ help='normalize lm score by target length instead of source length')
+ parser.add_argument('--channel-scoring-type', default='log_norm', choices=['unnormalized', 'log_norm', 'k2_separate', 'src_vocab', 'src_vocab_batched'],
+ help="Normalize bw scores with log softmax or return bw scores without log softmax")
+ parser.add_argument('--top-k-vocab', default=0, type=int,
+ help='top k vocab IDs to use with `src_vocab` in channel model scoring')
+ parser.add_argument('--k2', default=50, type=int,
+ help='the top k2 candidates to rescore with the noisy channel model for each beam')
+ parser.add_argument('--ch-wt', default=1, type=float,
+ help='weight for the channel model')
+ parser.add_argument('--lm-model', metavar='FILE',
+ help='path to lm model file, to model P(T). P(T) must share the same vocab as the direct model on the target side')
+ parser.add_argument('--lm-data', metavar='FILE',
+ help='path to lm model training data for target language, used to properly load LM with correct dictionary')
+ parser.add_argument('--lm-wt', default=1, type=float,
+ help='the weight of the lm in joint decoding')
+ # fmt: on
+
+ def build_generator(
+ self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None
+ ):
+ if getattr(args, "score_reference", False):
+ raise NotImplementedError()
+ else:
+ from .noisy_channel_sequence_generator import NoisyChannelSequenceGenerator
+ use_cuda = torch.cuda.is_available() and not self.args.cpu
+ assert self.args.lm_model is not None, '--lm-model required for noisy channel generation!'
+ assert self.args.lm_data is not None, '--lm-data required for noisy channel generation to map between LM and bitext vocabs'
+ if self.args.channel_model is not None:
+ import copy
+ ch_args_task = copy.deepcopy(self.args)
+ tmp = ch_args_task.source_lang
+ ch_args_task.source_lang = ch_args_task.target_lang
+ ch_args_task.target_lang = tmp
+ ch_args_task._name = 'translation'
+ channel_task = TranslationTask.setup_task(ch_args_task)
+
+ arg_dict = {}
+ arg_dict['task'] = 'language_modeling'
+ arg_dict['sample_break_mode'] = 'eos'
+ arg_dict['data'] = self.args.lm_data
+ arg_dict['output_dictionary_size'] = -1
+ lm_args = argparse.Namespace(**arg_dict)
+ lm_task = LanguageModelingTask.setup_task(lm_args)
+ lm_dict = lm_task.output_dictionary
+
+ if self.args.channel_model is not None:
+ channel_models, _ = checkpoint_utils.load_model_ensemble(self.args.channel_model.split(':'), task=channel_task)
+
+ for model in channel_models:
+ model.make_generation_fast_(
+ beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
+ need_attn=args.print_alignment,
+ )
+ if self.args.fp16:
+ model.half()
+ if use_cuda:
+ model.cuda()
+ else:
+ channel_models = None
+
+ lm_models, _ = checkpoint_utils.load_model_ensemble(self.args.lm_model.split(':'), task=lm_task)
+
+ for model in lm_models:
+ model.make_generation_fast_(
+ beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
+ need_attn=args.print_alignment,
+ )
+ if self.args.fp16:
+ model.half()
+ if use_cuda:
+ model.cuda()
+ return NoisyChannelSequenceGenerator(
+ combine_method=self.args.combine_method,
+ tgt_dict=self.target_dictionary,
+ src_dict=self.source_dictionary,
+ beam_size=getattr(args, 'beam', 5),
+ max_len_a=getattr(args, 'max_len_a', 0),
+ max_len_b=getattr(args, 'max_len_b', 200),
+ min_len=getattr(args, 'min_len', 1),
+ len_penalty=getattr(args, 'lenpen', 1),
+ unk_penalty=getattr(args, 'unkpen', 0),
+ temperature=getattr(args, 'temperature', 1.),
+ match_source_len=getattr(args, 'match_source_len', False),
+ no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0),
+ normalize_scores=(not getattr(args, 'unnormalized', False)),
+ channel_models=channel_models,
+ k2=getattr(self.args, 'k2', 50),
+ ch_weight=getattr(self.args, 'ch_wt', 1),
+ channel_scoring_type=self.args.channel_scoring_type,
+ top_k_vocab=self.args.top_k_vocab,
+ lm_models=lm_models,
+ lm_dict=lm_dict,
+ lm_weight=getattr(self.args, 'lm_wt', 1),
+ normalize_lm_scores_by_tgt_len=getattr(self.args, 'normalize_lm_scores_by_tgt_len', False),
+ )
diff --git a/fairseq/examples/flores101/README.md b/fairseq/examples/flores101/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..635c13f40bd0ccab704735bc5c26ea0192ea98cd
--- /dev/null
+++ b/fairseq/examples/flores101/README.md
@@ -0,0 +1,223 @@
+
+
+
+
+# Flores101: Large-Scale Multilingual Machine Translation
+
+## Introduction
+
+Baseline pretrained models for small and large tracks of WMT 21 Large-Scale Multilingual Machine Translation competition.
+
+Flores Task at WMT 21: http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html
+
+Flores announement blog post: https://ai.facebook.com/blog/flores-researchers-kick-off-multilingual-translation-challenge-at-wmt-and-call-for-compute-grants/
+
+
+
+## Pretrained models
+
+Model | Num layers | Embed dimension | FFN dimension| Vocab Size | #params | Download
+---|---|---|---|---|---|---
+`flores101_mm100_615M` | 12 | 1024 | 4096 | 256,000 | 615M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz
+`flores101_mm100_175M` | 6 | 512 | 2048 | 256,000 | 175M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz
+
+
+These models are trained similar to [M2M-100](https://arxiv.org/abs/2010.11125) with additional support for the languages that are part of the WMT Large-Scale Multilingual Machine Translation track. Full list of languages can be found at the bottom.
+
+
+## Example Generation code
+
+### Download model, sentencepiece vocab
+
+```bash
+fairseq=/path/to/fairseq
+cd $fairseq
+
+# Download 615M param model.
+wget https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz
+
+# Extract
+tar -xvzf flores101_mm100_615M.tar.gz
+```
+
+### Encode using our SentencePiece Model
+Note: Install SentencePiece from [here](https://github.com/google/sentencepiece)
+
+
+```bash
+fairseq=/path/to/fairseq
+cd $fairseq
+
+# Download example dataset From German to French
+sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de
+sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr
+
+for lang in de fr ; do
+ python scripts/spm_encode.py \
+ --model flores101_mm100_615M/sentencepiece.bpe.model \
+ --output_format=piece \
+ --inputs=raw_input.de-fr.${lang} \
+ --outputs=spm.de-fr.${lang}
+done
+```
+
+### Binarization
+
+```bash
+fairseq-preprocess \
+ --source-lang de --target-lang fr \
+ --testpref spm.de-fr \
+ --thresholdsrc 0 --thresholdtgt 0 \
+ --destdir data_bin \
+ --srcdict flores101_mm100_615M/dict.txt --tgtdict flores101_mm100_615M/dict.txt
+```
+
+### Generation
+
+
+```bash
+fairseq-generate \
+ data_bin \
+ --batch-size 1 \
+ --path flores101_mm100_615M/model.pt \
+ --fixed-dictionary flores101_mm100_615M/dict.txt \
+ -s de -t fr \
+ --remove-bpe 'sentencepiece' \
+ --beam 5 \
+ --task translation_multi_simple_epoch \
+ --lang-pairs flores101_mm100_615M/language_pairs.txt \
+ --decoder-langtok --encoder-langtok src \
+ --gen-subset test \
+ --fp16 \
+ --dataset-impl mmap \
+ --distributed-world-size 1 --distributed-no-spawn
+```
+
+### Supported Languages and lang code
+
+Language | lang code
+---|---
+Akrikaans | af
+Amharic | am
+Arabic | ar
+Assamese | as
+Asturian | ast
+Aymara | ay
+Azerbaijani | az
+Bashkir | ba
+Belarusian | be
+Bulgarian | bg
+Bengali | bn
+Breton | br
+Bosnian | bs
+Catalan | ca
+Cebuano | ceb
+Chokwe | cjk
+Czech | cs
+Welsh | cy
+Danish | da
+German | de
+Dyula| dyu
+Greek | el
+English | en
+Spanish | es
+Estonian | et
+Persian | fa
+Fulah | ff
+Finnish | fi
+French | fr
+Western Frisian | fy
+Irish | ga
+Scottish Gaelic | gd
+Galician | gl
+Gujarati | gu
+Hausa | ha
+Hebrew | he
+Hindi | hi
+Croatian | hr
+Haitian Creole | ht
+Hungarian | hu
+Armenian | hy
+Indonesian | id
+Igbo | ig
+Iloko | ilo
+Icelandic | is
+Italian | it
+Japanese | ja
+Javanese | jv
+Georgian | ka
+Kachin | kac
+Kamba | kam
+Kabuverdianu | kea
+Kongo | kg
+Kazakh | kk
+Central Khmer | km
+Kimbundu | kmb
+Northern Kurdish | kmr
+Kannada | kn
+Korean | ko
+Kurdish | ku
+Kyrgyz | ky
+Luxembourgish | lb
+Ganda | lg
+Lingala | ln
+Lao | lo
+Lithuanian | lt
+Luo | luo
+Latvian | lv
+Malagasy | mg
+Maori | mi
+Macedonian | mk
+Malayalam | ml
+Mongolian | mn
+Marathi | mr
+Malay | ms
+Maltese | mt
+Burmese | my
+Nepali | ne
+Dutch | nl
+Norwegian | no
+Northern Sotho | ns
+Nyanja | ny
+Occitan | oc
+Oromo | om
+Oriya | or
+Punjabi | pa
+Polish | pl
+Pashto | ps
+Portuguese | pt
+Quechua | qu
+Romanian | ro
+Russian | ru
+Sindhi | sd
+Shan | shn
+Sinhala | si
+Slovak | sk
+Slovenian | sl
+Shona | sn
+Somali | so
+Albanian | sq
+Serbian | sr
+Swati | ss
+Sundanese | su
+Swedish | sv
+Swahili | sw
+Tamil | ta
+Telugu | te
+Tajik | tg
+Thai | th
+Tigrinya | ti
+Tagalog | tl
+Tswana | tn
+Turkish | tr
+Ukrainian | uk
+Umbundu | umb
+Urdu | ur
+Uzbek | uz
+Vietnamese | vi
+Wolof | wo
+Xhosa | xh
+Yiddish | yi
+Yoruba | yo
+Chinese| zh
+Zulu | zu
diff --git a/fairseq/examples/flores101/flores_logo.png b/fairseq/examples/flores101/flores_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..d4d1455c6eab608ff5317ce885183cd213564273
Binary files /dev/null and b/fairseq/examples/flores101/flores_logo.png differ
diff --git a/fairseq/examples/fully_sharded_data_parallel/README.md b/fairseq/examples/fully_sharded_data_parallel/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9e44fef48bee5faeee27b3d1d1b1eb96b6a477f
--- /dev/null
+++ b/fairseq/examples/fully_sharded_data_parallel/README.md
@@ -0,0 +1,177 @@
+# Fully Sharded Data Parallel (FSDP)
+
+## Overview
+Recent work by [Microsoft](https://arxiv.org/abs/1910.02054) and
+[Google](https://arxiv.org/abs/2004.13336) has shown that data parallel
+training can be made significantly more efficient by sharding the model
+parameters and optimizer state across data parallel workers. These ideas are
+encapsulated in the new **`FullyShardedDataParallel` (FSDP)** wrapper provided
+by [fairscale](https://github.com/facebookresearch/fairscale/).
+
+Compared to PyTorch DDP:
+* FSDP produces identical results as PyTorch DDP (it's still synchronous data parallel training)
+* FSDP shards parameters (FP16 + FP32) and optimizer state across data parallel GPUs
+* FSDP is faster than PyTorch DDP because the optimizer step is sharded, and the communication can be overlapped with the forward pass
+* FSDP enables training 13B parameter models on 8 GPUs and 175B parameter models on 128 GPUs
+
+FSDP is fully supported in fairseq via the following new arguments:
+* `--ddp-backend=fully_sharded`: enables full sharding via FSDP
+* `--cpu-offload`: offloads the optimizer state and FP32 model copy to CPU (combine with `--optimizer=cpu_adam`)
+* `--no-reshard-after-forward`: increases training speed for large models (1B+ params) and is similar to ZeRO stage 2
+* other popular options (`--fp16`, `--update-freq`, `--checkpoint-activations`, `--offload-activations`, etc.) continue to work as normal
+
+Limitations
+
+FSDP currently has several limitations compared to fairseq's default DDP backend (PyTorch DDP):
+* while FSDP is full compatible with pointwise Optimizers (e.g., Adam, AdamW, Adadelta, Adamax, SGD, etc.), it is not currently compatible with non-pointwise Optimizers (e.g., Adagrad, Adafactor, LAMB, etc.)
+* FSDP depends on flattening the parameters, so models that currently require `--fp16-no-flatten-grads` may not be supported
+
+See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed
+explanation of these and other limitations.
+
+
+
+How it works
+
+
+
+See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed
+explanation of how FSDP works.
+
+
+
+## Example usage
+
+The following examples illustrate how to train a very large language model with
+13 billion parameters on 1 GPU by offloading parameters and optimizer states to
+CPU, or on 8 GPUs by fully sharding the params and optimizer states across GPUs.
+
+These examples use the WikiText-103 dataset for demonstration purposes, but
+in practice a much larger dataset will be needed to achieve good results.
+Follow the [instructions here](https://github.com/pytorch/fairseq/blob/main/examples/roberta/README.pretraining.md#1-preprocess-the-data)
+to preprocess the WikiText-103 dataset using the GPT-2/RoBERTa vocabulary.
+
+### 13B params on 1 V100 GPU (with CPU offloading)
+
+The following command trains a 13B parameter GPT-3 model on a single V100 GPU
+using the `--cpu-offload` feature to offload parameters and optimizer states to
+CPU. In this setting, the optimizer step (Adam) happens on CPU. We also use the
+`--checkpoint-activations` feature (sometimes called [gradient checkpointing](https://pytorch.org/docs/stable/checkpoint.html)),
+which further saves memory in exchange for a small increase in computation.
+
+**Requirements:**
+- Install the latest master version of fairscale: `pip install git+https://github.com/facebookresearch/fairscale.git@master`
+- You'll need 32GB of GPU memory and ~256GB of system memory to train the 13B param model.
+- If you have less system memory, the 6.7B param model can be trained with ~128GB of system memory, just set `--arch transformer_lm_gpt3_6_7`
+- We use the CPU Adam optimizer from [DeepSpeed](https://github.com/microsoft/DeepSpeed), so you'll need to `pip install deepspeed` before running the command.
+
+**Notes:**
+- The command will take ~5 minutes to start training, during which time it will appear to be hung, since randomly initializing 13B weights can be slow.
+- The `--cpu-offload` feature requires training in mixed precision (`--fp16`).
+- Tune the `OMP_NUM_THREADS` env variable for best performance with CPU offloading.
+- The example command below stops training after 10 steps (`--max-update 10`) and does not save checkpoints (`--no-save`).
+
+```bash
+OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0 \
+ fairseq-train data-bin/wikitext-103-roberta-bpe-bin \
+ --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
+ --cpu-offload --checkpoint-activations \
+ --task language_modeling --tokens-per-sample 2048 --batch-size 8 \
+ --arch transformer_lm_gpt3_13 \
+ --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
+ --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
+ --max-update 10 --no-save --log-format json --log-interval 1
+```
+
+Example output
+
+```
+(...)
+2021-03-08 12:29:51 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920)
+(...)
+2021-03-08 12:29:51 | INFO | fairseq_cli.train | training on 1 devices (GPUs/TPUs)
+2021-03-08 12:29:51 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8
+(...)
+Adam Optimizer #0 is created with AVX2 arithmetic capability.
+Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1
+(...)
+2021-03-08 12:31:36 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.475", "ppl": "91120.8", "wps": "0", "ups": "0", "wpb": "16384", "bsz": "8", "num_updates": "1", "lr": "2e-05", "gnorm": "20.751", "loss_scale": "4", "train_wall": "99", "gb_free": "9.3", "wall": "105"}
+2021-03-08 12:32:33 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.446", "ppl": "89281.6", "wps": "288.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "2", "lr": "4e-05", "gnorm": "19.777", "loss_scale": "4", "train_wall": "57", "gb_free": "9.3", "wall": "161"}
+2021-03-08 12:33:12 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0
+2021-03-08 12:33:51 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0
+2021-03-08 12:34:45 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "25.22", "ppl": "3.90691e+07", "wps": "123.4", "ups": "0.01", "wpb": "16384", "bsz": "8", "num_updates": "3", "lr": "6e-05", "gnorm": "131.281", "loss_scale": "1", "train_wall": "133", "gb_free": "9.3", "wall": "294"}
+2021-03-08 12:35:43 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.079", "ppl": "276809", "wps": "285.5", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "4", "lr": "8e-05", "gnorm": "13.776", "loss_scale": "1", "train_wall": "57", "gb_free": "9.3", "wall": "351"}
+2021-03-08 12:36:35 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "23.729", "ppl": "1.39088e+07", "wps": "316.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "5", "lr": "0.0001", "gnorm": "72.774", "loss_scale": "1", "train_wall": "52", "gb_free": "9.3", "wall": "403"}
+2021-03-08 12:37:28 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "20.429", "ppl": "1.41203e+06", "wps": "307.6", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "6", "lr": "8e-05", "gnorm": "60.846", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "456"}
+2021-03-08 12:38:27 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.965", "ppl": "511684", "wps": "279.4", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "7", "lr": "6e-05", "gnorm": "22.687", "loss_scale": "1", "train_wall": "59", "gb_free": "9.3", "wall": "515"}
+2021-03-08 12:39:18 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.345", "ppl": "332887", "wps": "319.1", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "8", "lr": "4e-05", "gnorm": "8.451", "loss_scale": "1", "train_wall": "51", "gb_free": "9.3", "wall": "566"}
+2021-03-08 12:40:11 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "18.262", "ppl": "314336", "wps": "305.9", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "9", "lr": "2e-05", "gnorm": "6.457", "loss_scale": "1", "train_wall": "54", "gb_free": "9.3", "wall": "620"}
+2021-03-08 12:41:04 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "17.556", "ppl": "192686", "wps": "311.8", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "10", "lr": "0", "gnorm": "5.796", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "673"}
+2021-03-08 12:41:04 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10
+2021-03-08 12:41:04 | INFO | fairseq_cli.train | begin validation on "valid" subset
+2021-03-08 12:43:15 | INFO | valid | {"epoch": 1, "valid_loss": "17.953", "valid_ppl": "253807", "valid_wps": "1868.4", "valid_wpb": "15400.2", "valid_bsz": "7.6", "valid_num_updates": "10"}
+2021-03-08 12:43:15 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below)
+2021-03-08 12:43:15 | INFO | train | {"epoch": 1, "train_loss": "19.351", "train_ppl": "668509", "train_wps": "210.9", "train_ups": "0.01", "train_wpb": "16384", "train_bsz": "8", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "36.26", "train_loss_scale": "1", "train_train_wall": "667", "train_gb_free": "9.3", "train_wall": "804"}
+2021-03-08 12:43:15 | INFO | fairseq_cli.train | done training in 798.6 seconds
+```
+
+
+
+### 13B params on 8 V100 GPUs (with full parameter + optimizer state sharding)
+
+FSDP can also shard the parameters and optimizer states across multiple GPUs,
+reducing memory requirements significantly. On 8 x 32GB GPUs, sharding enables
+training the same 13B parameter model *without offloading the parameters to
+CPU*. However, without CPU offloading we'd only be able to fit a batch size of
+1 per GPU, which would cause training speed to suffer.
+
+We obtain the best performance on 8 GPUs by combining full sharding and CPU
+offloading. The following command trains the same 13B parameter GPT-3 model as
+before on 8 x 32GB V100 GPUs; training speed increases superlinearly from ~310
+words per second to ~3200 words per second.
+
+```bash
+OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+ fairseq-train data-bin/wikitext-103-roberta-bpe-bin \
+ --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
+ --cpu-offload --checkpoint-activations \
+ --task language_modeling --tokens-per-sample 2048 --batch-size 8 \
+ --arch transformer_lm_gpt3_13 \
+ --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
+ --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
+ --max-update 10 --no-save --log-format json --log-interval 1
+```
+
+Example output
+
+```
+(...)
+2021-03-08 18:04:09 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920)
+(...)
+2021-03-08 18:04:09 | INFO | fairseq_cli.train | training on 8 devices (GPUs/TPUs)
+2021-03-08 18:04:09 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8
+(...)
+Adam Optimizer #0 is created with AVX2 arithmetic capability.
+Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1
+(...)
+2021-03-08 18:05:06 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "16.408", "ppl": "86945.6", "wps": "0", "ups": "0", "wpb": "131072", "bsz": "64", "num_updates": "1", "lr": "2e-05", "gnorm": "18.27", "loss_scale": "4", "train_wall": "47", "gb_free": "9.3", "wall": "56"}
+2021-03-08 18:05:45 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "16.352", "ppl": "83644.3", "wps": "3283.4", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "2", "lr": "4e-05", "gnorm": "18.411", "loss_scale": "4", "train_wall": "40", "gb_free": "9.3", "wall": "96"}
+2021-03-08 18:06:21 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0
+2021-03-08 18:06:56 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0
+2021-03-08 18:07:37 | INFO | train_inner | {"epoch": 1, "update": 0.006, "loss": "23.682", "ppl": "1.34537e+07", "wps": "1176.6", "ups": "0.01", "wpb": "131072", "bsz": "64", "num_updates": "3", "lr": "6e-05", "gnorm": "119.682", "loss_scale": "1", "train_wall": "111", "gb_free": "9.3", "wall": "208"}
+2021-03-08 18:08:18 | INFO | train_inner | {"epoch": 1, "update": 0.007, "loss": "18.988", "ppl": "519921", "wps": "3189.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "4", "lr": "8e-05", "gnorm": "14.934", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "249"}
+2021-03-08 18:08:59 | INFO | train_inner | {"epoch": 1, "update": 0.008, "loss": "20.08", "ppl": "1.10798e+06", "wps": "3223.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "5", "lr": "0.0001", "gnorm": "59.92", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "289"}
+2021-03-08 18:09:39 | INFO | train_inner | {"epoch": 1, "update": 0.009, "loss": "18.323", "ppl": "327980", "wps": "3256.6", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "6", "lr": "8e-05", "gnorm": "37.425", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "330"}
+2021-03-08 18:10:20 | INFO | train_inner | {"epoch": 1, "update": 0.01, "loss": "17.264", "ppl": "157354", "wps": "3188.7", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "7", "lr": "6e-05", "gnorm": "10.824", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "371"}
+2021-03-08 18:11:01 | INFO | train_inner | {"epoch": 1, "update": 0.011, "loss": "16.794", "ppl": "113647", "wps": "3230", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "8", "lr": "4e-05", "gnorm": "5.616", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "411"}
+2021-03-08 18:11:39 | INFO | train_inner | {"epoch": 1, "update": 0.012, "loss": "16.706", "ppl": "106938", "wps": "3384", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "9", "lr": "2e-05", "gnorm": "5.318", "loss_scale": "1", "train_wall": "39", "gb_free": "9.3", "wall": "450"}
+2021-03-08 18:12:19 | INFO | train_inner | {"epoch": 1, "update": 0.013, "loss": "16.548", "ppl": "95796.2", "wps": "3274.4", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "10", "lr": "0", "gnorm": "5.22", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "490"}
+2021-03-08 18:12:19 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10
+2021-03-08 18:12:19 | INFO | fairseq_cli.train | begin validation on "valid" subset
+2021-03-08 18:12:45 | INFO | valid | {"epoch": 1, "valid_loss": "16.624", "valid_ppl": "101000", "valid_wps": "10855.9", "valid_wpb": "123202", "valid_bsz": "60.5", "valid_num_updates": "10"}
+2021-03-08 18:12:45 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below)
+2021-03-08 18:12:45 | INFO | train | {"epoch": 1, "train_loss": "18.114", "train_ppl": "283776", "train_wps": "2567.8", "train_ups": "0.02", "train_wpb": "131072", "train_bsz": "64", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "29.562", "train_loss_scale": "1", "train_train_wall": "480", "train_gb_free": "9.3", "train_wall": "516"}
+2021-03-08 18:12:45 | INFO | fairseq_cli.train | done training in 509.9 seconds
+```
+
+
diff --git a/fairseq/examples/gottbert/README.md b/fairseq/examples/gottbert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d58feb279a4a50222290546c3bb285d3cea98e6
--- /dev/null
+++ b/fairseq/examples/gottbert/README.md
@@ -0,0 +1,64 @@
+# GottBERT: a pure German language model
+
+## Introduction
+
+[GottBERT](http://arxiv.org/abs/2012.02110) is a pretrained language model trained on 145GB of German text based on RoBERTa.
+
+## Example usage
+
+### fairseq
+##### Load GottBERT from torch.hub (PyTorch >= 1.1):
+```python
+import torch
+gottbert = torch.hub.load('pytorch/fairseq', 'gottbert-base')
+gottbert.eval() # disable dropout (or leave in train mode to finetune)
+```
+
+##### Load GottBERT (for PyTorch 1.0 or custom models):
+```python
+# Download gottbert model
+wget https://dl.gottbert.de/fairseq/models/gottbert-base.tar.gz
+tar -xzvf gottbert.tar.gz
+
+# Load the model in fairseq
+from fairseq.models.roberta import GottbertModel
+gottbert = GottbertModel.from_pretrained('/path/to/gottbert')
+gottbert.eval() # disable dropout (or leave in train mode to finetune)
+```
+
+##### Filling masks:
+```python
+masked_line = 'Gott ist ! :)'
+gottbert.fill_mask(masked_line, topk=3)
+# [('Gott ist gut ! :)', 0.3642110526561737, ' gut'),
+# ('Gott ist überall ! :)', 0.06009674072265625, ' überall'),
+# ('Gott ist großartig ! :)', 0.0370681993663311, ' großartig')]
+```
+
+##### Extract features from GottBERT
+
+```python
+# Extract the last layer's features
+line = "Der erste Schluck aus dem Becher der Naturwissenschaft macht atheistisch , aber auf dem Grunde des Bechers wartet Gott !"
+tokens = gottbert.encode(line)
+last_layer_features = gottbert.extract_features(tokens)
+assert last_layer_features.size() == torch.Size([1, 27, 768])
+
+# Extract all layer's features (layer 0 is the embedding layer)
+all_layers = gottbert.extract_features(tokens, return_all_hiddens=True)
+assert len(all_layers) == 13
+assert torch.all(all_layers[-1] == last_layer_features)
+```
+## Citation
+If you use our work, please cite:
+
+```bibtex
+@misc{scheible2020gottbert,
+ title={GottBERT: a pure German Language Model},
+ author={Raphael Scheible and Fabian Thomczyk and Patric Tippmann and Victor Jaravine and Martin Boeker},
+ year={2020},
+ eprint={2012.02110},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
diff --git a/fairseq/examples/hubert/README.md b/fairseq/examples/hubert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6695d819713b3e2077cf0fab30469b237e1cf1be
--- /dev/null
+++ b/fairseq/examples/hubert/README.md
@@ -0,0 +1,116 @@
+# HuBERT
+
+## Pre-trained and fine-tuned (ASR) models
+Model | Pretraining Data | Finetuning Dataset | Model | Quantizer
+|---|---|---|---|---
+HuBERT Base (~95M params) | [Librispeech](http://www.openslr.org/12) 960 hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | [L9 km500](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960_L9_km500.bin)
+HuBERT Large (~316M params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt)
+HuBERT Extra Large (~1B params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k.pt)
+HuBERT Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k_finetune_ls960.pt)
+HuBERT Extra Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt)
+
+## Load a model
+```
+ckpt_path = "/path/to/the/checkpoint.pt"
+models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+model = models[0]
+```
+
+## Train a new model
+
+### Data preparation
+
+Follow the steps in `./simple_kmeans` to create:
+- `{train,valid}.tsv` waveform list files
+- `{train,valid}.km` frame-aligned pseudo label files.
+- `dict.km.txt` a dummy dictionary
+The `label_rate` is the same as the feature frame rate used for clustering,
+which is 100Hz for MFCC features and 50Hz for HuBERT features by default.
+
+### Pre-train a HuBERT model
+
+Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km`
+are saved at `/path/to/labels`, and the label rate is 100Hz.
+
+To train a base model (12 layer transformer), run:
+```sh
+$ python fairseq_cli/hydra_train.py \
+ --config-dir /path/to/fairseq-py/examples/hubert/config/pretrain \
+ --config-name hubert_base_librispeech \
+ task.data=/path/to/data task.label_dir=/path/to/labels task.labels='["km"]' model.label_rate=100
+```
+
+### Fine-tune a HuBERT model with a CTC loss
+
+Suppose `{train,valid}.tsv` are saved at `/path/to/data`, and their
+corresponding character transcripts `{train,valid}.ltr` are saved at
+`/path/to/trans`.
+
+To fine-tune a pre-trained HuBERT model at `/path/to/checkpoint`, run
+```sh
+$ python fairseq_cli/hydra_train.py \
+ --config-dir /path/to/fairseq-py/examples/hubert/config/finetune \
+ --config-name base_10h \
+ task.data=/path/to/data task.label_dir=/path/to/trans \
+ model.w2v_path=/path/to/checkpoint
+```
+
+### Decode a HuBERT model
+
+Suppose the `test.tsv` and `test.ltr` are the waveform list and transcripts of
+the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is
+saved at `/path/to/checkpoint`. We support three decoding modes:
+- Viterbi decoding: greedy decoding without a language model
+- KenLM decoding: decoding with an arpa-format KenLM n-gram language model
+- Fairseq-LM deocding: decoding with a Fairseq neural language model
+
+
+#### Viterbi decoding
+
+`task.normalize` needs to be consistent with the value used during fine-tuning.
+Decoding results will be saved at
+`/path/to/experiment/directory/decode/viterbi/test`.
+
+```sh
+$ python examples/speech_recognition/new/infer.py \
+ --config-dir /path/to/fairseq-py/examples/hubert/config/decode \
+ --config-name infer_viterbi \
+ task.data=/path/to/data \
+ task.normalize=[true|false] \
+ decoding.exp_dir=/path/to/experiment/directory \
+ common_eval.path=/path/to/checkpoint
+ dataset.gen_subset=test \
+```
+
+#### KenLM / Fairseq-LM decoding
+
+Suppose the pronunciation lexicon and the n-gram LM are saved at
+`/path/to/lexicon` and `/path/to/arpa`, respectively. Decoding results will be
+saved at `/path/to/experiment/directory/decode/kenlm/test`.
+
+```sh
+$ python examples/speech_recognition/new/infer.py \
+ --config-dir /path/to/fairseq-py/examples/hubert/config/decode \
+ --config-name infer_kenlm \
+ task.data=/path/to/data \
+ task.normalize=[true|false] \
+ decoding.exp_dir=/path/to/experiment/directory \
+ common_eval.path=/path/to/checkpoint
+ dataset.gen_subset=test \
+ decoding.decoder.lexicon=/path/to/lexicon \
+ decoding.decoder.lmpath=/path/to/arpa
+```
+
+The command above uses the default decoding hyperparameter, which can be found
+in `examples/speech_recognition/hydra/decoder.py`. These parameters can be
+configured from the command line. For example, to search with a beam size of
+500, we can append the command above with `decoding.decoder.beam=500`.
+Important parameters include:
+- decoding.decoder.beam
+- decoding.decoder.beamthreshold
+- decoding.decoder.lmweight
+- decoding.decoder.wordscore
+- decoding.decoder.silweight
+
+To decode with a Fairseq LM, use `--config-name infer_fsqlm` instead, and
+change the path of lexicon and LM accordingly.
diff --git a/fairseq/examples/hubert/config/decode/ax_sweep/ngram.yaml b/fairseq/examples/hubert/config/decode/ax_sweep/ngram.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a02df1f7da7eebfebe4018ef2758a716fbab646
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/ax_sweep/ngram.yaml
@@ -0,0 +1,33 @@
+# @package _global_
+
+common_eval:
+ results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset}
+
+hydra:
+ sweeper:
+ ax_config:
+ max_trials: 60
+ early_stop:
+ minimize: true
+ max_epochs_without_improvement: 10
+ epsilon: 0.025
+ experiment:
+ name: ${dataset.gen_subset}
+ objective_name: wer
+ minimize: true
+ parameter_constraints: null
+ outcome_constraints: null
+ status_quo: null
+ client:
+ verbose_logging: false
+ random_seed: null
+ params:
+ decoding.decoder.lmweight:
+ type: range
+ bounds: [0.0, 8.0]
+ decoding.decoder.wordscore:
+ type: range
+ bounds: [-5.0, 5.0]
+ decoding.decoder.silweight:
+ type: range
+ bounds: [-10.0, 0.0]
diff --git a/fairseq/examples/hubert/config/decode/ax_sweep/transformer.yaml b/fairseq/examples/hubert/config/decode/ax_sweep/transformer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85ed3bd1a5a44871260f572786044c28f441add6
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/ax_sweep/transformer.yaml
@@ -0,0 +1,33 @@
+# @package _global_
+
+common_eval:
+ results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset}
+
+hydra:
+ sweeper:
+ ax_config:
+ max_trials: 60
+ early_stop:
+ minimize: true
+ max_epochs_without_improvement: 10
+ epsilon: 0.025
+ experiment:
+ name: ${dataset.gen_subset}
+ objective_name: wer
+ minimize: true
+ parameter_constraints: null
+ outcome_constraints: null
+ status_quo: null
+ client:
+ verbose_logging: false
+ random_seed: null
+ params:
+ decoding.decoder.lmweight:
+ type: range
+ bounds: [0.0, 4.0]
+ decoding.decoder.wordscore:
+ type: range
+ bounds: [-5.0, 5.0]
+ decoding.decoder.silweight:
+ type: range
+ bounds: [-8.0, 0.0]
diff --git a/fairseq/examples/hubert/config/decode/infer_fsqlm.yaml b/fairseq/examples/hubert/config/decode/infer_fsqlm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..026ad8db89a0673969a99fed6e1e84fc41fc7a1a
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/infer_fsqlm.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+
+defaults:
+ - model: null
+
+hydra:
+ run:
+ dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
+ sweep:
+ dir: ${common_eval.results_path}
+ subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
+
+task:
+ _name: hubert_pretraining
+ single_target: true
+ fine_tuning: true
+ data: ???
+ normalize: ???
+
+decoding:
+ type: fairseqlm
+ lexicon: ???
+ lmpath: ???
+ beamthreshold: 25
+ beam: 500
+ lmweight: 2
+ wordscore: -1
+ silweight: 0
+ unique_wer_file: true
+common_eval:
+ results_path: ???
+ path: ???
+ post_process: letter
+dataset:
+ max_tokens: 1100000
+ gen_subset: ???
diff --git a/fairseq/examples/hubert/config/decode/infer_kenlm.yaml b/fairseq/examples/hubert/config/decode/infer_kenlm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04642aeb6530133ab44e12e11e3d1661e3b9c32c
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/infer_kenlm.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+
+defaults:
+ - model: null
+
+hydra:
+ run:
+ dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
+ sweep:
+ dir: ${common_eval.results_path}
+ subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
+
+task:
+ _name: hubert_pretraining
+ single_target: true
+ fine_tuning: true
+ data: ???
+ normalize: ???
+
+decoding:
+ type: kenlm
+ lexicon: ???
+ lmpath: ???
+ beamthreshold: 100
+ beam: 500
+ lmweight: 2
+ wordscore: -1
+ silweight: 0
+ unique_wer_file: true
+common_eval:
+ results_path: ???
+ path: ???
+ post_process: letter
+dataset:
+ max_tokens: 1100000
+ gen_subset: ???
diff --git a/fairseq/examples/hubert/config/decode/infer_viterbi.yaml b/fairseq/examples/hubert/config/decode/infer_viterbi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4afc74c18ca890e1a20c6beabeb9059dd0f480f4
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/infer_viterbi.yaml
@@ -0,0 +1,29 @@
+# @package _group_
+
+defaults:
+ - model: null
+
+hydra:
+ run:
+ dir: ${common_eval.results_path}/viterbi
+ sweep:
+ dir: ${common_eval.results_path}
+ subdir: viterbi
+
+task:
+ _name: hubert_pretraining
+ single_target: true
+ fine_tuning: true
+ data: ???
+ normalize: ???
+
+decoding:
+ type: viterbi
+ unique_wer_file: true
+common_eval:
+ results_path: ???
+ path: ???
+ post_process: letter
+dataset:
+ max_tokens: 1100000
+ gen_subset: ???
diff --git a/fairseq/examples/hubert/config/decode/run/submitit_slurm.yaml b/fairseq/examples/hubert/config/decode/run/submitit_slurm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b8065832ecacf9dd4fe4e99c87941e00fb3ef7f
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/run/submitit_slurm.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+hydra:
+ launcher:
+ cpus_per_task: ${distributed_training.distributed_world_size}
+ gpus_per_node: ${distributed_training.distributed_world_size}
+ tasks_per_node: ${hydra.launcher.gpus_per_node}
+ nodes: 1
+ mem_gb: 200
+ timeout_min: 4320
+ max_num_timeout: 50
+ name: ${hydra.job.config_name}
+ submitit_folder: ${hydra.sweep.dir}/submitit
+
+distributed_training:
+ distributed_world_size: 1
+ distributed_no_spawn: true
+ distributed_port: 29761
diff --git a/fairseq/examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml b/fairseq/examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f669f376312dbfe4611cc08f4996a314155fb87
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+hydra:
+ launcher:
+ cpus_per_task: ${distributed_training.distributed_world_size}
+ gpus_per_node: ${distributed_training.distributed_world_size}
+ tasks_per_node: ${hydra.launcher.gpus_per_node}
+ nodes: 1
+ mem_gb: 200
+ timeout_min: 4320
+ max_num_timeout: 50
+ name: ${hydra.job.config_name}
+ submitit_folder: ${hydra.sweep.dir}/submitit
+
+distributed_training:
+ distributed_world_size: 8
+ distributed_no_spawn: true
+ distributed_port: 29761
diff --git a/fairseq/examples/hubert/config/finetune/base_10h.yaml b/fairseq/examples/hubert/config/finetune/base_10h.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a22c7c0347f792221f209bcfba7ba380a69f90a8
--- /dev/null
+++ b/fairseq/examples/hubert/config/finetune/base_10h.yaml
@@ -0,0 +1,100 @@
+# @package _group_
+
+common:
+ fp16: true
+ log_format: json
+ log_interval: 200
+ tensorboard_logdir: tblog
+ seed: 1337
+
+checkpoint:
+ save_interval: 5
+ keep_interval_updates: 1
+ no_epoch_checkpoints: true
+ best_checkpoint_metric: wer
+
+distributed_training:
+ ddp_backend: c10d
+ find_unused_parameters: true
+ distributed_world_size: 1
+ distributed_port: 29671
+ nprocs_per_node: 8
+
+task:
+ _name: hubert_pretraining
+ data: ???
+ fine_tuning: true
+ label_dir: ???
+ normalize: false # must be consistent with pre-training
+ labels: ["ltr"]
+ single_target: true
+
+dataset:
+ num_workers: 0
+ max_tokens: 3200000
+ validate_after_updates: ${model.freeze_finetune_updates}
+ validate_interval: 5
+ train_subset: train
+ valid_subset: valid
+
+criterion:
+ _name: ctc
+ zero_infinity: true
+
+optimization:
+ max_update: 25000
+ lr: [2e-5]
+ sentence_avg: true
+ update_freq: [1]
+
+optimizer:
+ _name: adam
+ adam_betas: (0.9,0.98)
+ adam_eps: 1e-08
+
+lr_scheduler:
+ _name: tri_stage
+ warmup_steps: 8000
+ hold_steps: 0
+ decay_steps: 72000
+ final_lr_scale: 0.05
+
+model:
+ _name: hubert_ctc
+ w2v_path: ???
+ apply_mask: true
+ mask_selection: static
+ mask_length: 10
+ mask_other: 0
+ mask_prob: 0.75
+ mask_channel_selection: static
+ mask_channel_length: 64
+ mask_channel_other: 0
+ mask_channel_prob: 0.5
+ layerdrop: 0.1
+ dropout: 0.0
+ activation_dropout: 0.1
+ attention_dropout: 0.0
+ feature_grad_mult: 0.0
+ freeze_finetune_updates: 10000
+
+hydra:
+ job:
+ config:
+ override_dirname:
+ kv_sep: '-'
+ item_sep: '__'
+ exclude_keys:
+ - run
+ - task.data
+ - task.label_dir
+ - model.w2v_path
+ - dataset.train_subset
+ - dataset.valid_subset
+ - criterion.wer_kenlm_model
+ - criterion.wer_lexicon
+ run:
+ dir: ???
+ sweep:
+ dir: ???
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
diff --git a/fairseq/examples/hubert/config/finetune/ckpt/it1.yaml b/fairseq/examples/hubert/config/finetune/ckpt/it1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2af96b3f72746f85feb13e7efcbdab6602b293de
--- /dev/null
+++ b/fairseq/examples/hubert/config/finetune/ckpt/it1.yaml
@@ -0,0 +1,7 @@
+# @package _global_
+
+task:
+ normalize: false
+
+model:
+ w2v_path: /checkpoint/wnhsu/w2v/hubert_final/iter1/hubert.km.randcrop.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU400k.s1337.ngpu32/checkpoint_last.pt
diff --git a/fairseq/examples/hubert/config/finetune/lm/ls_4gram.yaml b/fairseq/examples/hubert/config/finetune/lm/ls_4gram.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c7728ad29965d3cf18605808a893bc442afd56b
--- /dev/null
+++ b/fairseq/examples/hubert/config/finetune/lm/ls_4gram.yaml
@@ -0,0 +1,7 @@
+# @package _global_
+
+criterion:
+ wer_kenlm_model: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/4-gram.bin
+ wer_lexicon: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/10h/raw/lexicon_ltr.lst
+ wer_lm_weight: 2.0
+ wer_word_score: -1.0
diff --git a/fairseq/examples/hubert/config/finetune/run/submitit_reg.yaml b/fairseq/examples/hubert/config/finetune/run/submitit_reg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..27509503e7b306c07742fbed2fc5726d001bb7df
--- /dev/null
+++ b/fairseq/examples/hubert/config/finetune/run/submitit_reg.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+hydra:
+ launcher:
+ cpus_per_task: 8
+ gpus_per_node: 8
+ tasks_per_node: ${hydra.launcher.gpus_per_node}
+ nodes: 1
+ comment: null
+ mem_gb: 384
+ timeout_min: 4320
+ max_num_timeout: 100
+ constraint: volta32gb
+ name: ${hydra.job.config_name}/${hydra.job.override_dirname}
+ submitit_folder: ${hydra.sweep.dir}/submitit/%j
+
+distributed_training:
+ distributed_world_size: 8
+ distributed_port: 29671
+ nprocs_per_node: 8
diff --git a/fairseq/examples/hubert/config/pretrain/data/iter1.yaml b/fairseq/examples/hubert/config/pretrain/data/iter1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a1b65d802c83128c53f32b21807fa5e51da6cc9
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/data/iter1.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+
+task:
+ label_dir: ???
+ labels: ["km"]
+
+model:
+ label_rate: 100
diff --git a/fairseq/examples/hubert/config/pretrain/data/iter2.yaml b/fairseq/examples/hubert/config/pretrain/data/iter2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d4bfe61cc638af9de48e92c58994e435fba2abf
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/data/iter2.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+
+task:
+ label_dir: ???
+ labels: ["km"]
+
+model:
+ label_rate: 50
diff --git a/fairseq/examples/hubert/config/pretrain/hubert_base_librispeech.yaml b/fairseq/examples/hubert/config/pretrain/hubert_base_librispeech.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd84461a163866f622b01bf6d36b4de6215f3d97
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/hubert_base_librispeech.yaml
@@ -0,0 +1,97 @@
+# @package _group_
+
+common:
+ fp16: true
+ log_format: json
+ log_interval: 200
+ seed: 1337
+ tensorboard_logdir: tblog
+
+checkpoint:
+ save_interval_updates: 25000
+ keep_interval_updates: 1
+ no_epoch_checkpoints: true
+
+
+distributed_training:
+ ddp_backend: no_c10d
+ distributed_backend: 'nccl'
+ distributed_world_size: 32
+ distributed_port: 29671
+ nprocs_per_node: 8
+ find_unused_parameters: true
+
+task:
+ _name: hubert_pretraining
+ data: ???
+ label_dir: ???
+ labels: ???
+ label_rate: ${model.label_rate}
+ sample_rate: 16000
+ max_sample_size: 250000
+ min_sample_size: 32000
+ pad_audio: false
+ random_crop: true
+ normalize: false # must be consistent with extractor
+
+dataset:
+ num_workers: 6
+ max_tokens: 1400000
+ skip_invalid_size_inputs_valid_test: true
+ validate_interval: 5
+ validate_interval_updates: 10000
+
+criterion:
+ _name: hubert
+ pred_masked_weight: 1.0
+ pred_nomask_weight: 0.0
+ loss_weights: [10,]
+
+optimization:
+ max_update: 400000
+ lr: [0.0005]
+ clip_norm: 10.0
+
+optimizer:
+ _name: adam
+ adam_betas: (0.9,0.98)
+ adam_eps: 1e-06
+ weight_decay: 0.01
+
+lr_scheduler:
+ _name: polynomial_decay
+ warmup_updates: 32000
+
+model:
+ _name: hubert
+ label_rate: ???
+ skip_masked: false
+ skip_nomask: false
+ mask_prob: 0.80
+ extractor_mode: default
+ conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+ final_dim: 256
+ encoder_layerdrop: 0.05
+ dropout_input: 0.1
+ dropout_features: 0.1
+ dropout: 0.1
+ attention_dropout: 0.1
+ feature_grad_mult: 0.1
+ untie_final_proj: true
+ activation_dropout: 0.0
+
+hydra:
+ job:
+ config:
+ override_dirname:
+ kv_sep: '-'
+ item_sep: '__'
+ exclude_keys:
+ - run
+ - task.data
+ - task.label_dir
+ run:
+ dir: ???
+ sweep:
+ dir: ???
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
diff --git a/fairseq/examples/hubert/config/pretrain/hubert_large_librivox.yaml b/fairseq/examples/hubert/config/pretrain/hubert_large_librivox.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5192b5f29b53aa8391a0ab67b6238c0d0b4985e
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/hubert_large_librivox.yaml
@@ -0,0 +1,101 @@
+# @package _group_
+
+common:
+ fp16: true
+ log_format: json
+ log_interval: 200
+ seed: 1337
+ tensorboard_logdir: tblog
+
+checkpoint:
+ save_interval_updates: 25000
+ keep_interval_updates: 1
+ no_epoch_checkpoints: true
+
+
+distributed_training:
+ ddp_backend: no_c10d
+ distributed_backend: 'nccl'
+ distributed_world_size: 128
+ distributed_port: 29671
+ nprocs_per_node: 8
+ find_unused_parameters: true
+
+task:
+ _name: hubert_pretraining
+ data: ???
+ label_dir: ???
+ labels: ???
+ label_rate: ${model.label_rate}
+ sample_rate: 16000
+ max_sample_size: 250000
+ min_sample_size: 32000
+ pad_audio: false
+ random_crop: true
+ normalize: true # must be consistent with extractor
+
+dataset:
+ num_workers: 6
+ max_tokens: 900000
+ skip_invalid_size_inputs_valid_test: true
+ validate_interval: 5
+ validate_interval_updates: 10000
+
+criterion:
+ _name: hubert
+ pred_masked_weight: 1.0
+ pred_nomask_weight: 0.0
+ loss_weights: [10,]
+
+optimization:
+ max_update: 400000
+ lr: [0.0015]
+ clip_norm: 1.0
+
+optimizer:
+ _name: adam
+ adam_betas: (0.9,0.98)
+ adam_eps: 1e-06
+ weight_decay: 0.01
+
+lr_scheduler:
+ _name: polynomial_decay
+ warmup_updates: 32000
+
+model:
+ _name: hubert
+ label_rate: ???
+ encoder_layers: 24
+ encoder_embed_dim: 1024
+ encoder_ffn_embed_dim: 4096
+ encoder_attention_heads: 16
+ final_dim: 768
+ skip_masked: false
+ skip_nomask: false
+ mask_prob: 0.80
+ extractor_mode: layer_norm
+ conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+ encoder_layerdrop: 0.0
+ dropout_input: 0.0
+ dropout_features: 0.0
+ dropout: 0.0
+ attention_dropout: 0.0
+ layer_norm_first: true
+ feature_grad_mult: 1.0
+ untie_final_proj: true
+ activation_dropout: 0.0
+
+hydra:
+ job:
+ config:
+ override_dirname:
+ kv_sep: '-'
+ item_sep: '__'
+ exclude_keys:
+ - run
+ - task.data
+ run:
+ dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+ sweep:
+ dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
diff --git a/fairseq/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml b/fairseq/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34e8f2bfb93863db122f694785b80857713ceb05
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml
@@ -0,0 +1,101 @@
+# @package _group_
+
+common:
+ fp16: true
+ log_format: json
+ log_interval: 200
+ seed: 1337
+ tensorboard_logdir: tblog
+
+checkpoint:
+ save_interval_updates: 25000
+ keep_interval_updates: 1
+ no_epoch_checkpoints: true
+
+
+distributed_training:
+ ddp_backend: no_c10d
+ distributed_backend: 'nccl'
+ distributed_world_size: 256
+ distributed_port: 29671
+ nprocs_per_node: 8
+ find_unused_parameters: true
+
+task:
+ _name: hubert_pretraining
+ data: ???
+ label_dir: ???
+ labels: ???
+ label_rate: ${model.label_rate}
+ sample_rate: 16000
+ max_sample_size: 250000
+ min_sample_size: 32000
+ pad_audio: false
+ random_crop: true
+ normalize: true # must be consistent with extractor
+
+dataset:
+ num_workers: 6
+ max_tokens: 360000
+ skip_invalid_size_inputs_valid_test: true
+ validate_interval: 5
+ validate_interval_updates: 10000
+
+criterion:
+ _name: hubert
+ pred_masked_weight: 1.0
+ pred_nomask_weight: 0.0
+ loss_weights: [10,]
+
+optimization:
+ max_update: 400000
+ lr: [0.003]
+ clip_norm: 1.0
+
+optimizer:
+ _name: adam
+ adam_betas: (0.9,0.98)
+ adam_eps: 1e-06
+ weight_decay: 0.01
+
+lr_scheduler:
+ _name: polynomial_decay
+ warmup_updates: 32000
+
+model:
+ _name: hubert
+ label_rate: ???
+ encoder_layers: 48
+ encoder_embed_dim: 1280
+ encoder_ffn_embed_dim: 5120
+ encoder_attention_heads: 16
+ final_dim: 1024
+ skip_masked: false
+ skip_nomask: false
+ mask_prob: 0.80
+ extractor_mode: layer_norm
+ conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+ encoder_layerdrop: 0.0
+ dropout_input: 0.0
+ dropout_features: 0.0
+ dropout: 0.0
+ attention_dropout: 0.0
+ layer_norm_first: true
+ feature_grad_mult: 1.0
+ untie_final_proj: true
+ activation_dropout: 0.0
+
+hydra:
+ job:
+ config:
+ override_dirname:
+ kv_sep: '-'
+ item_sep: '__'
+ exclude_keys:
+ - run
+ - task.data
+ run:
+ dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+ sweep:
+ dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
diff --git a/fairseq/examples/hubert/config/pretrain/run/submitit_reg.yaml b/fairseq/examples/hubert/config/pretrain/run/submitit_reg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46c979cd2835fe026b0a532a54533904d1001e54
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/run/submitit_reg.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+hydra:
+ launcher:
+ cpus_per_task: 8
+ gpus_per_node: 8
+ tasks_per_node: ${hydra.launcher.gpus_per_node}
+ nodes: 4
+ comment: null
+ mem_gb: 384
+ timeout_min: 4320
+ max_num_timeout: 100
+ constraint: volta32gb
+ name: ${hydra.job.config_name}/${hydra.job.override_dirname}
+ submitit_folder: ${hydra.sweep.dir}/submitit/%j
+
+distributed_training:
+ distributed_world_size: 32
+ distributed_port: 29671
+ nprocs_per_node: 8
diff --git a/fairseq/examples/hubert/measure_teacher_quality.py b/fairseq/examples/hubert/measure_teacher_quality.py
new file mode 100644
index 0000000000000000000000000000000000000000..92279b2214bb2ba4a99aea92098907ef4f55821b
--- /dev/null
+++ b/fairseq/examples/hubert/measure_teacher_quality.py
@@ -0,0 +1,241 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import os.path as op
+import re
+from tabulate import tabulate
+from collections import Counter
+
+
+def comp_purity(p_xy, axis):
+ max_p = p_xy.max(axis=axis)
+ marg_p = p_xy.sum(axis=axis)
+ indv_pur = max_p / marg_p
+ aggr_pur = max_p.sum()
+ return indv_pur, aggr_pur
+
+
+def comp_entropy(p):
+ return (-p * np.log(p + 1e-8)).sum()
+
+
+def comp_norm_mutual_info(p_xy):
+ p_x = p_xy.sum(axis=1, keepdims=True)
+ p_y = p_xy.sum(axis=0, keepdims=True)
+ pmi = np.log(p_xy / np.matmul(p_x, p_y) + 1e-8)
+ mi = (p_xy * pmi).sum()
+ h_x = comp_entropy(p_x)
+ h_y = comp_entropy(p_y)
+ return mi, mi / h_x, mi / h_y, h_x, h_y
+
+
+def pad(labs, n):
+ if n == 0:
+ return np.array(labs)
+ return np.concatenate([[labs[0]] * n, labs, [labs[-1]] * n])
+
+
+def comp_avg_seg_dur(labs_list):
+ n_frms = 0
+ n_segs = 0
+ for labs in labs_list:
+ labs = np.array(labs)
+ edges = np.zeros(len(labs)).astype(bool)
+ edges[0] = True
+ edges[1:] = labs[1:] != labs[:-1]
+ n_frms += len(edges)
+ n_segs += edges.astype(int).sum()
+ return n_frms / n_segs
+
+
+def comp_joint_prob(uid2refs, uid2hyps):
+ """
+ Args:
+ pad: padding for spliced-feature derived labels
+ """
+ cnts = Counter()
+ skipped = []
+ abs_frmdiff = 0
+ for uid in uid2refs:
+ if uid not in uid2hyps:
+ skipped.append(uid)
+ continue
+ refs = uid2refs[uid]
+ hyps = uid2hyps[uid]
+ abs_frmdiff += abs(len(refs) - len(hyps))
+ min_len = min(len(refs), len(hyps))
+ refs = refs[:min_len]
+ hyps = hyps[:min_len]
+ cnts.update(zip(refs, hyps))
+ tot = sum(cnts.values())
+
+ ref_set = sorted({ref for ref, _ in cnts.keys()})
+ hyp_set = sorted({hyp for _, hyp in cnts.keys()})
+ ref2pid = dict(zip(ref_set, range(len(ref_set))))
+ hyp2lid = dict(zip(hyp_set, range(len(hyp_set))))
+ # print(hyp_set)
+ p_xy = np.zeros((len(ref2pid), len(hyp2lid)), dtype=float)
+ for (ref, hyp), cnt in cnts.items():
+ p_xy[ref2pid[ref], hyp2lid[hyp]] = cnt
+ p_xy /= p_xy.sum()
+ return p_xy, ref2pid, hyp2lid, tot, abs_frmdiff, skipped
+
+
+def read_phn(tsv_path, rm_stress=True):
+ uid2phns = {}
+ with open(tsv_path) as f:
+ for line in f:
+ uid, phns = line.rstrip().split("\t")
+ phns = phns.split(",")
+ if rm_stress:
+ phns = [re.sub("[0-9]", "", phn) for phn in phns]
+ uid2phns[uid] = phns
+ return uid2phns
+
+
+def read_lab(tsv_path, lab_path, pad_len=0, upsample=1):
+ """
+ tsv is needed to retrieve the uids for the labels
+ """
+ with open(tsv_path) as f:
+ f.readline()
+ uids = [op.splitext(op.basename(line.rstrip().split()[0]))[0] for line in f]
+ with open(lab_path) as f:
+ labs_list = [pad(line.rstrip().split(), pad_len).repeat(upsample) for line in f]
+ assert len(uids) == len(labs_list)
+ return dict(zip(uids, labs_list))
+
+
+def main_lab_lab(
+ tsv_dir,
+ lab_dir,
+ lab_name,
+ lab_sets,
+ ref_dir,
+ ref_name,
+ pad_len=0,
+ upsample=1,
+ verbose=False,
+):
+ # assume tsv_dir is the same for both the reference and the hypotheses
+ tsv_dir = lab_dir if tsv_dir is None else tsv_dir
+
+ uid2refs = {}
+ for s in lab_sets:
+ uid2refs.update(read_lab(f"{tsv_dir}/{s}.tsv", f"{ref_dir}/{s}.{ref_name}"))
+
+ uid2hyps = {}
+ for s in lab_sets:
+ uid2hyps.update(
+ read_lab(
+ f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
+ )
+ )
+ _main(uid2refs, uid2hyps, verbose)
+
+
+def main_phn_lab(
+ tsv_dir,
+ lab_dir,
+ lab_name,
+ lab_sets,
+ phn_dir,
+ phn_sets,
+ pad_len=0,
+ upsample=1,
+ verbose=False,
+):
+ uid2refs = {}
+ for s in phn_sets:
+ uid2refs.update(read_phn(f"{phn_dir}/{s}.tsv"))
+
+ uid2hyps = {}
+ tsv_dir = lab_dir if tsv_dir is None else tsv_dir
+ for s in lab_sets:
+ uid2hyps.update(
+ read_lab(
+ f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
+ )
+ )
+ _main(uid2refs, uid2hyps, verbose)
+
+
+def _main(uid2refs, uid2hyps, verbose):
+ (p_xy, ref2pid, hyp2lid, tot, frmdiff, skipped) = comp_joint_prob(
+ uid2refs, uid2hyps
+ )
+ ref_pur_by_hyp, ref_pur = comp_purity(p_xy, axis=0)
+ hyp_pur_by_ref, hyp_pur = comp_purity(p_xy, axis=1)
+ (mi, mi_norm_by_ref, mi_norm_by_hyp, h_ref, h_hyp) = comp_norm_mutual_info(p_xy)
+ outputs = {
+ "ref pur": ref_pur,
+ "hyp pur": hyp_pur,
+ "H(ref)": h_ref,
+ "H(hyp)": h_hyp,
+ "MI": mi,
+ "MI/H(ref)": mi_norm_by_ref,
+ "ref segL": comp_avg_seg_dur(uid2refs.values()),
+ "hyp segL": comp_avg_seg_dur(uid2hyps.values()),
+ "p_xy shape": p_xy.shape,
+ "frm tot": tot,
+ "frm diff": frmdiff,
+ "utt tot": len(uid2refs),
+ "utt miss": len(skipped),
+ }
+ print(tabulate([outputs.values()], outputs.keys(), floatfmt=".4f"))
+
+
+if __name__ == "__main__":
+ """
+ compute quality of labels with respect to phone or another labels if set
+ """
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("tsv_dir")
+ parser.add_argument("lab_dir")
+ parser.add_argument("lab_name")
+ parser.add_argument("--lab_sets", default=["valid"], type=str, nargs="+")
+ parser.add_argument(
+ "--phn_dir",
+ default="/checkpoint/wnhsu/data/librispeech/960h/fa/raw_phn/phone_frame_align_v1",
+ )
+ parser.add_argument(
+ "--phn_sets", default=["dev-clean", "dev-other"], type=str, nargs="+"
+ )
+ parser.add_argument("--pad_len", default=0, type=int, help="padding for hypotheses")
+ parser.add_argument(
+ "--upsample", default=1, type=int, help="upsample factor for hypotheses"
+ )
+ parser.add_argument("--ref_lab_dir", default="")
+ parser.add_argument("--ref_lab_name", default="")
+ parser.add_argument("--verbose", action="store_true")
+ args = parser.parse_args()
+
+ if args.ref_lab_dir and args.ref_lab_name:
+ main_lab_lab(
+ args.tsv_dir,
+ args.lab_dir,
+ args.lab_name,
+ args.lab_sets,
+ args.ref_lab_dir,
+ args.ref_lab_name,
+ args.pad_len,
+ args.upsample,
+ args.verbose,
+ )
+ else:
+ main_phn_lab(
+ args.tsv_dir,
+ args.lab_dir,
+ args.lab_name,
+ args.lab_sets,
+ args.phn_dir,
+ args.phn_sets,
+ args.pad_len,
+ args.upsample,
+ args.verbose,
+ )
diff --git a/fairseq/examples/hubert/simple_kmeans/README.md b/fairseq/examples/hubert/simple_kmeans/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..847475c23f8a6a47bb25cba83466ddd9eba167b8
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/README.md
@@ -0,0 +1,80 @@
+# Sharded Feature Extraction and K-means Application
+
+This folder contains scripts for preparing HUBERT labels from tsv files, the
+steps are:
+1. feature extraction
+2. k-means clustering
+3. k-means application
+
+
+## Data preparation
+
+`*.tsv` files contains a list of audio, where each line is the root, and
+following lines are the subpath for each audio:
+```
+
+
+
+...
+```
+
+
+## Feature extraction
+
+### MFCC feature
+Suppose the tsv file is at `${tsv_dir}/${split}.tsv`. To extract 39-D
+mfcc+delta+ddelta features for the 1st iteration HUBERT training, run:
+```sh
+python dump_mfcc_feature.py ${tsv_dir} ${split} ${nshard} ${rank} ${feat_dir}
+```
+This would shard the tsv file into `${nshard}` and extract features for the
+`${rank}`-th shard, where rank is an integer in `[0, nshard-1]`. Features would
+be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
+
+
+### HUBERT feature
+To extract features from the `${layer}`-th transformer layer of a trained
+HUBERT model saved at `${ckpt_path}`, run:
+```sh
+python dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} ${layer} ${nshard} ${rank} ${feat_dir}
+```
+Features would also be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
+
+- if out-of-memory, decrease the chunk size with `--max_chunk`
+
+
+## K-means clustering
+To fit a k-means model with `${n_clusters}` clusters on 10% of the `${split}` data, run
+```sh
+python learn_kmeans.py ${feat_dir} ${split} ${nshard} ${km_path} ${n_cluster} --percent 0.1
+```
+This saves the k-means model to `${km_path}`.
+
+- set `--precent -1` to use all data
+- more kmeans options can be found with `-h` flag
+
+
+## K-means application
+To apply a trained k-means model `${km_path}` to obtain labels for `${split}`, run
+```sh
+python dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir}
+```
+This would extract labels for the `${rank}`-th shard out of `${nshard}` shards
+and dump them to `${lab_dir}/${split}_${rank}_${shard}.km`
+
+
+Finally, merge shards for `${split}` by running
+```sh
+for rank in $(seq 0 $((nshard - 1))); do
+ cat $lab_dir/${split}_${rank}_${nshard}.km
+done > $lab_dir/${split}.km
+```
+
+
+## Create a dummy dict
+To create a dummy dictionary, run
+```sh
+for x in $(seq 0 $((n_clusters - 1))); do
+ echo "$x 1"
+done >> $lab_dir/dict.km.txt
+```
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature.py b/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ea4ea0aa93046a133722511311a2735796cefeb
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature.py
@@ -0,0 +1,93 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import fairseq
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+
+from feature_utils import get_path_iterator, dump_feature
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+
+
+logging.basicConfig(
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ level=os.environ.get("LOGLEVEL", "INFO").upper(),
+ stream=sys.stdout,
+)
+logger = logging.getLogger("dump_hubert_feature")
+
+
+class HubertFeatureReader(object):
+ def __init__(self, ckpt_path, layer, max_chunk=1600000):
+ (
+ model,
+ cfg,
+ task,
+ ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+ self.model = model[0].eval().cuda()
+ self.task = task
+ self.layer = layer
+ self.max_chunk = max_chunk
+ logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+ logger.info(f" max_chunk = {self.max_chunk}")
+
+ def read_audio(self, path, ref_len=None):
+ wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+ if wav.ndim == 2:
+ wav = wav.mean(-1)
+ assert wav.ndim == 1, wav.ndim
+ if ref_len is not None and abs(ref_len - len(wav)) > 160:
+ logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+ return wav
+
+ def get_feats(self, path, ref_len=None):
+ x = self.read_audio(path, ref_len=ref_len)
+ with torch.no_grad():
+ x = torch.from_numpy(x).float().cuda()
+ if self.task.cfg.normalize:
+ x = F.layer_norm(x, x.shape)
+ x = x.view(1, -1)
+
+ feat = []
+ for start in range(0, x.size(1), self.max_chunk):
+ x_chunk = x[:, start : start + self.max_chunk]
+ feat_chunk, _ = self.model.extract_features(
+ source=x_chunk,
+ padding_mask=None,
+ mask=False,
+ output_layer=self.layer,
+ )
+ feat.append(feat_chunk)
+ return torch.cat(feat, 1).squeeze(0)
+
+
+def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk):
+ reader = HubertFeatureReader(ckpt_path, layer, max_chunk)
+ generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+ dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("tsv_dir")
+ parser.add_argument("split")
+ parser.add_argument("ckpt_path")
+ parser.add_argument("layer", type=int)
+ parser.add_argument("nshard", type=int)
+ parser.add_argument("rank", type=int)
+ parser.add_argument("feat_dir")
+ parser.add_argument("--max_chunk", type=int, default=1600000)
+ args = parser.parse_args()
+ logger.info(args)
+
+ main(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py b/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py
new file mode 100644
index 0000000000000000000000000000000000000000..941bc1b675459b800b7e006f2ff9c2305c0dd8e8
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import csv
+import io
+import logging
+import os
+import os.path as op
+import sys
+
+from dump_hubert_feature import HubertFeatureReader
+from feature_utils import get_shard_range, dump_feature
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+
+
+logging.basicConfig(
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ level=os.environ.get("LOGLEVEL", "INFO").upper(),
+ stream=sys.stdout,
+)
+logger = logging.getLogger("dump_hubert_feature_s2t")
+
+
+class HubertFeatureReaderS2T(HubertFeatureReader):
+ def read_audio(self, path, ref_len=None):
+ wav = get_features_or_waveform(
+ path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate
+ )
+ if wav.ndim == 2:
+ wav = wav.mean(-1)
+ assert wav.ndim == 1, wav.ndim
+ if ref_len is not None and abs(ref_len - len(wav)) > 160:
+ logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+ return wav
+
+
+def get_path_iterator(root, tsv, nshard, rank, audio_col_name):
+ with open(tsv) as f:
+ reader = csv.DictReader(
+ f,
+ delimiter="\t",
+ quotechar=None,
+ doublequote=False,
+ lineterminator="\n",
+ quoting=csv.QUOTE_NONE,
+ )
+ subpaths = [op.join(root, e[audio_col_name]) for e in reader]
+ start, end = get_shard_range(len(subpaths), nshard, rank)
+ subpaths = subpaths[start:end]
+
+ def iterate():
+ for subpath in subpaths:
+ yield op.join(root, subpath), None
+
+ return iterate, len(subpaths)
+
+
+def main(
+ root,
+ tsv_path,
+ ckpt_path,
+ layer,
+ nshard,
+ rank,
+ feat_dir,
+ split,
+ max_chunk,
+ audio_col_name,
+):
+ reader = HubertFeatureReaderS2T(ckpt_path, layer, max_chunk)
+ generator, num = get_path_iterator(root, tsv_path, nshard, rank, audio_col_name)
+ dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("root")
+ parser.add_argument("tsv_path")
+ parser.add_argument("ckpt_path")
+ parser.add_argument("layer", type=int)
+ parser.add_argument("nshard", type=int)
+ parser.add_argument("rank", type=int)
+ parser.add_argument("feat_dir")
+ parser.add_argument("split")
+ parser.add_argument("--audio_col_name", type=str, default="audio")
+ parser.add_argument("--max_chunk", type=int, default=1600000)
+ args = parser.parse_args()
+ logger.info(args)
+
+ main(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_km_label.py b/fairseq/examples/hubert/simple_kmeans/dump_km_label.py
new file mode 100644
index 0000000000000000000000000000000000000000..8871307804d3f1e5c7cc49061614c69df26ab1ee
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_km_label.py
@@ -0,0 +1,98 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import numpy as np
+
+import joblib
+import torch
+import tqdm
+
+logging.basicConfig(
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ level=os.environ.get("LOGLEVEL", "INFO").upper(),
+ stream=sys.stdout,
+)
+logger = logging.getLogger("dump_km_label")
+
+
+class ApplyKmeans(object):
+ def __init__(self, km_path):
+ self.km_model = joblib.load(km_path)
+ self.C_np = self.km_model.cluster_centers_.transpose()
+ self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)
+
+ self.C = torch.from_numpy(self.C_np)
+ self.Cnorm = torch.from_numpy(self.Cnorm_np)
+ if torch.cuda.is_available():
+ self.C = self.C.cuda()
+ self.Cnorm = self.Cnorm.cuda()
+
+ def __call__(self, x):
+ if isinstance(x, torch.Tensor):
+ dist = (
+ x.pow(2).sum(1, keepdim=True)
+ - 2 * torch.matmul(x, self.C)
+ + self.Cnorm
+ )
+ return dist.argmin(dim=1).cpu().numpy()
+ else:
+ dist = (
+ (x ** 2).sum(1, keepdims=True)
+ - 2 * np.matmul(x, self.C_np)
+ + self.Cnorm_np
+ )
+ return np.argmin(dist, axis=1)
+
+
+def get_feat_iterator(feat_dir, split, nshard, rank):
+ feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+ leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+ with open(leng_path, "r") as f:
+ lengs = [int(line.rstrip()) for line in f]
+ offsets = [0] + np.cumsum(lengs[:-1]).tolist()
+
+ def iterate():
+ feat = np.load(feat_path, mmap_mode="r")
+ assert feat.shape[0] == (offsets[-1] + lengs[-1])
+ for offset, leng in zip(offsets, lengs):
+ yield feat[offset: offset + leng]
+
+ return iterate, len(lengs)
+
+
+def dump_label(feat_dir, split, km_path, nshard, rank, lab_dir):
+ apply_kmeans = ApplyKmeans(km_path)
+ generator, num = get_feat_iterator(feat_dir, split, nshard, rank)
+ iterator = generator()
+
+ lab_path = f"{lab_dir}/{split}_{rank}_{nshard}.km"
+ os.makedirs(lab_dir, exist_ok=True)
+ with open(lab_path, "w") as f:
+ for feat in tqdm.tqdm(iterator, total=num):
+ # feat = torch.from_numpy(feat).cuda()
+ lab = apply_kmeans(feat).tolist()
+ f.write(" ".join(map(str, lab)) + "\n")
+ logger.info("finished successfully")
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("feat_dir")
+ parser.add_argument("split")
+ parser.add_argument("km_path")
+ parser.add_argument("nshard", type=int)
+ parser.add_argument("rank", type=int)
+ parser.add_argument("lab_dir")
+ args = parser.parse_args()
+ logging.info(str(args))
+
+ dump_label(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_mfcc_feature.py b/fairseq/examples/hubert/simple_kmeans/dump_mfcc_feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3537784d1d390701e96951d6e39f63f2023e32a
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_mfcc_feature.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import soundfile as sf
+import torch
+import torchaudio
+
+from feature_utils import get_path_iterator, dump_feature
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+
+logging.basicConfig(
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ level=os.environ.get("LOGLEVEL", "INFO").upper(),
+ stream=sys.stdout,
+)
+logger = logging.getLogger("dump_mfcc_feature")
+
+
+class MfccFeatureReader(object):
+ def __init__(self, sample_rate):
+ self.sample_rate = sample_rate
+
+ def read_audio(self, path, ref_len=None):
+ wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.sample_rate)
+ if ref_len is not None and abs(ref_len - len(wav)) > 160:
+ logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+ return wav
+
+ def get_feats(self, path, ref_len=None):
+ x = self.read_audio(path, ref_len=ref_len)
+ with torch.no_grad():
+ x = torch.from_numpy(x).float()
+ x = x.view(1, -1)
+
+ mfccs = torchaudio.compliance.kaldi.mfcc(
+ waveform=x,
+ sample_frequency=self.sample_rate,
+ use_energy=False,
+ ) # (time, freq)
+ mfccs = mfccs.transpose(0, 1) # (freq, time)
+ deltas = torchaudio.functional.compute_deltas(mfccs)
+ ddeltas = torchaudio.functional.compute_deltas(deltas)
+ concat = torch.cat([mfccs, deltas, ddeltas], dim=0)
+ concat = concat.transpose(0, 1).contiguous() # (freq, time)
+ return concat
+
+
+def main(tsv_dir, split, nshard, rank, feat_dir, sample_rate):
+ reader = MfccFeatureReader(sample_rate)
+ generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+ dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("tsv_dir")
+ parser.add_argument("split")
+ parser.add_argument("nshard", type=int)
+ parser.add_argument("rank", type=int)
+ parser.add_argument("feat_dir")
+ parser.add_argument("--sample_rate", type=int, default=16000)
+ args = parser.parse_args()
+ logger.info(args)
+
+ main(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_w2v2_feature.py b/fairseq/examples/hubert/simple_kmeans/dump_w2v2_feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1f0d902acf0756580a1f4604feee8fc499a9a63
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_w2v2_feature.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import fairseq
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+
+from feature_utils import get_path_iterator, dump_feature
+
+
+logging.basicConfig(
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ level=os.environ.get("LOGLEVEL", "INFO").upper(),
+ stream=sys.stdout,
+)
+logger = logging.getLogger("dump_w2v2_feature")
+
+
+class Wav2Vec2FeatureReader(object):
+ def __init__(self, ckpt_path, layer, max_chunk=1600000):
+ (
+ model,
+ cfg,
+ task,
+ ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+ self.model = model[0].eval().cuda()
+ self.task = task
+ self.layer = layer # assume this is 1-based like HuBERT
+ self.max_chunk = max_chunk
+ logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+ logger.info(f" max_chunk = {self.max_chunk}")
+ logger.info(f" model:\n{self.model}")
+
+ def read_audio(self, path, ref_len=None):
+ wav, sr = sf.read(path)
+ assert sr == self.task.cfg.sample_rate, sr
+ if wav.ndim == 2:
+ wav = wav.mean(-1)
+ assert wav.ndim == 1, wav.ndim
+ if ref_len is not None and abs(ref_len - len(wav)) > 160:
+ logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+ return wav
+
+ def get_feats(self, path, ref_len=None):
+ x = self.read_audio(path, ref_len)
+ with torch.no_grad():
+ x = torch.from_numpy(x).float().cuda()
+ if self.task.cfg.normalize:
+ x = F.layer_norm(x, x.shape)
+ x = x.view(1, -1)
+
+ feat = []
+ for start in range(0, x.size(1), self.max_chunk):
+ x_chunk = x[:, start: start + self.max_chunk]
+ res = self.model.extract_features(
+ source=x_chunk,
+ padding_mask=None,
+ mask=False,
+ layer=self.layer - 1,
+ )
+ feat_chunk = res["x"]
+ feat.append(feat_chunk)
+ return torch.cat(feat, 1).squeeze(0)
+
+
+def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk):
+ reader = Wav2Vec2FeatureReader(ckpt_path, layer, max_chunk)
+ generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+ dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("tsv_dir")
+ parser.add_argument("split")
+ parser.add_argument("ckpt_path")
+ parser.add_argument("layer", type=int)
+ parser.add_argument("nshard", type=int)
+ parser.add_argument("rank", type=int)
+ parser.add_argument("feat_dir")
+ parser.add_argument("--max_chunk", type=int, default=1600000)
+ args = parser.parse_args()
+ logger.info(args)
+
+ main(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/feature_utils.py b/fairseq/examples/hubert/simple_kmeans/feature_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f80bc4569768fac181133cdc8f76d1230e03bff6
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/feature_utils.py
@@ -0,0 +1,66 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import tqdm
+from npy_append_array import NpyAppendArray
+
+
+logging.basicConfig(
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ level=os.environ.get("LOGLEVEL", "INFO").upper(),
+ stream=sys.stdout,
+)
+logger = logging.getLogger("feature_utils")
+
+
+def get_shard_range(tot, nshard, rank):
+ assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}"
+ start = round(tot / nshard * rank)
+ end = round(tot / nshard * (rank + 1))
+ assert start < end, f"start={start}, end={end}"
+ logger.info(
+ f"rank {rank} of {nshard}, process {end-start} "
+ f"({start}-{end}) out of {tot}"
+ )
+ return start, end
+
+
+def get_path_iterator(tsv, nshard, rank):
+ with open(tsv, "r") as f:
+ root = f.readline().rstrip()
+ lines = [line.rstrip() for line in f]
+ start, end = get_shard_range(len(lines), nshard, rank)
+ lines = lines[start:end]
+ def iterate():
+ for line in lines:
+ subpath, nsample = line.split("\t")
+ yield f"{root}/{subpath}", int(nsample)
+ return iterate, len(lines)
+
+
+def dump_feature(reader, generator, num, split, nshard, rank, feat_dir):
+ iterator = generator()
+
+ feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+ leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+
+ os.makedirs(feat_dir, exist_ok=True)
+ if os.path.exists(feat_path):
+ os.remove(feat_path)
+
+ feat_f = NpyAppendArray(feat_path)
+ with open(leng_path, "w") as leng_f:
+ for path, nsample in tqdm.tqdm(iterator, total=num):
+ feat = reader.get_feats(path, nsample)
+ feat_f.append(feat.cpu().numpy())
+ leng_f.write(f"{len(feat)}\n")
+ logger.info("finished successfully")
+
+
diff --git a/fairseq/examples/hubert/simple_kmeans/learn_kmeans.py b/fairseq/examples/hubert/simple_kmeans/learn_kmeans.py
new file mode 100644
index 0000000000000000000000000000000000000000..113ac655b8c0a585fe43797e99674e445098edd0
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/learn_kmeans.py
@@ -0,0 +1,146 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+
+import joblib
+
+logging.basicConfig(
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ level=os.environ.get("LOGLEVEL", "INFO").upper(),
+ stream=sys.stdout,
+)
+logger = logging.getLogger("learn_kmeans")
+
+
+def get_km_model(
+ n_clusters,
+ init,
+ max_iter,
+ batch_size,
+ tol,
+ max_no_improvement,
+ n_init,
+ reassignment_ratio,
+):
+ return MiniBatchKMeans(
+ n_clusters=n_clusters,
+ init=init,
+ max_iter=max_iter,
+ batch_size=batch_size,
+ verbose=1,
+ compute_labels=False,
+ tol=tol,
+ max_no_improvement=max_no_improvement,
+ init_size=None,
+ n_init=n_init,
+ reassignment_ratio=reassignment_ratio,
+ )
+
+
+def load_feature_shard(feat_dir, split, nshard, rank, percent):
+ feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+ leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+ with open(leng_path, "r") as f:
+ lengs = [int(line.rstrip()) for line in f]
+ offsets = [0] + np.cumsum(lengs[:-1]).tolist()
+
+ if percent < 0:
+ return np.load(feat_path, mmap_mode="r")
+ else:
+ nsample = int(np.ceil(len(lengs) * percent))
+ indices = np.random.choice(len(lengs), nsample, replace=False)
+ feat = np.load(feat_path, mmap_mode="r")
+ sampled_feat = np.concatenate(
+ [feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0
+ )
+ logger.info(
+ (
+ f"sampled {nsample} utterances, {len(sampled_feat)} frames "
+ f"from shard {rank}/{nshard}"
+ )
+ )
+ return sampled_feat
+
+
+def load_feature(feat_dir, split, nshard, seed, percent):
+ assert percent <= 1.0
+ feat = np.concatenate(
+ [
+ load_feature_shard(feat_dir, split, nshard, r, percent)
+ for r in range(nshard)
+ ],
+ axis=0,
+ )
+ logging.info(f"loaded feature with dimension {feat.shape}")
+ return feat
+
+
+def learn_kmeans(
+ feat_dir,
+ split,
+ nshard,
+ km_path,
+ n_clusters,
+ seed,
+ percent,
+ init,
+ max_iter,
+ batch_size,
+ tol,
+ n_init,
+ reassignment_ratio,
+ max_no_improvement,
+):
+ np.random.seed(seed)
+ feat = load_feature(feat_dir, split, nshard, seed, percent)
+ km_model = get_km_model(
+ n_clusters,
+ init,
+ max_iter,
+ batch_size,
+ tol,
+ max_no_improvement,
+ n_init,
+ reassignment_ratio,
+ )
+ km_model.fit(feat)
+ joblib.dump(km_model, km_path)
+
+ inertia = -km_model.score(feat) / len(feat)
+ logger.info("total intertia: %.5f", inertia)
+ logger.info("finished successfully")
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("feat_dir", type=str)
+ parser.add_argument("split", type=str)
+ parser.add_argument("nshard", type=int)
+ parser.add_argument("km_path", type=str)
+ parser.add_argument("n_clusters", type=int)
+ parser.add_argument("--seed", default=0, type=int)
+ parser.add_argument(
+ "--percent", default=-1, type=float, help="sample a subset; -1 for all"
+ )
+ parser.add_argument("--init", default="k-means++")
+ parser.add_argument("--max_iter", default=100, type=int)
+ parser.add_argument("--batch_size", default=10000, type=int)
+ parser.add_argument("--tol", default=0.0, type=float)
+ parser.add_argument("--max_no_improvement", default=100, type=int)
+ parser.add_argument("--n_init", default=20, type=int)
+ parser.add_argument("--reassignment_ratio", default=0.0, type=float)
+ args = parser.parse_args()
+ logging.info(str(args))
+
+ learn_kmeans(**vars(args))
diff --git a/fairseq/examples/hubert/tests/sample.base.L9.km500.km b/fairseq/examples/hubert/tests/sample.base.L9.km500.km
new file mode 100644
index 0000000000000000000000000000000000000000..656eef96e588b601a7a8c0f2ab8644d4185045fb
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.base.L9.km500.km
@@ -0,0 +1 @@
+17 17 17 17 296 296 20 20 20 461 461 20 184 20 20 20 184 289 144 445 445 213 213 213 213 252 215 129 401 20 354 180 494 44 416 416 416 192 192 180 180 84 84 84 16 88 88 88 88 319 242 240 348 35 35 117 404 197 226 209 83 55 55 55 322 67 94 199 118 118 118 118 118 118 402 219 219 219 222 222 222 353 59 245 245 251 251 241 241 431 367 367 178 35 35 35 458 192 351 41 324 324 324 252 464 464 139 139 424 424 424 497 497 497 122 90 42 42 147 380 380 499 319 319 319 348 348 33 33 394 90 76 465 74 425 425 386 386 431 319 319 319 319 319 240 203 53 473 34 340 340 340 340 116 64 212 384 377 123 123 123 216 216 216 114 114 57 57 57 203 381 381 117 48 13 47 80 20 80 80 320 7 7 364 345 141 141 141 141 281 281 9 86 221 198 198 22 283 455 236 239 239 107 107 395 286 286 286 468 468 406 406 467 176 176 176 328 200 200 248 464 145 365 365 365 365 330 385 457 77 77 77 54 224 300 334 334 382 304 304 271 186 31 342 342 342 198 22 283 5 38 162 232 232 482 68 26 26 359 359 81 444 213 213 252 143 458 41 324 324 324 422 143 445 445 445 351 180 486 315 315 450 450 450 203 53 473 291 89 116 379 243 478 478 66 482 482 105 105 336 336 354 29 498 498 498 498 396 396 313 37 314 198 22 222 222 222 222 245 129 74 74 437 437 496 496 496 413 94 199 41 41 324 324 318 318 269 342 9 168 106 106 284 426 426 426 426 348 64 76 401 259 108 123 153 153 153 153 372 372 396 313 24 314 90 401 259 445 445 351 351 365 365 365 365 282 282 215 233 233 229 427 20 247 126 126 126 326 326 326 326 326 326 326 101 101 101 149 228 228 20 289 20 7 217 70 65 189 189 151 240 285 300 300 495 406 467 176 135 135 339 248 466 114 222 222 222 313 313 239 384 371 490 490 38 31 54 54 224 494 494 236 129 259 74 190 487 288 288 288 288 374 173 173 280 280 302 302 175 175 69 69 223 130 129 401 75 108 119 295 295 295 295 143 192 192 135 135 135 135 200 200 464 255 255 255 251 251 241 431 235 235 235 348 348 465 192 44 44 236 8 8 354 319 319 383 348 36 310 107 107 395 462 462 8 32 32 32 354 153 153 153 153 153 387 387 387 387 85 207 318 318 318 49 453 9 168 125 125 125 125 125 466 199 44 44 143 129 144 445 351 351 351 486 486 460 285 285 302 302 497 497 122 239 161 161 79 79 499 499 499 265 265 265 85 85 85 299 299 173 352 352 427 229 170 247 15 15 15 15 15 15 193 193 193 17
diff --git a/fairseq/examples/hubert/tests/sample.base.L9.len b/fairseq/examples/hubert/tests/sample.base.L9.len
new file mode 100644
index 0000000000000000000000000000000000000000..7d3028fa244a2121c51f39dcc92dc15be82823a6
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.base.L9.len
@@ -0,0 +1 @@
+596
diff --git a/fairseq/examples/hubert/tests/sample.large.L20.len b/fairseq/examples/hubert/tests/sample.large.L20.len
new file mode 100644
index 0000000000000000000000000000000000000000..7d3028fa244a2121c51f39dcc92dc15be82823a6
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.large.L20.len
@@ -0,0 +1 @@
+596
diff --git a/fairseq/examples/hubert/tests/sample.large.hypo.word b/fairseq/examples/hubert/tests/sample.large.hypo.word
new file mode 100644
index 0000000000000000000000000000000000000000..d77a4cfddcb93c2e08eb55e630c85fe840fd3cc2
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.large.hypo.word
@@ -0,0 +1 @@
+KEEP A GOING AN IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0)
diff --git a/fairseq/examples/hubert/tests/sample.xlarge.L30.len b/fairseq/examples/hubert/tests/sample.xlarge.L30.len
new file mode 100644
index 0000000000000000000000000000000000000000..7d3028fa244a2121c51f39dcc92dc15be82823a6
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.xlarge.L30.len
@@ -0,0 +1 @@
+596
diff --git a/fairseq/examples/hubert/tests/sample.xlarge.hypo.word b/fairseq/examples/hubert/tests/sample.xlarge.hypo.word
new file mode 100644
index 0000000000000000000000000000000000000000..53e402d4550c820220e0964654a600dabaca8b1c
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.xlarge.hypo.word
@@ -0,0 +1 @@
+KEEP A GOIN AND IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0)
diff --git a/fairseq/examples/hubert/update_ckpt.py b/fairseq/examples/hubert/update_ckpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..53c9e74ea613e30aa5c22614e658f2b7272bac0c
--- /dev/null
+++ b/fairseq/examples/hubert/update_ckpt.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+src_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2.pt"
+ref_ckpt = "/checkpoint/wnhsu/w2v/hubert_icassp_oss_v3/iter2_km100-400k-grp-L6/oss.km500_p0_1_s334.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU100k.s1337.ngpu32/checkpoint_last.pt"
+new_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2_updated.pt"
+
+
+def update_state(state):
+ state["model"]["label_embs_concat"] = state["model"].pop("label_embs")
+ state["args"].task = "hubert_pretraining"
+ state["args"].labels = f"['{state['args'].labels}']"
+ return state
+
+
+src_state = torch.load(src_ckpt)
+src_state = update_state(src_state)
+torch.save(src_state, new_ckpt)
diff --git a/fairseq/examples/latent_depth/latent_depth_src/loss/__init__.py b/fairseq/examples/latent_depth/latent_depth_src/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/examples/latent_depth/latent_depth_src/loss/latent_depth.py b/fairseq/examples/latent_depth/latent_depth_src/loss/latent_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9535ecac3ec403868681a8b50c1fbe1c90dfe
--- /dev/null
+++ b/fairseq/examples/latent_depth/latent_depth_src/loss/latent_depth.py
@@ -0,0 +1,99 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+from torch.nn.modules.loss import _Loss
+
+
+class LatentLayersKLLoss(_Loss):
+ def __init__(self, args):
+ super().__init__()
+ self.args = args
+
+ def forward(self, layer_samples, lang_idx, update_num, sample_size):
+ prior = self.args.prior
+ samples = layer_samples[lang_idx]
+ eps = 1e-7
+ if prior == "uniform":
+ # uniform prior
+ kl_loss = (samples * (torch.log(samples + eps) - math.log(0.5))).sum(-1)
+ elif prior == "agged_posterior":
+ # aggregated posterior
+ y_t = torch.stack([x.detach() for x in layer_samples], dim=0)
+ agged_q = torch.sum(y_t, dim=0)
+ row_norm = agged_q.sum(-1)
+ normed_agg_q = agged_q / row_norm
+ kl_loss = (
+ samples * (torch.log(samples + eps) - torch.log(normed_agg_q + eps))
+ ).sum(-1)
+ else:
+ raise NotImplementedError("The specified prior is not implemented.")
+
+ # normalized by number of layers
+ kl_loss /= layer_samples[0].size()[0]
+ kl_weight = min(
+ self.args.sparsity_weight,
+ (update_num - self.args.soft_update)
+ * self.args.sparsity_weight
+ / self.args.anneal_updates,
+ )
+ kl_loss *= kl_weight * sample_size
+ return kl_loss
+
+
+class LatentLayersSparsityLoss(_Loss):
+ def __init__(self, args):
+ super().__init__()
+ self.args = args
+
+ def is_valid(self, update_num):
+ if self.args.target_layers <= 0:
+ return False
+ return update_num > (self.args.soft_update + self.args.anneal_updates)
+
+ def forward(self, layer_samples_list, update_num, sample_size):
+ batch_loss = 0
+ share_loss = 0
+ global_sparsity_loss = 0
+ layer_samples = torch.stack(layer_samples_list, dim=0)
+ if (
+ self.args.target_layers > 0 or self.args.share_weight > 0
+ ) and update_num > (self.args.soft_update + self.args.anneal_updates):
+ # anneal sparsity weight
+ if update_num < (self.args.anneal_updates + self.args.soft_update):
+ weight_anneal = 0
+ elif update_num < (2 * self.args.anneal_updates + self.args.soft_update):
+ weight_anneal = (
+ (update_num - self.args.soft_update - self.args.anneal_updates)
+ * self.args.share_weight
+ / self.args.anneal_updates
+ )
+ else:
+ weight_anneal = 1
+ # compute ratio among languages
+ layer_utilization = torch.sum(layer_samples, dim=0)
+ layer_utilization /= layer_samples.size()[0]
+ if self.args.share_weight > 0:
+ # encouraging sharing across languages
+ share_loss = sum(
+ -1.0 * v * math.log(v) for v in layer_utilization if v > 0
+ )
+ batch_loss += (
+ weight_anneal * self.args.share_weight * sample_size * share_loss
+ )
+ if self.args.target_layers > 0:
+ # computed expected number of layers selected
+ expeted_layers = sum(layer_utilization)
+ # compute l2 loss wrt target number of layers
+ global_sparsity_loss = (expeted_layers - self.args.target_layers) ** 2
+ batch_loss += (
+ weight_anneal
+ * self.args.share_weight
+ * sample_size
+ * global_sparsity_loss
+ )
+ return batch_loss
diff --git a/fairseq/examples/latent_depth/latent_depth_src/models/__init__.py b/fairseq/examples/latent_depth/latent_depth_src/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/examples/latent_depth/latent_depth_src/models/latent_transformer.py b/fairseq/examples/latent_depth/latent_depth_src/models/latent_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a825301a452bd935deafdaf78fa2427ca9a469e
--- /dev/null
+++ b/fairseq/examples/latent_depth/latent_depth_src/models/latent_transformer.py
@@ -0,0 +1,156 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+import torch.nn as nn
+from fairseq.models.fairseq_encoder import EncoderOut
+from fairseq.models.transformer import TransformerDecoder, TransformerEncoder
+from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer
+from torch import Tensor
+
+from ..modules.latent_layers import LayerSelect
+
+
+class LatentTransformerEncoder(TransformerEncoder):
+ """Latent depth (https://arxiv.org/abs/2009.13102) implemented in
+ TransformerEncoder.
+ """
+
+ def __init__(self, args, dictionary, embed_tokens, num_logits=1):
+ self.num_logits = num_logits
+ self.num_layers = args.encoder_layers
+ super().__init__(args, dictionary, embed_tokens)
+ self.layer_select = LayerSelect(
+ num_layers=self.num_layers,
+ num_logits=self.num_logits,
+ soft_select=getattr(args, "soft_select", False),
+ sampling_tau=getattr(args, "sampling_tau", 5.),
+ )
+ self.lang_idx = None
+ self.layers = nn.ModuleList(
+ [self._build_encoder_layer(args, idx) for idx in range(args.encoder_layers)]
+ )
+
+ def set_lang_idx(self, lang_idx):
+ self.lang_idx = lang_idx
+
+ def _build_encoder_layer(self, args, idx=None):
+ return LatentTransformerEncoderLayer(args, idx, layer_select=self.layer_select)
+
+ def forward(self, src_tokens, src_lengths, return_all_hiddens: bool = False):
+ self.layer_select.sample(self.lang_idx)
+ return super().forward(src_tokens, src_lengths, return_all_hiddens)
+
+
+class LatentTransformerEncoderLayer(TransformerEncoderLayer):
+ """Encoder layer with each (non_residual) block weighted by samples of Bernouli
+ or Gumbel Signmoid samples.
+
+ Args:
+ args (argparse.Namespace): parsed command-line arguments from standard
+ TransformerEncoderLayer.
+ idx (int): layer index (used to retrieve samples).
+ layer_select (LayerSelect, optional): instance of LayerSelect module with logits
+ parameters and sampling method.
+ """
+
+ def __init__(self, args, idx, layer_select=None):
+ super().__init__(args)
+ self.idx = idx
+ self.layer_select = layer_select
+
+ def residual_connection(self, x, residual):
+ return residual + x * self.layer_select(self.idx)
+
+
+class LatentTransformerDecoder(TransformerDecoder):
+ """Latent depth (https://arxiv.org/abs/2009.13102) implemented in
+ TransformerDecoder.
+ """
+
+ def __init__(
+ self, args, dictionary, embed_tokens, no_encoder_attn=False, num_logits=1
+ ):
+ self.num_logits = num_logits
+ self.num_layers = args.decoder_layers
+ super().__init__(
+ args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+ )
+ self.layer_select = LayerSelect(
+ num_layers=self.num_layers,
+ num_logits=self.num_logits,
+ soft_select=getattr(args, "soft_select", False),
+ sampling_tau=getattr(args, "sampling_tau", 5.),
+ )
+ self.lang_idx = None
+ self.layers = nn.ModuleList(
+ [
+ self._build_decoder_layer(args, no_encoder_attn, idx)
+ for idx in range(args.decoder_layers)
+ ]
+ )
+
+ def set_lang_idx(self, lang_idx):
+ self.lang_idx = lang_idx
+
+ def _build_decoder_layer(self, args, no_encoder_attn=False, idx=None):
+ return LatentTransformerDecoderLayer(
+ args, idx, layer_select=self.layer_select, no_encoder_attn=no_encoder_attn
+ )
+
+ def forward(
+ self,
+ prev_output_tokens,
+ encoder_out: Optional[EncoderOut] = None,
+ incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+ features_only: bool = False,
+ alignment_layer: Optional[int] = None,
+ alignment_heads: Optional[int] = None,
+ src_lengths: Optional[Any] = None,
+ return_all_hiddens: bool = False,
+ ):
+ self.layer_select.sample(self.lang_idx)
+ return super().forward(
+ prev_output_tokens=prev_output_tokens,
+ encoder_out=encoder_out,
+ incremental_state=incremental_state,
+ features_only=features_only,
+ alignment_layer=alignment_layer,
+ src_lengths=src_lengths,
+ return_all_hiddens=return_all_hiddens,
+ )
+
+
+class LatentTransformerDecoderLayer(TransformerDecoderLayer):
+ """Decoder layer with each (non_residual) block weighted by samples of Bernouli
+ or Gumbel Signmoid samples.
+
+ Args:
+ args (argparse.Namespace): parsed command-line arguments from standard
+ TransformerDecoderLayer.
+ idx (int): layer index (used to retrieve samples).
+ layer_select (LayerSelect, optional): instance of LayerSelect module with logits
+ parameters and sampling method.
+ no_encoder_attn (bool, optional): whether to attend to encoder outputs
+ (default: False).
+
+ """
+
+ def __init__(
+ self,
+ args,
+ idx,
+ layer_select=None,
+ no_encoder_attn=False,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ ):
+ super().__init__(args, no_encoder_attn, add_bias_kv, add_zero_attn)
+ self.idx = idx
+ self.layer_select = layer_select
+
+ def residual_connection(self, x, residual):
+ return residual + x * self.layer_select(self.idx)
diff --git a/fairseq/examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py b/fairseq/examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cc2a7174b765b7ad8808489196e12082a91a2d7
--- /dev/null
+++ b/fairseq/examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py
@@ -0,0 +1,195 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.tasks import register_task
+from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
+from fairseq.utils import safe_hasattr
+
+from .loss.latent_depth import LatentLayersKLLoss, LatentLayersSparsityLoss
+
+
+@register_task("multilingual_translation_latent_depth")
+class MultilingualTranslationTaskLatentDepth(MultilingualTranslationTask):
+ """A task for multiple translation with latent depth.
+
+ See `"Deep Transformer with Latent Depth"
+ (Li et al., 2020) `_.
+ """
+
+ @staticmethod
+ def add_args(parser):
+ """Add task-specific arguments to the parser."""
+ # fmt: off
+ MultilingualTranslationTask.add_args(parser)
+ parser.add_argument('--encoder-latent-layer', action='store_true', help='latent layer selection in encoder')
+ parser.add_argument('--decoder-latent-layer', action='store_true', help='latent layer selection in decoder')
+ parser.add_argument('--target-layers', default=-1, type=int,
+ help='number of effective layers to learn; -1 means no constraint')
+ parser.add_argument('--sparsity-weight', default=0.0, type=float,
+ help='weight for sparsity loss')
+ parser.add_argument('--share-weight', default=0.0, type=float,
+ help='weight for sharing loss')
+ parser.add_argument('--soft-update', default=1, type=int,
+ help='number of updates with soft sampling')
+ parser.add_argument('--anneal-updates', default=1, type=int,
+ help='number of updates to anneal the KL loss weight')
+ parser.add_argument('--prior', default="uniform", type=str,
+ help='prior used for computing KL loss')
+ # fmt: on
+
+ def __init__(self, args, dicts, training):
+ super().__init__(args, dicts, training)
+ self.src_langs, self.tgt_langs = zip(
+ *[(lang.split("-")[0], lang.split("-")[1]) for lang in args.lang_pairs]
+ )
+ if self.training and self.encoder_latent_layer:
+ assert self.args.share_encoders
+ if self.training and self.decoder_latent_layer:
+ assert self.args.share_decoders
+ if training or self.encoder_latent_layer or self.decoder_latent_layer:
+ self.lang_pairs = args.lang_pairs
+ else:
+ self.lang_pairs = ["{}-{}".format(args.source_lang, args.target_lang)]
+ self.eval_lang_pairs = self.lang_pairs
+ self.model_lang_pairs = self.lang_pairs
+ if self.training and (self.encoder_latent_layer or self.decoder_latent_layer):
+ self.kl_loss = LatentLayersKLLoss(self.args)
+ self.sparsity_loss = LatentLayersSparsityLoss(self.args)
+
+ def _per_lang_pair_train_loss(
+ self, lang_pair, model, update_num, criterion, sample, optimizer, ignore_grad
+ ):
+ src, tgt = lang_pair.split("-")
+ if self.encoder_latent_layer:
+ src_lang_idx = self.src_lang_idx_dict[src]
+ model.models[lang_pair].encoder.set_lang_idx(src_lang_idx)
+ model.models[lang_pair].encoder.layer_select.hard_select = (
+ update_num > self.args.soft_update
+ )
+ if self.decoder_latent_layer:
+ tgt_lang_idx = self.tgt_lang_idx_dict[tgt]
+ model.models[lang_pair].decoder.set_lang_idx(tgt_lang_idx)
+ model.models[lang_pair].decoder.layer_select.hard_select = (
+ update_num > self.args.soft_update
+ )
+
+ loss, sample_size, logging_output = criterion(
+ model.models[lang_pair], sample[lang_pair]
+ )
+ if self.encoder_latent_layer:
+ none_samples = sum(
+ 1 if x is None else 0
+ for x in model.models[lang_pair].encoder.layer_select.layer_samples
+ )
+ if none_samples == 0 or self.args.prior != "agged_posterior":
+ loss += self.kl_loss(
+ model.models[lang_pair].encoder.layer_select.layer_samples,
+ src_lang_idx,
+ update_num,
+ sample_size,
+ )
+ if self.decoder_latent_layer:
+ none_samples = sum(
+ 1 if x is None else 0
+ for x in model.models[lang_pair].decoder.layer_select.layer_samples
+ )
+ if none_samples == 0 or self.args.prior != "agged_posterior":
+ loss += self.kl_loss(
+ model.models[lang_pair].decoder.layer_select.layer_samples,
+ tgt_lang_idx,
+ update_num,
+ sample_size,
+ )
+ if ignore_grad:
+ loss *= 0
+
+ if hasattr(self, "sparsity_loss") and self.sparsity_loss.is_valid(update_num):
+ # need to retain the graph if sparsity loss needs to be added
+ loss.backward(retain_graph=True)
+ else:
+ optimizer.backward(loss)
+
+ return loss, sample_size, logging_output
+
+ def train_step(
+ self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+ ):
+ agg_loss, agg_sample_size, agg_logging_output = super().train_step(
+ sample, model, criterion, optimizer, update_num, ignore_grad
+ )
+ # compute auxiliary loss from layere sparsity, based on all samples from all languages
+ if hasattr(self, "sparsity_loss") and self.sparsity_loss.is_valid(update_num):
+ sparsity_loss = 0
+ if self.encoder_latent_layer:
+ sparsity_loss += self.sparsity_loss(
+ next(
+ iter(model.models.values())
+ ).encoder.layer_select.layer_samples,
+ update_num,
+ agg_sample_size,
+ )
+ if self.decoder_latent_layer:
+ sparsity_loss += self.sparsity_loss(
+ next(
+ iter(model.models.values())
+ ).decoder.layer_select.layer_samples,
+ update_num,
+ agg_sample_size,
+ )
+ if sparsity_loss > 0:
+ optimizer.backward(sparsity_loss)
+ return agg_loss, agg_sample_size, agg_logging_output
+
+ def _per_lang_pair_valid_loss(self, lang_pair, model, criterion, sample):
+ src, tgt = lang_pair.split("-")
+ if self.encoder_latent_layer:
+ src_lang_idx = self.src_lang_idx_dict[src]
+ model.models[lang_pair].encoder.set_lang_idx(src_lang_idx)
+ if self.decoder_latent_layer:
+ tgt_lang_idx = self.tgt_lang_idx_dict[tgt]
+ model.models[lang_pair].decoder.set_lang_idx(tgt_lang_idx)
+ loss, sample_size, logging_output = criterion(
+ model.models[lang_pair], sample[lang_pair]
+ )
+ return loss, sample_size, logging_output
+
+ def inference_step(
+ self, generator, models, sample, prefix_tokens=None, constraints=None
+ ):
+ if self.encoder_latent_layer or self.decoder_latent_layer:
+ for model in models:
+ if self.encoder_latent_layer:
+ assert model.encoder.layer_select is not None
+ src_lang_idx = self.src_lang_idx_dict[self.args.source_lang]
+ model.encoder.set_lang_idx(src_lang_idx)
+ if self.decoder_latent_layer:
+ assert model.decoder.layer_select is not None
+ tgt_lang_idx = self.tgt_lang_idx_dict[self.args.target_lang]
+ model.decoder.set_lang_idx(tgt_lang_idx)
+ return super().inference_step(
+ generator, models, sample, prefix_tokens, constraints
+ )
+
+ @property
+ def encoder_latent_layer(self):
+ return (
+ safe_hasattr(self.args, "encoder_latent_layer")
+ and self.args.encoder_latent_layer
+ )
+
+ @property
+ def decoder_latent_layer(self):
+ return (
+ safe_hasattr(self.args, "decoder_latent_layer")
+ and self.args.decoder_latent_layer
+ )
+
+ @property
+ def src_lang_idx_dict(self):
+ return {lang: lang_idx for lang_idx, lang in enumerate(self.src_langs)}
+
+ @property
+ def tgt_lang_idx_dict(self):
+ return {lang: lang_idx for lang_idx, lang in enumerate(self.tgt_langs)}
diff --git a/fairseq/examples/layerdrop/README.md b/fairseq/examples/layerdrop/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4d48ee9615e1458e1e889635dc9938e427a7f64a
--- /dev/null
+++ b/fairseq/examples/layerdrop/README.md
@@ -0,0 +1,154 @@
+# Reducing Transformer Depth on Demand with Structured Dropout (Fan et al., 2019)
+This page contains information for how to train models with LayerDrop, based on this [paper](https://arxiv.org/abs/1909.11556).
+
+## Citation:
+If you found this technique useful, please cite our paper:
+```bibtex
+@article{fan2019reducing,
+ title={Reducing Transformer Depth on Demand with Structured Dropout},
+ author={Fan, Angela and Grave, Edouard and Joulin, Armand},
+ journal={arXiv preprint arXiv:1909.11556},
+ year={2019}
+}
+```
+
+## Pre-trained models
+
+Model | Description | Download
+---|---|---
+`layerdrop_wmt_en_de_12_6` | Transformer + LayerDrop 0.2 trained on WMT16 en-de with 12 encoder and 6 decoder layers | [layerdrop_wmt_en_de_12_6.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/layerdrop_wmt_en_de_12_6.tar.gz)
+`roberta_layerdrop.base` | RoBERTa Base + LayerDrop 0.2 | [roberta_layerdrop.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.base.qnli.tar.gz)
+`roberta_layerdrop.large` | RoBERTa Large + LayerDrop 0.2 | [roberta_layerdrop.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.large.tar.gz)
+`roberta_layerdrop.large.mnli` | `roberta_layerdrop.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | [roberta_layerdrop.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.large.mnli.tar.gz)
+`roberta_layerdrop.large.qnli` | `roberta_layerdrop.large` finetuned on [QNLI](https://arxiv.org/abs/1804.07461) | [roberta_layerdrop.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.large.qnli.tar.gz)
+
+
+Evaluate performance of these pre-trained models:
+```bash
+# Example for Machine Translation
+fairseq-generate /path/to/bped/wmt/data --path nmt_checkpoint.pt \
+ --beam 8 --lenpen 0.4 \
+ --batch-size 64 \
+ --remove-bpe \
+ --gen-subset test > wmt16_gen.txt
+bash scripts/compound_split_bleu.sh wmt16_gen.txt
+# prints BLEU4 = 30.17
+```
+
+```python
+# Example for RoBERTa + LayerDrop finetuned on MNLI:
+from fairseq.models.roberta import RobertaModel
+
+roberta_layerdrop = RobertaModel.from_pretrained(
+ '/path/to/MNLI/model',
+ checkpoint_file='mnli_checkpoint.pt',
+ data_name_or_path='/path/to/MNLI/data/MNLI-bin'
+)
+label_map = {0: 'contradiction', 2: 'neutral', 1: 'entailment'}
+ncorrect, nsamples = 0, 0
+roberta_layerdrop.cuda()
+roberta_layerdrop.eval()
+with open('/path/to/MNLI/data/dev_matched.tsv') as fin:
+ fin.readline()
+ for index, line in enumerate(fin):
+ tokens = line.strip().split('\t')
+ sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
+ tokens = roberta_layerdrop.encode(sent1, sent2)
+ prediction = roberta_layerdrop.predict('sentence_classification_head', tokens).argmax().item()
+ prediction_label = label_map[prediction]
+ ncorrect += int(prediction_label == target)
+ nsamples += 1
+print('| Accuracy: ', float(ncorrect)/float(nsamples))
+# prints | Accuracy: 0.9026999490575649
+
+
+# Example for RoBERTa + LayerDrop finetuned on QNLI:
+roberta = RobertaModel.from_pretrained(
+ '/path/to/QNLI/model',
+ checkpoint_file='qnli_checkpoint.pt',
+ data_name_or_path='/path/to/QNLI/data/QNLI-bin'
+)
+
+label_fn = lambda label: roberta.task.label_dictionary.string(
+ [label + roberta.task.target_dictionary.nspecial]
+)
+ncorrect, nsamples = 0, 0
+roberta.cuda()
+roberta.eval()
+with open('/path/to/QNLI/data/dev.tsv') as fin:
+ fin.readline()
+ for index, line in enumerate(fin):
+ tokens = line.strip().split('\t')
+ sent1, sent2, target = tokens[1], tokens[2], tokens[3]
+ tokens = roberta.encode(sent1, sent2)
+ prediction = roberta.predict('sentence_classification_head', tokens).argmax().item()
+ prediction_label = label_fn(prediction)
+ ncorrect += int(prediction_label == target)
+ nsamples += 1
+print('| Accuracy: ', float(ncorrect)/float(nsamples))
+# prints | Accuracy: 0.9480139117700896
+```
+
+
+## Example usage
+
+To train a model with LayerDrop, add the following flags. We recommend 0.2, a value that worked well in our experiments. For Language Models that are decoder-only, you need only the decoder flag. For RoBERTa, an encoder, you need only the encoder flag. The encoder and decoder LayerDrop values can be set differently.
+```
+--encoder-layerdrop 0.2 --decoder-layerdrop 0.2
+```
+
+To prune a model that has been trained with LayerDrop, add the following flags followed by a comma separated list of which layers you would like to keep.
+```
+--encoder-layers-to-keep 0,2,4,6,8,10,12,14 --decoder-layers-to-keep 0,2,4,6,8,10,12,14
+```
+Setting these flags should print a message such as:
+```
+| Pruning model to specified layer configuration
+```
+You should also see a smaller number of parameters in the model, for example the 16-Layer Transformer Language Model prints:
+```
+num. model params: 246933504
+```
+while a model pruned to 8 Layers prints:
+```
+num. model params: 146163712
+```
+
+If you would like to pick up training with a model that has been pruned, simply adding these flags is sufficient. If you would like to use a script that only does evaluation (no training), you may need to pass an override command. A specific example would be for language modeling:
+```bash
+fairseq-eval-lm /path/to/wikitext-103 \
+ --path /path/to/model/checkpoint.pt \
+ --model-overrides "{'decoder_layers_to_keep':'0,2,4,6,8,10,12,14'}"
+```
+This model override command overrides the training parameters and updates the model arguments so that the pruned model is run instead of the full model.
+
+## Reproduce Paper Results
+
+Looking to reproduce the results in the paper?
+
+1. For Translation on WMT16 en-de, we followed this setting [here](https://github.com/pytorch/fairseq/blob/main/examples/scaling_nmt/README.md)
+2. To train RoBERTa, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/roberta)
+3. To train Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/language_model)
+
+
+## Tips
+
+1. If you would like to train large models with better performance, LayerDrop should be set to a smaller value such as 0.1 or 0.2. Too much LayerDrop will mean the model has too much regularization, so may not reach the best performance. Since LayerDrop adds regularization, you may achieve the best performance by slightly reducing the amount of standard dropout (for example, reduce by 0.1).
+
+2. If you would like to train large models to be pruned and made smaller, LayerDrop should be set to a larger value such as 0.5 if you want to prune very aggressively (such as removing half the network or more). If you would like to prune fewer layers away, LayerDrop can be set to a smaller value such as 0.2. Our experiments were conducted with low values of LayerDrop (such as 0.1 and 0.2), for reference.
+
+3. When pruning layers at inference time, it is best to spread out the layers remaining so they are evenly spaced throughout the network. For example, if you want to remove 50% of the network, keeping every other layer is good.
+
+
+## FAQ
+
+1. How did the sharing layers experiment work? In an appendix (https://openreview.net/pdf?id=SylO2yStDr) we added an experiment on Wikitext-103 language modeling that combined LayerDrop with Weight Sharing. We shared chunks of 2 layers such that every other layer had shared weights. For example, if our network has layers 1 through 6, then layer 1 and 2 are shared, layer 3 and 4 are shared, and layer 5 and 6 are shared.
+
+2. LayerDrop hasn't been helping in my setting? During training time, LayerDrop can help regularize your network. This is most important if your network is already overfitting - if your network is underfitting, it is possible LayerDrop is adding too much regularization. We recommend using smaller values (such as 0.1 or 0.2) and also decreasing the quantity of standard dropout (for example, reduce by 0.1).
+
+3. Can you train a model without LayerDrop and finetune with LayerDrop (e.g. for BERT)? In our experiments, we did not see great performance. Models such as RoBERTa have trained for a long time in the pre-training setting, so only finetuning with LayerDrop for a few epochs on a downstream task such as MNLI does not achieve the robustness required for successful pruning.
+
+
+## Having an issue or have a question?
+
+Please open an issue in this repository with the details of your question. Thanks!
diff --git a/fairseq/examples/linformer/linformer_src/models/__init__.py b/fairseq/examples/linformer/linformer_src/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391