diff --git a/.gitattributes b/.gitattributes
index f2a2c76de831fff2251aff6720ca43af603fd329..fb55ab1be9cfc79e062d44758e8e54c1bf486a5d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -37,3 +37,4 @@ fairseq/examples/MMPT/vlm.png filter=lfs diff=lfs merge=lfs -text
 fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text
 fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+fairseq/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text
diff --git a/fairseq/docs/fairseq.gif b/fairseq/docs/fairseq.gif
new file mode 100644
index 0000000000000000000000000000000000000000..8701763fa799e18f99ecf255f9cadb058724ab0a
--- /dev/null
+++ b/fairseq/docs/fairseq.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7551e9682c816fca1fa00458f3b657177c8d90d2e87db31e42197cb3ae80fca
+size 2664833
diff --git a/fairseq/examples/data2vec/scripts/text/finetune_all_large_fair_aws_local_lr.sh b/fairseq/examples/data2vec/scripts/text/finetune_all_large_fair_aws_local_lr.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c3c58adcb81b53fc30765d7506b92935fc1978bc
--- /dev/null
+++ b/fairseq/examples/data2vec/scripts/text/finetune_all_large_fair_aws_local_lr.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -eu
+
+job_id="$1"
+task_id="$2"
+dir="$3"
+
+echo "job_id: $job_id, task_id: $task_id, dir: $dir"
+
+mkdir -p "$dir/log"
+sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1"
+sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00"
+sbatch_args="$sbatch_args -d afterok:$job_id -o $dir/log/decode_sweep_%A.out"
+sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err"
+
+sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh $dir
diff --git a/fairseq/examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh b/fairseq/examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5efb00e0df355404c760c8029b0639ae1476d7db
--- /dev/null
+++ b/fairseq/examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env zsh
+
+dir="$1"
+cp="$dir/checkpoints/checkpoint_last.pt"
+
+echo "dir: $dir"
+
+declare -A tasks
+tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin"
+tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin"
+tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin"
+tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin"
+tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin"
+
+lrs=(5e-6 8e-6 1e-5 2e-5)
+
+for task data_path in ${(kv)tasks}; do
+    for lr in $lrs; do
+      echo $lr $task
+      PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" \
+        python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \
+        --config-name $task +run_config=local task.data="$data_path" common.log_interval=200 dataset.num_workers=1 \
+        checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" \
+        model._name=roberta_large
+    done
+done
diff --git a/fairseq/examples/data2vec/scripts/text/finetune_all_large_fair_nodep_aws_local_lr.sh b/fairseq/examples/data2vec/scripts/text/finetune_all_large_fair_nodep_aws_local_lr.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4fb21bce79cab2f2f238178beb38c85daa901296
--- /dev/null
+++ b/fairseq/examples/data2vec/scripts/text/finetune_all_large_fair_nodep_aws_local_lr.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -eu
+
+dir="$1"
+
+echo "dir: $dir"
+
+mkdir -p "$dir/log"
+sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1"
+sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00"
+sbatch_args="$sbatch_args -o $dir/log/decode_sweep_%A.out"
+sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err"
+
+sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh $dir
diff --git a/fairseq/examples/data2vec/scripts/text/finetune_sst2_qnli_sweep_fair_nodep.sh b/fairseq/examples/data2vec/scripts/text/finetune_sst2_qnli_sweep_fair_nodep.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d7b43bee80b906619446c773b6473fb2e4673626
--- /dev/null
+++ b/fairseq/examples/data2vec/scripts/text/finetune_sst2_qnli_sweep_fair_nodep.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env zsh
+
+dir="$1"
+cp="$dir/checkpoints/checkpoint_last.pt"
+
+echo "dir: $dir"
+
+declare -A tasks
+tasks[qnli]="/private/home/jgu/data/GLUE/QNLI-bin"
+tasks[sst_2]="/private/home/jgu/data/GLUE/SST-2-bin"
+
+lrs="5e-6 1e-5 2e-5 5e-5 1e-4 2e-4 5e-4 1e-3"
+
+for task data_path in ${(kv)tasks}; do
+  for lr in $(echo "$lrs"); do
+    PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \
+    --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \
+    checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_sweep/$task/lr_$lr" "optimization.lr=[${lr}]" &
+  done
+done
diff --git a/fairseq/examples/data2vec/scripts/text/glue.py b/fairseq/examples/data2vec/scripts/text/glue.py
new file mode 100644
index 0000000000000000000000000000000000000000..5382d31834e0b7db5ad5ecee7b09072adff747e6
--- /dev/null
+++ b/fairseq/examples/data2vec/scripts/text/glue.py
@@ -0,0 +1,34 @@
+from valids import parser, main as valids_main
+import os.path as osp
+
+
+args = parser.parse_args()
+args.target = "valid_accuracy"
+args.best_biggest = True
+args.best = True
+args.last = 0
+args.path_contains = None
+
+res =  valids_main(args, print_output=False)
+
+grouped = {}
+for k, v in res.items():
+    k = osp.dirname(k)
+    run = osp.dirname(k)
+    task = osp.basename(k)
+    val = v["valid_accuracy"]
+
+    if run not in grouped:
+        grouped[run] = {}
+
+    grouped[run][task] = val
+
+for run, tasks in grouped.items():
+    print(run)
+    avg = sum(float(v) for v in tasks.values()) / len(tasks)
+    avg_norte = sum(float(v) for k,v in tasks.items() if k != 'rte') / (len(tasks) -1)
+    try:
+        print(f"{tasks['cola']}\t{tasks['qnli']}\t{tasks['mrpc']}\t{tasks['rte']}\t{tasks['sst_2']}\t{avg:.2f}\t{avg_norte:.2f}")
+    except:
+        print(tasks)
+    print()
diff --git a/fairseq/examples/data2vec/scripts/text/glue_lr.py b/fairseq/examples/data2vec/scripts/text/glue_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..75bdfe0368032065fe66008f59db32ff5de26ff9
--- /dev/null
+++ b/fairseq/examples/data2vec/scripts/text/glue_lr.py
@@ -0,0 +1,143 @@
+import os.path as osp
+import re
+from collections import defaultdict
+
+from valids import parser, main as valids_main
+
+
+TASK_TO_METRIC = {
+    "cola": "mcc",
+    "qnli": "accuracy",
+    "mrpc": "acc_and_f1",
+    "rte": "accuracy",
+    "sst_2": "accuracy",
+    "mnli": "accuracy",
+    "qqp": "acc_and_f1",
+    "sts_b": "pearson_and_spearman",
+}
+TASKS = ["cola", "qnli", "mrpc", "rte", "sst_2", "mnli", "qqp", "sts_b"]
+
+
+def get_best_stat_str(task_vals, show_subdir):
+    task_to_best_val = {}
+    task_to_best_dir = {}
+    for task, subdir_to_val in task_vals.items():
+        task_to_best_val[task] = max(subdir_to_val.values())
+        task_to_best_dir[task] = max(subdir_to_val.keys(), key=lambda x: subdir_to_val[x])
+
+    # import pdb; pdb.set_trace()
+    N1 = len(task_to_best_val)
+    N2 = len([k for k in task_to_best_val if k != "rte"])
+    avg1 = sum(task_to_best_val.values()) / N1
+    avg2 = sum(v for task, v in task_to_best_val.items() if task != "rte") / N2
+
+    try:
+        msg = ""
+        for task in TASKS:
+            dir = task_to_best_dir.get(task, 'null')
+            val = task_to_best_val.get(task, -100)
+            msg += f"({dir}, {val})\t" if show_subdir else f"{val}\t"
+        msg += f"{avg1:.2f}\t{avg2:.2f}"
+    except Exception as e:
+        msg = str(e)
+        msg += str(sorted(task_vals.items()))
+    return msg
+
+def get_all_stat_str(task_vals):
+    msg = ""
+    for task in [task for task in TASKS if task in task_vals]:
+        msg += f"=== {task}\n"
+        for subdir in sorted(task_vals[task].keys()):
+            msg += f"\t{subdir}\t{task_vals[task][subdir]}\n"
+    return msg
+
+def get_tabular_stat_str(task_vals):
+    """assume subdir is <param>/run_*/0"""
+    msg = ""
+    for task in [task for task in TASKS if task in task_vals]:
+        msg += f"=== {task}\n"
+        param_to_runs = defaultdict(dict)
+        for subdir in task_vals[task]:
+            match = re.match("(.*)/(run_.*)/0", subdir)
+            assert match, "subdir"
+            param, run = match.groups()
+            param_to_runs[param][run] = task_vals[task][subdir]
+        params = sorted(param_to_runs, key=lambda x: float(x))
+        runs = sorted(set(run for runs in param_to_runs.values() for run in runs))
+        msg += ("runs:" + "\t".join(runs) + "\n")
+        msg += ("params:" + "\t".join(params) + "\n")
+        for param in params:
+            msg += "\t".join([str(param_to_runs[param].get(run, None)) for run in runs])
+            msg += "\n"
+        # for subdir in sorted(task_vals[task].keys()):
+        #     msg += f"\t{subdir}\t{task_vals[task][subdir]}\n"
+    return msg
+
+   
+
+def main():
+    parser.add_argument("--show_glue", action="store_true", help="show glue metric for each task instead of accuracy")
+    parser.add_argument("--print_mode", default="best", help="best|all|tabular")
+    parser.add_argument("--show_subdir", action="store_true", help="print the subdir that has the best results for each run")
+    parser.add_argument("--override_target", default="valid_accuracy", help="override target")
+
+    args = parser.parse_args()
+    args.target = args.override_target
+    args.best_biggest = True
+    args.best = True
+    args.last = 0
+    args.path_contains = None
+    
+    res =  valids_main(args, print_output=False)
+    grouped_acc = {}
+    grouped_met = {}  # use official metric for each task
+    for path, v in res.items():
+        path = "/".join([args.base, path])
+        path = re.sub("//*", "/", path)
+        match = re.match("(.*)finetune[^/]*/([^/]*)/(.*)", path)
+        if not match:
+            continue
+        run, task, subdir = match.groups()
+
+        if run not in grouped_acc:
+            grouped_acc[run] = {}
+            grouped_met[run] = {}
+        if task not in grouped_acc[run]:
+            grouped_acc[run][task] = {}
+            grouped_met[run][task] = {}
+
+        if v is not None:
+            grouped_acc[run][task][subdir] = float(v.get("valid_accuracy", -100))
+            grouped_met[run][task][subdir] = float(v.get(f"valid_{TASK_TO_METRIC[task]}", -100))
+        else:
+            print(f"{path} has None return")
+
+    header = "\t".join(TASKS)
+    for run in sorted(grouped_acc):
+        print(run)
+        if args.print_mode == "all":
+            if args.show_glue:
+                print("===== GLUE =====")
+                print(get_all_stat_str(grouped_met[run]))
+            else:
+                print("===== ACC =====")
+                print(get_all_stat_str(grouped_acc[run]))
+        elif args.print_mode == "best":
+            print(f"      {header}")
+            if args.show_glue:
+                print(f"GLEU: {get_best_stat_str(grouped_met[run], args.show_subdir)}")
+            else:
+                print(f"ACC:  {get_best_stat_str(grouped_acc[run], args.show_subdir)}")
+        elif args.print_mode == "tabular":
+            if args.show_glue:
+                print("===== GLUE =====")
+                print(get_tabular_stat_str(grouped_met[run]))
+            else:
+                print("===== ACC =====")
+                print(get_tabular_stat_str(grouped_acc[run]))
+        else:
+            raise ValueError(args.print_mode)
+        print()
+
+if __name__ == "__main__":
+    main()
diff --git a/fairseq/examples/data2vec/tasks/audio_classification.py b/fairseq/examples/data2vec/tasks/audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..2925a04cf9c6220db04a567ac023ba074ee222c5
--- /dev/null
+++ b/fairseq/examples/data2vec/tasks/audio_classification.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+import os
+import numpy as np
+import math
+import torch
+
+from sklearn import metrics as sklearn_metrics
+from dataclasses import dataclass
+
+from fairseq.tasks.audio_pretraining import AudioPretrainingTask, AudioPretrainingConfig
+from fairseq.tasks import register_task
+from fairseq.logging import metrics
+
+from ..data.add_class_target_dataset import AddClassTargetDataset
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class AudioClassificationConfig(AudioPretrainingConfig):
+    label_descriptors: str = "label_descriptors.csv"
+    labels: str = "lbl"
+
+
+@register_task("audio_classification", dataclass=AudioClassificationConfig)
+class AudioClassificationTask(AudioPretrainingTask):
+    """ """
+
+    cfg: AudioClassificationConfig
+
+    def __init__(
+        self,
+        cfg: AudioClassificationConfig,
+    ):
+        super().__init__(cfg)
+
+        self.state.add_factory("labels", self.load_labels)
+
+    def load_labels(self):
+        labels = {}
+        path = os.path.join(self.cfg.data, self.cfg.label_descriptors)
+        with open(path, "r") as ldf:
+            for line in ldf:
+                if line.strip() == "":
+                    continue
+                items = line.split(",")
+                idx = items[0]
+                lbl = items[1]
+                assert lbl not in labels, lbl
+                labels[lbl] = idx
+        return labels
+
+    @property
+    def labels(self):
+        return self.state.labels
+
+    def load_dataset(
+        self, split: str, task_cfg: AudioClassificationConfig = None, **kwargs
+    ):
+        super().load_dataset(split, task_cfg, **kwargs)
+
+        task_cfg = task_cfg or self.cfg
+
+        data_path = self.cfg.data
+        label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}")
+        skipped_indices = getattr(self.datasets[split], "skipped_indices", set())
+        labels = []
+        with open(label_path, "r") as f:
+            for i, line in enumerate(f):
+                if i not in skipped_indices:
+                    lbl_items = line.rstrip().split("\t")
+                    labels.append([int(x) for x in lbl_items[2].split(",")])
+
+        assert len(labels) == len(self.datasets[split]), (
+            f"labels length ({len(labels)}) and dataset length "
+            f"({len(self.datasets[split])}) do not match"
+        )
+
+        self.datasets[split] = AddClassTargetDataset(
+            self.datasets[split],
+            labels,
+            multi_class=True,
+            add_to_input=True,
+            num_classes=len(self.labels),
+        )
+
+    def calculate_stats(self, output, target):
+
+        classes_num = target.shape[-1]
+        stats = []
+
+        # Accuracy, only used for single-label classification such as esc-50, not for multiple label one such as AudioSet
+        # acc = sklearn_metrics.accuracy_score(np.argmax(target, 1), np.argmax(output, 1))
+
+        # Class-wise statistics
+        for k in range(classes_num):
+            # Average precision
+            avg_precision = sklearn_metrics.average_precision_score(
+                target[:, k], output[:, k], average=None
+            )
+
+            dict = {
+                "AP": avg_precision,
+            }
+
+            # # AUC
+            # try:
+            #     auc = sklearn_metrics.roc_auc_score(target[:, k], output[:, k], average=None)
+            # except:
+            #     auc = 0
+            #
+            # # Precisions, recalls
+            # (precisions, recalls, thresholds) = sklearn_metrics.precision_recall_curve(
+            #     target[:, k], output[:, k]
+            # )
+            #
+            # # FPR, TPR
+            # (fpr, tpr, thresholds) = sklearn_metrics.roc_curve(target[:, k], output[:, k])
+            #
+            # save_every_steps = 1000  # Sample statistics to reduce size
+            # dict = {
+            #     "precisions": precisions[0::save_every_steps],
+            #     "recalls": recalls[0::save_every_steps],
+            #     "AP": avg_precision,
+            #     "fpr": fpr[0::save_every_steps],
+            #     "fnr": 1.0 - tpr[0::save_every_steps],
+            #     "auc": auc,
+            #     # note acc is not class-wise, this is just to keep consistent with other metrics
+            #     "acc": acc,
+            # }
+            stats.append(dict)
+
+        return stats
+
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+        return loss, sample_size, logging_output
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        if "_predictions" in logging_outputs[0]:
+            metrics.log_concat_tensor(
+                "_predictions",
+                torch.cat([l["_predictions"].cpu() for l in logging_outputs], dim=0),
+            )
+            metrics.log_concat_tensor(
+                "_targets",
+                torch.cat([l["_targets"].cpu() for l in logging_outputs], dim=0),
+            )
+
+            def compute_stats(meters):
+                if meters["_predictions"].tensor.shape[0] < 100:
+                    return 0
+                stats = self.calculate_stats(
+                    meters["_predictions"].tensor, meters["_targets"].tensor
+                )
+                return np.nanmean([stat["AP"] for stat in stats])
+
+            metrics.log_derived("mAP", compute_stats)
diff --git a/fairseq/examples/data2vec/tasks/image_classification.py b/fairseq/examples/data2vec/tasks/image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ea4c2afeeb59345d91418a22fda5fc576c5c961
--- /dev/null
+++ b/fairseq/examples/data2vec/tasks/image_classification.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import os.path as osp
+import logging
+
+from dataclasses import dataclass
+import torch
+from torchvision import transforms
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.tasks import register_task
+from fairseq.logging import metrics
+
+try:
+    from ..data import ImageDataset
+except:
+    import sys
+
+    sys.path.append("..")
+    from data import ImageDataset
+
+from .image_pretraining import (
+    ImagePretrainingConfig,
+    ImagePretrainingTask,
+    IMG_EXTENSIONS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ImageClassificationConfig(ImagePretrainingConfig):
+    pass
+
+
+@register_task("image_classification", dataclass=ImageClassificationConfig)
+class ImageClassificationTask(ImagePretrainingTask):
+
+    cfg: ImageClassificationConfig
+
+    @classmethod
+    def setup_task(cls, cfg: ImageClassificationConfig, **kwargs):
+        return cls(cfg)
+
+    def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs):
+        data_path = self.cfg.data
+        cfg = task_cfg or self.cfg
+
+        path_with_split = osp.join(data_path, split)
+        if osp.exists(path_with_split):
+            data_path = path_with_split
+
+        from timm.data import create_transform
+
+        if split == "train":
+            # this should always dispatch to transforms_imagenet_train
+            transform = create_transform(
+                input_size=cfg.input_size,
+                is_training=True,
+                auto_augment="rand-m9-mstd0.5-inc1",
+                interpolation="bicubic",
+                re_prob=0.25,
+                re_mode="pixel",
+                re_count=1,
+                mean=cfg.normalization_mean,
+                std=cfg.normalization_std,
+            )
+            if not cfg.input_size > 32:
+                transform.transforms[0] = transforms.RandomCrop(
+                    cfg.input_size, padding=4
+                )
+        else:
+            t = []
+            if cfg.input_size > 32:
+                crop_pct = 1
+                if cfg.input_size < 384:
+                    crop_pct = 224 / 256
+                size = int(cfg.input_size / crop_pct)
+                t.append(
+                    transforms.Resize(
+                        size, interpolation=3
+                    ),  # to maintain same ratio w.r.t. 224 images
+                )
+                t.append(transforms.CenterCrop(cfg.input_size))
+
+            t.append(transforms.ToTensor())
+            t.append(
+                transforms.Normalize(cfg.normalization_mean, cfg.normalization_std)
+            )
+            transform = transforms.Compose(t)
+            logger.info(transform)
+
+        self.datasets[split] = ImageDataset(
+            root=data_path,
+            extensions=IMG_EXTENSIONS,
+            load_classes=True,
+            transform=transform,
+        )
+        for k in self.datasets.keys():
+            if k != split:
+                assert self.datasets[k].classes == self.datasets[split].classes
+
+    def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False):
+        model = super().build_model(model_cfg, from_checkpoint)
+
+        actualized_cfg = getattr(model, "cfg", None)
+        if actualized_cfg is not None:
+            if hasattr(actualized_cfg, "pretrained_model_args"):
+                model_cfg.pretrained_model_args = actualized_cfg.pretrained_model_args
+
+        return model
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+
+        if "correct" in logging_outputs[0]:
+            zero = torch.scalar_tensor(0.0)
+            correct = sum(log.get("correct", zero) for log in logging_outputs)
+            metrics.log_scalar_sum("_correct", correct)
+
+            metrics.log_derived(
+                "accuracy",
+                lambda meters: 100 * meters["_correct"].sum / meters["sample_size"].sum,
+            )
diff --git a/fairseq/examples/data2vec/tasks/image_pretraining.py b/fairseq/examples/data2vec/tasks/image_pretraining.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd688fd1366f3ee4145726307d41be3174a44151
--- /dev/null
+++ b/fairseq/examples/data2vec/tasks/image_pretraining.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+import sys
+import os.path as osp
+
+from dataclasses import dataclass, field
+from typing import List
+from omegaconf import MISSING
+
+import torch
+from torchvision import transforms
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.tasks import FairseqTask, register_task
+
+try:
+    from ..data import ImageDataset
+except:
+    sys.path.append("..")
+    from data import ImageDataset
+
+logger = logging.getLogger(__name__)
+
+IMG_EXTENSIONS = {
+    ".jpg",
+    ".jpeg",
+    ".png",
+    ".ppm",
+    ".bmp",
+    ".pgm",
+    ".tif",
+    ".tiff",
+    ".webp",
+}
+
+
+@dataclass
+class ImagePretrainingConfig(FairseqDataclass):
+    data: str = field(default=MISSING, metadata={"help": "path to data directory"})
+    input_size: int = 224
+    normalization_mean: List[float] = (0.485, 0.456, 0.406)
+    normalization_std: List[float] = (0.229, 0.224, 0.225)
+
+
+@register_task("image_pretraining", dataclass=ImagePretrainingConfig)
+class ImagePretrainingTask(FairseqTask):
+    """ """
+
+    cfg: ImagePretrainingConfig
+
+    @classmethod
+    def setup_task(cls, cfg: ImagePretrainingConfig, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            cfg (AudioPretrainingConfig): configuration of this task
+        """
+
+        return cls(cfg)
+
+    def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs):
+        data_path = self.cfg.data
+        cfg = task_cfg or self.cfg
+
+        path_with_split = osp.join(data_path, split)
+        if osp.exists(path_with_split):
+            data_path = path_with_split
+
+        transform = transforms.Compose(
+            [
+                transforms.ColorJitter(0.4, 0.4, 0.4),
+                transforms.RandomHorizontalFlip(p=0.5),
+                transforms.RandomResizedCrop(
+                    size=cfg.input_size,
+                    interpolation=transforms.InterpolationMode.BICUBIC,
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=torch.tensor(cfg.normalization_mean),
+                    std=torch.tensor(cfg.normalization_std),
+                ),
+            ]
+        )
+
+        logger.info(transform)
+
+        self.datasets[split] = ImageDataset(
+            root=data_path,
+            extensions=IMG_EXTENSIONS,
+            load_classes=False,
+            transform=transform,
+        )
+
+    @property
+    def source_dictionary(self):
+        return None
+
+    @property
+    def target_dictionary(self):
+        return None
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return sys.maxsize, sys.maxsize
diff --git a/fairseq/examples/data2vec/tasks/mae_image_pretraining.py b/fairseq/examples/data2vec/tasks/mae_image_pretraining.py
new file mode 100644
index 0000000000000000000000000000000000000000..35a14891ca6039943a4d7add0ed92ba45db8d1e0
--- /dev/null
+++ b/fairseq/examples/data2vec/tasks/mae_image_pretraining.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+import sys
+
+from typing import Optional, List
+from dataclasses import dataclass, field
+from omegaconf import MISSING, II
+
+from fairseq.data import SubsampleDataset
+from fairseq.dataclass import FairseqDataclass
+from fairseq.tasks import FairseqTask, register_task
+
+try:
+    from ..data import MaeImageDataset
+except:
+    sys.path.append("..")
+    from data import MaeImageDataset
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ImageMaskingConfig:
+    patch_size: int = II("model.modalities.image.patch_size")
+    mask_prob: float = II("model.modalities.image.mask_prob")
+    mask_prob_adjust: float = II("model.modalities.image.mask_prob_adjust")
+    mask_length: int = II("model.modalities.image.mask_length")
+    inverse_mask: bool = II("model.modalities.image.inverse_mask")
+    mask_dropout: float = II("model.modalities.image.mask_dropout")
+    clone_batch: int = II("model.clone_batch")
+    expand_adjacent: bool = False
+    non_overlapping: bool = False
+
+
+@dataclass
+class MaeImagePretrainingConfig(FairseqDataclass):
+    data: str = field(default=MISSING, metadata={"help": "path to data directory"})
+    multi_data: Optional[List[str]] = None
+    input_size: int = 224
+    local_cache_path: Optional[str] = None
+    key: str = "imgs"
+
+    beit_transforms: bool = False
+    target_transform: bool = False
+    no_transform: bool = False
+
+    rebuild_batches: bool = True
+
+    precompute_mask_config: Optional[ImageMaskingConfig] = None
+
+    subsample: float = 1
+    seed: int = II("common.seed")
+    dataset_type: str = "imagefolder"
+
+
+@register_task("mae_image_pretraining", dataclass=MaeImagePretrainingConfig)
+class MaeImagePretrainingTask(FairseqTask):
+    """ """
+
+    cfg: MaeImagePretrainingConfig
+
+    @classmethod
+    def setup_task(cls, cfg: MaeImagePretrainingConfig, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            cfg (AudioPretrainingConfig): configuration of this task
+        """
+
+        return cls(cfg)
+
+    def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs):
+        data_path = self.cfg.data
+        cfg = task_cfg or self.cfg
+
+        compute_mask = cfg.precompute_mask_config is not None
+        mask_args = {}
+        if compute_mask:
+            mask_args = cfg.precompute_mask_config
+
+        self.datasets[split] = MaeImageDataset(
+            root=data_path if cfg.multi_data is None else cfg.multi_data,
+            split=split,
+            input_size=cfg.input_size,
+            local_cache_path=cfg.local_cache_path,
+            key=cfg.key,
+            beit_transforms=cfg.beit_transforms,
+            target_transform=cfg.target_transform,
+            no_transform=cfg.no_transform,
+            compute_mask=compute_mask,
+            dataset_type=cfg.dataset_type,
+            **mask_args,
+        )
+
+        if cfg.subsample < 1:
+            self.datasets[split] = SubsampleDataset(
+                self.datasets[split],
+                cfg.subsample,
+                shuffle=True,
+                seed=cfg.seed,
+            )
+
+    @property
+    def source_dictionary(self):
+        return None
+
+    @property
+    def target_dictionary(self):
+        return None
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return sys.maxsize, sys.maxsize
diff --git a/fairseq/examples/emotion_conversion/emotion_models/__init__.py b/fairseq/examples/emotion_conversion/emotion_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/examples/emotion_conversion/emotion_models/duration_predictor.py b/fairseq/examples/emotion_conversion/emotion_models/duration_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb47df0a214b593987611e2d62c61c48a0ea0c67
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/emotion_models/duration_predictor.py
@@ -0,0 +1,243 @@
+import logging
+import os
+
+import hydra
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops.layers.torch import Rearrange
+from torch.utils.data import DataLoader, Dataset
+
+from .utils import Accuracy
+
+logger = logging.getLogger(__name__)
+
+
+def save_ckpt(model, path, model_class):
+    ckpt = {
+        "state_dict": model.state_dict(),
+        "padding_token": model.padding_token,
+        "model_class": model_class,
+    }
+    torch.save(ckpt, path)
+
+
+def load_ckpt(path):
+    ckpt = torch.load(path)
+    ckpt["model_class"]["_target_"] = "emotion_models.duration_predictor.CnnPredictor"
+    model = hydra.utils.instantiate(ckpt["model_class"])
+    model.load_state_dict(ckpt["state_dict"])
+    model.padding_token = ckpt["padding_token"]
+    model = model.cpu()
+    model.eval()
+    return model
+
+
+class Collator:
+    def __init__(self, padding_idx):
+        self.padding_idx = padding_idx
+
+    def __call__(self, batch):
+        x = [item[0] for item in batch]
+        lengths = [len(item) for item in x]
+        x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=self.padding_idx)
+        y = [item[1] for item in batch]
+        y = torch.nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=self.padding_idx)
+        mask = (x != self.padding_idx)
+        return x, y, mask, lengths
+
+
+class Predictor(nn.Module):
+    def __init__(self, n_tokens, emb_dim):
+        super(Predictor, self).__init__()
+        self.n_tokens = n_tokens
+        self.emb_dim = emb_dim
+        self.padding_token = n_tokens
+        # add 1 extra embedding for padding token, set the padding index to be the last token
+        # (tokens from the clustering start at index 0)
+        self.emb = nn.Embedding(n_tokens + 1, emb_dim, padding_idx=self.padding_token)
+
+    def inflate_input(self, batch):
+        """ get a sequence of tokens, predict their durations
+        and inflate them accordingly """
+        batch_durs = self.forward(batch)
+        batch_durs = torch.exp(batch_durs) - 1
+        batch_durs = batch_durs.round()
+        output = []
+        for seq, durs in zip(batch, batch_durs):
+            inflated_seq = []
+            for token, n in zip(seq, durs):
+                if token == self.padding_token:
+                    break
+                n = int(n.item())
+                token = int(token.item())
+                inflated_seq.extend([token for _ in range(n)])
+            output.append(inflated_seq)
+        output = torch.LongTensor(output)
+        return output
+
+
+class CnnPredictor(Predictor):
+    def __init__(self, n_tokens, emb_dim, channels, kernel, output_dim, dropout, n_layers):
+        super(CnnPredictor, self).__init__(n_tokens=n_tokens, emb_dim=emb_dim)
+        layers = [
+            Rearrange("b t c -> b c t"),
+            nn.Conv1d(emb_dim, channels, kernel_size=kernel, padding=(kernel - 1) // 2),
+            Rearrange("b c t -> b t c"),
+            nn.ReLU(),
+            nn.LayerNorm(channels),
+            nn.Dropout(dropout),
+        ]
+        for _ in range(n_layers-1):
+            layers += [
+                Rearrange("b t c -> b c t"),
+                nn.Conv1d(channels, channels, kernel_size=kernel, padding=(kernel - 1) // 2),
+                Rearrange("b c t -> b t c"),
+                nn.ReLU(),
+                nn.LayerNorm(channels),
+                nn.Dropout(dropout),
+            ]
+        self.conv_layer = nn.Sequential(*layers)
+        self.proj = nn.Linear(channels, output_dim)
+
+    def forward(self, x):
+        x = self.emb(x)
+        x = self.conv_layer(x)
+        x = self.proj(x)
+        x = x.squeeze(-1)
+        return x
+
+
+def l2_log_loss(input, target):
+    return F.mse_loss(
+        input=input.float(),
+        target=torch.log(target.float() + 1),
+        reduce=False
+    )
+
+
+class DurationDataset(Dataset):
+    def __init__(self, tsv_path, km_path, substring=""):
+        lines = open(tsv_path, "r").readlines()
+        self.root, self.tsv = lines[0], lines[1:]
+        self.km = open(km_path, "r").readlines()
+        logger.info(f"loaded {len(self.km)} files")
+
+        if substring != "":
+            tsv, km = [], []
+            for tsv_line, km_line in zip(self.tsv, self.km):
+                if substring.lower() in tsv_line.lower():
+                    tsv.append(tsv_line)
+                    km.append(km_line)
+            self.tsv, self.km = tsv, km
+            logger.info(f"after filtering: {len(self.km)} files")
+
+    def __len__(self):
+        return len(self.km)
+
+    def __getitem__(self, i):
+        x = self.km[i]
+        x = x.split(" ")
+        x = list(map(int, x))
+
+        y = []
+        xd = []
+        count = 1
+        for x1, x2 in zip(x[:-1], x[1:]):
+            if x1 == x2:
+                count += 1
+                continue
+            else:
+                y.append(count)
+                xd.append(x1)
+                count = 1
+
+        xd = torch.LongTensor(xd)
+        y = torch.LongTensor(y)
+        return xd, y
+
+
+def train(cfg):
+    device = "cuda:0"
+    model = hydra.utils.instantiate(cfg[cfg.model]).to(device)
+    optimizer = hydra.utils.instantiate(cfg.optimizer, model.parameters())
+    # add 1 extra embedding for padding token, set the padding index to be the last token
+    # (tokens from the clustering start at index 0)
+    collate_fn = Collator(padding_idx=model.padding_token)
+    logger.info(f"data: {cfg.train_tsv}")
+    train_ds = DurationDataset(cfg.train_tsv, cfg.train_km, substring=cfg.substring)
+    valid_ds = DurationDataset(cfg.valid_tsv, cfg.valid_km, substring=cfg.substring)
+    train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
+    valid_dl = DataLoader(valid_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)
+
+    best_loss = float("inf")
+    for epoch in range(cfg.epochs):
+        train_loss, train_loss_scaled = train_epoch(model, train_dl, l2_log_loss, optimizer, device)
+        valid_loss, valid_loss_scaled, *acc = valid_epoch(model, valid_dl, l2_log_loss, device)
+        acc0, acc1, acc2, acc3 = acc
+        if valid_loss_scaled < best_loss:
+            path = f"{os.getcwd()}/{cfg.substring}.ckpt"
+            save_ckpt(model, path, cfg[cfg.model])
+            best_loss = valid_loss_scaled
+            logger.info(f"saved checkpoint: {path}")
+            logger.info(f"[epoch {epoch}] train loss: {train_loss:.3f}, train scaled: {train_loss_scaled:.3f}")
+            logger.info(f"[epoch {epoch}] valid loss: {valid_loss:.3f}, valid scaled: {valid_loss_scaled:.3f}")
+            logger.info(f"acc: {acc0,acc1,acc2,acc3}")
+
+
+def train_epoch(model, loader, criterion, optimizer, device):
+    model.train()
+    epoch_loss = 0
+    epoch_loss_scaled = 0
+    for x, y, mask, _ in loader:
+        x, y, mask = x.to(device), y.to(device), mask.to(device)
+        yhat = model(x)
+        loss = criterion(yhat, y) * mask
+        loss = torch.mean(loss)
+        loss.backward()
+        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        epoch_loss += loss.item()
+        # get normal scale loss
+        yhat_scaled = torch.exp(yhat) - 1
+        yhat_scaled = torch.round(yhat_scaled)
+        scaled_loss = torch.mean(torch.abs(yhat_scaled - y) * mask)
+        epoch_loss_scaled += scaled_loss.item()
+    return epoch_loss / len(loader), epoch_loss_scaled / len(loader)
+
+
+def valid_epoch(model, loader, criterion, device):
+    model.eval()
+    epoch_loss = 0
+    epoch_loss_scaled = 0
+    acc = Accuracy()
+    for x, y, mask, _ in loader:
+        x, y, mask = x.to(device), y.to(device), mask.to(device)
+        yhat = model(x)
+        loss = criterion(yhat, y) * mask
+        loss = torch.mean(loss)
+        epoch_loss += loss.item()
+        # get normal scale loss
+        yhat_scaled = torch.exp(yhat) - 1
+        yhat_scaled = torch.round(yhat_scaled)
+        scaled_loss = torch.sum(torch.abs(yhat_scaled - y) * mask) / mask.sum()
+        acc.update(yhat_scaled[mask].view(-1).float(), y[mask].view(-1).float())
+        epoch_loss_scaled += scaled_loss.item()
+    logger.info(f"example y: {y[0, :10].tolist()}")
+    logger.info(f"example yhat: {yhat_scaled[0, :10].tolist()}")
+    acc0 = acc.acc(tol=0)
+    acc1 = acc.acc(tol=1)
+    acc2 = acc.acc(tol=2)
+    acc3 = acc.acc(tol=3)
+    logger.info(f"accs: {acc0,acc1,acc2,acc3}")
+    return epoch_loss / len(loader), epoch_loss_scaled / len(loader), acc0, acc1, acc2, acc3
+
+
+@hydra.main(config_path=".", config_name="duration_predictor.yaml")
+def main(cfg):
+    logger.info(f"{cfg}")
+    train(cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fairseq/examples/emotion_conversion/emotion_models/duration_predictor.yaml b/fairseq/examples/emotion_conversion/emotion_models/duration_predictor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e976f484351a7d64eaedb70eb7866ce8555b419
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/emotion_models/duration_predictor.yaml
@@ -0,0 +1,48 @@
+train_tsv: "<your-processed-data>/denoising/emov/train.tsv" 
+train_km:  "<your-processed-data>/denoising/emov/train.km" 
+valid_tsv: "<your-processed-data>/denoising/emov/valid.tsv" 
+valid_km:  "<your-processed-data>/denoising/emov/valid.km"
+
+n_tokens: 200
+batch_size: 32
+lr: 0.0001
+epochs: 300
+model: "cnn"
+substring: ""
+
+rnn:
+  _target_: emotion_models.duration_predictor.RnnPredictor
+  n_tokens: ${n_tokens}
+  emb_dim: 128
+  rnn_hidden: 128
+  output_dim: 1
+  dropout: 0
+  n_layers: 1
+
+optimizer:
+  _target_: torch.optim.Adam
+  lr: ${lr}
+  betas: [0.9, 0.98]
+  eps: 0.000000001
+  weight_decay: 0
+
+cnn:
+  _target_: emotion_models.duration_predictor.CnnPredictor
+  n_tokens: ${n_tokens}
+  emb_dim: 128
+  channels: 256
+  kernel: 3
+  output_dim: 1
+  dropout: 0.5
+  n_layers: 1
+
+hydra:
+  run:
+    dir: /checkpoint/felixkreuk/experiments/duration_predictor/${hydra.job.override_dirname}
+  job:
+    config:
+      # configuration for the ${hydra.job.override_dirname} runtime variable
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: ['train_tsv', 'train_km', 'valid_tsv', 'valid_km']
diff --git a/fairseq/examples/emotion_conversion/emotion_models/pitch_predictor.py b/fairseq/examples/emotion_conversion/emotion_models/pitch_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..431446996c30e1dadb61a1834e5ca56b647fe8f1
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/emotion_models/pitch_predictor.py
@@ -0,0 +1,559 @@
+import logging
+import os
+import random
+import sys
+from collections import defaultdict
+
+import hydra
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from scipy.io.wavfile import read
+from scipy.ndimage import gaussian_filter1d
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+
+dir_path = os.path.dirname(__file__)
+resynth_path = os.path.dirname(dir_path) + "/speech-resynthesis"
+sys.path.append(resynth_path)
+from dataset import parse_speaker, parse_style
+from .utils import F0Stat
+
+MAX_WAV_VALUE = 32768.0
+logger = logging.getLogger(__name__)
+
+
+def quantize_f0(speaker_to_f0, nbins, normalize, log):
+    f0_all = []
+    for speaker, f0 in speaker_to_f0.items():
+        f0 = f0.raw_data
+        if log:
+            f0 = f0.log()
+        mean = speaker_to_f0[speaker].mean_log if log else speaker_to_f0[speaker].mean
+        std = speaker_to_f0[speaker].std_log if log else speaker_to_f0[speaker].std
+        if normalize == "mean":
+            f0 = f0 - mean
+        elif normalize == "meanstd":
+            f0 = (f0 - mean) / std
+        f0_all.extend(f0.tolist())
+
+    hist, bin_x = np.histogram(f0_all, 100000)
+    cum_hist = np.cumsum(hist) / len(f0_all) * 100
+
+    bin_offset = []
+    bin_size = 100 / nbins
+    threshold = bin_size
+    for i in range(nbins - 1):
+        index = (np.abs(cum_hist - threshold)).argmin()
+        bin_offset.append(bin_x[index])
+        threshold += bin_size
+    bins = np.array(bin_offset)
+    bins = torch.FloatTensor(bins)
+
+    return bins
+
+
+def save_ckpt(model, path, model_class, f0_min, f0_max, f0_bins, speaker_stats):
+    ckpt = {
+        "state_dict": model.state_dict(),
+        "padding_token": model.padding_token,
+        "model_class": model_class,
+        "speaker_stats": speaker_stats,
+        "f0_min": f0_min,
+        "f0_max": f0_max,
+        "f0_bins": f0_bins,
+    }
+    torch.save(ckpt, path)
+
+
+def load_ckpt(path):
+    ckpt = torch.load(path)
+    ckpt["model_class"]["_target_"] = "emotion_models.pitch_predictor.CnnPredictor"
+    model = hydra.utils.instantiate(ckpt["model_class"])
+    model.load_state_dict(ckpt["state_dict"])
+    model.setup_f0_stats(
+        ckpt["f0_min"],
+        ckpt["f0_max"],
+        ckpt["f0_bins"],
+        ckpt["speaker_stats"],
+    )
+    return model
+
+
+def freq2bin(f0, f0_min, f0_max, bins):
+    f0 = f0.clone()
+    f0[f0 < f0_min] = f0_min
+    f0[f0 > f0_max] = f0_max
+    f0 = torch.bucketize(f0, bins)
+    return f0
+
+
+def bin2freq(x, f0_min, f0_max, bins, mode):
+    n_bins = len(bins) + 1
+    assert x.shape[-1] == n_bins
+    bins = torch.cat([torch.tensor([f0_min]), bins]).to(x.device)
+    if mode == "mean":
+        f0 = (x * bins).sum(-1, keepdims=True) / x.sum(-1, keepdims=True)
+    elif mode == "argmax":
+        idx = F.one_hot(x.argmax(-1), num_classes=n_bins)
+        f0 = (idx * bins).sum(-1, keepdims=True)
+    else:
+        raise NotImplementedError()
+    return f0[..., 0]
+
+
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+
+
+def l1_loss(input, target):
+    return F.l1_loss(input=input.float(), target=target.float(), reduce=False)
+
+
+def l2_loss(input, target):
+    return F.mse_loss(input=input.float(), target=target.float(), reduce=False)
+
+
+class Collator:
+    def __init__(self, padding_idx):
+        self.padding_idx = padding_idx
+
+    def __call__(self, batch):
+        tokens = [item[0] for item in batch]
+        lengths = [len(item) for item in tokens]
+        tokens = torch.nn.utils.rnn.pad_sequence(
+            tokens, batch_first=True, padding_value=self.padding_idx
+        )
+        f0 = [item[1] for item in batch]
+        f0 = torch.nn.utils.rnn.pad_sequence(
+            f0, batch_first=True, padding_value=self.padding_idx
+        )
+        f0_raw = [item[2] for item in batch]
+        f0_raw = torch.nn.utils.rnn.pad_sequence(
+            f0_raw, batch_first=True, padding_value=self.padding_idx
+        )
+        spk = [item[3] for item in batch]
+        spk = torch.LongTensor(spk)
+        gst = [item[4] for item in batch]
+        gst = torch.LongTensor(gst)
+        mask = tokens != self.padding_idx
+        return tokens, f0, f0_raw, spk, gst, mask, lengths
+
+
+class CnnPredictor(nn.Module):
+    def __init__(
+        self,
+        n_tokens,
+        emb_dim,
+        channels,
+        kernel,
+        dropout,
+        n_layers,
+        spk_emb,
+        gst_emb,
+        n_bins,
+        f0_pred,
+        f0_log,
+        f0_norm,
+    ):
+        super(CnnPredictor, self).__init__()
+        self.n_tokens = n_tokens
+        self.emb_dim = emb_dim
+        self.f0_log = f0_log
+        self.f0_pred = f0_pred
+        self.padding_token = n_tokens
+        self.f0_norm = f0_norm
+        # add 1 extra embedding for padding token, set the padding index to be the last token
+        # (tokens from the clustering start at index 0)
+        self.token_emb = nn.Embedding(
+            n_tokens + 1, emb_dim, padding_idx=self.padding_token
+        )
+
+        self.spk_emb = spk_emb
+        self.gst_emb = nn.Embedding(20, gst_emb)
+        self.setup = False
+
+        feats = emb_dim + gst_emb
+        # feats = emb_dim + gst_emb + (256 if spk_emb else 0)
+        layers = [
+            nn.Sequential(
+                Rearrange("b t c -> b c t"),
+                nn.Conv1d(
+                    feats, channels, kernel_size=kernel, padding=(kernel - 1) // 2
+                ),
+                Rearrange("b c t -> b t c"),
+                nn.ReLU(),
+                nn.LayerNorm(channels),
+                nn.Dropout(dropout),
+            )
+        ]
+        for _ in range(n_layers - 1):
+            layers += [
+                nn.Sequential(
+                    Rearrange("b t c -> b c t"),
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size=kernel,
+                        padding=(kernel - 1) // 2,
+                    ),
+                    Rearrange("b c t -> b t c"),
+                    nn.ReLU(),
+                    nn.LayerNorm(channels),
+                    nn.Dropout(dropout),
+                )
+            ]
+        self.conv_layer = nn.ModuleList(layers)
+        self.proj = nn.Linear(channels, n_bins)
+
+    def forward(self, x, gst=None):
+        x = self.token_emb(x)
+        feats = [x]
+
+        if gst is not None:
+            gst = self.gst_emb(gst)
+            gst = rearrange(gst, "b c -> b c 1")
+            gst = F.interpolate(gst, x.shape[1])
+            gst = rearrange(gst, "b c t -> b t c")
+            feats.append(gst)
+
+        x = torch.cat(feats, dim=-1)
+
+        for i, conv in enumerate(self.conv_layer):
+            if i != 0:
+                x = conv(x) + x
+            else:
+                x = conv(x)
+
+        x = self.proj(x)
+        x = x.squeeze(-1)
+
+        if self.f0_pred == "mean":
+            x = torch.sigmoid(x)
+        elif self.f0_pred == "argmax":
+            x = torch.softmax(x, dim=-1)
+        else:
+            raise NotImplementedError
+        return x
+
+    def setup_f0_stats(self, f0_min, f0_max, f0_bins, speaker_stats):
+        self.f0_min = f0_min
+        self.f0_max = f0_max
+        self.f0_bins = f0_bins
+        self.speaker_stats = speaker_stats
+        self.setup = True
+
+    def inference(self, x, spk_id=None, gst=None):
+        assert (
+            self.setup == True
+        ), "make sure that `setup_f0_stats` was called before inference!"
+        probs = self(x, gst)
+        f0 = bin2freq(probs, self.f0_min, self.f0_max, self.f0_bins, self.f0_pred)
+        for i in range(f0.shape[0]):
+            mean = (
+                self.speaker_stats[spk_id[i].item()].mean_log
+                if self.f0_log
+                else self.speaker_stats[spk_id[i].item()].mean
+            )
+            std = (
+                self.speaker_stats[spk_id[i].item()].std_log
+                if self.f0_log
+                else self.speaker_stats[spk_id[i].item()].std
+            )
+            if self.f0_norm == "mean":
+                f0[i] = f0[i] + mean
+            if self.f0_norm == "meanstd":
+                f0[i] = (f0[i] * std) + mean
+        if self.f0_log:
+            f0 = f0.exp()
+        return f0
+
+
+class PitchDataset(Dataset):
+    def __init__(
+        self,
+        tsv_path,
+        km_path,
+        substring,
+        spk,
+        spk2id,
+        gst,
+        gst2id,
+        f0_bins,
+        f0_bin_type,
+        f0_smoothing,
+        f0_norm,
+        f0_log,
+    ):
+        lines = open(tsv_path, "r").readlines()
+        self.root, self.tsv = lines[0], lines[1:]
+        self.root = self.root.strip()
+        self.km = open(km_path, "r").readlines()
+        print(f"loaded {len(self.km)} files")
+
+        self.spk = spk
+        self.spk2id = spk2id
+        self.gst = gst
+        self.gst2id = gst2id
+
+        self.f0_bins = f0_bins
+        self.f0_smoothing = f0_smoothing
+        self.f0_norm = f0_norm
+        self.f0_log = f0_log
+
+        if substring != "":
+            tsv, km = [], []
+            for tsv_line, km_line in zip(self.tsv, self.km):
+                if substring.lower() in tsv_line.lower():
+                    tsv.append(tsv_line)
+                    km.append(km_line)
+            self.tsv, self.km = tsv, km
+            print(f"after filtering: {len(self.km)} files")
+
+        self.speaker_stats = self._compute_f0_stats()
+        self.f0_min, self.f0_max = self._compute_f0_minmax()
+        if f0_bin_type == "adaptive":
+            self.f0_bins = quantize_f0(
+                self.speaker_stats, self.f0_bins, self.f0_norm, self.f0_log
+            )
+        elif f0_bin_type == "uniform":
+            self.f0_bins = torch.linspace(self.f0_min, self.f0_max, self.f0_bins + 1)[
+                1:-1
+            ]
+        else:
+            raise NotImplementedError
+        print(f"f0 min: {self.f0_min}, f0 max: {self.f0_max}")
+        print(f"bins: {self.f0_bins} (shape: {self.f0_bins.shape})")
+
+    def __len__(self):
+        return len(self.km)
+
+    def _load_f0(self, tsv_line):
+        tsv_line = tsv_line.split("\t")[0]
+        f0 = self.root + "/" + tsv_line.replace(".wav", ".yaapt.f0.npy")
+        f0 = np.load(f0)
+        f0 = torch.FloatTensor(f0)
+        return f0
+
+    def _preprocess_f0(self, f0, spk):
+        mask = f0 != -999999  # process all frames
+        # mask = (f0 != 0)  # only process voiced frames
+        mean = (
+            self.speaker_stats[spk].mean_log
+            if self.f0_log
+            else self.speaker_stats[spk].mean
+        )
+        std = (
+            self.speaker_stats[spk].std_log
+            if self.f0_log
+            else self.speaker_stats[spk].std
+        )
+        if self.f0_log:
+            f0[f0 == 0] = 1e-5
+            f0[mask] = f0[mask].log()
+        if self.f0_norm == "mean":
+            f0[mask] = f0[mask] - mean
+        if self.f0_norm == "meanstd":
+            f0[mask] = (f0[mask] - mean) / std
+        return f0
+
+    def _compute_f0_minmax(self):
+        f0_min, f0_max = float("inf"), -float("inf")
+        for tsv_line in tqdm(self.tsv, desc="computing f0 minmax"):
+            spk = self.spk2id[parse_speaker(tsv_line, self.spk)]
+            f0 = self._load_f0(tsv_line)
+            f0 = self._preprocess_f0(f0, spk)
+            f0_min = min(f0_min, f0.min().item())
+            f0_max = max(f0_max, f0.max().item())
+        return f0_min, f0_max
+
+    def _compute_f0_stats(self):
+        from functools import partial
+
+        speaker_stats = defaultdict(partial(F0Stat, True))
+        for tsv_line in tqdm(self.tsv, desc="computing speaker stats"):
+            spk = self.spk2id[parse_speaker(tsv_line, self.spk)]
+            f0 = self._load_f0(tsv_line)
+            mask = f0 != 0
+            f0 = f0[mask]  # compute stats only on voiced parts
+            speaker_stats[spk].update(f0)
+        return speaker_stats
+
+    def __getitem__(self, i):
+        x = self.km[i]
+        x = x.split(" ")
+        x = list(map(int, x))
+        x = torch.LongTensor(x)
+
+        gst = parse_style(self.tsv[i], self.gst)
+        gst = self.gst2id[gst]
+        spk = parse_speaker(self.tsv[i], self.spk)
+        spk = self.spk2id[spk]
+
+        f0_raw = self._load_f0(self.tsv[i])
+        f0 = self._preprocess_f0(f0_raw.clone(), spk)
+
+        f0 = F.interpolate(f0.unsqueeze(0).unsqueeze(0), x.shape[0])[0, 0]
+        f0_raw = F.interpolate(f0_raw.unsqueeze(0).unsqueeze(0), x.shape[0])[0, 0]
+
+        f0 = freq2bin(f0, f0_min=self.f0_min, f0_max=self.f0_max, bins=self.f0_bins)
+        f0 = F.one_hot(f0.long(), num_classes=len(self.f0_bins) + 1).float()
+        if self.f0_smoothing > 0:
+            f0 = torch.tensor(
+                gaussian_filter1d(f0.float().numpy(), sigma=self.f0_smoothing)
+            )
+        return x, f0, f0_raw, spk, gst
+
+
+def train(cfg):
+    device = "cuda:0"
+    # add 1 extra embedding for padding token, set the padding index to be the last token
+    # (tokens from the clustering start at index 0)
+    padding_token = cfg.n_tokens
+    collate_fn = Collator(padding_idx=padding_token)
+    train_ds = PitchDataset(
+        cfg.train_tsv,
+        cfg.train_km,
+        substring=cfg.substring,
+        spk=cfg.spk,
+        spk2id=cfg.spk2id,
+        gst=cfg.gst,
+        gst2id=cfg.gst2id,
+        f0_bins=cfg.f0_bins,
+        f0_bin_type=cfg.f0_bin_type,
+        f0_smoothing=cfg.f0_smoothing,
+        f0_norm=cfg.f0_norm,
+        f0_log=cfg.f0_log,
+    )
+    valid_ds = PitchDataset(
+        cfg.valid_tsv,
+        cfg.valid_km,
+        substring=cfg.substring,
+        spk=cfg.spk,
+        spk2id=cfg.spk2id,
+        gst=cfg.gst,
+        gst2id=cfg.gst2id,
+        f0_bins=cfg.f0_bins,
+        f0_bin_type=cfg.f0_bin_type,
+        f0_smoothing=cfg.f0_smoothing,
+        f0_norm=cfg.f0_norm,
+        f0_log=cfg.f0_log,
+    )
+    train_dl = DataLoader(
+        train_ds,
+        num_workers=0,
+        batch_size=cfg.batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+    )
+    valid_dl = DataLoader(
+        valid_ds, num_workers=0, batch_size=16, shuffle=False, collate_fn=collate_fn
+    )
+
+    f0_min = train_ds.f0_min
+    f0_max = train_ds.f0_max
+    f0_bins = train_ds.f0_bins
+    speaker_stats = train_ds.speaker_stats
+
+    model = hydra.utils.instantiate(cfg["model"]).to(device)
+    model.setup_f0_stats(f0_min, f0_max, f0_bins, speaker_stats)
+
+    optimizer = hydra.utils.instantiate(cfg.optimizer, model.parameters())
+
+    best_loss = float("inf")
+    for epoch in range(cfg.epochs):
+        train_loss, train_l2_loss, train_l2_voiced_loss = run_epoch(
+            model, train_dl, optimizer, device, cfg, mode="train"
+        )
+        valid_loss, valid_l2_loss, valid_l2_voiced_loss = run_epoch(
+            model, valid_dl, None, device, cfg, mode="valid"
+        )
+        print(
+            f"[epoch {epoch}] train loss: {train_loss:.3f}, l2 loss: {train_l2_loss:.3f}, l2 voiced loss: {train_l2_voiced_loss:.3f}"
+        )
+        print(
+            f"[epoch {epoch}] valid loss: {valid_loss:.3f}, l2 loss: {valid_l2_loss:.3f}, l2 voiced loss: {valid_l2_voiced_loss:.3f}"
+        )
+        if valid_l2_voiced_loss < best_loss:
+            path = f"{os.getcwd()}/pitch_predictor.ckpt"
+            save_ckpt(model, path, cfg["model"], f0_min, f0_max, f0_bins, speaker_stats)
+            best_loss = valid_l2_voiced_loss
+            print(f"saved checkpoint: {path}")
+        print(f"[epoch {epoch}] best loss: {best_loss:.3f}")
+
+
+def run_epoch(model, loader, optimizer, device, cfg, mode):
+    if mode == "train":
+        model.train()
+    else:
+        model.eval()
+
+    epoch_loss = 0
+    l1 = 0
+    l1_voiced = 0
+    for x, f0_bin, f0_raw, spk_id, gst, mask, _ in tqdm(loader):
+        x, f0_bin, f0_raw, spk_id, gst, mask = (
+            x.to(device),
+            f0_bin.to(device),
+            f0_raw.to(device),
+            spk_id.to(device),
+            gst.to(device),
+            mask.to(device),
+        )
+        b, t, n_bins = f0_bin.shape
+        yhat = model(x, gst)
+        nonzero_mask = (f0_raw != 0).logical_and(mask)
+        yhat_raw = model.inference(x, spk_id, gst)
+        expanded_mask = mask.unsqueeze(-1).expand(-1, -1, n_bins)
+        if cfg.f0_pred == "mean":
+            loss = F.binary_cross_entropy(
+                yhat[expanded_mask], f0_bin[expanded_mask]
+            ).mean()
+        elif cfg.f0_pred == "argmax":
+            loss = F.cross_entropy(
+                rearrange(yhat, "b t d -> (b t) d"),
+                rearrange(f0_bin.argmax(-1), "b t -> (b t)"),
+                reduce=False,
+            )
+            loss = rearrange(loss, "(b t) -> b t", b=b, t=t)
+            loss = (loss * mask).sum() / mask.float().sum()
+        else:
+            raise NotImplementedError
+        l1 += F.l1_loss(yhat_raw[mask], f0_raw[mask]).item()
+        l1_voiced += F.l1_loss(yhat_raw[nonzero_mask], f0_raw[nonzero_mask]).item()
+        epoch_loss += loss.item()
+
+        if mode == "train":
+            loss.backward()
+            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+
+    print(f"{mode} example    y: {f0_bin.argmax(-1)[0, 50:60].tolist()}")
+    print(f"{mode} example yhat: {yhat.argmax(-1)[0, 50:60].tolist()}")
+    print(f"{mode} example    y: {f0_raw[0, 50:60].round().tolist()}")
+    print(f"{mode} example yhat: {yhat_raw[0, 50:60].round().tolist()}")
+    return epoch_loss / len(loader), l1 / len(loader), l1_voiced / len(loader)
+
+
+@hydra.main(config_path=dir_path, config_name="pitch_predictor.yaml")
+def main(cfg):
+    np.random.seed(1)
+    random.seed(1)
+    torch.manual_seed(1)
+    from hydra.core.hydra_config import HydraConfig
+
+    overrides = {
+        x.split("=")[0]: x.split("=")[1]
+        for x in HydraConfig.get().overrides.task
+        if "/" not in x
+    }
+    print(f"{cfg}")
+    train(cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fairseq/examples/emotion_conversion/emotion_models/utils.py b/fairseq/examples/emotion_conversion/emotion_models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4199c310f8ee6e50c81ca0645ea30cc672fd0b5c
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/emotion_models/utils.py
@@ -0,0 +1,78 @@
+import torch
+
+
+class Stat:
+    def __init__(self, keep_raw=False):
+        self.x = 0.0
+        self.x2 = 0.0
+        self.z = 0.0  # z = logx
+        self.z2 = 0.0
+        self.n = 0.0
+        self.u = 0.0
+        self.keep_raw = keep_raw
+        self.raw = []
+
+    def update(self, new_x):
+        new_z = new_x.log()
+
+        self.x += new_x.sum()
+        self.x2 += (new_x**2).sum()
+        self.z += new_z.sum()
+        self.z2 += (new_z**2).sum()
+        self.n += len(new_x)
+        self.u += 1
+
+        if self.keep_raw:
+            self.raw.append(new_x)
+
+    @property
+    def mean(self):
+        return self.x / self.n
+
+    @property
+    def std(self):
+        return (self.x2 / self.n - self.mean**2) ** 0.5
+
+    @property
+    def mean_log(self):
+        return self.z / self.n
+
+    @property
+    def std_log(self):
+        return (self.z2 / self.n - self.mean_log**2) ** 0.5
+
+    @property
+    def n_frms(self):
+        return self.n
+
+    @property
+    def n_utts(self):
+        return self.u
+
+    @property
+    def raw_data(self):
+        assert self.keep_raw, "does not support storing raw data!"
+        return torch.cat(self.raw)
+
+
+class F0Stat(Stat):
+    def update(self, new_x):
+        # assume unvoiced frames are 0 and consider only voiced frames
+        if new_x is not None:
+            super().update(new_x[new_x != 0])
+
+
+class Accuracy:
+    def __init__(self):
+        self.y, self.yhat = [], []
+
+    def update(self, yhat, y):
+        self.yhat.append(yhat)
+        self.y.append(y)
+
+    def acc(self, tol):
+        yhat = torch.cat(self.yhat)
+        y = torch.cat(self.y)
+        acc = torch.abs(yhat - y) <= tol
+        acc = acc.float().mean().item()
+        return acc
diff --git a/fairseq/examples/emotion_conversion/fairseq_models/__init__.py b/fairseq/examples/emotion_conversion/fairseq_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..441bc03db49c78556796db5d0b680b0cc7a99ed7
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/fairseq_models/__init__.py
@@ -0,0 +1,226 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqMultiModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import (
+    Embedding,
+    base_architecture,
+)
+from fairseq.models.multilingual_transformer import (
+    MultilingualTransformerModel,
+    base_multilingual_architecture,
+)
+from fairseq.utils import safe_hasattr
+from collections import OrderedDict
+
+
+@register_model("multilingual_transformer_from_mbart")
+class MultilingualTransformerModelFromMbart(MultilingualTransformerModel):
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
+
+        assert isinstance(task, MultilingualTranslationTask)
+
+        # make sure all arguments are present in older models
+        base_multilingual_architecture(args)
+
+        if not safe_hasattr(args, "max_source_positions"):
+            args.max_source_positions = 1024
+        if not safe_hasattr(args, "max_target_positions"):
+            args.max_target_positions = 1024
+
+        src_langs = [lang_pair.split("-")[0] for lang_pair in task.model_lang_pairs]
+        tgt_langs = [lang_pair.split("-")[1] for lang_pair in task.model_lang_pairs]
+
+        if args.share_encoders:
+            args.share_encoder_embeddings = True
+        if args.share_decoders:
+            args.share_decoder_embeddings = True
+
+        def build_embedding(dictionary, embed_dim, path=None):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            # if provided, load from preloaded dictionaries
+            if path:
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            return emb
+
+        # build shared embeddings (if applicable)
+        shared_encoder_embed_tokens, shared_decoder_embed_tokens = None, None
+        if args.share_all_embeddings:
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+                dicts=task.dicts,
+                langs=task.langs,
+                embed_dim=args.encoder_embed_dim,
+                build_embedding=build_embedding,
+                pretrained_embed_path=args.encoder_embed_path,
+            )
+            shared_decoder_embed_tokens = shared_encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            if args.share_encoder_embeddings:
+                shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+                    dicts=task.dicts,
+                    langs=src_langs,
+                    embed_dim=args.encoder_embed_dim,
+                    build_embedding=build_embedding,
+                    pretrained_embed_path=args.encoder_embed_path,
+                )
+            if args.share_decoder_embeddings:
+                shared_decoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+                    dicts=task.dicts,
+                    langs=tgt_langs,
+                    embed_dim=args.decoder_embed_dim,
+                    build_embedding=build_embedding,
+                    pretrained_embed_path=args.decoder_embed_path,
+                )
+
+        # encoders/decoders for each language
+        lang_encoders, lang_decoders = {}, {}
+
+        def get_encoder(lang):
+            if lang not in lang_encoders:
+                if shared_encoder_embed_tokens is not None:
+                    encoder_embed_tokens = shared_encoder_embed_tokens
+                else:
+                    encoder_embed_tokens = build_embedding(
+                        task.dicts[lang],
+                        args.encoder_embed_dim,
+                        args.encoder_embed_path,
+                    )
+                lang_encoders[lang] = MultilingualTransformerModel._get_module_class(
+                    True, args, task.dicts[lang], encoder_embed_tokens, src_langs
+                )
+            return lang_encoders[lang]
+
+        def get_decoder(lang):
+            if lang not in lang_decoders:
+                if shared_decoder_embed_tokens is not None:
+                    decoder_embed_tokens = shared_decoder_embed_tokens
+                else:
+                    decoder_embed_tokens = build_embedding(
+                        task.dicts[lang],
+                        args.decoder_embed_dim,
+                        args.decoder_embed_path,
+                    )
+                lang_decoders[lang] = MultilingualTransformerModel._get_module_class(
+                    False, args, task.dicts[lang], decoder_embed_tokens, tgt_langs
+                )
+            return lang_decoders[lang]
+
+        # shared encoders/decoders (if applicable)
+        shared_encoder, shared_decoder = None, None
+        if args.share_encoders:
+            shared_encoder = get_encoder(src_langs[0])
+        if args.share_decoders:
+            shared_decoder = get_decoder(tgt_langs[0])
+
+        encoders, decoders = OrderedDict(), OrderedDict()
+        for lang_pair, src, tgt in zip(task.model_lang_pairs, src_langs, tgt_langs):
+            encoders[lang_pair] = (
+                shared_encoder if shared_encoder is not None else get_encoder(src)
+            )
+            decoders[lang_pair] = (
+                shared_decoder if shared_decoder is not None else get_decoder(tgt)
+            )
+
+        return MultilingualTransformerModelFromMbart(encoders, decoders)
+
+    def load_state_dict(self, state_dict, strict=True, model_cfg=None):
+        state_dict_subset = state_dict.copy()
+        lang_pairs = set([x.split(".")[1] for x in state_dict.keys()])
+        finetune_mode = not any("neutral" in lp for lp in lang_pairs)
+
+        if finetune_mode:
+            # load a pre-trained mBART/BART model
+            # we need this code because mBART/BART are not of type FairseqMultiModel but FairseqModel
+            # so we hackishly load the weights by replicating them for all lang pairs
+            print("loading pre-trained BART")
+            self_state_dict = self.state_dict()
+            for k, v in state_dict.items():
+                for lang_pair in self.models:
+                    new_key = k if "models." in k else f"models.{lang_pair}.{k}"
+                    # print(new_key)
+                    if self_state_dict[new_key].shape == v.shape:
+                        state_dict_subset[new_key] = v
+                    elif any(
+                        w in k
+                        for w in [
+                            "encoder.embed_tokens.weight",
+                            "decoder.embed_tokens.weight",
+                            "decoder.output_projection.weight",
+                        ]
+                    ):
+                        # why vocab_size - 5? because there are `vocab_size` tokens from the language
+                        # and 5 additional tokens in the denoising task: eos,bos,pad,unk,mask.
+                        # but in the translation task there are only `vocab_size` + 4 (no mask).
+                        print(
+                            f"{k}: {self_state_dict[new_key].shape} != {v.shape}",
+                            end="",
+                            flush=True,
+                        )
+                        vocab_size = v.shape[0] - 5
+                        state_dict_subset[new_key] = self_state_dict[new_key]
+                        state_dict_subset[new_key] = v[: vocab_size + 4]
+                        print(f" => fixed by using first {vocab_size + 4} dims")
+                    else:
+                        raise ValueError("unable to load model due to mimatched dims!")
+                del state_dict_subset[k]
+        else:
+            print("loading pre-trained emotion translation model")
+            for k, _ in state_dict.items():
+                assert k.startswith("models.")
+                lang_pair = k.split(".")[1]
+                if lang_pair not in self.models:
+                    del state_dict_subset[k]
+
+        super().load_state_dict(state_dict_subset, strict=strict, model_cfg=model_cfg)
+
+
+@register_model_architecture("transformer", "transformer_small")
+def transformer_small(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.encoder_layers = getattr(args, "encoder_layers", 3)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 512)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 3)
+    base_architecture(args)
+
+
+@register_model_architecture(
+    "multilingual_transformer_from_mbart", "multilingual_small"
+)
+def multilingual_small(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.encoder_layers = getattr(args, "encoder_layers", 3)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 512)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 3)
+    base_multilingual_architecture(args)
diff --git a/fairseq/examples/emotion_conversion/preprocess/__init__.py b/fairseq/examples/emotion_conversion/preprocess/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/examples/emotion_conversion/preprocess/build_hifigan_manifest.py b/fairseq/examples/emotion_conversion/preprocess/build_hifigan_manifest.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c0d79ceeca7f03087781394b907b089197b4e9
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/preprocess/build_hifigan_manifest.py
@@ -0,0 +1,38 @@
+import torchaudio
+import argparse
+import json
+
+def main():
+    parser = argparse.ArgumentParser(description="example: python create_hifigan_manifest.py --tsv /checkpoint/felixkreuk/datasets/vctk/splits/vctk_16khz/train.tsv --km /checkpoint/felixkreuk/experiments/hubert/hubert_feats/vctk_16khz_km_100/train.km --km_type hubert_100km > ~/tmp/tmp_mani.txt")
+    parser.add_argument("--tsv", required=True, help="path to fairseq tsv file")
+    parser.add_argument("--km", required=True, help="path to a km file generated by HuBERT clustering")
+    parser.add_argument("--km_type", required=True, help="name of the codes in the output json (for example: 'cpc_100km')")
+    args = parser.parse_args()
+
+    km_lines = open(args.km, "r").readlines()
+    tsv_lines = open(args.tsv, "r").readlines()
+    assert len(km_lines) == len(tsv_lines) - 1, "tsv and km files are not of the same length!"
+
+    wav_root = tsv_lines[0].strip()
+    tsv_lines = tsv_lines[1:]
+
+    for tsv_line, km_line in zip(tsv_lines, km_lines):
+        tsv_line, km_line = tsv_line.strip(), km_line.strip()
+        wav_basename, wav_num_frames = tsv_line.split("\t")
+        wav_path = wav_root + "/" + wav_basename
+        wav_info = torchaudio.info(wav_path)
+        assert int(wav_num_frames) == wav_info.num_frames, "tsv duration and actual duration don't match!"
+        wav_duration = wav_info.num_frames / wav_info.sample_rate
+        manifest_line = {"audio": wav_path, "duration": wav_duration, args.km_type: km_line}
+        print(json.dumps(manifest_line))
+
+if __name__ == "__main__":
+    """
+    usage:
+    python create_hifigan_manifest.py \
+            --tsv /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/valid.tsv \
+            --km /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/valid.km \
+            --km_type hubert \
+            > /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/hifigan_valid_manifest.txt
+    """
+    main()
diff --git a/fairseq/examples/emotion_conversion/preprocess/build_translation_manifests.py b/fairseq/examples/emotion_conversion/preprocess/build_translation_manifests.py
new file mode 100644
index 0000000000000000000000000000000000000000..d38454a71368146c44dc73f6f37037dca0dd026d
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/preprocess/build_translation_manifests.py
@@ -0,0 +1,258 @@
+from glob import glob
+import argparse
+from collections import defaultdict, Counter
+from itertools import combinations, product, groupby
+from pathlib import Path
+import os
+from sklearn.utils import shuffle
+import numpy as np
+import random
+from shutil import copy
+from subprocess import check_call
+
+np.random.seed(42)
+random.seed(42)
+
+
+def get_fname(s):
+    return s.split("\t")[0]
+
+def get_emotion(s):
+    return get_fname(s).split("_")[0].split("/")[1].lower()
+
+def get_utt_id(s):
+    return get_fname(s).split(".")[0].split("_")[-1]
+
+def dedup(seq):
+    """ >> remove_repetitions("1 2 2 3 100 2 2 1")
+    '1 2 3 100 2 1' """
+    seq = seq.strip().split(" ")
+    result = seq[:1]
+    reps = []
+    rep_counter = 1
+    for k in seq[1:]:
+        if k != result[-1]:
+            result += [k]
+            reps += [rep_counter]
+            rep_counter = 1
+        else:
+            rep_counter += 1
+    reps += [rep_counter]
+    assert len(reps) == len(result) and sum(reps) == len(seq)
+    return " ".join(result) + "\n" #, reps
+
+def remove_under_k(seq, k):
+    """ remove tokens that repeat less then k times in a row
+    >> remove_under_k("a a a a b c c c", 1) ==> a a a a c c c """
+    seq = seq.strip().split(" ")
+    result = []
+
+    freqs = [(k,len(list(g))) for k, g in groupby(seq)]
+    for c, f in freqs:
+        if f > k:
+            result += [c for _ in range(f)]
+    return " ".join(result) + "\n" #, reps
+
+
+def call(cmd):
+    print(cmd)
+    check_call(cmd, shell=True)
+
+
+def denoising_preprocess(path, lang, dict):
+    bin = 'fairseq-preprocess'
+    cmd = [
+        bin,
+        f'--trainpref {path}/train.{lang} --validpref {path}/valid.{lang} --testpref {path}/test.{lang}',
+        f'--destdir {path}/tokenized/{lang}',
+        '--only-source',
+        '--task multilingual_denoising',
+        '--workers 40',
+    ]
+    if dict != "":
+        cmd += [f'--srcdict {dict}']
+    cmd = " ".join(cmd)
+    call(cmd)
+
+
+def translation_preprocess(path, src_lang, trg_lang, dict, only_train=False):
+    bin = 'fairseq-preprocess'
+    cmd = [
+        bin,
+        f'--source-lang {src_lang} --target-lang {trg_lang}',
+        f'--trainpref {path}/train',
+        f'--destdir {path}/tokenized',
+        '--workers 40',
+    ]
+    if not only_train:
+        cmd += [f'--validpref {path}/valid --testpref {path}/test']
+    if dict != "":
+        cmd += [
+            f'--srcdict {dict}',
+            f'--tgtdict {dict}',
+        ]
+    cmd = " ".join(cmd)
+    call(cmd)
+
+
+def load_tsv_km(tsv_path, km_path):
+    assert tsv_path.exists() and km_path.exists()
+    tsv_lines = open(tsv_path, "r").readlines()
+    root, tsv_lines = tsv_lines[0], tsv_lines[1:]
+    km_lines = open(km_path, "r").readlines()
+    assert len(tsv_lines) == len(km_lines), ".tsv and .km should be the same length!"
+    return root, tsv_lines, km_lines
+
+
+def main():
+    desc = """
+    this script takes as input .tsv and .km files for EMOV dataset, and a pairs of emotions.
+    it generates parallel .tsv and .km files for these emotions. for exmaple:
+    ❯ python build_emov_translation_manifests.py \
+            /checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/train.tsv \
+            /checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/emov_16khz_km_100/train.km \
+            ~/tmp/emov_pairs \
+            --src-emotion amused --trg-emotion neutral \
+            --dedup --shuffle --cross-speaker --dry-run
+    """
+    parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument("data", type=Path, help="path to a dir containing .tsv and .km files containing emov dataset")
+    parser.add_argument("output_path", type=Path, help="output directory with the manifests will be created")
+    parser.add_argument("-cs", "--cross-speaker", action='store_true', help="if set then translation will occur also between speakers, meaning the same sentence can be translated between different speakers (default: false)")
+    parser.add_argument("-dd", "--dedup", action='store_true', help="remove repeated tokens (example: 'aaabc=>abc')")
+    parser.add_argument("-sh", "--shuffle", action='store_true', help="shuffle the data")
+    parser.add_argument("-ae", "--autoencode", action='store_true', help="include training pairs from the same emotion (this includes examples of the same sentence uttered by different people and examples where the src and trg are the exact same seq)")
+    parser.add_argument("-dr", "--dry-run", action='store_true', help="don't write anything to disk")
+    parser.add_argument("-zs", "--zero-shot", action='store_true', help="if true, the denoising task will train on the same splits as the translation task (split by utterance id). if false, the denoising task will train on randomly sampled splits (not split by utterance id)")
+    parser.add_argument("--km-ext", default="km", help="")
+    parser.add_argument("--dict", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/fairseq.dict.txt", help="")
+    args = parser.parse_args()
+    SPEAKERS = ["bea", "jenie", "josh", "sam", "SAME"]
+    EMOTIONS = ['neutral', 'amused', 'angry', 'disgusted', 'sleepy']
+
+    suffix = ""
+    if args.cross_speaker: suffix += "_cross-speaker"
+    if args.dedup: suffix += "_dedup"
+    translation_suffix = ""
+    if args.autoencode: translation_suffix += "_autoencode"
+    denoising_suffix = ""
+    denoising_suffix += "_zeroshot" if args.zero_shot else "_nonzeroshot"
+
+    translation_dir = Path(args.output_path) / ("emov_multilingual_translation" + suffix + translation_suffix)
+    os.makedirs(translation_dir, exist_ok=True)
+    denoising_dir = Path(args.output_path) / ("emov_multilingual_denoising" + suffix + denoising_suffix)
+    os.makedirs(denoising_dir, exist_ok=True)
+
+    denoising_data = [p.name for p in (args.data / "denoising").glob("*") if "emov" not in p.name]
+
+    for split in ["train", "valid", "test"]:
+        root, tsv_lines, km_lines = load_tsv_km(
+            tsv_path = args.data / "denoising" / "emov" / f"{split}.tsv",
+            km_path = args.data / "denoising" / "emov" / f"{split}.{args.km_ext}"
+        )
+
+        # generate data for the multilingual denoising task
+        for EMOTION in EMOTIONS:
+            print("---")
+            print(split)
+            print(f"denoising: {EMOTION}")
+            emotion_tsv, emotion_km = [], []
+            for tsv_line, km_line in zip(tsv_lines, km_lines):
+                if EMOTION.lower() in tsv_line.lower():
+                    km_line = km_line if not args.dedup else dedup(km_line)
+                    emotion_tsv.append(tsv_line)
+                    emotion_km.append(km_line)
+            print(f"{len(emotion_km)} samples")
+            open(denoising_dir / f"files.{split}.{EMOTION}", "w").writelines([root] + emotion_tsv)
+            open(denoising_dir / f"{split}.{EMOTION}", "w").writelines(emotion_km)
+
+        for data in denoising_data:
+            with open(args.data / "denoising" / data / f"{split}.{args.km_ext}", "r") as f1:
+                with open(denoising_dir / f"{split}.{data}", "w") as f2:
+                    f2.writelines([l if not args.dedup else dedup(l) for l in f1.readlines()])
+
+        # start of translation preprocessing
+        root, tsv_lines, km_lines = load_tsv_km(
+            tsv_path = args.data / "translation" / f"{split}.tsv",
+            km_path = args.data / "translation" / f"{split}.{args.km_ext}"
+        )
+
+        # generate data for the multilingual translation task
+        for SRC_EMOTION in EMOTIONS:
+            TRG_EMOTIONS = EMOTIONS if args.autoencode else set(EMOTIONS) - set([SRC_EMOTION])
+            for TRG_EMOTION in TRG_EMOTIONS:
+                # when translating back to the same emotion - we dont want these emotion
+                # pairs to be part of the validation/test sets (because its not really emotion conversino)
+                #  if SRC_EMOTION == TRG_EMOTION and split in ["valid", "test"]: continue
+                print("---")
+                print(split)
+                print(f"src emotions: {SRC_EMOTION}\ntrg emotions: {TRG_EMOTION}")
+
+                # create a dictionary with the following structure:
+                # output[SPEAKER][UTT_ID] = list with indexes of line from the tsv file
+                # that match the speaker and utterance id. for exmaple:
+                # output = {'sam': {'0493': [875, 1608, 1822], ...}, ...}
+                # meaning, for speaker 'sam', utterance id '0493', the indexes in tsv_lines
+                # are 875, 1608, 1822
+                spkr2utts = defaultdict(lambda: defaultdict(list))
+                for i, tsv_line in enumerate(tsv_lines):
+                    speaker = tsv_line.split("/")[0]
+                    if args.cross_speaker: speaker = "SAME"
+                    assert speaker in SPEAKERS, "unknown speaker! make sure the .tsv contains EMOV data"
+                    utt_id = get_utt_id(tsv_line)
+                    spkr2utts[speaker][utt_id].append(i)
+
+                # create a tsv and km files with all the combinations for translation
+                src_tsv, trg_tsv, src_km, trg_km = [], [], [], []
+                for speaker, utt_ids in spkr2utts.items():
+                    for utt_id, indices in utt_ids.items():
+                        # generate all pairs
+                        pairs = [(x,y) for x in indices for y in indices]
+                        # self-translation 
+                        if SRC_EMOTION == TRG_EMOTION:
+                            pairs = [(x,y) for (x,y) in pairs if x == y]
+                        # filter according to src and trg emotions
+                        pairs = [(x,y) for (x,y) in pairs 
+                                if get_emotion(tsv_lines[x]) == SRC_EMOTION and get_emotion(tsv_lines[y]) == TRG_EMOTION]
+
+                        for idx1, idx2 in pairs:
+                            assert get_utt_id(tsv_lines[idx1]) == get_utt_id(tsv_lines[idx2])
+                            src_tsv.append(tsv_lines[idx1])
+                            trg_tsv.append(tsv_lines[idx2])
+                            km_line_idx1 = km_lines[idx1]
+                            km_line_idx2 = km_lines[idx2]
+                            km_line_idx1 = km_line_idx1 if not args.dedup else dedup(km_line_idx1)
+                            km_line_idx2 = km_line_idx2 if not args.dedup else dedup(km_line_idx2)
+                            src_km.append(km_line_idx1)
+                            trg_km.append(km_line_idx2)
+                assert len(src_tsv) == len(trg_tsv) == len(src_km) == len(trg_km)
+                print(f"{len(src_tsv)} pairs")
+
+                if len(src_tsv) == 0:
+                    raise Exception("ERROR: generated 0 pairs!")
+
+                if args.dry_run: continue
+
+                # create files
+                os.makedirs(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}", exist_ok=True)
+                open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"files.{split}.{SRC_EMOTION}", "w").writelines([root] + src_tsv)
+                open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"files.{split}.{TRG_EMOTION}", "w").writelines([root] + trg_tsv)
+                open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"{split}.{SRC_EMOTION}", "w").writelines(src_km)
+                open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"{split}.{TRG_EMOTION}", "w").writelines(trg_km)
+
+        
+    # fairseq-preprocess the denoising data
+    for EMOTION in EMOTIONS + denoising_data:
+        denoising_preprocess(denoising_dir, EMOTION, args.dict)
+    os.system(f"cp {args.dict} {denoising_dir}/tokenized/dict.txt")
+
+    # fairseq-preprocess the translation data
+    os.makedirs(translation_dir / "tokenized", exist_ok=True)
+    for SRC_EMOTION in EMOTIONS:
+        TRG_EMOTIONS = EMOTIONS if args.autoencode else set(EMOTIONS) - set([SRC_EMOTION])
+        for TRG_EMOTION in TRG_EMOTIONS:
+            translation_preprocess(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}", SRC_EMOTION, TRG_EMOTION, args.dict)#, only_train=SRC_EMOTION==TRG_EMOTION)
+    os.system(f"cp -rf {translation_dir}/**/tokenized/* {translation_dir}/tokenized")
+
+if __name__ == "__main__":
+    main()
diff --git a/fairseq/examples/emotion_conversion/preprocess/create_core_manifest.py b/fairseq/examples/emotion_conversion/preprocess/create_core_manifest.py
new file mode 100644
index 0000000000000000000000000000000000000000..b55740e00becb6fc45d2d3c5396dc6ca85790e2a
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/preprocess/create_core_manifest.py
@@ -0,0 +1,91 @@
+from pathlib import Path
+import os
+import sys
+import subprocess
+import argparse
+from datetime import datetime
+import logging
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(levelname)s] %(message)s',
+    handlers=[logging.FileHandler('debug.log'), logging.StreamHandler()]
+)
+logger = logging.getLogger(__name__)
+
+
+def verify_dict_size(km, dict):
+    logger.info(f"verifying: {km}")
+    dict_size = len(open(dict, "r").readlines())
+    km_vocab = set(open(km, "r").read().replace("\n", " ").split(" "))
+    if "" in km_vocab: km_vocab.remove("")
+    km_vocab_size = len(km_vocab)
+    return dict_size == km_vocab_size
+
+
+def verify_files_exist(l):
+    for f in l:
+        if not f.exists():
+            logging.error(f"{f} doesn't exist!")
+            return False
+    return True
+
+
+def run_cmd(cmd, print_output=True):
+    try:
+        out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True, shell=True)
+        if print_output:
+            logger.info(f"command output:\n{out}")
+        return out
+    except subprocess.CalledProcessError as grepexc:                                                                                                   
+        logger.info(f"error executing command!:\n{cmd}")
+        logger.info(grepexc.output)
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tsv", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/data.tsv", type=Path)
+    parser.add_argument("--emov-km", required=True, type=Path)
+    parser.add_argument("--km", nargs='+', required=True, type=Path)
+    parser.add_argument("--seed", type=int, default=1)
+    parser.add_argument("--dict", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/fairseq.dict.txt")
+    parser.add_argument("--manifests-dir", type=Path, default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz")
+    args = parser.parse_args()
+
+    manifests_dir = args.manifests_dir
+    date = datetime.now().strftime('%d%m%y')
+    outdir = manifests_dir / f"{date}"
+
+    # verify input and create folders
+    all_kms = args.km + [args.emov_km]
+    assert verify_files_exist(all_kms), "make sure the km dir contains: train-clean-all.km, blizzard2013.km, data.km"
+    for codes in all_kms:
+        assert verify_dict_size(codes, args.dict), "dict argument doesn't match the vocabulary of the km file!"
+    assert not outdir.exists(), "data dir already exists!"
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    logger.info("generating denoising split (emov)")
+    run_cmd(f"python preprocess/split_km_tsv.py {args.tsv} {args.emov_km} --destdir {outdir}/denoising/emov -sh --seed {args.seed}")
+    for codes in args.km:
+        codes_name = os.path.basename(codes)
+        run_cmd(f"python preprocess/split_km.py {codes} --destdir {outdir}/denoising/{codes_name} -sh --seed {args.seed}")
+
+    logger.info("generating translation split")
+    run_cmd(f"python preprocess/split_emov_km_tsv_by_uttid.py {args.tsv} {args.emov_km} --destdir {outdir}/translation --seed {args.seed}")
+
+    emov_code_name = os.path.basename(args.emov_km)
+    logger.info("generating hifigan split")
+    run_cmd(
+        f"mkdir -p {outdir}/hifigan &&"
+        f"python preprocess/build_hifigan_manifest.py --km_type hubert --tsv {outdir}/denoising/emov/train.tsv --km {outdir}/denoising/emov/train.km > {outdir}/hifigan/train.txt &&"
+        f"python preprocess/build_hifigan_manifest.py --km_type hubert --tsv {outdir}/denoising/emov/valid.tsv --km {outdir}/denoising/emov/valid.km > {outdir}/hifigan/valid.txt &&"
+        f"python preprocess/build_hifigan_manifest.py --km_type hubert --tsv {outdir}/denoising/emov/test.tsv --km {outdir}/denoising/emov/test.km > {outdir}/hifigan/test.txt"
+    )
+
+    logger.info("generating fairseq manifests")
+    run_cmd(f"python preprocess/build_translation_manifests.py {outdir} {outdir}/fairseq-data -dd -cs --dict {args.dict}")
+
+    logger.info(f"finished processing data at:\n{outdir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fairseq/examples/emotion_conversion/preprocess/extract_f0.py b/fairseq/examples/emotion_conversion/preprocess/extract_f0.py
new file mode 100644
index 0000000000000000000000000000000000000000..4204aa4db1c39fac6a12a865741b5acca702230f
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/preprocess/extract_f0.py
@@ -0,0 +1,57 @@
+import argparse
+from tqdm import tqdm
+from multiprocessing import Manager, Pool
+
+from scipy.io.wavfile import read
+from librosa.util import normalize
+import numpy as np
+import amfm_decompy.pYAAPT as pYAAPT
+import amfm_decompy.basic_tools as basic
+
+MAX_WAV_VALUE = 32768.0
+
+parser = argparse.ArgumentParser(description="")
+parser.add_argument("tsv", help="")
+parser.add_argument("--extractor", choices=["crepe", "pyaapt"], default="pyaapt", help="")
+parser.add_argument("--interp", action="store_true", help="")
+parser.add_argument("--n_workers", type=int, default=40, help="")
+args = parser.parse_args()
+
+tsv_lines = open(args.tsv, "r").readlines()
+root, tsv_lines = tsv_lines[0].strip(), tsv_lines[1:]
+
+
+def extract_f0(tsv_line):
+    wav_path, _ = tsv_line.split("\t")
+    wav_path = root.strip() + "/" + wav_path
+    sr, wav = read(wav_path)
+    wav = wav / MAX_WAV_VALUE
+    wav = normalize(wav) * 0.95
+
+    if args.extractor == "pyaapt":
+        frame_length = 20.0
+        pad = int(frame_length / 1000 * sr) // 2
+        wav = np.pad(wav.squeeze(), (pad, pad), "constant", constant_values=0)
+        signal = basic.SignalObj(wav, sr)
+        pitch = pYAAPT.yaapt(
+                signal,
+                **{
+                    'frame_length': frame_length,
+                    'frame_space': 5.0,
+                    'nccf_thresh1': 0.25,
+                    'tda_frame_length': 25.0
+                })
+        pitch = pitch.samp_interp[None, None, :] if args.interp else pitch.samp_values[None, None, :]
+        pitch = pitch[0, 0]
+        f0_path = wav_path.replace(".wav", ".yaapt")
+        f0_path += ".interp.f0" if args.interp else ".f0"
+        np.save(f0_path, pitch)
+
+
+def main():
+    with Pool(args.n_workers) as p:
+        r = list(tqdm(p.imap(extract_f0, tsv_lines), total=len(tsv_lines)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fairseq/examples/emotion_conversion/preprocess/process_km.py b/fairseq/examples/emotion_conversion/preprocess/process_km.py
new file mode 100644
index 0000000000000000000000000000000000000000..864a022105f3c6f82b53a4553a438e76e211f40d
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/preprocess/process_km.py
@@ -0,0 +1,40 @@
+import sys
+import argparse
+from tqdm import tqdm
+from build_emov_translation_manifests import dedup, remove_under_k
+
+
+if __name__ == "__main__":
+    """
+    this is a standalone script to process a km file
+    specifically, to dedup or remove tokens that repeat less
+    than k times in a row
+    """
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("km", type=str, help="path to km file")
+    parser.add_argument("--dedup", action='store_true')
+    parser.add_argument("--remove-under-k", type=int, default=0)
+    parser.add_argument("--output", default=None)
+    args = parser.parse_args()
+
+    if not args.dedup and args.remove_under_k == 0:
+        print("nothing to do! quitting...")
+        sys.exit(0)
+
+    km = open(args.km, "r").readlines()
+    out = []
+    for line in tqdm(km):
+        if args.remove_under_k > 0:
+            line = remove_under_k(line, args.remove_under_k)
+        if args.dedup:
+            line = dedup(line)
+        out.append(line)
+
+    path = args.km if args.output is None else args.output
+    if args.remove_under_k > 0:
+        path = path.replace(".km", f"-k{args.remove_under_k}.km")
+    if args.dedup:
+        path = path.replace(".km", f"-deduped.km")
+
+    open(path, "w").writelines(out)
+    print(f"written to {path}")
diff --git a/fairseq/examples/emotion_conversion/preprocess/split_emov_km_tsv_by_uttid.py b/fairseq/examples/emotion_conversion/preprocess/split_emov_km_tsv_by_uttid.py
new file mode 100644
index 0000000000000000000000000000000000000000..94221afba711eb106ecd2275e5862a9a8b6148fd
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/preprocess/split_emov_km_tsv_by_uttid.py
@@ -0,0 +1,70 @@
+from pathlib import Path
+import os
+import sys
+import argparse
+import random
+import numpy as np
+from tqdm import tqdm
+from sklearn.model_selection import train_test_split
+from build_translation_manifests import get_utt_id
+
+
+def train_val_test_split(tsv_lines, km_lines, valid_percent, test_percent, seed=42):
+    utt_ids = list(sorted(set([get_utt_id(x) for x in tsv_lines])))
+    utt_ids, valid_utt_ids, _, _ = train_test_split(utt_ids, utt_ids, test_size=valid_percent, shuffle=True, random_state=seed)
+    train_utt_ids, test_utt_ids, _, _ = train_test_split(utt_ids, utt_ids, test_size=test_percent, shuffle=True, random_state=seed)
+
+    train_idx = [i for i, line in enumerate(tsv_lines) if get_utt_id(line) in train_utt_ids]
+    valid_idx = [i for i, line in enumerate(tsv_lines) if get_utt_id(line) in valid_utt_ids]
+    test_idx = [i for i, line in enumerate(tsv_lines) if get_utt_id(line) in test_utt_ids]
+
+    train_tsv, train_km = [tsv_lines[i] for i in train_idx], [km_lines[i] for i in train_idx]
+    valid_tsv, valid_km = [tsv_lines[i] for i in valid_idx], [km_lines[i] for i in valid_idx]
+    test_tsv, test_km = [tsv_lines[i] for i in test_idx], [km_lines[i] for i in test_idx]
+
+    print(f"train {len(train_km)}")
+    print(f"valid {len(valid_km)}")
+    print(f"test {len(test_km)}")
+
+    return train_tsv, train_km, valid_tsv, valid_km, test_tsv, test_km
+
+
+if __name__ == "__main__":
+    """
+    this is a standalone script to process a km file
+    specifically, to dedup or remove tokens that repeat less
+    than k times in a row
+    """
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("tsv", type=str, help="path to tsv file")
+    parser.add_argument("km", type=str, help="path to km file")
+    parser.add_argument("--destdir", required=True, type=str)
+    parser.add_argument("--valid-percent", type=float, default=0.05, help="percent to allocate to validation set")
+    parser.add_argument("--test-percent", type=float, default=0.05, help="percent to allocate to test set")
+    parser.add_argument("--seed", type=int, default=42, help="")
+    args = parser.parse_args()
+    
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+
+    os.makedirs(args.destdir, exist_ok=True)
+    km = open(args.km, "r").readlines()
+    tsv = open(args.tsv, "r").readlines()
+    root, tsv = tsv[0], tsv[1:]
+
+    assert args.tsv.endswith(".tsv") and args.km.endswith(".km")
+    assert len(tsv) == len(km)
+
+    train_tsv, train_km, valid_tsv, valid_km, test_tsv, test_km = train_val_test_split(tsv, km, args.valid_percent, args.test_percent, args.seed)
+
+    assert len(train_tsv) + len(valid_tsv) + len(test_tsv) == len(tsv)
+    assert len(train_tsv) == len(train_km) and len(valid_tsv) == len(valid_km) and len(test_tsv) == len(test_km)
+
+    dir = Path(args.destdir)
+    open(dir / f"train.tsv", "w").writelines([root] + train_tsv)
+    open(dir / f"valid.tsv", "w").writelines([root] + valid_tsv)
+    open(dir / f"test.tsv", "w").writelines([root] + test_tsv)
+    open(dir / f"train.km", "w").writelines(train_km)
+    open(dir / f"valid.km", "w").writelines(valid_km)
+    open(dir / f"test.km", "w").writelines(test_km)
+    print("done")
diff --git a/fairseq/examples/emotion_conversion/preprocess/split_km.py b/fairseq/examples/emotion_conversion/preprocess/split_km.py
new file mode 100644
index 0000000000000000000000000000000000000000..d145fc2bdefe72d5ce90b56f13d4e9f19a001d31
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/preprocess/split_km.py
@@ -0,0 +1,50 @@
+from pathlib import Path
+import os
+import argparse
+import random
+import numpy as np
+from sklearn.utils import shuffle
+
+
+if __name__ == "__main__":
+    """
+    this is a standalone script to process a km file
+    specifically, to dedup or remove tokens that repeat less
+    than k times in a row
+    """
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("km", type=str, help="path to km file")
+    parser.add_argument("--destdir", required=True, type=str)
+    parser.add_argument("--valid-percent", type=float, default=0.05, help="percent to allocate to validation set")
+    parser.add_argument("--test-percent", type=float, default=0.05, help="percent to allocate to test set")
+    parser.add_argument("-sh", "--shuffle", action="store_true", help="path to km file")
+    parser.add_argument("--seed", type=int, default=42, help="")
+    args = parser.parse_args()
+    
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+
+    os.makedirs(args.destdir, exist_ok=True)
+    km = open(args.km, "r").readlines()
+
+    if args.shuffle:
+        km = shuffle(km)
+        print(f"shuffled")
+
+    N = len(km)
+    N_tt = int(N * args.test_percent)
+    N_cv = int(N * args.valid_percent)
+    N_tr = N - N_tt - N_cv
+
+    train_km = km[:N_tr]
+    valid_km = km[N_tr:N_tr + N_cv]
+    test_km = km[N_tr + N_cv:]
+
+    dir = Path(args.destdir)
+    open(dir / f"train.km", "w").writelines(train_km)
+    open(dir / f"valid.km", "w").writelines(valid_km)
+    open(dir / f"test.km", "w").writelines(test_km)
+    print(f"train: {len(train_km)}")
+    print(f"valid: {len(valid_km)}")
+    print(f"test: {len(test_km)}")
+    print("done")
diff --git a/fairseq/examples/emotion_conversion/preprocess/split_km_tsv.py b/fairseq/examples/emotion_conversion/preprocess/split_km_tsv.py
new file mode 100644
index 0000000000000000000000000000000000000000..2113aa718d3bb75c08e5f9d01a79c2ff7ff2d4f4
--- /dev/null
+++ b/fairseq/examples/emotion_conversion/preprocess/split_km_tsv.py
@@ -0,0 +1,65 @@
+from pathlib import Path
+import os
+import argparse
+import random
+import numpy as np
+from sklearn.utils import shuffle
+
+
+if __name__ == "__main__":
+    """
+    this is a standalone script to process a km file
+    specifically, to dedup or remove tokens that repeat less
+    than k times in a row
+    """
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("tsv", type=str, help="path to tsv file")
+    parser.add_argument("km", type=str, help="path to km file")
+    parser.add_argument("--destdir", required=True, type=str)
+    parser.add_argument("--valid-percent", type=float, default=0.05, help="percent to allocate to validation set")
+    parser.add_argument("--test-percent", type=float, default=0.05, help="percent to allocate to test set")
+    parser.add_argument("-sh", "--shuffle", action="store_true", help="path to km file")
+    parser.add_argument("--seed", type=int, default=42, help="")
+    args = parser.parse_args()
+    
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+
+    os.makedirs(args.destdir, exist_ok=True)
+    km = open(args.km, "r").readlines()
+    tsv = open(args.tsv, "r").readlines()
+    root, tsv = tsv[0], tsv[1:]
+
+    assert args.tsv.endswith(".tsv") and args.km.endswith(".km")
+    assert len(tsv) == len(km)
+
+    if args.shuffle:
+        tsv, km = shuffle(tsv, km)
+        print(f"shuffled")
+
+    N = len(tsv)
+    N_tt = int(N * args.test_percent)
+    N_cv = int(N * args.valid_percent)
+    N_tr = N - N_tt - N_cv
+
+    train_tsv = tsv[:N_tr]
+    valid_tsv = tsv[N_tr:N_tr + N_cv]
+    test_tsv = tsv[N_tr + N_cv:]
+    train_km = km[:N_tr]
+    valid_km = km[N_tr:N_tr + N_cv]
+    test_km = km[N_tr + N_cv:]
+
+    assert len(train_tsv) + len(valid_tsv) + len(test_tsv) == len(tsv)
+    assert len(train_tsv) == len(train_km) and len(valid_tsv) == len(valid_km) and len(test_tsv) == len(test_km)
+
+    dir = Path(args.destdir)
+    open(dir / f"train.tsv", "w").writelines([root] + train_tsv)
+    open(dir / f"valid.tsv", "w").writelines([root] + valid_tsv)
+    open(dir / f"test.tsv", "w").writelines([root] + test_tsv)
+    open(dir / f"train.km", "w").writelines(train_km)
+    open(dir / f"valid.km", "w").writelines(valid_km)
+    open(dir / f"test.km", "w").writelines(test_km)
+    print(f"train: {len(train_km)}")
+    print(f"valid: {len(valid_km)}")
+    print(f"test: {len(test_km)}")
+    print("done")
diff --git a/fairseq/examples/fast_noisy_channel/README.md b/fairseq/examples/fast_noisy_channel/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2631a8c34d11bdf7d351c6807b6fe415f5715e1
--- /dev/null
+++ b/fairseq/examples/fast_noisy_channel/README.md
@@ -0,0 +1,345 @@
+# Language Models not just for Pre-training: Fast Online Neural Noisy Channel Modeling
+
+## Introduction
+- [Yee et al. (2019)](https://www.aclweb.org/anthology/D19-1571.pdf) introduce a simple and effective noisy channel modeling approach for neural machine translation. However, the noisy channel online decoding approach introduced in this paper is too slow to be practical.
+- To address this, [Bhosale et al. (2020)](http://www.statmt.org/wmt20/pdf/2020.wmt-1.68.pdf) introduces 3 simple approximations to make this approach very fast and practical without much loss in accuracy.
+- This README provides intructions on how to run online decoding or generation with the noisy channel modeling approach, including ways to make it very fast without much loss in accuracy.
+
+## Noisy Channel Modeling
+
+[Yee et al. (2019)](https://www.aclweb.org/anthology/D19-1571.pdf) applies the Bayes Rule to predict `P(y|x)`, the probability of the target `y` given the source `x`.
+```P(y|x) = P(x|y) * P(y) / P(x)```
+- `P(x|y)` predicts the source `x` given the target `y` and is referred to as the **channel model**
+- `P(y)` is a **language model** over the target `y`
+- `P(x)` is generally not modeled since it is constant for all `y`.
+
+We use Transformer models to parameterize the direct model `P(y|x)`, the channel model `P(x|y)` and the language model `P(y)`.
+
+During online decoding with beam search, we generate the top `K2` candidates per beam and score them with the following linear combination of the channel model, the language model as well as the direct model scores.
+
+```(1 / t) * log(P(y|x) + (1 / s) * ( λ1 * log(P(x|y)) + λ2 * log(P(y) ) )```
+- `t` - Target Prefix Length
+- `s` - Source Length
+- `λ1` - Channel Model Weight
+- `λ2` - Language Model Weight
+
+The top `beam_size` candidates based on the above combined scores are chosen to continue the beams in beam search. In beam search with a direct model alone, the scores from the direct model `P(y|x)` are used to choose the top candidates in beam search.
+
+This framework provides a great way to utlize strong target language models trained on large amounts of unlabeled data. Language models can prefer targets unrelated to the source, so we also need a channel model whose role is to ensure that the target preferred by the language model also translates back to the source.
+
+### Training Translation Models and Language Models
+
+For training Transformer models in fairseq for machine translation, refer to instructions [here](https://github.com/pytorch/fairseq/tree/main/examples/translation)
+
+For training Transformer models in fairseq for language modeling, refer to instructions [here](https://github.com/pytorch/fairseq/tree/main/examples/language_model)
+
+### Generation with Language Model for German-English translation with fairseq
+
+Here are instructions to generate using a direct model and a target-side language model.
+
+Note:
+- Download and install fairseq as per instructions [here](https://github.com/pytorch/fairseq)
+- Preprocess and binarize the dataset as per instructions in section [Test Data Preprocessing](#test-data-preprocessing)
+
+```sh
+binarized_data=data_dir/binarized
+direct_model=de_en_seed4.pt
+lm_model=en_lm.pt
+lm_data=lm_data
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt -O ${direct_model}
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt -O ${lm_model}
+mkdir -p ${lm_data}
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/dict.txt -O ${lm_data}/dict.txt
+
+k2=10
+lenpen=0.16
+lm_wt=0.14
+fairseq-generate ${binarized_data} \
+    --user-dir examples/fast_noisy_channel \
+    --beam 5 \
+    --path ${direct_model} \
+    --lm-model ${lm_model} \
+    --lm-data ${lm_data}  \
+    --k2 ${k2} \
+    --combine-method lm_only \
+    --task noisy_channel_translation \
+    --lenpen ${lenpen} \
+    --lm-wt ${lm_wt} \
+    --gen-subset valid \
+    --remove-bpe \
+    --fp16 \
+    --batch-size 10
+```
+### Noisy Channel Generation for German-English translation with fairseq
+
+Here are instructions for noisy channel generation with a direct model, channel model and language model as explained in section [Noisy Channel Modeling](#noisy-channel-modeling).
+
+Note:
+- Download and install fairseq as per instructions [here](https://github.com/pytorch/fairseq)
+- Preprocess and binarize the dataset as per instructions in section [Test Data Preprocessing](#test-data-preprocessing)
+
+```sh
+binarized_data=data_dir/binarized
+direct_model=de_en_seed4.pt
+lm_model=en_lm.pt
+lm_data=lm_data
+ch_model=en_de.big.seed4.pt
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt -O ${direct_model}
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt -O ${lm_model}
+mkdir -p ${lm_data}
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/dict.txt -O ${lm_data}/dict.txt
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed4.pt -O ${ch_model}
+
+k2=10
+lenpen=0.21
+lm_wt=0.50
+bw_wt=0.30
+fairseq-generate ${binarized_data} \
+    --user-dir examples/fast_noisy_channel \
+    --beam 5 \
+    --path ${direct_model} \
+    --lm-model ${lm_model} \
+    --lm-data ${lm_data}  \
+    --channel-model ${ch_model} \
+    --k2 ${k2} \
+    --combine-method noisy_channel \
+    --task noisy_channel_translation \
+    --lenpen ${lenpen} \
+    --lm-wt ${lm_wt} \
+    --ch-wt ${bw_wt} \
+    --gen-subset test \
+    --remove-bpe \
+    --fp16 \
+    --batch-size 1
+```
+## Fast Noisy Channel Modeling
+
+[Bhosale et al. (2020)](http://www.statmt.org/wmt20/pdf/2020.wmt-1.68.pdf) introduces 3 approximations that speed up online noisy channel decoding -
+- Smaller channel models (`Tranformer Base` with 1 encoder and decoder layer each vs. `Transformer Big`)
+  - This involves training a channel model that is possibly smaller and less accurate in terms of BLEU than a channel model of the same size as the direct model.
+  - Since the role of the channel model is mainly to assign low scores to generations from the language model if they don't translate back to the source, we may not need the most accurate channel model for this purpose.
+- Smaller output vocabulary size for the channel model (~30,000 -> ~1000)
+  - The channel model doesn't need to score the full output vocabulary, it just needs to score the source tokens, which are completely known.
+  - This is specified using the arguments `--channel-scoring-type src_vocab --top-k-vocab 500`
+  - This means that the output vocabulary for the channel model will be the source tokens for all examples in the batch and the top-K most frequent tokens in the vocabulary
+  - This reduces the memory consumption needed to store channel model scores significantly
+- Smaller number of candidates (`k2`) scored per beam
+  - This is specified by reducing the argument `--k2`
+
+
+### Fast Noisy Channel Generation for German-English translation with fairseq
+
+Here are instructions for **fast** noisy channel generation with a direct model, channel model and language model as explained in section [Fast Noisy Channel Modeling](#fast-noisy-channel-modeling). The main differences are that we use a smaller channel model, reduce `--k2`, set `--channel-scoring-type src_vocab --top-k-vocab 500` and increase the `--batch-size`.
+
+Note:
+- Download and install fairseq as per instructions [here](https://github.com/pytorch/fairseq)
+- Preprocess and binarize the dataset as per instructions in section [Test Data Preprocessing](#test-data-preprocessing)
+
+```sh
+binarized_data=data_dir/binarized
+direct_model=de_en_seed4.pt
+lm_model=en_lm.pt
+lm_data=lm_data
+small_ch_model=en_de.base_1_1.seed4.pt
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt -O ${direct_model}
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt -O ${lm_model}
+mkdir -p ${lm_data}
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/dict.txt -O ${lm_data}/dict.txt
+wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed4.pt -O ${small_ch_model}
+
+k2=3
+lenpen=0.23
+lm_wt=0.58
+bw_wt=0.26
+fairseq-generate ${binarized_data} \
+    --user-dir examples/fast_noisy_channel \
+    --beam 5 \
+    --path ${direct_model} \
+    --lm-model ${lm_model} \
+    --lm-data ${lm_data}  \
+    --channel-model ${small_ch_model} \
+    --k2 ${k2} \
+    --combine-method noisy_channel \
+    --task noisy_channel_translation \
+    --lenpen ${lenpen} \
+    --lm-wt ${lm_wt} \
+    --ch-wt ${bw_wt} \
+    --gen-subset test \
+    --remove-bpe \
+    --fp16 \
+    --batch-size 50 \
+    --channel-scoring-type src_vocab --top-k-vocab 500
+```
+
+## Test Data Preprocessing
+
+For preprocessing and binarizing the test sets for Romanian-English and German-English translation, we use the following script -
+
+```sh
+FAIRSEQ=/path/to/fairseq
+cd $FAIRSEQ
+SCRIPTS=$FAIRSEQ/mosesdecoder/scripts
+if [ ! -d "${SCRIPTS}" ]; then
+    echo 'Cloning Moses github repository (for tokenization scripts)...'
+    git clone https://github.com/moses-smt/mosesdecoder.git
+fi
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+NORMALIZE=$SCRIPTS/tokenizer/normalize-punctuation.perl
+
+s=de
+t=en
+test=wmt18
+
+mkdir -p data_dir
+
+# Tokenization
+if [ $s == "ro" ] ; then
+    # Note: Get normalise-romanian.py and remove-diacritics.py from
+    # https://github.com/rsennrich/wmt16-scripts/tree/master/preprocess
+    sacrebleu -t $test -l $s-$t --echo src | \
+        $NORMALIZE -l $s | \
+        python normalise-romanian.py | \
+        python remove-diacritics.py | \
+        $TOKENIZER -l $s -a -q > data_dir/$test.$s-$t.$s
+else
+    sacrebleu -t $test -l $s-$t --echo src | perl $NORMALIZE -l $s | perl $TOKENIZER -threads 8 -a -l $s > data_dir/$test.$s-$t.$s
+fi
+
+sacrebleu -t $test -l $s-$t --echo ref | perl $NORMALIZE -l $t | perl $TOKENIZER -threads 8 -a -l $t > data_dir/$test.$s-$t.$t
+
+
+# Applying BPE
+src_bpe_code=/path/to/source/language/bpe/code
+tgt_bpe_code=/path/to/target/language/bpe/code
+src_dict=/path/to/source/language/dict
+tgt_dict=/path/to/target/language/dict
+
+FASTBPE=$FAIRSEQ/fastBPE
+if [ ! -d "${FASTBPE}" ] ; then
+    git clone https://github.com/glample/fastBPE.git
+    # Follow compilation instructions at https://github.com/glample/fastBPE
+    g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
+fi
+
+${FASTBPE}/fast applybpe data_dir/bpe.$test.$s-$t.$s data_dir/$test.$s-$t.$s ${src_bpe_code}
+${FASTBPE}/fast applybpe data_dir/bpe.$test.$s-$t.$s data_dir/$test.$s-$t.$s ${tgt_bpe_code}
+
+fairseq-preprocess -s $s -t $t \
+    --testpref data_dir/bpe.$test.$s-$t \
+    --destdir data_dir/binarized \
+    --srcdict ${src_dict} \
+    --tgtdict ${tgt_dict}
+```
+
+## Calculating BLEU
+
+```sh
+DETOKENIZER=$SCRIPTS/tokenizer/detokenizer.perl
+cat ${generation_output} | grep -P "^H" | sort -V | cut -f 3- | $DETOKENIZER -l $t -q -a | sacrebleu -t $test -l $s-$t
+```
+
+
+## Romanian-English Translation
+
+The direct and channel models are trained using bitext data (WMT16) combined with backtranslated data (The monolingual data used for backtranslation comes from http://data.statmt.org/rsennrich/wmt16_backtranslations/ (Sennrich et al., 2016c))
+
+The backtranslated data is generated using an ensemble of 3 English-Romanian models trained on bitext training data (WMT16) with unrestricted sampling.
+
+### BPE Codes and Dictionary
+
+We learn a joint BPE vocabulary of 18K types on the bitext training data which is used for both the source and target.
+||Path|
+|----------|------|
+| BPE Code | [joint_bpe_18k](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/bpe_18k) |
+| Dictionary | [dict](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/dict) |
+
+### Direct Models
+For Ro-En with backtranslation, the direct and channel models use a Transformer-Big architecture.
+
+| Seed | Model |
+|----|----|
+| 2 | [ro_en_seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/direct_models/seed2.pt)
+| 4 | [ro_en_seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/direct_models/seed4.pt)
+| 6 | [ro_en_seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/direct_models/seed6.pt)
+
+### Channel Models
+For channel models, we follow the same steps as for the direct models. But backtranslated data is generated in the opposite direction using [this Romanian monolingual data](http://data.statmt.org/rsennrich/wmt16_backtranslations/).
+The best lenpen, LM weight and CH weight are obtained by sweeping over the validation set (wmt16/dev) using beam 5.
+| Model Size | Lenpen | LM Weight | CH Weight | Seed 2 | Seed 4 | Seed 6 |
+|----|----|----|----|----|----|----|
+| `big` | 0.84 | 0.64 | 0.56 | [big.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/big.seed2.pt) | [big.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/big.seed2.pt) | [big.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/big.seed2.pt) |
+| `base_1_1` | 0.63 | 0.40 | 0.37 | [base_1_1.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/base_1_1.seed2.pt) | [base_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/base_1_1.seed4.pt) | [base_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/base_1_1.seed6.pt) |
+
+### Language Model
+The model is trained on de-duplicated English Newscrawl data from 2007-2018 comprising 186 million sentences or 4.5B words after normalization and tokenization.
+|  | Path |
+|----|----|
+| `--lm-model` | [transformer_en_lm](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/lm_model/transformer_lm.pt) |
+| `--lm-data` | [lm_data](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/lm_model/lm_dict)
+
+## German-English Translation
+
+### BPE Codes and Dictionaries
+
+| | Path|
+|----------|------|
+| Source BPE Code | [de_bpe_code_24K](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/de_bpe_code_24K) |
+| Target BPE Code | [en_bpe_code_24K](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/en_bpe_code_24K)
+| Source Dictionary | [de_dict](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/de_dict) |
+| Target Dictionary | [en_dict](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/en_dict) |
+
+### Direct Models
+We train on WMT’19 training data. Following [Ng et al., 2019](http://statmt.org/wmt19/pdf/53/WMT33.pdf), we apply language identification filtering and remove sentences longer than 250 tokens as well as sentence pairs with a source/target length ratio exceeding 1.5. This results in 26.8M sentence pairs.
+We use the Transformer-Big architecture for the direct model.
+
+| Seed | Model |
+|:----:|----|
+| 4 | [de_en_seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt)
+| 5 | [de_en_seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed5.pt)
+| 6 | [de_en_seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed6.pt)
+
+### Channel Models
+
+We train on WMT’19 training data. Following [Ng et al., 2019](http://statmt.org/wmt19/pdf/53/WMT33.pdf), we apply language identification filtering and remove sentences longer than 250 tokens as well as sentence pairs with a source/target length ratio exceeding 1.5. This results in 26.8M sentence pairs.
+
+| Model Size | Seed 4 | Seed 5 | Seed 6 |
+|----|----|----|----|
+| `big` | [big.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed4.pt) | [big.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed5.pt) | [big.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed6.pt) |
+| `big_1_1` | [big_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big_1_1.seed4.pt) | [big_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big_1_1.seed5.pt) | [big_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big_1_1.seed6.pt) |
+| `base` | [base.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base.seed4.pt) | [base.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base.seed5.pt) | [base.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base.seed6.pt) |
+| `base_1_1` | [base_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed4.pt) | [base_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed5.pt) | [base_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed6.pt) |
+| `half` | [half.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half.seed4.pt) | [half.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half.seed5.pt) | [half.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half.seed6.pt) |
+| `half_1_1` | [half_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half_1_1.seed4.pt) | [half_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half_1_1.seed5.pt) | [half_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half_1_1.seed6.pt) |
+| `quarter` | [quarter.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter.seed4.pt) | [quarter.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter.seed5.pt) | [quarter.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter.seed6.pt) |
+| `quarter_1_1` | [quarter_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter_1_1.seed4.pt) | [quarter_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter_1_1.seed5.pt) | [quarter_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter_1_1.seed6.pt) |
+| `8th` | [8th.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th.seed4.pt) | [8th.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th.seed5.pt) | [8th.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th.seed6.pt) |
+| `8th_1_1` | [8th_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th_1_1.seed4.pt) | [8th_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th_1_1.seed5.pt) | [8th_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th_1_1.seed6.pt) |
+| `16th` | [16th.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th.seed4.pt) | [16th.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th.seed5.pt) | [16th.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th.seed6.pt) |
+| `16th_1_1` | [16th_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th_1_1.seed4.pt) | [16th_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th_1_1.seed5.pt) | [16th_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th_1_1.seed6.pt) |
+
+### Language Model
+The model is trained on de-duplicated English Newscrawl data from 2007-2018 comprising 186 million sentences or 4.5B words after normalization and tokenization.
+|  | Path |
+|----|----|
+| `--lm-model` | [transformer_en_lm](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt) |
+| `--lm-data` | [lm_data](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/)
+
+
+## Citation
+
+```bibtex
+@inproceedings{bhosale2020language,
+    title={Language Models not just for Pre-training: Fast Online Neural Noisy Channel Modeling},
+    author={Shruti Bhosale and Kyra Yee and Sergey Edunov and Michael Auli},
+    booktitle={Proceedings of the Fifth Conference on Machine Translation (WMT)},
+    year={2020},
+}
+
+@inproceedings{yee2019simple,
+  title={Simple and Effective Noisy Channel Modeling for Neural Machine Translation},
+  author={Yee, Kyra and Dauphin, Yann and Auli, Michael},
+  booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
+  pages={5700--5705},
+  year={2019}
+}
+```
diff --git a/fairseq/examples/fast_noisy_channel/__init__.py b/fairseq/examples/fast_noisy_channel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b248c3a24e12ad3da885a7f328c714942de2e6b
--- /dev/null
+++ b/fairseq/examples/fast_noisy_channel/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import noisy_channel_translation  # noqa
+from . import noisy_channel_sequence_generator  # noqa
+from . import noisy_channel_beam_search  # noqa
diff --git a/fairseq/examples/fast_noisy_channel/noisy_channel_beam_search.py b/fairseq/examples/fast_noisy_channel/noisy_channel_beam_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..23869ebcd0c438f36e310c8ccddd3b5c07a71182
--- /dev/null
+++ b/fairseq/examples/fast_noisy_channel/noisy_channel_beam_search.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq.search import Search
+
+
+class NoisyChannelBeamSearch(Search):
+
+    def __init__(self, tgt_dict):
+        super().__init__(tgt_dict)
+        self.fw_scores_buf = None
+        self.lm_scores_buf = None
+
+    def _init_buffers(self, t):
+        # super()._init_buffers(t)
+        if self.fw_scores_buf is None:
+            self.scores_buf = t.new()
+            self.indices_buf = torch.LongTensor().to(device=t.device)
+            self.beams_buf = torch.LongTensor().to(device=t.device)
+            self.fw_scores_buf = t.new()
+            self.lm_scores_buf = t.new()
+
+    def combine_fw_bw(self, combine_method, fw_cum, bw, step):
+        if combine_method == "noisy_channel":
+            fw_norm = fw_cum.div(step + 1)
+            lprobs = bw + fw_norm
+        elif combine_method == "lm_only":
+            lprobs = bw + fw_cum
+
+        return lprobs
+
+    def step(self, step, fw_lprobs, scores, bw_lprobs, lm_lprobs, combine_method):
+        self._init_buffers(fw_lprobs)
+        bsz, beam_size, vocab_size = fw_lprobs.size()
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            fw_lprobs = fw_lprobs[:, ::beam_size, :].contiguous()
+            bw_lprobs = bw_lprobs[:, ::beam_size, :].contiguous()
+            # nothing to add since we are at the first step
+            fw_lprobs_cum = fw_lprobs
+
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            raw_scores = (scores[:, :, step - 1].unsqueeze(-1))
+            fw_lprobs_cum = (fw_lprobs.add(raw_scores))
+
+        combined_lprobs = self.combine_fw_bw(combine_method, fw_lprobs_cum, bw_lprobs, step)
+
+        # choose the top k according to the combined noisy channel model score
+        torch.topk(
+            combined_lprobs.view(bsz, -1),
+            k=min(
+                # Take the best 2 x beam_size predictions. We'll choose the first
+                # beam_size of these which don't predict eos to continue with.
+                beam_size * 2,
+                combined_lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+            ),
+            out=(self.scores_buf, self.indices_buf),
+        )
+        # save corresponding fw and lm scores
+        self.fw_scores_buf = torch.gather(fw_lprobs_cum.view(bsz, -1), 1, self.indices_buf)
+        self.lm_scores_buf = torch.gather(lm_lprobs.view(bsz, -1), 1, self.indices_buf)
+        # Project back into relative indices and beams
+        self.beams_buf = self.indices_buf // vocab_size
+        self.indices_buf.fmod_(vocab_size)
+        return self.scores_buf, self.fw_scores_buf, self.lm_scores_buf, self.indices_buf, self.beams_buf
diff --git a/fairseq/examples/fast_noisy_channel/noisy_channel_sequence_generator.py b/fairseq/examples/fast_noisy_channel/noisy_channel_sequence_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea8fae98e87e9f3e69bc51987703a6429eb0c92a
--- /dev/null
+++ b/fairseq/examples/fast_noisy_channel/noisy_channel_sequence_generator.py
@@ -0,0 +1,842 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional
+
+import math
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from .noisy_channel_beam_search import NoisyChannelBeamSearch
+from fairseq.sequence_generator import EnsembleModel
+
+
+class NoisyChannelSequenceGenerator(object):
+    def __init__(
+        self,
+        combine_method,
+        tgt_dict,
+        src_dict=None,
+        beam_size=1,
+        max_len_a=0,
+        max_len_b=200,
+        min_len=1,
+        len_penalty=1.0,
+        unk_penalty=0.0,
+        retain_dropout=False,
+        temperature=1.0,
+        match_source_len=False,
+        no_repeat_ngram_size=0,
+        normalize_scores=True,
+        channel_models=None,
+        k2=10,
+        ch_weight=1.0,
+        channel_scoring_type='log_norm',
+        top_k_vocab=0,
+        lm_models=None,
+        lm_dict=None,
+        lm_weight=1.0,
+        normalize_lm_scores_by_tgt_len=False,
+    ):
+        """Generates translations of a given source sentence,
+           using beam search with noisy channel decoding.
+
+        Args:
+            combine_method (string, optional): Method to combine direct, LM and
+                channel model scores (default: None)
+            tgt_dict (~fairseq.data.Dictionary): target dictionary
+            src_dict (~fairseq.data.Dictionary): source dictionary
+            beam_size (int, optional): beam width (default: 1)
+            max_len_a/b (int, optional): generate sequences of maximum length
+                ax + b, where x is the source length
+            min_len (int, optional): the minimum length of the generated output
+                (not including end-of-sentence)
+            len_penalty (float, optional): length penalty, where <1.0 favors
+                shorter, >1.0 favors longer sentences (default: 1.0)
+            unk_penalty (float, optional): unknown word penalty, where <0
+                produces more unks, >0 produces fewer (default: 0.0)
+            retain_dropout (bool, optional): use dropout when generating
+                (default: False)
+            temperature (float, optional): temperature, where values
+                >1.0 produce more uniform samples and values <1.0 produce
+                sharper samples (default: 1.0)
+            match_source_len (bool, optional): outputs should match the source
+                length (default: False)
+            no_repeat_ngram_size (int, optional): Size of n-grams that we avoid
+                repeating in the generation (default: 0)
+            normalize_scores (bool, optional): normalize scores by the length
+                of the output (default: True)
+            channel_models (List[~fairseq.models.FairseqModel]): ensemble of models
+                translating from the target to the source
+            k2 (int, optional): Top K2 candidates to score per beam at each step (default:10)
+            ch_weight (int, optional): Weight associated with the channel model score
+                assuming that the direct model score has weight 1.0 (default: 1.0)
+            channel_scoring_type (str, optional): String specifying how to score
+                the channel model (default: 'log_norm')
+            top_k_vocab (int, optional): If `channel_scoring_type` is `'src_vocab'` or
+                `'src_vocab_batched'`, then this parameter specifies the number of
+                most frequent tokens to include in the channel model output vocabulary,
+                in addition to the source tokens in the input batch (default: 0)
+            lm_models (List[~fairseq.models.FairseqModel]): ensemble of models
+                generating text in the target language
+            lm_dict (~fairseq.data.Dictionary): LM Model dictionary
+            lm_weight (int, optional): Weight associated with the LM model score
+                assuming that the direct model score has weight 1.0 (default: 1.0)
+            normalize_lm_scores_by_tgt_len (bool, optional): Should we normalize LM scores
+                by the target length? By default, we normalize the combination of
+                LM and channel model scores by the source length
+        """
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos()
+        self.vocab_size = len(tgt_dict)
+        self.beam_size = beam_size
+        # the max beam size is the dictionary size - 1, since we never select pad
+        self.beam_size = min(beam_size, self.vocab_size - 1)
+        self.max_len_a = max_len_a
+        self.max_len_b = max_len_b
+        self.min_len = min_len
+        self.normalize_scores = normalize_scores
+        self.len_penalty = len_penalty
+        self.unk_penalty = unk_penalty
+        self.retain_dropout = retain_dropout
+        self.temperature = temperature
+        self.match_source_len = match_source_len
+        self.no_repeat_ngram_size = no_repeat_ngram_size
+        self.channel_models = channel_models
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+        self.combine_method = combine_method
+        self.k2 = k2
+        self.ch_weight = ch_weight
+        self.channel_scoring_type = channel_scoring_type
+        self.top_k_vocab = top_k_vocab
+        self.lm_models = lm_models
+        self.lm_dict = lm_dict
+        self.lm_weight = lm_weight
+        self.log_softmax_fn = torch.nn.LogSoftmax(dim=1)
+        self.normalize_lm_scores_by_tgt_len = normalize_lm_scores_by_tgt_len
+
+        self.share_tgt_dict = (self.lm_dict == self.tgt_dict)
+        self.tgt_to_lm = make_dict2dict(tgt_dict, lm_dict)
+
+        self.ch_scoring_bsz = 3072
+
+        assert temperature > 0, '--temperature must be greater than 0'
+
+        self.search = NoisyChannelBeamSearch(tgt_dict)
+
+    @torch.no_grad()
+    def generate(
+        self,
+        models,
+        sample,
+        prefix_tokens=None,
+        bos_token=None,
+        **kwargs
+    ):
+        """Generate a batch of translations.
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            sample (dict): batch
+            prefix_tokens (torch.LongTensor, optional): force decoder to begin
+                with these tokens
+        """
+        model = EnsembleModel(models)
+        incremental_states = torch.jit.annotate(
+            List[Dict[str, Dict[str, Optional[Tensor]]]],
+            [
+                torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
+                for i in range(model.models_size)
+            ],
+        )
+        if not self.retain_dropout:
+            model.eval()
+
+        # model.forward normally channels prev_output_tokens into the decoder
+        # separately, but SequenceGenerator directly calls model.encoder
+        encoder_input = {
+            k: v for k, v in sample['net_input'].items()
+            if k != 'prev_output_tokens'
+        }
+        src_tokens = encoder_input['src_tokens']
+        src_lengths_no_eos = (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1)
+        input_size = src_tokens.size()
+        # batch dimension goes first followed by source lengths
+        bsz = input_size[0]
+        src_len = input_size[1]
+        beam_size = self.beam_size
+
+        if self.match_source_len:
+            max_len = src_lengths_no_eos.max().item()
+        else:
+            max_len = min(
+                int(self.max_len_a * src_len + self.max_len_b),
+                # exclude the EOS marker
+                model.max_decoder_positions() - 1,
+            )
+
+        # compute the encoder output for each beam
+        encoder_outs = model.forward_encoder(encoder_input)
+        new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
+        new_order = new_order.to(src_tokens.device).long()
+        encoder_outs = model.reorder_encoder_out(encoder_outs, new_order)
+
+        src_lengths = encoder_input['src_lengths']
+        # initialize buffers
+        scores = src_tokens.new(bsz * beam_size, max_len + 1).float().fill_(0)
+        lm_prefix_scores = src_tokens.new(bsz * beam_size).float().fill_(0)
+
+        scores_buf = scores.clone()
+        tokens = src_tokens.new(bsz * beam_size, max_len + 2).long().fill_(self.pad)
+        tokens_buf = tokens.clone()
+        tokens[:, 0] = self.eos if bos_token is None else bos_token
+
+        # reorder source tokens so they may be used as a reference in generating P(S|T)
+        src_tokens = reorder_all_tokens(src_tokens, src_lengths, self.src_dict.eos_index)
+
+        src_tokens = src_tokens.repeat(1, beam_size).view(-1, src_len)
+        src_lengths = src_lengths.view(bsz, -1).repeat(1, beam_size).view(bsz*beam_size, -1)
+
+        attn, attn_buf = None, None
+        nonpad_idxs = None
+
+        # The cands_to_ignore indicates candidates that should be ignored.
+        # For example, suppose we're sampling and have already finalized 2/5
+        # samples. Then the cands_to_ignore would mark 2 positions as being ignored,
+        # so that we only finalize the remaining 3 samples.
+        cands_to_ignore = src_tokens.new_zeros(bsz, beam_size).eq(-1)  # forward and backward-compatible False mask
+
+        # list of completed sentences
+        finalized = [[] for i in range(bsz)]
+        finished = [False for i in range(bsz)]
+        num_remaining_sent = bsz
+
+        # number of candidate hypos per step
+        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS
+
+        # offset arrays for converting between different indexing schemes
+        bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens)
+        cand_offsets = torch.arange(0, cand_size).type_as(tokens)
+
+        # helper function for allocating buffers on the fly
+        buffers = {}
+
+        def buffer(name, type_of=tokens):  # noqa
+            if name not in buffers:
+                buffers[name] = type_of.new()
+            return buffers[name]
+
+        def is_finished(sent, step, unfin_idx):
+            """
+            Check whether we've finished generation for a given sentence, by
+            comparing the worst score among finalized hypotheses to the best
+            possible score among unfinalized hypotheses.
+            """
+            assert len(finalized[sent]) <= beam_size
+            if len(finalized[sent]) == beam_size:
+                return True
+            return False
+
+        def finalize_hypos(step, bbsz_idx, eos_scores, combined_noisy_channel_eos_scores):
+            """
+            Finalize the given hypotheses at this step, while keeping the total
+            number of finalized hypotheses per sentence <= beam_size.
+
+            Note: the input must be in the desired finalization order, so that
+            hypotheses that appear earlier in the input are preferred to those
+            that appear later.
+
+            Args:
+                step: current time step
+                bbsz_idx: A vector of indices in the range [0, bsz*beam_size),
+                    indicating which hypotheses to finalize
+                eos_scores: A vector of the same size as bbsz_idx containing
+                    fw scores for each hypothesis
+                combined_noisy_channel_eos_scores: A vector of the same size as bbsz_idx containing
+                    combined noisy channel scores for each hypothesis
+            """
+            assert bbsz_idx.numel() == eos_scores.numel()
+
+            # clone relevant token and attention tensors
+            tokens_clone = tokens.index_select(0, bbsz_idx)
+            tokens_clone = tokens_clone[:, 1:step + 2]  # skip the first index, which is EOS
+            assert not tokens_clone.eq(self.eos).any()
+            tokens_clone[:, step] = self.eos
+            attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step+2] if attn is not None else None
+
+            # compute scores per token position
+            pos_scores = scores.index_select(0, bbsz_idx)[:, :step+1]
+            pos_scores[:, step] = eos_scores
+            # convert from cumulative to per-position scores
+            pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]
+
+            # normalize sentence-level scores
+            if self.normalize_scores:
+                combined_noisy_channel_eos_scores /= (step + 1) ** self.len_penalty
+
+            cum_unfin = []
+            prev = 0
+            for f in finished:
+                if f:
+                    prev += 1
+                else:
+                    cum_unfin.append(prev)
+
+            sents_seen = set()
+            for i, (idx, score) in enumerate(zip(bbsz_idx.tolist(), combined_noisy_channel_eos_scores.tolist())):
+                unfin_idx = idx // beam_size
+                sent = unfin_idx + cum_unfin[unfin_idx]
+
+                sents_seen.add((sent, unfin_idx))
+
+                if self.match_source_len and step > src_lengths_no_eos[unfin_idx]:
+                    score = -math.inf
+
+                def get_hypo():
+
+                    if attn_clone is not None:
+                        # remove padding tokens from attn scores
+                        hypo_attn = attn_clone[i][nonpad_idxs[sent]]
+                        _, alignment = hypo_attn.max(dim=0)
+                    else:
+                        hypo_attn = None
+                        alignment = None
+
+                    return {
+                        'tokens': tokens_clone[i],
+                        'score': score,
+                        'attention': hypo_attn,  # src_len x tgt_len
+                        'alignment': alignment,
+                        'positional_scores': pos_scores[i],
+                    }
+
+                if len(finalized[sent]) < beam_size:
+                    finalized[sent].append(get_hypo())
+
+            newly_finished = []
+            for sent, unfin_idx in sents_seen:
+                # check termination conditions for this sentence
+                if not finished[sent] and is_finished(sent, step, unfin_idx):
+                    finished[sent] = True
+                    newly_finished.append(unfin_idx)
+            return newly_finished
+
+        def noisy_channel_rescoring(lprobs, beam_size, bsz, src_tokens, tokens, k):
+            """Rescore the top k hypothesis from each beam using noisy channel modeling
+            Returns:
+                new_fw_lprobs: the direct model probabilities after pruning the top k
+                new_ch_lm_lprobs:  the combined channel and language model probabilities
+                new_lm_lprobs: the language model probabilities after pruning the top k
+            """
+            with torch.no_grad():
+                lprobs_size = lprobs.size()
+                if prefix_tokens is not None and step < prefix_tokens.size(1):
+                    probs_slice = lprobs.view(bsz, -1, lprobs.size(-1))[:, 0, :]
+                    cand_scores = torch.gather(
+                        probs_slice, dim=1,
+                        index=prefix_tokens[:, step].view(-1, 1).data
+                    ).expand(-1, beam_size).contiguous().view(bsz*beam_size, 1)
+                    cand_indices = prefix_tokens[:, step].view(-1, 1).expand(bsz, beam_size).data.contiguous().view(bsz*beam_size, 1)
+
+                    # need to calculate and save fw and lm probs for prefix tokens
+                    fw_top_k = cand_scores
+                    fw_top_k_idx = cand_indices
+                    k = 1
+                else:
+                    # take the top k best words for every sentence in batch*beam
+                    fw_top_k, fw_top_k_idx = torch.topk(lprobs.view(beam_size*bsz, -1), k=k)
+                eos_idx = torch.nonzero(fw_top_k_idx.view(bsz*beam_size*k, -1) == self.eos)[:, 0]
+                ch_scores = fw_top_k.new_full((beam_size*bsz*k, ), 0)
+                src_size = torch.sum(src_tokens[:, :] != self.src_dict.pad_index, dim=1, keepdim=True, dtype=fw_top_k.dtype)
+
+                if self.combine_method != "lm_only":
+                    temp_src_tokens_full = src_tokens[:, :].repeat(1, k).view(bsz*beam_size*k, -1)
+                    not_padding = temp_src_tokens_full[:, 1:] != self.src_dict.pad_index
+                    cur_tgt_size = step+2
+
+                    # add eos to all candidate sentences except those that already end in eos
+                    eos_tokens = tokens[:, 0].repeat(1, k).view(-1, 1)
+                    eos_tokens[eos_idx] = self.tgt_dict.pad_index
+
+                    if step == 0:
+                        channel_input = torch.cat((fw_top_k_idx.view(-1, 1), eos_tokens), 1)
+                    else:
+                        # move eos from beginning to end of target sentence
+                        channel_input = torch.cat((tokens[:, 1:step + 1].repeat(1, k).view(-1, step), fw_top_k_idx.view(-1, 1), eos_tokens), 1)
+
+                    ch_input_lengths = torch.tensor(np.full(channel_input.size(0), cur_tgt_size))
+                    ch_input_lengths[eos_idx] = cur_tgt_size-1
+                    if self.channel_scoring_type == "unnormalized":
+                        ch_encoder_output = channel_model.encoder(channel_input, src_lengths=ch_input_lengths)
+                        ch_decoder_output, _ = channel_model.decoder(temp_src_tokens_full, encoder_out=ch_encoder_output, features_only=True)
+                        del ch_encoder_output
+                        ch_intermed_scores = channel_model.decoder.unnormalized_scores_given_target(ch_decoder_output, target_ids=temp_src_tokens_full[:, 1:])
+                        ch_intermed_scores = ch_intermed_scores.float()
+                        ch_intermed_scores *= not_padding.float()
+                        ch_scores = torch.sum(ch_intermed_scores, dim=1)
+                    elif self.channel_scoring_type == "k2_separate":
+                        for k_idx in range(k):
+                            k_eos_tokens = eos_tokens[k_idx::k, :]
+                            if step == 0:
+                                k_ch_input = torch.cat((fw_top_k_idx[:, k_idx:k_idx+1], k_eos_tokens), 1)
+                            else:
+                                # move eos from beginning to end of target sentence
+                                k_ch_input = torch.cat((tokens[:, 1:step + 1], fw_top_k_idx[:, k_idx:k_idx+1], k_eos_tokens), 1)
+                            k_ch_input_lengths = ch_input_lengths[k_idx::k]
+                            k_ch_output = channel_model(k_ch_input, k_ch_input_lengths, src_tokens)
+                            k_ch_lprobs = channel_model.get_normalized_probs(k_ch_output, log_probs=True)
+                            k_ch_intermed_scores = torch.gather(k_ch_lprobs[:, :-1, :], 2, src_tokens[:, 1:].unsqueeze(2)).squeeze(2)
+                            k_ch_intermed_scores *= not_padding.float()
+                            ch_scores[k_idx::k] = torch.sum(k_ch_intermed_scores, dim=1)
+                    elif self.channel_scoring_type == "src_vocab":
+                        ch_encoder_output = channel_model.encoder(channel_input, src_lengths=ch_input_lengths)
+                        ch_decoder_output, _ = channel_model.decoder(temp_src_tokens_full, encoder_out=ch_encoder_output, features_only=True)
+
+                        del ch_encoder_output
+                        ch_lprobs = normalized_scores_with_batch_vocab(
+                            channel_model.decoder,
+                            ch_decoder_output, src_tokens, k, bsz, beam_size,
+                            self.src_dict.pad_index, top_k=self.top_k_vocab)
+                        ch_scores = torch.sum(ch_lprobs, dim=1)
+                    elif self.channel_scoring_type == "src_vocab_batched":
+                        ch_bsz_size = temp_src_tokens_full.shape[0]
+                        ch_lprobs_list = [None] * len(range(0, ch_bsz_size, self.ch_scoring_bsz))
+                        for i, start_idx in enumerate(range(0, ch_bsz_size, self.ch_scoring_bsz)):
+                            end_idx = min(start_idx + self.ch_scoring_bsz, ch_bsz_size)
+                            temp_src_tokens_full_batch = temp_src_tokens_full[start_idx:end_idx, :]
+                            channel_input_batch = channel_input[start_idx:end_idx, :]
+                            ch_input_lengths_batch = ch_input_lengths[start_idx:end_idx]
+                            ch_encoder_output_batch = channel_model.encoder(channel_input_batch, src_lengths=ch_input_lengths_batch)
+                            ch_decoder_output_batch, _ = channel_model.decoder(temp_src_tokens_full_batch, encoder_out=ch_encoder_output_batch, features_only=True)
+                            ch_lprobs_list[i] = normalized_scores_with_batch_vocab(
+                                channel_model.decoder,
+                                ch_decoder_output_batch, src_tokens, k, bsz, beam_size,
+                                self.src_dict.pad_index, top_k=self.top_k_vocab,
+                                start_idx=start_idx, end_idx=end_idx)
+                        ch_lprobs = torch.cat(ch_lprobs_list, dim=0)
+                        ch_scores = torch.sum(ch_lprobs, dim=1)
+                    else:
+                        ch_output = channel_model(channel_input, ch_input_lengths, temp_src_tokens_full)
+                        ch_lprobs = channel_model.get_normalized_probs(ch_output, log_probs=True)
+                        ch_intermed_scores = torch.gather(ch_lprobs[:, :-1, :], 2, temp_src_tokens_full[:, 1:].unsqueeze(2)).squeeze().view(bsz*beam_size*k, -1)
+                        ch_intermed_scores *= not_padding.float()
+                        ch_scores = torch.sum(ch_intermed_scores, dim=1)
+
+                else:
+                    cur_tgt_size = 0
+                ch_scores = ch_scores.view(bsz*beam_size, k)
+                expanded_lm_prefix_scores = lm_prefix_scores.unsqueeze(1).expand(-1, k).flatten()
+
+                if self.share_tgt_dict:
+                    lm_scores = get_lm_scores(lm, tokens[:, :step + 1].view(-1, step+1), lm_incremental_states, fw_top_k_idx.view(-1, 1), torch.tensor(np.full(tokens.size(0), step+1)), k)
+                else:
+                    new_lm_input = dict2dict(tokens[:, :step + 1].view(-1, step+1), self.tgt_to_lm)
+                    new_cands = dict2dict(fw_top_k_idx.view(-1, 1), self.tgt_to_lm)
+                    lm_scores = get_lm_scores(lm, new_lm_input, lm_incremental_states, new_cands, torch.tensor(np.full(tokens.size(0), step+1)), k)
+
+                lm_scores.add_(expanded_lm_prefix_scores)
+                ch_lm_scores = combine_ch_lm(self.combine_method, ch_scores, lm_scores, src_size, cur_tgt_size)
+                # initialize all as min value
+                new_fw_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1)
+                new_ch_lm_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1)
+                new_lm_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1)
+                new_fw_lprobs[:, self.pad] = -math.inf
+                new_ch_lm_lprobs[:, self.pad] = -math.inf
+                new_lm_lprobs[:, self.pad] = -math.inf
+
+                new_fw_lprobs.scatter_(1, fw_top_k_idx, fw_top_k)
+                new_ch_lm_lprobs.scatter_(1, fw_top_k_idx, ch_lm_scores)
+                new_lm_lprobs.scatter_(1, fw_top_k_idx, lm_scores.view(-1, k))
+                return new_fw_lprobs, new_ch_lm_lprobs, new_lm_lprobs
+
+        def combine_ch_lm(combine_type, ch_scores, lm_scores1, src_size, tgt_size):
+            if self.channel_scoring_type == "unnormalized":
+                ch_scores = self.log_softmax_fn(
+                    ch_scores.view(-1, self.beam_size * self.k2)
+                ).view(ch_scores.shape)
+            ch_scores = ch_scores * self.ch_weight
+            lm_scores1 = lm_scores1 * self.lm_weight
+
+            if combine_type == "lm_only":
+                # log P(T|S) + log P(T)
+                ch_scores = lm_scores1.view(ch_scores.size())
+            elif combine_type == "noisy_channel":
+                # 1/t log P(T|S) + 1/s log P(S|T) + 1/t log P(T)
+                if self.normalize_lm_scores_by_tgt_len:
+                    ch_scores.div_(src_size)
+                    lm_scores_norm = lm_scores1.view(ch_scores.size()).div(tgt_size)
+                    ch_scores.add_(lm_scores_norm)
+                # 1/t log P(T|S) + 1/s log P(S|T) + 1/s log P(T)
+                else:
+                    ch_scores.add_(lm_scores1.view(ch_scores.size()))
+                    ch_scores.div_(src_size)
+
+            return ch_scores
+
+        if self.channel_models is not None:
+            channel_model = self.channel_models[0]  # assume only one channel_model model
+        else:
+            channel_model = None
+
+        lm = EnsembleModel(self.lm_models)
+        lm_incremental_states = torch.jit.annotate(
+            List[Dict[str, Dict[str, Optional[Tensor]]]],
+            [
+                torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
+                for i in range(lm.models_size)
+            ],
+        )
+
+        reorder_state = None
+        batch_idxs = None
+        for step in range(max_len + 1):  # one extra step for EOS marker
+            # reorder decoder internal states based on the prev choice of beams
+            if reorder_state is not None:
+                if batch_idxs is not None:
+                    # update beam indices to take into account removed sentences
+                    corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(batch_idxs)
+                    reorder_state.view(-1, beam_size).add_(corr.unsqueeze(-1) * beam_size)
+                model.reorder_incremental_state(incremental_states, reorder_state)
+                encoder_outs = model.reorder_encoder_out(encoder_outs, reorder_state)
+
+                lm.reorder_incremental_state(lm_incremental_states, reorder_state)
+
+            fw_lprobs, avg_attn_scores = model.forward_decoder(
+                tokens[:, :step + 1], encoder_outs, incremental_states, temperature=self.temperature,
+            )
+
+            fw_lprobs[:, self.pad] = -math.inf  # never select pad
+            fw_lprobs[:, self.unk] -= self.unk_penalty  # apply unk penalty
+            fw_lprobs, ch_lm_lprobs, lm_lprobs = noisy_channel_rescoring(fw_lprobs, beam_size, bsz, src_tokens, tokens, self.k2)
+
+            # handle min and max length constraints
+            if step >= max_len:
+                fw_lprobs[:, :self.eos] = -math.inf
+                fw_lprobs[:, self.eos + 1:] = -math.inf
+            elif step < self.min_len:
+                fw_lprobs[:, self.eos] = -math.inf
+
+            # handle prefix tokens (possibly with different lengths)
+            if prefix_tokens is not None and step < prefix_tokens.size(1):
+                prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1)
+                prefix_mask = prefix_toks.ne(self.pad)
+
+                prefix_fw_lprobs = fw_lprobs.gather(-1, prefix_toks.unsqueeze(-1))
+                fw_lprobs[prefix_mask] = -math.inf
+                fw_lprobs[prefix_mask] = fw_lprobs[prefix_mask].scatter_(
+                    -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_fw_lprobs
+                )
+
+                prefix_ch_lm_lprobs = ch_lm_lprobs.gather(-1, prefix_toks.unsqueeze(-1))
+                ch_lm_lprobs[prefix_mask] = -math.inf
+                ch_lm_lprobs[prefix_mask] = ch_lm_lprobs[prefix_mask].scatter_(
+                    -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_ch_lm_lprobs
+                )
+
+                prefix_lm_lprobs = lm_lprobs.gather(-1, prefix_toks.unsqueeze(-1))
+                lm_lprobs[prefix_mask] = -math.inf
+                lm_lprobs[prefix_mask] = lm_lprobs[prefix_mask].scatter_(
+                    -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lm_lprobs
+                )
+
+                # if prefix includes eos, then we should make sure tokens and
+                # scores are the same across all beams
+                eos_mask = prefix_toks.eq(self.eos)
+                if eos_mask.any():
+                    # validate that the first beam matches the prefix
+                    first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[:, 0, 1:step + 1]
+                    eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
+                    target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
+                    assert (first_beam == target_prefix).all()
+
+                    def replicate_first_beam(tensor, mask):
+                        tensor = tensor.view(-1, beam_size, tensor.size(-1))
+                        tensor[mask] = tensor[mask][:, :1, :]
+                        return tensor.view(-1, tensor.size(-1))
+
+                    # copy tokens, scores and lprobs from the first beam to all beams
+                    tokens = replicate_first_beam(tokens, eos_mask_batch_dim)
+                    scores = replicate_first_beam(scores, eos_mask_batch_dim)
+
+                    fw_lprobs = replicate_first_beam(fw_lprobs, eos_mask_batch_dim)
+                    ch_lm_lprobs = replicate_first_beam(ch_lm_lprobs, eos_mask_batch_dim)
+                    lm_lprobs = replicate_first_beam(lm_lprobs, eos_mask_batch_dim)
+
+            if self.no_repeat_ngram_size > 0:
+                # for each beam and batch sentence, generate a list of previous ngrams
+                gen_ngrams = [{} for bbsz_idx in range(bsz * beam_size)]
+                for bbsz_idx in range(bsz * beam_size):
+                    gen_tokens = tokens[bbsz_idx].tolist()
+                    for ngram in zip(*[gen_tokens[i:] for i in range(self.no_repeat_ngram_size)]):
+                        gen_ngrams[bbsz_idx][tuple(ngram[:-1])] = \
+                                gen_ngrams[bbsz_idx].get(tuple(ngram[:-1]), []) + [ngram[-1]]
+
+            # Record attention scores
+            if avg_attn_scores is not None:
+                if attn is None:
+                    attn = scores.new(bsz * beam_size, src_tokens.size(1), max_len + 2)
+                    attn_buf = attn.clone()
+                    nonpad_idxs = src_tokens.ne(self.pad)
+                attn[:, :, step + 1].copy_(avg_attn_scores)
+
+            scores = scores.type_as(fw_lprobs)
+            scores_buf = scores_buf.type_as(fw_lprobs)
+
+            self.search.set_src_lengths(src_lengths_no_eos)
+
+            if self.no_repeat_ngram_size > 0:
+                def calculate_banned_tokens(bbsz_idx):
+                    # before decoding the next token, prevent decoding of ngrams that have already appeared
+                    ngram_index = tuple(tokens[bbsz_idx, step + 2 - self.no_repeat_ngram_size:step + 1].tolist())
+                    return gen_ngrams[bbsz_idx].get(ngram_index, [])
+
+                if step + 2 - self.no_repeat_ngram_size >= 0:
+                    # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+                    banned_tokens = [calculate_banned_tokens(bbsz_idx) for bbsz_idx in range(bsz * beam_size)]
+                else:
+                    banned_tokens = [[] for bbsz_idx in range(bsz * beam_size)]
+
+                for bbsz_idx in range(bsz * beam_size):
+                    fw_lprobs[bbsz_idx, banned_tokens[bbsz_idx]] = -math.inf
+
+            combined_noisy_channel_scores, fw_lprobs_top_k, lm_lprobs_top_k, cand_indices, cand_beams = self.search.step(
+                step,
+                fw_lprobs.view(bsz, -1, self.vocab_size),
+                scores.view(bsz, beam_size, -1)[:, :, :step], ch_lm_lprobs.view(bsz, -1, self.vocab_size),
+                lm_lprobs.view(bsz, -1, self.vocab_size), self.combine_method
+            )
+
+            # cand_bbsz_idx contains beam indices for the top candidate
+            # hypotheses, with a range of values: [0, bsz*beam_size),
+            # and dimensions: [bsz, cand_size]
+            cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+
+            # finalize hypotheses that end in eos (except for candidates to be ignored)
+            eos_mask = cand_indices.eq(self.eos)
+            eos_mask[:, :beam_size] &= ~cands_to_ignore
+
+            # only consider eos when it's among the top beam_size indices
+            eos_bbsz_idx = torch.masked_select(
+                cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]
+            )
+
+            finalized_sents = set()
+            if eos_bbsz_idx.numel() > 0:
+                eos_scores = torch.masked_select(
+                    fw_lprobs_top_k[:, :beam_size], mask=eos_mask[:, :beam_size]
+                )
+                combined_noisy_channel_eos_scores = torch.masked_select(
+                    combined_noisy_channel_scores[:, :beam_size],
+                    mask=eos_mask[:, :beam_size],
+                )
+
+                # finalize hypo using channel model score
+                finalized_sents = finalize_hypos(
+                    step, eos_bbsz_idx, eos_scores, combined_noisy_channel_eos_scores)
+
+                num_remaining_sent -= len(finalized_sents)
+
+            assert num_remaining_sent >= 0
+            if num_remaining_sent == 0:
+                break
+
+            if len(finalized_sents) > 0:
+                new_bsz = bsz - len(finalized_sents)
+
+                # construct batch_idxs which holds indices of batches to keep for the next pass
+                batch_mask = cand_indices.new_ones(bsz)
+                batch_mask[cand_indices.new(finalized_sents)] = 0
+                batch_idxs = torch.nonzero(batch_mask).squeeze(-1)
+
+                eos_mask = eos_mask[batch_idxs]
+                cand_beams = cand_beams[batch_idxs]
+                bbsz_offsets.resize_(new_bsz, 1)
+                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+
+                lm_lprobs_top_k = lm_lprobs_top_k[batch_idxs]
+
+                fw_lprobs_top_k = fw_lprobs_top_k[batch_idxs]
+                cand_indices = cand_indices[batch_idxs]
+                if prefix_tokens is not None:
+                    prefix_tokens = prefix_tokens[batch_idxs]
+                src_lengths_no_eos = src_lengths_no_eos[batch_idxs]
+                cands_to_ignore = cands_to_ignore[batch_idxs]
+
+                scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                scores_buf.resize_as_(scores)
+                tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                tokens_buf.resize_as_(tokens)
+                src_tokens = src_tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                src_lengths = src_lengths.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                lm_prefix_scores = lm_prefix_scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1).squeeze()
+
+                if attn is not None:
+                    attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1)
+                    attn_buf.resize_as_(attn)
+                bsz = new_bsz
+            else:
+                batch_idxs = None
+
+            # Set active_mask so that values > cand_size indicate eos or
+            # ignored hypos and values < cand_size indicate candidate
+            # active hypos. After this, the min values per row are the top
+            # candidate active hypos.
+            eos_mask[:, :beam_size] |= cands_to_ignore
+            active_mask = torch.add(
+                eos_mask.type_as(cand_offsets) * cand_size,
+                cand_offsets[: eos_mask.size(1)],
+            )
+
+            # get the top beam_size active hypotheses, which are just the hypos
+            # with the smallest values in active_mask
+            active_hypos, new_cands_to_ignore = buffer('active_hypos'), buffer('new_cands_to_ignore')
+            torch.topk(
+                active_mask, k=beam_size, dim=1, largest=False,
+                out=(new_cands_to_ignore, active_hypos)
+            )
+
+            # update cands_to_ignore to ignore any finalized hypos
+            cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
+            assert (~cands_to_ignore).any(dim=1).all()
+
+            active_bbsz_idx = buffer('active_bbsz_idx')
+            torch.gather(
+                cand_bbsz_idx, dim=1, index=active_hypos,
+                out=active_bbsz_idx,
+            )
+            active_scores = torch.gather(
+                fw_lprobs_top_k, dim=1, index=active_hypos,
+                out=scores[:, step].view(bsz, beam_size),
+            )
+
+            active_bbsz_idx = active_bbsz_idx.view(-1)
+            active_scores = active_scores.view(-1)
+
+            # copy tokens and scores for active hypotheses
+            torch.index_select(
+                tokens[:, :step + 1], dim=0, index=active_bbsz_idx,
+                out=tokens_buf[:, :step + 1],
+            )
+            torch.gather(
+                cand_indices, dim=1, index=active_hypos,
+                out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1],
+            )
+            if step > 0:
+                torch.index_select(
+                    scores[:, :step], dim=0, index=active_bbsz_idx,
+                    out=scores_buf[:, :step],
+                )
+            torch.gather(
+                fw_lprobs_top_k, dim=1, index=active_hypos,
+                out=scores_buf.view(bsz, beam_size, -1)[:, :, step],
+            )
+            torch.gather(
+                lm_lprobs_top_k, dim=1, index=active_hypos,
+                out=lm_prefix_scores.view(bsz, beam_size)
+            )
+
+            # copy attention for active hypotheses
+            if attn is not None:
+                torch.index_select(
+                    attn[:, :, :step + 2], dim=0, index=active_bbsz_idx,
+                    out=attn_buf[:, :, :step + 2],
+                )
+
+            # swap buffers
+            tokens, tokens_buf = tokens_buf, tokens
+            scores, scores_buf = scores_buf, scores
+            if attn is not None:
+                attn, attn_buf = attn_buf, attn
+
+            # reorder incremental state in decoder
+            reorder_state = active_bbsz_idx
+
+        # sort by score descending
+        for sent in range(len(finalized)):
+            finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True)
+
+        return finalized
+
+
+def get_lm_scores(model, input_tokens, incremental_states, cand_tokens, input_len, k):
+    with torch.no_grad():
+        lm_lprobs, avg_attn_scores = model.forward_decoder(
+            input_tokens, encoder_outs=None, incremental_states=incremental_states,
+        )
+
+        lm_lprobs_size = lm_lprobs.size(0)
+        probs_next_wrd = torch.gather(lm_lprobs.repeat(1, k).view(lm_lprobs_size*k, -1), 1, cand_tokens).squeeze().view(-1)
+
+        return probs_next_wrd
+
+
+def make_dict2dict(old_dict, new_dict):
+    dict2dict_map = {}
+    for sym in old_dict.symbols:
+        dict2dict_map[old_dict.index(sym)] = new_dict.index(sym)
+    return dict2dict_map
+
+
+def dict2dict(tokens, dict2dict_map):
+    if tokens.device == torch.device('cpu'):
+        tokens_tmp = tokens
+    else:
+        tokens_tmp = tokens.cpu()
+    return tokens_tmp.map_(
+        tokens_tmp,
+        lambda _, val, dict2dict_map=dict2dict_map : dict2dict_map[float(val)]
+    ).to(tokens.device)
+
+
+def reorder_tokens(tokens, lengths, eos):
+    # reorder source tokens so they may be used as reference for P(S|T)
+    return torch.cat((tokens.new([eos]), tokens[-lengths:-1], tokens[:-lengths]), 0)
+
+
+def reorder_all_tokens(tokens, lengths, eos):
+    # used to reorder src tokens from [<pad> <w1> <w2> .. <eos>] to [<eos> <w1> <w2>...<pad>]
+    # so source tokens can be used to predict P(S|T)
+    return torch.stack([reorder_tokens(token, length, eos) for token, length in zip(tokens, lengths)])
+
+
+def normalized_scores_with_batch_vocab(
+        model_decoder, features, target_ids, k, bsz, beam_size,
+        pad_idx, top_k=0, vocab_size_meter=None, start_idx=None,
+        end_idx=None, **kwargs):
+    """
+        Get normalized probabilities (or log probs) from a net's output
+        w.r.t. vocab consisting of target IDs in the batch
+    """
+    if model_decoder.adaptive_softmax is None:
+        weight = model_decoder.output_projection.weight
+        vocab_ids = torch.unique(
+            torch.cat(
+                (torch.unique(target_ids), torch.arange(top_k, device=target_ids.device))
+            )
+        )
+        id_map = dict(zip(vocab_ids.tolist(), range(len(vocab_ids))))
+        mapped_target_ids = target_ids.cpu().apply_(
+            lambda x, id_map=id_map: id_map[x]
+        ).to(target_ids.device)
+        expanded_target_ids = mapped_target_ids[:, :].repeat(1, k).view(bsz*beam_size*k, -1)
+        if start_idx is not None and end_idx is not None:
+            expanded_target_ids = expanded_target_ids[start_idx:end_idx, :]
+        logits = F.linear(features, weight[vocab_ids, :])
+        log_softmax = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+        intermed_scores = torch.gather(
+            log_softmax[:, :-1, :],
+            2,
+            expanded_target_ids[:, 1:].unsqueeze(2),
+        ).squeeze()
+        not_padding = expanded_target_ids[:, 1:] != pad_idx
+        intermed_scores *= not_padding.float()
+        return intermed_scores
+    else:
+        raise ValueError("adaptive softmax doesn't work with " +
+                         "`normalized_scores_with_batch_vocab()`")
diff --git a/fairseq/examples/fast_noisy_channel/noisy_channel_translation.py b/fairseq/examples/fast_noisy_channel/noisy_channel_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74bdfd456f9b7c546ce528173c77431b4f57ac1
--- /dev/null
+++ b/fairseq/examples/fast_noisy_channel/noisy_channel_translation.py
@@ -0,0 +1,127 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.tasks.translation import TranslationTask
+from fairseq.tasks.language_modeling import LanguageModelingTask
+from fairseq import checkpoint_utils
+import argparse
+from fairseq.tasks import register_task
+import torch
+
+
+@register_task("noisy_channel_translation")
+class NoisyChannelTranslation(TranslationTask):
+    """
+    Rescore the top k candidates from each beam using noisy channel modeling
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        TranslationTask.add_args(parser)
+        # fmt: off
+        parser.add_argument('--channel-model', metavar='FILE',
+                            help='path to P(S|T) model. P(S|T) and P(T|S) must share source and target dictionaries.')
+        parser.add_argument('--combine-method', default='lm_only',
+                            choices=['lm_only', 'noisy_channel'],
+                            help="""method for combining direct and channel model scores.
+                                    lm_only: decode with P(T|S)P(T)
+                                    noisy_channel: decode with 1/t P(T|S) + 1/s(P(S|T)P(T))""")
+        parser.add_argument('--normalize-lm-scores-by-tgt-len', action='store_true', default=False,
+                            help='normalize lm score by target length instead of source length')
+        parser.add_argument('--channel-scoring-type', default='log_norm', choices=['unnormalized', 'log_norm', 'k2_separate', 'src_vocab', 'src_vocab_batched'],
+                            help="Normalize bw scores with log softmax or return bw scores without log softmax")
+        parser.add_argument('--top-k-vocab', default=0, type=int,
+                            help='top k vocab IDs to use with `src_vocab` in channel model scoring')
+        parser.add_argument('--k2', default=50, type=int,
+                            help='the top k2 candidates to rescore with the noisy channel model for each beam')
+        parser.add_argument('--ch-wt', default=1, type=float,
+                            help='weight for the channel model')
+        parser.add_argument('--lm-model', metavar='FILE',
+                            help='path to lm model file, to model P(T). P(T) must share the same vocab as the direct model on the target side')
+        parser.add_argument('--lm-data', metavar='FILE',
+                            help='path to lm model training data for target language, used to properly load LM with correct dictionary')
+        parser.add_argument('--lm-wt', default=1, type=float,
+                            help='the weight of the lm in joint decoding')
+        # fmt: on
+
+    def build_generator(
+        self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None
+    ):
+        if getattr(args, "score_reference", False):
+            raise NotImplementedError()
+        else:
+            from .noisy_channel_sequence_generator import NoisyChannelSequenceGenerator
+            use_cuda = torch.cuda.is_available() and not self.args.cpu
+            assert self.args.lm_model is not None, '--lm-model required for noisy channel generation!'
+            assert self.args.lm_data is not None, '--lm-data required for noisy channel generation to map between LM and bitext vocabs'
+            if self.args.channel_model is not None:
+                import copy
+                ch_args_task = copy.deepcopy(self.args)
+                tmp = ch_args_task.source_lang
+                ch_args_task.source_lang = ch_args_task.target_lang
+                ch_args_task.target_lang = tmp
+                ch_args_task._name = 'translation'
+                channel_task = TranslationTask.setup_task(ch_args_task)
+
+            arg_dict = {}
+            arg_dict['task'] = 'language_modeling'
+            arg_dict['sample_break_mode'] = 'eos'
+            arg_dict['data'] = self.args.lm_data
+            arg_dict['output_dictionary_size'] = -1
+            lm_args = argparse.Namespace(**arg_dict)
+            lm_task = LanguageModelingTask.setup_task(lm_args)
+            lm_dict = lm_task.output_dictionary
+
+            if self.args.channel_model is not None:
+                channel_models, _ = checkpoint_utils.load_model_ensemble(self.args.channel_model.split(':'), task=channel_task)
+
+                for model in channel_models:
+                    model.make_generation_fast_(
+                        beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
+                        need_attn=args.print_alignment,
+                    )
+                    if self.args.fp16:
+                        model.half()
+                    if use_cuda:
+                        model.cuda()
+            else:
+                channel_models = None
+
+            lm_models, _ = checkpoint_utils.load_model_ensemble(self.args.lm_model.split(':'), task=lm_task)
+
+            for model in lm_models:
+                model.make_generation_fast_(
+                    beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
+                    need_attn=args.print_alignment,
+                )
+                if self.args.fp16:
+                    model.half()
+                if use_cuda:
+                    model.cuda()
+            return NoisyChannelSequenceGenerator(
+                combine_method=self.args.combine_method,
+                tgt_dict=self.target_dictionary,
+                src_dict=self.source_dictionary,
+                beam_size=getattr(args, 'beam', 5),
+                max_len_a=getattr(args, 'max_len_a', 0),
+                max_len_b=getattr(args, 'max_len_b', 200),
+                min_len=getattr(args, 'min_len', 1),
+                len_penalty=getattr(args, 'lenpen', 1),
+                unk_penalty=getattr(args, 'unkpen', 0),
+                temperature=getattr(args, 'temperature', 1.),
+                match_source_len=getattr(args, 'match_source_len', False),
+                no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0),
+                normalize_scores=(not getattr(args, 'unnormalized', False)),
+                channel_models=channel_models,
+                k2=getattr(self.args, 'k2', 50),
+                ch_weight=getattr(self.args, 'ch_wt', 1),
+                channel_scoring_type=self.args.channel_scoring_type,
+                top_k_vocab=self.args.top_k_vocab,
+                lm_models=lm_models,
+                lm_dict=lm_dict,
+                lm_weight=getattr(self.args, 'lm_wt', 1),
+                normalize_lm_scores_by_tgt_len=getattr(self.args, 'normalize_lm_scores_by_tgt_len', False),
+            )
diff --git a/fairseq/examples/flores101/README.md b/fairseq/examples/flores101/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..635c13f40bd0ccab704735bc5c26ea0192ea98cd
--- /dev/null
+++ b/fairseq/examples/flores101/README.md
@@ -0,0 +1,223 @@
+<p align="center">
+<img src="flores_logo.png" width="500">
+</p>
+
+# Flores101: Large-Scale Multilingual Machine Translation
+
+## Introduction
+
+Baseline pretrained models for small and large tracks of WMT 21 Large-Scale Multilingual Machine Translation competition.
+
+Flores Task at WMT 21: http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html
+
+Flores announement blog post: https://ai.facebook.com/blog/flores-researchers-kick-off-multilingual-translation-challenge-at-wmt-and-call-for-compute-grants/
+
+
+
+## Pretrained models
+
+Model | Num layers | Embed dimension | FFN dimension| Vocab Size | #params | Download
+---|---|---|---|---|---|---
+`flores101_mm100_615M` | 12 | 1024 | 4096 | 256,000 | 615M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz
+`flores101_mm100_175M` | 6 | 512 | 2048 | 256,000 | 175M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz
+
+
+These models are trained similar to [M2M-100](https://arxiv.org/abs/2010.11125) with additional support for the languages that are part of the WMT Large-Scale Multilingual Machine Translation track. Full list of languages can be found at the bottom.
+
+
+## Example Generation code
+
+### Download model, sentencepiece vocab
+
+```bash
+fairseq=/path/to/fairseq
+cd $fairseq
+
+# Download 615M param model.
+wget https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz
+
+# Extract 
+tar -xvzf flores101_mm100_615M.tar.gz
+```
+
+### Encode using our SentencePiece Model
+Note: Install SentencePiece from [here](https://github.com/google/sentencepiece)
+
+
+```bash
+fairseq=/path/to/fairseq
+cd $fairseq
+
+# Download example dataset From German to French
+sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de
+sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr
+
+for lang in de fr ; do
+    python scripts/spm_encode.py \
+        --model flores101_mm100_615M/sentencepiece.bpe.model \
+        --output_format=piece \
+        --inputs=raw_input.de-fr.${lang} \
+        --outputs=spm.de-fr.${lang}
+done
+```
+
+### Binarization
+
+```bash
+fairseq-preprocess \
+    --source-lang de --target-lang fr \
+    --testpref spm.de-fr \
+    --thresholdsrc 0 --thresholdtgt 0 \
+    --destdir data_bin \
+    --srcdict flores101_mm100_615M/dict.txt --tgtdict flores101_mm100_615M/dict.txt
+```
+
+### Generation 
+
+
+```bash
+fairseq-generate \
+    data_bin \
+    --batch-size 1 \
+    --path flores101_mm100_615M/model.pt \
+    --fixed-dictionary flores101_mm100_615M/dict.txt \
+    -s de -t fr \
+    --remove-bpe 'sentencepiece' \
+    --beam 5 \
+    --task translation_multi_simple_epoch \
+    --lang-pairs flores101_mm100_615M/language_pairs.txt \
+    --decoder-langtok --encoder-langtok src \
+    --gen-subset test \
+    --fp16 \
+    --dataset-impl mmap \
+    --distributed-world-size 1 --distributed-no-spawn
+```
+
+### Supported Languages and lang code
+
+Language | lang code
+---|---
+Akrikaans | af
+Amharic | am
+Arabic | ar
+Assamese | as
+Asturian | ast
+Aymara | ay
+Azerbaijani | az
+Bashkir | ba
+Belarusian | be
+Bulgarian | bg
+Bengali | bn
+Breton | br
+Bosnian | bs
+Catalan | ca
+Cebuano | ceb
+Chokwe | cjk
+Czech | cs
+Welsh | cy
+Danish | da
+German | de
+Dyula| dyu
+Greek | el
+English | en
+Spanish | es
+Estonian | et
+Persian | fa
+Fulah | ff
+Finnish | fi
+French | fr
+Western Frisian | fy
+Irish | ga
+Scottish Gaelic | gd
+Galician | gl
+Gujarati | gu
+Hausa | ha
+Hebrew | he
+Hindi | hi
+Croatian | hr
+Haitian Creole | ht
+Hungarian | hu
+Armenian | hy
+Indonesian | id
+Igbo | ig
+Iloko | ilo
+Icelandic | is
+Italian | it
+Japanese | ja
+Javanese | jv
+Georgian | ka
+Kachin | kac
+Kamba | kam
+Kabuverdianu | kea
+Kongo | kg
+Kazakh | kk
+Central Khmer | km
+Kimbundu | kmb
+Northern Kurdish | kmr
+Kannada | kn
+Korean | ko
+Kurdish | ku
+Kyrgyz | ky
+Luxembourgish | lb
+Ganda | lg
+Lingala | ln
+Lao | lo
+Lithuanian | lt
+Luo | luo
+Latvian | lv
+Malagasy | mg
+Maori | mi
+Macedonian | mk
+Malayalam | ml
+Mongolian | mn
+Marathi | mr
+Malay | ms
+Maltese | mt
+Burmese | my
+Nepali | ne
+Dutch | nl
+Norwegian | no
+Northern Sotho | ns
+Nyanja | ny
+Occitan | oc
+Oromo | om
+Oriya | or
+Punjabi | pa
+Polish | pl
+Pashto | ps
+Portuguese | pt
+Quechua | qu
+Romanian | ro
+Russian | ru
+Sindhi | sd
+Shan | shn
+Sinhala | si
+Slovak | sk
+Slovenian | sl
+Shona | sn
+Somali | so
+Albanian | sq
+Serbian | sr
+Swati | ss
+Sundanese | su
+Swedish | sv
+Swahili | sw
+Tamil | ta
+Telugu | te
+Tajik | tg
+Thai | th
+Tigrinya | ti
+Tagalog | tl
+Tswana | tn
+Turkish | tr
+Ukrainian | uk
+Umbundu | umb
+Urdu | ur
+Uzbek | uz
+Vietnamese | vi
+Wolof | wo
+Xhosa | xh
+Yiddish | yi
+Yoruba | yo
+Chinese| zh
+Zulu | zu
diff --git a/fairseq/examples/flores101/flores_logo.png b/fairseq/examples/flores101/flores_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..d4d1455c6eab608ff5317ce885183cd213564273
Binary files /dev/null and b/fairseq/examples/flores101/flores_logo.png differ
diff --git a/fairseq/examples/fully_sharded_data_parallel/README.md b/fairseq/examples/fully_sharded_data_parallel/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9e44fef48bee5faeee27b3d1d1b1eb96b6a477f
--- /dev/null
+++ b/fairseq/examples/fully_sharded_data_parallel/README.md
@@ -0,0 +1,177 @@
+# Fully Sharded Data Parallel (FSDP)
+
+## Overview
+Recent work by [Microsoft](https://arxiv.org/abs/1910.02054) and
+[Google](https://arxiv.org/abs/2004.13336) has shown that data parallel
+training can be made significantly more efficient by sharding the model
+parameters and optimizer state across data parallel workers. These ideas are
+encapsulated in the new **`FullyShardedDataParallel` (FSDP)** wrapper provided
+by [fairscale](https://github.com/facebookresearch/fairscale/).
+
+Compared to PyTorch DDP:
+* FSDP produces identical results as PyTorch DDP (it's still synchronous data parallel training)
+* FSDP shards parameters (FP16 + FP32) and optimizer state across data parallel GPUs
+* FSDP is faster than PyTorch DDP because the optimizer step is sharded, and the communication can be overlapped with the forward pass
+* FSDP enables training 13B parameter models on 8 GPUs and 175B parameter models on 128 GPUs
+
+FSDP is fully supported in fairseq via the following new arguments:
+* `--ddp-backend=fully_sharded`: enables full sharding via FSDP
+* `--cpu-offload`: offloads the optimizer state and FP32 model copy to CPU (combine with `--optimizer=cpu_adam`)
+* `--no-reshard-after-forward`: increases training speed for large models (1B+ params) and is similar to ZeRO stage 2
+* other popular options (`--fp16`, `--update-freq`, `--checkpoint-activations`, `--offload-activations`, etc.) continue to work as normal
+
+<details><summary>Limitations</summary><p>
+
+FSDP currently has several limitations compared to fairseq's default DDP backend (PyTorch DDP):
+* while FSDP is full compatible with pointwise Optimizers (e.g., Adam, AdamW, Adadelta, Adamax, SGD, etc.), it is not currently compatible with non-pointwise Optimizers (e.g., Adagrad, Adafactor, LAMB, etc.)
+* FSDP depends on flattening the parameters, so models that currently require `--fp16-no-flatten-grads` may not be supported
+
+See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed
+explanation of these and other limitations.
+
+</p></details>
+
+<details><summary>How it works</summary><p>
+
+<img width="800" alt="Fully Sharded Data Parallel" src="https://user-images.githubusercontent.com/231798/110406775-c2de0000-8050-11eb-9718-fbfc4510a76a.png">
+
+See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed
+explanation of how FSDP works.
+
+</p></details>
+
+## Example usage
+
+The following examples illustrate how to train a very large language model with
+13 billion parameters on 1 GPU by offloading parameters and optimizer states to
+CPU, or on 8 GPUs by fully sharding the params and optimizer states across GPUs.
+
+These examples use the WikiText-103 dataset for demonstration purposes, but
+in practice a much larger dataset will be needed to achieve good results.
+Follow the [instructions here](https://github.com/pytorch/fairseq/blob/main/examples/roberta/README.pretraining.md#1-preprocess-the-data)
+to preprocess the WikiText-103 dataset using the GPT-2/RoBERTa vocabulary.
+
+### 13B params on 1 V100 GPU (with CPU offloading)
+
+The following command trains a 13B parameter GPT-3 model on a single V100 GPU
+using the `--cpu-offload` feature to offload parameters and optimizer states to
+CPU. In this setting, the optimizer step (Adam) happens on CPU. We also use the
+`--checkpoint-activations` feature (sometimes called [gradient checkpointing](https://pytorch.org/docs/stable/checkpoint.html)),
+which further saves memory in exchange for a small increase in computation.
+
+**Requirements:**
+- Install the latest master version of fairscale: `pip install git+https://github.com/facebookresearch/fairscale.git@master`
+- You'll need 32GB of GPU memory and ~256GB of system memory to train the 13B param model.
+- If you have less system memory, the 6.7B param model can be trained with ~128GB of system memory, just set `--arch transformer_lm_gpt3_6_7`
+- We use the CPU Adam optimizer from [DeepSpeed](https://github.com/microsoft/DeepSpeed), so you'll need to `pip install deepspeed` before running the command.
+
+**Notes:**
+- The command will take ~5 minutes to start training, during which time it will appear to be hung, since randomly initializing 13B weights can be slow.
+- The `--cpu-offload` feature requires training in mixed precision (`--fp16`).
+- Tune the `OMP_NUM_THREADS` env variable for best performance with CPU offloading.
+- The example command below stops training after 10 steps (`--max-update 10`) and does not save checkpoints (`--no-save`).
+
+```bash
+OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0 \
+    fairseq-train data-bin/wikitext-103-roberta-bpe-bin \
+    --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
+    --cpu-offload --checkpoint-activations \
+    --task language_modeling --tokens-per-sample 2048 --batch-size 8 \
+    --arch transformer_lm_gpt3_13 \
+    --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
+    --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
+    --max-update 10 --no-save --log-format json --log-interval 1
+```
+
+<details><summary>Example output</summary><p>
+
+```
+(...)
+2021-03-08 12:29:51 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920)
+(...)
+2021-03-08 12:29:51 | INFO | fairseq_cli.train | training on 1 devices (GPUs/TPUs)
+2021-03-08 12:29:51 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8
+(...)
+Adam Optimizer #0 is created with AVX2 arithmetic capability.
+Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1
+(...)
+2021-03-08 12:31:36 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.475", "ppl": "91120.8", "wps": "0", "ups": "0", "wpb": "16384", "bsz": "8", "num_updates": "1", "lr": "2e-05", "gnorm": "20.751", "loss_scale": "4", "train_wall": "99", "gb_free": "9.3", "wall": "105"}
+2021-03-08 12:32:33 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.446", "ppl": "89281.6", "wps": "288.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "2", "lr": "4e-05", "gnorm": "19.777", "loss_scale": "4", "train_wall": "57", "gb_free": "9.3", "wall": "161"}
+2021-03-08 12:33:12 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0
+2021-03-08 12:33:51 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0
+2021-03-08 12:34:45 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "25.22", "ppl": "3.90691e+07", "wps": "123.4", "ups": "0.01", "wpb": "16384", "bsz": "8", "num_updates": "3", "lr": "6e-05", "gnorm": "131.281", "loss_scale": "1", "train_wall": "133", "gb_free": "9.3", "wall": "294"}
+2021-03-08 12:35:43 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.079", "ppl": "276809", "wps": "285.5", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "4", "lr": "8e-05", "gnorm": "13.776", "loss_scale": "1", "train_wall": "57", "gb_free": "9.3", "wall": "351"}
+2021-03-08 12:36:35 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "23.729", "ppl": "1.39088e+07", "wps": "316.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "5", "lr": "0.0001", "gnorm": "72.774", "loss_scale": "1", "train_wall": "52", "gb_free": "9.3", "wall": "403"}
+2021-03-08 12:37:28 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "20.429", "ppl": "1.41203e+06", "wps": "307.6", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "6", "lr": "8e-05", "gnorm": "60.846", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "456"}
+2021-03-08 12:38:27 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.965", "ppl": "511684", "wps": "279.4", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "7", "lr": "6e-05", "gnorm": "22.687", "loss_scale": "1", "train_wall": "59", "gb_free": "9.3", "wall": "515"}
+2021-03-08 12:39:18 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.345", "ppl": "332887", "wps": "319.1", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "8", "lr": "4e-05", "gnorm": "8.451", "loss_scale": "1", "train_wall": "51", "gb_free": "9.3", "wall": "566"}
+2021-03-08 12:40:11 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "18.262", "ppl": "314336", "wps": "305.9", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "9", "lr": "2e-05", "gnorm": "6.457", "loss_scale": "1", "train_wall": "54", "gb_free": "9.3", "wall": "620"}
+2021-03-08 12:41:04 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "17.556", "ppl": "192686", "wps": "311.8", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "10", "lr": "0", "gnorm": "5.796", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "673"}
+2021-03-08 12:41:04 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10
+2021-03-08 12:41:04 | INFO | fairseq_cli.train | begin validation on "valid" subset
+2021-03-08 12:43:15 | INFO | valid | {"epoch": 1, "valid_loss": "17.953", "valid_ppl": "253807", "valid_wps": "1868.4", "valid_wpb": "15400.2", "valid_bsz": "7.6", "valid_num_updates": "10"}
+2021-03-08 12:43:15 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below)
+2021-03-08 12:43:15 | INFO | train | {"epoch": 1, "train_loss": "19.351", "train_ppl": "668509", "train_wps": "210.9", "train_ups": "0.01", "train_wpb": "16384", "train_bsz": "8", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "36.26", "train_loss_scale": "1", "train_train_wall": "667", "train_gb_free": "9.3", "train_wall": "804"}
+2021-03-08 12:43:15 | INFO | fairseq_cli.train | done training in 798.6 seconds
+```
+
+</p></details>
+
+### 13B params on 8 V100 GPUs (with full parameter + optimizer state sharding)
+
+FSDP can also shard the parameters and optimizer states across multiple GPUs,
+reducing memory requirements significantly. On 8 x 32GB GPUs, sharding enables
+training the same 13B parameter model *without offloading the parameters to
+CPU*. However, without CPU offloading we'd only be able to fit a batch size of
+1 per GPU, which would cause training speed to suffer.
+
+We obtain the best performance on 8 GPUs by combining full sharding and CPU
+offloading. The following command trains the same 13B parameter GPT-3 model as
+before on 8 x 32GB V100 GPUs; training speed increases superlinearly from ~310
+words per second to ~3200 words per second.
+
+```bash
+OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+    fairseq-train data-bin/wikitext-103-roberta-bpe-bin \
+    --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
+    --cpu-offload --checkpoint-activations \
+    --task language_modeling --tokens-per-sample 2048 --batch-size 8 \
+    --arch transformer_lm_gpt3_13 \
+    --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
+    --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
+    --max-update 10 --no-save --log-format json --log-interval 1
+```
+
+<details><summary>Example output</summary><p>
+
+```
+(...)
+2021-03-08 18:04:09 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920)
+(...)
+2021-03-08 18:04:09 | INFO | fairseq_cli.train | training on 8 devices (GPUs/TPUs)
+2021-03-08 18:04:09 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8
+(...)
+Adam Optimizer #0 is created with AVX2 arithmetic capability.
+Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1
+(...)
+2021-03-08 18:05:06 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "16.408", "ppl": "86945.6", "wps": "0", "ups": "0", "wpb": "131072", "bsz": "64", "num_updates": "1", "lr": "2e-05", "gnorm": "18.27", "loss_scale": "4", "train_wall": "47", "gb_free": "9.3", "wall": "56"}
+2021-03-08 18:05:45 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "16.352", "ppl": "83644.3", "wps": "3283.4", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "2", "lr": "4e-05", "gnorm": "18.411", "loss_scale": "4", "train_wall": "40", "gb_free": "9.3", "wall": "96"}
+2021-03-08 18:06:21 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0
+2021-03-08 18:06:56 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0
+2021-03-08 18:07:37 | INFO | train_inner | {"epoch": 1, "update": 0.006, "loss": "23.682", "ppl": "1.34537e+07", "wps": "1176.6", "ups": "0.01", "wpb": "131072", "bsz": "64", "num_updates": "3", "lr": "6e-05", "gnorm": "119.682", "loss_scale": "1", "train_wall": "111", "gb_free": "9.3", "wall": "208"}
+2021-03-08 18:08:18 | INFO | train_inner | {"epoch": 1, "update": 0.007, "loss": "18.988", "ppl": "519921", "wps": "3189.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "4", "lr": "8e-05", "gnorm": "14.934", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "249"}
+2021-03-08 18:08:59 | INFO | train_inner | {"epoch": 1, "update": 0.008, "loss": "20.08", "ppl": "1.10798e+06", "wps": "3223.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "5", "lr": "0.0001", "gnorm": "59.92", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "289"}
+2021-03-08 18:09:39 | INFO | train_inner | {"epoch": 1, "update": 0.009, "loss": "18.323", "ppl": "327980", "wps": "3256.6", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "6", "lr": "8e-05", "gnorm": "37.425", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "330"}
+2021-03-08 18:10:20 | INFO | train_inner | {"epoch": 1, "update": 0.01, "loss": "17.264", "ppl": "157354", "wps": "3188.7", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "7", "lr": "6e-05", "gnorm": "10.824", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "371"}
+2021-03-08 18:11:01 | INFO | train_inner | {"epoch": 1, "update": 0.011, "loss": "16.794", "ppl": "113647", "wps": "3230", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "8", "lr": "4e-05", "gnorm": "5.616", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "411"}
+2021-03-08 18:11:39 | INFO | train_inner | {"epoch": 1, "update": 0.012, "loss": "16.706", "ppl": "106938", "wps": "3384", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "9", "lr": "2e-05", "gnorm": "5.318", "loss_scale": "1", "train_wall": "39", "gb_free": "9.3", "wall": "450"}
+2021-03-08 18:12:19 | INFO | train_inner | {"epoch": 1, "update": 0.013, "loss": "16.548", "ppl": "95796.2", "wps": "3274.4", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "10", "lr": "0", "gnorm": "5.22", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "490"}
+2021-03-08 18:12:19 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10
+2021-03-08 18:12:19 | INFO | fairseq_cli.train | begin validation on "valid" subset
+2021-03-08 18:12:45 | INFO | valid | {"epoch": 1, "valid_loss": "16.624", "valid_ppl": "101000", "valid_wps": "10855.9", "valid_wpb": "123202", "valid_bsz": "60.5", "valid_num_updates": "10"}
+2021-03-08 18:12:45 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below)
+2021-03-08 18:12:45 | INFO | train | {"epoch": 1, "train_loss": "18.114", "train_ppl": "283776", "train_wps": "2567.8", "train_ups": "0.02", "train_wpb": "131072", "train_bsz": "64", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "29.562", "train_loss_scale": "1", "train_train_wall": "480", "train_gb_free": "9.3", "train_wall": "516"}
+2021-03-08 18:12:45 | INFO | fairseq_cli.train | done training in 509.9 seconds
+```
+
+</p></details>
diff --git a/fairseq/examples/gottbert/README.md b/fairseq/examples/gottbert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d58feb279a4a50222290546c3bb285d3cea98e6
--- /dev/null
+++ b/fairseq/examples/gottbert/README.md
@@ -0,0 +1,64 @@
+# GottBERT: a pure German language model
+
+## Introduction
+
+[GottBERT](http://arxiv.org/abs/2012.02110) is a pretrained language model trained on 145GB of German text based on RoBERTa.
+
+## Example usage
+
+### fairseq
+##### Load GottBERT from torch.hub (PyTorch >= 1.1):
+```python
+import torch
+gottbert = torch.hub.load('pytorch/fairseq', 'gottbert-base')
+gottbert.eval()  # disable dropout (or leave in train mode to finetune)
+```
+
+##### Load GottBERT (for PyTorch 1.0 or custom models):
+```python
+# Download gottbert model
+wget https://dl.gottbert.de/fairseq/models/gottbert-base.tar.gz
+tar -xzvf gottbert.tar.gz
+
+# Load the model in fairseq
+from fairseq.models.roberta import GottbertModel
+gottbert = GottbertModel.from_pretrained('/path/to/gottbert')
+gottbert.eval()  # disable dropout (or leave in train mode to finetune)
+```
+
+##### Filling masks:
+```python
+masked_line = 'Gott ist <mask> ! :)'
+gottbert.fill_mask(masked_line, topk=3)
+# [('Gott ist gut ! :)',        0.3642110526561737,   ' gut'),
+#  ('Gott ist überall ! :)',    0.06009674072265625,  ' überall'),
+#  ('Gott ist großartig ! :)',  0.0370681993663311,   ' großartig')]
+```
+
+##### Extract features from GottBERT
+
+```python
+# Extract the last layer's features
+line = "Der erste Schluck aus dem Becher der Naturwissenschaft macht atheistisch , aber auf dem Grunde des Bechers wartet Gott !"
+tokens = gottbert.encode(line)
+last_layer_features = gottbert.extract_features(tokens)
+assert last_layer_features.size() == torch.Size([1, 27, 768])
+
+# Extract all layer's features (layer 0 is the embedding layer)
+all_layers = gottbert.extract_features(tokens, return_all_hiddens=True)
+assert len(all_layers) == 13
+assert torch.all(all_layers[-1] == last_layer_features)
+```
+## Citation
+If you use our work, please cite:
+
+```bibtex
+@misc{scheible2020gottbert,
+      title={GottBERT: a pure German Language Model},
+      author={Raphael Scheible and Fabian Thomczyk and Patric Tippmann and Victor Jaravine and Martin Boeker},
+      year={2020},
+      eprint={2012.02110},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
diff --git a/fairseq/examples/hubert/README.md b/fairseq/examples/hubert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6695d819713b3e2077cf0fab30469b237e1cf1be
--- /dev/null
+++ b/fairseq/examples/hubert/README.md
@@ -0,0 +1,116 @@
+# HuBERT
+
+## Pre-trained and fine-tuned (ASR) models
+Model | Pretraining Data | Finetuning Dataset | Model | Quantizer
+|---|---|---|---|---
+HuBERT Base (~95M params) | [Librispeech](http://www.openslr.org/12) 960 hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | [L9 km500](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960_L9_km500.bin)
+HuBERT Large (~316M params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt)
+HuBERT Extra Large (~1B params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr |  No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k.pt)
+HuBERT Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k_finetune_ls960.pt)
+HuBERT Extra Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt)
+
+## Load a model
+```
+ckpt_path = "/path/to/the/checkpoint.pt"
+models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+model = models[0]
+```
+
+## Train a new model
+
+### Data preparation
+
+Follow the steps in `./simple_kmeans` to create:
+- `{train,valid}.tsv` waveform list files
+- `{train,valid}.km` frame-aligned pseudo label files.
+- `dict.km.txt` a dummy dictionary
+The `label_rate` is the same as the feature frame rate used for clustering,
+which is 100Hz for MFCC features and 50Hz for HuBERT features by default.
+
+### Pre-train a HuBERT model
+
+Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km`
+are saved at `/path/to/labels`, and the label rate is 100Hz.
+
+To train a base model (12 layer transformer), run:
+```sh
+$ python fairseq_cli/hydra_train.py \
+  --config-dir /path/to/fairseq-py/examples/hubert/config/pretrain \
+  --config-name hubert_base_librispeech \
+  task.data=/path/to/data task.label_dir=/path/to/labels task.labels='["km"]' model.label_rate=100
+```
+
+### Fine-tune a HuBERT model with a CTC loss
+
+Suppose `{train,valid}.tsv` are saved at `/path/to/data`, and their
+corresponding character transcripts `{train,valid}.ltr` are saved at
+`/path/to/trans`.
+
+To fine-tune a pre-trained HuBERT model at `/path/to/checkpoint`, run
+```sh
+$ python fairseq_cli/hydra_train.py \
+  --config-dir /path/to/fairseq-py/examples/hubert/config/finetune \
+  --config-name base_10h \
+  task.data=/path/to/data task.label_dir=/path/to/trans \
+  model.w2v_path=/path/to/checkpoint
+```
+
+### Decode a HuBERT model
+
+Suppose the `test.tsv` and `test.ltr` are the waveform list and transcripts of
+the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is
+saved at `/path/to/checkpoint`. We support three decoding modes:
+- Viterbi decoding: greedy decoding without a language model
+- KenLM decoding: decoding with an arpa-format KenLM n-gram language model
+- Fairseq-LM deocding: decoding with a Fairseq neural language model
+
+
+#### Viterbi decoding
+
+`task.normalize` needs to be consistent with the value used during fine-tuning.
+Decoding results will be saved at
+`/path/to/experiment/directory/decode/viterbi/test`.
+
+```sh
+$ python examples/speech_recognition/new/infer.py \
+  --config-dir /path/to/fairseq-py/examples/hubert/config/decode \
+  --config-name infer_viterbi \
+  task.data=/path/to/data \
+  task.normalize=[true|false] \
+  decoding.exp_dir=/path/to/experiment/directory \
+  common_eval.path=/path/to/checkpoint
+  dataset.gen_subset=test \
+```
+
+#### KenLM / Fairseq-LM decoding
+
+Suppose the pronunciation lexicon and the n-gram LM are saved at
+`/path/to/lexicon` and `/path/to/arpa`, respectively. Decoding results will be
+saved at `/path/to/experiment/directory/decode/kenlm/test`.
+
+```sh
+$ python examples/speech_recognition/new/infer.py \
+  --config-dir /path/to/fairseq-py/examples/hubert/config/decode \
+  --config-name infer_kenlm \
+  task.data=/path/to/data \
+  task.normalize=[true|false] \
+  decoding.exp_dir=/path/to/experiment/directory \
+  common_eval.path=/path/to/checkpoint
+  dataset.gen_subset=test \
+  decoding.decoder.lexicon=/path/to/lexicon \
+  decoding.decoder.lmpath=/path/to/arpa
+```
+
+The command above uses the default decoding hyperparameter, which can be found
+in `examples/speech_recognition/hydra/decoder.py`. These parameters can be
+configured from the command line. For example, to search with a beam size of
+500, we can append the command above with `decoding.decoder.beam=500`.
+Important parameters include:
+- decoding.decoder.beam
+- decoding.decoder.beamthreshold
+- decoding.decoder.lmweight
+- decoding.decoder.wordscore
+- decoding.decoder.silweight
+
+To decode with a Fairseq LM, use `--config-name infer_fsqlm` instead, and
+change the path of lexicon and LM accordingly.
diff --git a/fairseq/examples/hubert/config/decode/ax_sweep/ngram.yaml b/fairseq/examples/hubert/config/decode/ax_sweep/ngram.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a02df1f7da7eebfebe4018ef2758a716fbab646
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/ax_sweep/ngram.yaml
@@ -0,0 +1,33 @@
+# @package _global_
+
+common_eval:
+  results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset}
+
+hydra:
+  sweeper:
+    ax_config:
+      max_trials: 60
+      early_stop:
+        minimize: true
+        max_epochs_without_improvement: 10
+        epsilon: 0.025
+      experiment:
+        name: ${dataset.gen_subset}
+        objective_name: wer
+        minimize: true
+        parameter_constraints: null
+        outcome_constraints: null
+        status_quo: null
+      client:
+        verbose_logging: false
+        random_seed: null
+      params:
+        decoding.decoder.lmweight:
+          type: range
+          bounds: [0.0, 8.0]
+        decoding.decoder.wordscore:
+          type: range
+          bounds: [-5.0, 5.0]
+        decoding.decoder.silweight:
+          type: range
+          bounds: [-10.0, 0.0]
diff --git a/fairseq/examples/hubert/config/decode/ax_sweep/transformer.yaml b/fairseq/examples/hubert/config/decode/ax_sweep/transformer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85ed3bd1a5a44871260f572786044c28f441add6
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/ax_sweep/transformer.yaml
@@ -0,0 +1,33 @@
+# @package _global_
+
+common_eval:
+  results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset}
+
+hydra:
+  sweeper:
+    ax_config:
+      max_trials: 60
+      early_stop:
+        minimize: true
+        max_epochs_without_improvement: 10
+        epsilon: 0.025
+      experiment:
+        name: ${dataset.gen_subset}
+        objective_name: wer
+        minimize: true
+        parameter_constraints: null
+        outcome_constraints: null
+        status_quo: null
+      client:
+        verbose_logging: false
+        random_seed: null
+      params:
+        decoding.decoder.lmweight:
+          type: range
+          bounds: [0.0, 4.0]
+        decoding.decoder.wordscore:
+          type: range
+          bounds: [-5.0, 5.0]
+        decoding.decoder.silweight:
+          type: range
+          bounds: [-8.0, 0.0]
diff --git a/fairseq/examples/hubert/config/decode/infer_fsqlm.yaml b/fairseq/examples/hubert/config/decode/infer_fsqlm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..026ad8db89a0673969a99fed6e1e84fc41fc7a1a
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/infer_fsqlm.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+
+defaults:
+  - model: null
+
+hydra:
+  run:
+    dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
+  sweep:
+    dir: ${common_eval.results_path}
+    subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
+
+task:
+  _name: hubert_pretraining
+  single_target: true
+  fine_tuning: true
+  data: ???
+  normalize: ???
+
+decoding:
+  type: fairseqlm
+  lexicon: ???
+  lmpath: ???
+  beamthreshold: 25
+  beam: 500
+  lmweight: 2
+  wordscore: -1
+  silweight: 0
+  unique_wer_file: true
+common_eval:
+  results_path: ???
+  path: ???
+  post_process: letter
+dataset:
+  max_tokens: 1100000
+  gen_subset: ???
diff --git a/fairseq/examples/hubert/config/decode/infer_kenlm.yaml b/fairseq/examples/hubert/config/decode/infer_kenlm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04642aeb6530133ab44e12e11e3d1661e3b9c32c
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/infer_kenlm.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+
+defaults:
+  - model: null
+
+hydra:
+  run:
+    dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
+  sweep:
+    dir: ${common_eval.results_path}
+    subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
+
+task:
+  _name: hubert_pretraining
+  single_target: true
+  fine_tuning: true
+  data: ???
+  normalize: ???
+
+decoding:
+  type: kenlm
+  lexicon: ???
+  lmpath: ???
+  beamthreshold: 100
+  beam: 500
+  lmweight: 2
+  wordscore: -1
+  silweight: 0
+  unique_wer_file: true
+common_eval:
+  results_path: ???
+  path: ???
+  post_process: letter
+dataset:
+  max_tokens: 1100000
+  gen_subset: ???
diff --git a/fairseq/examples/hubert/config/decode/infer_viterbi.yaml b/fairseq/examples/hubert/config/decode/infer_viterbi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4afc74c18ca890e1a20c6beabeb9059dd0f480f4
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/infer_viterbi.yaml
@@ -0,0 +1,29 @@
+# @package _group_
+
+defaults:
+  - model: null
+
+hydra:
+  run:
+    dir: ${common_eval.results_path}/viterbi
+  sweep:
+    dir: ${common_eval.results_path}
+    subdir: viterbi
+
+task:
+  _name: hubert_pretraining
+  single_target: true
+  fine_tuning: true
+  data: ???
+  normalize: ???
+
+decoding:
+  type: viterbi
+  unique_wer_file: true
+common_eval:
+  results_path: ???
+  path: ???
+  post_process: letter
+dataset:
+  max_tokens: 1100000
+  gen_subset: ???
diff --git a/fairseq/examples/hubert/config/decode/run/submitit_slurm.yaml b/fairseq/examples/hubert/config/decode/run/submitit_slurm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b8065832ecacf9dd4fe4e99c87941e00fb3ef7f
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/run/submitit_slurm.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+hydra:
+  launcher:
+    cpus_per_task: ${distributed_training.distributed_world_size}
+    gpus_per_node: ${distributed_training.distributed_world_size}
+    tasks_per_node: ${hydra.launcher.gpus_per_node}
+    nodes: 1
+    mem_gb: 200
+    timeout_min: 4320
+    max_num_timeout: 50
+    name: ${hydra.job.config_name}
+    submitit_folder: ${hydra.sweep.dir}/submitit
+
+distributed_training:
+  distributed_world_size: 1
+  distributed_no_spawn: true
+  distributed_port: 29761
diff --git a/fairseq/examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml b/fairseq/examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f669f376312dbfe4611cc08f4996a314155fb87
--- /dev/null
+++ b/fairseq/examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+hydra:
+  launcher:
+    cpus_per_task: ${distributed_training.distributed_world_size}
+    gpus_per_node: ${distributed_training.distributed_world_size}
+    tasks_per_node: ${hydra.launcher.gpus_per_node}
+    nodes: 1
+    mem_gb: 200
+    timeout_min: 4320
+    max_num_timeout: 50
+    name: ${hydra.job.config_name}
+    submitit_folder: ${hydra.sweep.dir}/submitit
+
+distributed_training:
+  distributed_world_size: 8
+  distributed_no_spawn: true
+  distributed_port: 29761
diff --git a/fairseq/examples/hubert/config/finetune/base_10h.yaml b/fairseq/examples/hubert/config/finetune/base_10h.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a22c7c0347f792221f209bcfba7ba380a69f90a8
--- /dev/null
+++ b/fairseq/examples/hubert/config/finetune/base_10h.yaml
@@ -0,0 +1,100 @@
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+
+checkpoint:
+  save_interval: 5
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 1
+  distributed_port: 29671
+  nprocs_per_node: 8
+
+task:
+  _name: hubert_pretraining
+  data: ???
+  fine_tuning: true
+  label_dir: ???
+  normalize: false  # must be consistent with pre-training
+  labels: ["ltr"]
+  single_target: true
+
+dataset:
+  num_workers: 0
+  max_tokens: 3200000
+  validate_after_updates: ${model.freeze_finetune_updates}
+  validate_interval: 5
+  train_subset: train
+  valid_subset: valid
+
+criterion:
+  _name: ctc
+  zero_infinity: true
+
+optimization:
+  max_update: 25000
+  lr: [2e-5]
+  sentence_avg: true
+  update_freq: [1]
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 8000
+  hold_steps: 0
+  decay_steps: 72000
+  final_lr_scale: 0.05
+
+model:
+  _name: hubert_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 10000
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
diff --git a/fairseq/examples/hubert/config/finetune/ckpt/it1.yaml b/fairseq/examples/hubert/config/finetune/ckpt/it1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2af96b3f72746f85feb13e7efcbdab6602b293de
--- /dev/null
+++ b/fairseq/examples/hubert/config/finetune/ckpt/it1.yaml
@@ -0,0 +1,7 @@
+# @package _global_
+
+task:
+  normalize: false
+
+model:
+  w2v_path: /checkpoint/wnhsu/w2v/hubert_final/iter1/hubert.km.randcrop.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU400k.s1337.ngpu32/checkpoint_last.pt
diff --git a/fairseq/examples/hubert/config/finetune/lm/ls_4gram.yaml b/fairseq/examples/hubert/config/finetune/lm/ls_4gram.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c7728ad29965d3cf18605808a893bc442afd56b
--- /dev/null
+++ b/fairseq/examples/hubert/config/finetune/lm/ls_4gram.yaml
@@ -0,0 +1,7 @@
+# @package _global_
+
+criterion:
+  wer_kenlm_model: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/4-gram.bin
+  wer_lexicon: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/10h/raw/lexicon_ltr.lst
+  wer_lm_weight: 2.0
+  wer_word_score: -1.0
diff --git a/fairseq/examples/hubert/config/finetune/run/submitit_reg.yaml b/fairseq/examples/hubert/config/finetune/run/submitit_reg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..27509503e7b306c07742fbed2fc5726d001bb7df
--- /dev/null
+++ b/fairseq/examples/hubert/config/finetune/run/submitit_reg.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+hydra:
+  launcher:
+    cpus_per_task: 8
+    gpus_per_node: 8
+    tasks_per_node: ${hydra.launcher.gpus_per_node}
+    nodes: 1
+    comment: null
+    mem_gb: 384
+    timeout_min: 4320
+    max_num_timeout: 100
+    constraint: volta32gb
+    name: ${hydra.job.config_name}/${hydra.job.override_dirname}
+    submitit_folder: ${hydra.sweep.dir}/submitit/%j
+
+distributed_training:
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
diff --git a/fairseq/examples/hubert/config/pretrain/data/iter1.yaml b/fairseq/examples/hubert/config/pretrain/data/iter1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a1b65d802c83128c53f32b21807fa5e51da6cc9
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/data/iter1.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+
+task:
+  label_dir: ???
+  labels: ["km"]
+
+model:
+  label_rate: 100
diff --git a/fairseq/examples/hubert/config/pretrain/data/iter2.yaml b/fairseq/examples/hubert/config/pretrain/data/iter2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d4bfe61cc638af9de48e92c58994e435fba2abf
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/data/iter2.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+
+task:
+  label_dir: ???
+  labels: ["km"]
+
+model:
+  label_rate: 50
diff --git a/fairseq/examples/hubert/config/pretrain/hubert_base_librispeech.yaml b/fairseq/examples/hubert/config/pretrain/hubert_base_librispeech.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd84461a163866f622b01bf6d36b4de6215f3d97
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/hubert_base_librispeech.yaml
@@ -0,0 +1,97 @@
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+
+
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+  find_unused_parameters: true
+
+task:
+  _name: hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ???
+  label_rate: ${model.label_rate}
+  sample_rate: 16000
+  max_sample_size: 250000
+  min_sample_size: 32000
+  pad_audio: false
+  random_crop: true
+  normalize: false # must be consistent with extractor
+
+dataset:
+  num_workers: 6
+  max_tokens: 1400000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+
+criterion:
+  _name: hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+
+model:
+  _name: hubert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  mask_prob: 0.80
+  extractor_mode: default
+  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
diff --git a/fairseq/examples/hubert/config/pretrain/hubert_large_librivox.yaml b/fairseq/examples/hubert/config/pretrain/hubert_large_librivox.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5192b5f29b53aa8391a0ab67b6238c0d0b4985e
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/hubert_large_librivox.yaml
@@ -0,0 +1,101 @@
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+
+
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 128
+  distributed_port: 29671
+  nprocs_per_node: 8
+  find_unused_parameters: true
+
+task:
+  _name: hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ???
+  label_rate: ${model.label_rate}
+  sample_rate: 16000
+  max_sample_size: 250000
+  min_sample_size: 32000
+  pad_audio: false
+  random_crop: true
+  normalize: true # must be consistent with extractor
+
+dataset:
+  num_workers: 6
+  max_tokens: 900000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+
+criterion:
+  _name: hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+
+optimization:
+  max_update: 400000
+  lr: [0.0015]
+  clip_norm: 1.0
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+
+model:
+  _name: hubert
+  label_rate: ???
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+  final_dim: 768
+  skip_masked: false
+  skip_nomask: false
+  mask_prob: 0.80
+  extractor_mode: layer_norm
+  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  encoder_layerdrop: 0.0
+  dropout_input: 0.0
+  dropout_features: 0.0
+  dropout: 0.0
+  attention_dropout: 0.0
+  layer_norm_first: true
+  feature_grad_mult: 1.0
+  untie_final_proj: true
+  activation_dropout: 0.0
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+  run:
+    dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+  sweep:
+    dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
diff --git a/fairseq/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml b/fairseq/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34e8f2bfb93863db122f694785b80857713ceb05
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml
@@ -0,0 +1,101 @@
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+
+
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 256
+  distributed_port: 29671
+  nprocs_per_node: 8
+  find_unused_parameters: true
+
+task:
+  _name: hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ???
+  label_rate: ${model.label_rate}
+  sample_rate: 16000
+  max_sample_size: 250000
+  min_sample_size: 32000
+  pad_audio: false
+  random_crop: true
+  normalize: true # must be consistent with extractor
+
+dataset:
+  num_workers: 6
+  max_tokens: 360000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+
+criterion:
+  _name: hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+
+optimization:
+  max_update: 400000
+  lr: [0.003]
+  clip_norm: 1.0
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+
+model:
+  _name: hubert
+  label_rate: ???
+  encoder_layers: 48
+  encoder_embed_dim: 1280
+  encoder_ffn_embed_dim: 5120
+  encoder_attention_heads: 16
+  final_dim: 1024
+  skip_masked: false
+  skip_nomask: false
+  mask_prob: 0.80
+  extractor_mode: layer_norm
+  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  encoder_layerdrop: 0.0
+  dropout_input: 0.0
+  dropout_features: 0.0
+  dropout: 0.0
+  attention_dropout: 0.0
+  layer_norm_first: true
+  feature_grad_mult: 1.0
+  untie_final_proj: true
+  activation_dropout: 0.0
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+  run:
+    dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+  sweep:
+    dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
diff --git a/fairseq/examples/hubert/config/pretrain/run/submitit_reg.yaml b/fairseq/examples/hubert/config/pretrain/run/submitit_reg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46c979cd2835fe026b0a532a54533904d1001e54
--- /dev/null
+++ b/fairseq/examples/hubert/config/pretrain/run/submitit_reg.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+hydra:
+  launcher:
+    cpus_per_task: 8
+    gpus_per_node: 8
+    tasks_per_node: ${hydra.launcher.gpus_per_node}
+    nodes: 4
+    comment: null
+    mem_gb: 384
+    timeout_min: 4320
+    max_num_timeout: 100
+    constraint: volta32gb
+    name: ${hydra.job.config_name}/${hydra.job.override_dirname}
+    submitit_folder: ${hydra.sweep.dir}/submitit/%j
+
+distributed_training:
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
diff --git a/fairseq/examples/hubert/measure_teacher_quality.py b/fairseq/examples/hubert/measure_teacher_quality.py
new file mode 100644
index 0000000000000000000000000000000000000000..92279b2214bb2ba4a99aea92098907ef4f55821b
--- /dev/null
+++ b/fairseq/examples/hubert/measure_teacher_quality.py
@@ -0,0 +1,241 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import os.path as op
+import re
+from tabulate import tabulate
+from collections import Counter
+
+
+def comp_purity(p_xy, axis):
+    max_p = p_xy.max(axis=axis)
+    marg_p = p_xy.sum(axis=axis)
+    indv_pur = max_p / marg_p
+    aggr_pur = max_p.sum()
+    return indv_pur, aggr_pur
+
+
+def comp_entropy(p):
+    return (-p * np.log(p + 1e-8)).sum()
+
+
+def comp_norm_mutual_info(p_xy):
+    p_x = p_xy.sum(axis=1, keepdims=True)
+    p_y = p_xy.sum(axis=0, keepdims=True)
+    pmi = np.log(p_xy / np.matmul(p_x, p_y) + 1e-8)
+    mi = (p_xy * pmi).sum()
+    h_x = comp_entropy(p_x)
+    h_y = comp_entropy(p_y)
+    return mi, mi / h_x, mi / h_y, h_x, h_y
+
+
+def pad(labs, n):
+    if n == 0:
+        return np.array(labs)
+    return np.concatenate([[labs[0]] * n, labs, [labs[-1]] * n])
+
+
+def comp_avg_seg_dur(labs_list):
+    n_frms = 0
+    n_segs = 0
+    for labs in labs_list:
+        labs = np.array(labs)
+        edges = np.zeros(len(labs)).astype(bool)
+        edges[0] = True
+        edges[1:] = labs[1:] != labs[:-1]
+        n_frms += len(edges)
+        n_segs += edges.astype(int).sum()
+    return n_frms / n_segs
+
+
+def comp_joint_prob(uid2refs, uid2hyps):
+    """
+    Args:
+        pad: padding for spliced-feature derived labels
+    """
+    cnts = Counter()
+    skipped = []
+    abs_frmdiff = 0
+    for uid in uid2refs:
+        if uid not in uid2hyps:
+            skipped.append(uid)
+            continue
+        refs = uid2refs[uid]
+        hyps = uid2hyps[uid]
+        abs_frmdiff += abs(len(refs) - len(hyps))
+        min_len = min(len(refs), len(hyps))
+        refs = refs[:min_len]
+        hyps = hyps[:min_len]
+        cnts.update(zip(refs, hyps))
+    tot = sum(cnts.values())
+
+    ref_set = sorted({ref for ref, _ in cnts.keys()})
+    hyp_set = sorted({hyp for _, hyp in cnts.keys()})
+    ref2pid = dict(zip(ref_set, range(len(ref_set))))
+    hyp2lid = dict(zip(hyp_set, range(len(hyp_set))))
+    # print(hyp_set)
+    p_xy = np.zeros((len(ref2pid), len(hyp2lid)), dtype=float)
+    for (ref, hyp), cnt in cnts.items():
+        p_xy[ref2pid[ref], hyp2lid[hyp]] = cnt
+    p_xy /= p_xy.sum()
+    return p_xy, ref2pid, hyp2lid, tot, abs_frmdiff, skipped
+
+
+def read_phn(tsv_path, rm_stress=True):
+    uid2phns = {}
+    with open(tsv_path) as f:
+        for line in f:
+            uid, phns = line.rstrip().split("\t")
+            phns = phns.split(",")
+            if rm_stress:
+                phns = [re.sub("[0-9]", "", phn) for phn in phns]
+            uid2phns[uid] = phns
+    return uid2phns
+
+
+def read_lab(tsv_path, lab_path, pad_len=0, upsample=1):
+    """
+    tsv is needed to retrieve the uids for the labels
+    """
+    with open(tsv_path) as f:
+        f.readline()
+        uids = [op.splitext(op.basename(line.rstrip().split()[0]))[0] for line in f]
+    with open(lab_path) as f:
+        labs_list = [pad(line.rstrip().split(), pad_len).repeat(upsample) for line in f]
+    assert len(uids) == len(labs_list)
+    return dict(zip(uids, labs_list))
+
+
+def main_lab_lab(
+    tsv_dir,
+    lab_dir,
+    lab_name,
+    lab_sets,
+    ref_dir,
+    ref_name,
+    pad_len=0,
+    upsample=1,
+    verbose=False,
+):
+    # assume tsv_dir is the same for both the reference and the hypotheses
+    tsv_dir = lab_dir if tsv_dir is None else tsv_dir
+
+    uid2refs = {}
+    for s in lab_sets:
+        uid2refs.update(read_lab(f"{tsv_dir}/{s}.tsv", f"{ref_dir}/{s}.{ref_name}"))
+
+    uid2hyps = {}
+    for s in lab_sets:
+        uid2hyps.update(
+            read_lab(
+                f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
+            )
+        )
+    _main(uid2refs, uid2hyps, verbose)
+
+
+def main_phn_lab(
+    tsv_dir,
+    lab_dir,
+    lab_name,
+    lab_sets,
+    phn_dir,
+    phn_sets,
+    pad_len=0,
+    upsample=1,
+    verbose=False,
+):
+    uid2refs = {}
+    for s in phn_sets:
+        uid2refs.update(read_phn(f"{phn_dir}/{s}.tsv"))
+
+    uid2hyps = {}
+    tsv_dir = lab_dir if tsv_dir is None else tsv_dir
+    for s in lab_sets:
+        uid2hyps.update(
+            read_lab(
+                f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
+            )
+        )
+    _main(uid2refs, uid2hyps, verbose)
+
+
+def _main(uid2refs, uid2hyps, verbose):
+    (p_xy, ref2pid, hyp2lid, tot, frmdiff, skipped) = comp_joint_prob(
+        uid2refs, uid2hyps
+    )
+    ref_pur_by_hyp, ref_pur = comp_purity(p_xy, axis=0)
+    hyp_pur_by_ref, hyp_pur = comp_purity(p_xy, axis=1)
+    (mi, mi_norm_by_ref, mi_norm_by_hyp, h_ref, h_hyp) = comp_norm_mutual_info(p_xy)
+    outputs = {
+        "ref pur": ref_pur,
+        "hyp pur": hyp_pur,
+        "H(ref)": h_ref,
+        "H(hyp)": h_hyp,
+        "MI": mi,
+        "MI/H(ref)": mi_norm_by_ref,
+        "ref segL": comp_avg_seg_dur(uid2refs.values()),
+        "hyp segL": comp_avg_seg_dur(uid2hyps.values()),
+        "p_xy shape": p_xy.shape,
+        "frm tot": tot,
+        "frm diff": frmdiff,
+        "utt tot": len(uid2refs),
+        "utt miss": len(skipped),
+    }
+    print(tabulate([outputs.values()], outputs.keys(), floatfmt=".4f"))
+
+
+if __name__ == "__main__":
+    """
+    compute quality of labels with respect to phone or another labels if set
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("lab_dir")
+    parser.add_argument("lab_name")
+    parser.add_argument("--lab_sets", default=["valid"], type=str, nargs="+")
+    parser.add_argument(
+        "--phn_dir",
+        default="/checkpoint/wnhsu/data/librispeech/960h/fa/raw_phn/phone_frame_align_v1",
+    )
+    parser.add_argument(
+        "--phn_sets", default=["dev-clean", "dev-other"], type=str, nargs="+"
+    )
+    parser.add_argument("--pad_len", default=0, type=int, help="padding for hypotheses")
+    parser.add_argument(
+        "--upsample", default=1, type=int, help="upsample factor for hypotheses"
+    )
+    parser.add_argument("--ref_lab_dir", default="")
+    parser.add_argument("--ref_lab_name", default="")
+    parser.add_argument("--verbose", action="store_true")
+    args = parser.parse_args()
+
+    if args.ref_lab_dir and args.ref_lab_name:
+        main_lab_lab(
+            args.tsv_dir,
+            args.lab_dir,
+            args.lab_name,
+            args.lab_sets,
+            args.ref_lab_dir,
+            args.ref_lab_name,
+            args.pad_len,
+            args.upsample,
+            args.verbose,
+        )
+    else:
+        main_phn_lab(
+            args.tsv_dir,
+            args.lab_dir,
+            args.lab_name,
+            args.lab_sets,
+            args.phn_dir,
+            args.phn_sets,
+            args.pad_len,
+            args.upsample,
+            args.verbose,
+        )
diff --git a/fairseq/examples/hubert/simple_kmeans/README.md b/fairseq/examples/hubert/simple_kmeans/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..847475c23f8a6a47bb25cba83466ddd9eba167b8
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/README.md
@@ -0,0 +1,80 @@
+# Sharded Feature Extraction and K-means Application
+
+This folder contains scripts for preparing HUBERT labels from tsv files, the
+steps are:
+1. feature extraction
+2. k-means clustering
+3. k-means application
+
+
+## Data preparation
+
+`*.tsv` files contains a list of audio, where each line is the root, and
+following lines are the subpath for each audio:
+```
+<root-dir>
+<audio-path-1>
+<audio-path-2>
+...
+```
+
+
+## Feature extraction
+
+### MFCC feature
+Suppose the tsv file is at `${tsv_dir}/${split}.tsv`. To extract 39-D
+mfcc+delta+ddelta features for the 1st iteration HUBERT training, run:
+```sh
+python dump_mfcc_feature.py ${tsv_dir} ${split} ${nshard} ${rank} ${feat_dir}
+```
+This would shard the tsv file into `${nshard}` and extract features for the
+`${rank}`-th shard, where rank is an integer in `[0, nshard-1]`. Features would
+be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
+
+
+### HUBERT feature
+To extract features from the `${layer}`-th transformer layer of a trained
+HUBERT model saved at `${ckpt_path}`, run:
+```sh
+python dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} ${layer} ${nshard} ${rank} ${feat_dir}
+```
+Features would also be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
+
+- if out-of-memory, decrease the chunk size with `--max_chunk`
+
+
+## K-means clustering
+To fit a k-means model with `${n_clusters}` clusters on 10% of the `${split}` data, run
+```sh
+python learn_kmeans.py ${feat_dir} ${split} ${nshard} ${km_path} ${n_cluster} --percent 0.1
+```
+This saves the k-means model to `${km_path}`.
+
+- set `--precent -1` to use all data
+- more kmeans options can be found with `-h` flag
+
+
+## K-means application
+To apply a trained k-means model `${km_path}` to obtain labels for `${split}`, run
+```sh
+python dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir}
+```
+This would extract labels for the `${rank}`-th shard out of `${nshard}` shards
+and dump them to `${lab_dir}/${split}_${rank}_${shard}.km`
+
+
+Finally, merge shards for `${split}` by running
+```sh
+for rank in $(seq 0 $((nshard - 1))); do
+  cat $lab_dir/${split}_${rank}_${nshard}.km
+done > $lab_dir/${split}.km
+```
+
+
+## Create a dummy dict
+To create a dummy dictionary, run
+```sh
+for x in $(seq 0 $((n_clusters - 1))); do
+  echo "$x 1"
+done >> $lab_dir/dict.km.txt
+```
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature.py b/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ea4ea0aa93046a133722511311a2735796cefeb
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature.py
@@ -0,0 +1,93 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import fairseq
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+
+from feature_utils import get_path_iterator, dump_feature
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_hubert_feature")
+
+
+class HubertFeatureReader(object):
+    def __init__(self, ckpt_path, layer, max_chunk=1600000):
+        (
+            model,
+            cfg,
+            task,
+        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+        self.model = model[0].eval().cuda()
+        self.task = task
+        self.layer = layer
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().cuda()
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start : start + self.max_chunk]
+                feat_chunk, _ = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    output_layer=self.layer,
+                )
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)
+
+
+def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk):
+    reader = HubertFeatureReader(ckpt_path, layer, max_chunk)
+    generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+    dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("split")
+    parser.add_argument("ckpt_path")
+    parser.add_argument("layer", type=int)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("--max_chunk", type=int, default=1600000)
+    args = parser.parse_args()
+    logger.info(args)
+
+    main(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py b/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py
new file mode 100644
index 0000000000000000000000000000000000000000..941bc1b675459b800b7e006f2ff9c2305c0dd8e8
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import csv
+import io
+import logging
+import os
+import os.path as op
+import sys
+
+from dump_hubert_feature import HubertFeatureReader
+from feature_utils import get_shard_range, dump_feature
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_hubert_feature_s2t")
+
+
+class HubertFeatureReaderS2T(HubertFeatureReader):
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(
+            path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate
+        )
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+
+
+def get_path_iterator(root, tsv, nshard, rank, audio_col_name):
+    with open(tsv) as f:
+        reader = csv.DictReader(
+            f,
+            delimiter="\t",
+            quotechar=None,
+            doublequote=False,
+            lineterminator="\n",
+            quoting=csv.QUOTE_NONE,
+        )
+        subpaths = [op.join(root, e[audio_col_name]) for e in reader]
+        start, end = get_shard_range(len(subpaths), nshard, rank)
+        subpaths = subpaths[start:end]
+
+        def iterate():
+            for subpath in subpaths:
+                yield op.join(root, subpath), None
+
+    return iterate, len(subpaths)
+
+
+def main(
+    root,
+    tsv_path,
+    ckpt_path,
+    layer,
+    nshard,
+    rank,
+    feat_dir,
+    split,
+    max_chunk,
+    audio_col_name,
+):
+    reader = HubertFeatureReaderS2T(ckpt_path, layer, max_chunk)
+    generator, num = get_path_iterator(root, tsv_path, nshard, rank, audio_col_name)
+    dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("root")
+    parser.add_argument("tsv_path")
+    parser.add_argument("ckpt_path")
+    parser.add_argument("layer", type=int)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("split")
+    parser.add_argument("--audio_col_name", type=str, default="audio")
+    parser.add_argument("--max_chunk", type=int, default=1600000)
+    args = parser.parse_args()
+    logger.info(args)
+
+    main(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_km_label.py b/fairseq/examples/hubert/simple_kmeans/dump_km_label.py
new file mode 100644
index 0000000000000000000000000000000000000000..8871307804d3f1e5c7cc49061614c69df26ab1ee
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_km_label.py
@@ -0,0 +1,98 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import numpy as np
+
+import joblib
+import torch
+import tqdm
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_km_label")
+
+
+class ApplyKmeans(object):
+    def __init__(self, km_path):
+        self.km_model = joblib.load(km_path)
+        self.C_np = self.km_model.cluster_centers_.transpose()
+        self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)
+
+        self.C = torch.from_numpy(self.C_np)
+        self.Cnorm = torch.from_numpy(self.Cnorm_np)
+        if torch.cuda.is_available():
+            self.C = self.C.cuda()
+            self.Cnorm = self.Cnorm.cuda()
+
+    def __call__(self, x):
+        if isinstance(x, torch.Tensor):
+            dist = (
+                x.pow(2).sum(1, keepdim=True)
+                - 2 * torch.matmul(x, self.C)
+                + self.Cnorm
+            )
+            return dist.argmin(dim=1).cpu().numpy()
+        else:
+            dist = (
+                (x ** 2).sum(1, keepdims=True)
+                - 2 * np.matmul(x, self.C_np)
+                + self.Cnorm_np
+            )
+            return np.argmin(dist, axis=1)
+
+
+def get_feat_iterator(feat_dir, split, nshard, rank):
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+    with open(leng_path, "r") as f:
+        lengs = [int(line.rstrip()) for line in f]
+        offsets = [0] + np.cumsum(lengs[:-1]).tolist()
+
+    def iterate():
+        feat = np.load(feat_path, mmap_mode="r")
+        assert feat.shape[0] == (offsets[-1] + lengs[-1])
+        for offset, leng in zip(offsets, lengs):
+            yield feat[offset: offset + leng]
+
+    return iterate, len(lengs)
+
+
+def dump_label(feat_dir, split, km_path, nshard, rank, lab_dir):
+    apply_kmeans = ApplyKmeans(km_path)
+    generator, num = get_feat_iterator(feat_dir, split, nshard, rank)
+    iterator = generator()
+
+    lab_path = f"{lab_dir}/{split}_{rank}_{nshard}.km"
+    os.makedirs(lab_dir, exist_ok=True)
+    with open(lab_path, "w") as f:
+        for feat in tqdm.tqdm(iterator, total=num):
+            # feat = torch.from_numpy(feat).cuda()
+            lab = apply_kmeans(feat).tolist()
+            f.write(" ".join(map(str, lab)) + "\n")
+    logger.info("finished successfully")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("feat_dir")
+    parser.add_argument("split")
+    parser.add_argument("km_path")
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("lab_dir")
+    args = parser.parse_args()
+    logging.info(str(args))
+
+    dump_label(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_mfcc_feature.py b/fairseq/examples/hubert/simple_kmeans/dump_mfcc_feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3537784d1d390701e96951d6e39f63f2023e32a
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_mfcc_feature.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import soundfile as sf
+import torch
+import torchaudio
+
+from feature_utils import get_path_iterator, dump_feature
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_mfcc_feature")
+
+
+class MfccFeatureReader(object):
+    def __init__(self, sample_rate):
+        self.sample_rate = sample_rate
+
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.sample_rate)
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float()
+            x = x.view(1, -1)
+
+            mfccs = torchaudio.compliance.kaldi.mfcc(
+                waveform=x,
+                sample_frequency=self.sample_rate,
+                use_energy=False,
+            )  # (time, freq)
+            mfccs = mfccs.transpose(0, 1)  # (freq, time)
+            deltas = torchaudio.functional.compute_deltas(mfccs)
+            ddeltas = torchaudio.functional.compute_deltas(deltas)
+            concat = torch.cat([mfccs, deltas, ddeltas], dim=0)
+            concat = concat.transpose(0, 1).contiguous()  # (freq, time)
+            return concat
+
+
+def main(tsv_dir, split, nshard, rank, feat_dir, sample_rate):
+    reader = MfccFeatureReader(sample_rate)
+    generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+    dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("split")
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("--sample_rate", type=int, default=16000)
+    args = parser.parse_args()
+    logger.info(args)
+
+    main(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/dump_w2v2_feature.py b/fairseq/examples/hubert/simple_kmeans/dump_w2v2_feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1f0d902acf0756580a1f4604feee8fc499a9a63
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/dump_w2v2_feature.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import fairseq
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+
+from feature_utils import get_path_iterator, dump_feature
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_w2v2_feature")
+
+
+class Wav2Vec2FeatureReader(object):
+    def __init__(self, ckpt_path, layer, max_chunk=1600000):
+        (
+            model,
+            cfg,
+            task,
+        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+        self.model = model[0].eval().cuda()
+        self.task = task
+        self.layer = layer  # assume this is 1-based like HuBERT
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+        logger.info(f" model:\n{self.model}")
+
+    def read_audio(self, path, ref_len=None):
+        wav, sr = sf.read(path)
+        assert sr == self.task.cfg.sample_rate, sr
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().cuda()
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start: start + self.max_chunk]
+                res = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    layer=self.layer - 1,
+                )
+                feat_chunk = res["x"]
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)
+
+
+def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk):
+    reader = Wav2Vec2FeatureReader(ckpt_path, layer, max_chunk)
+    generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+    dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("split")
+    parser.add_argument("ckpt_path")
+    parser.add_argument("layer", type=int)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("--max_chunk", type=int, default=1600000)
+    args = parser.parse_args()
+    logger.info(args)
+
+    main(**vars(args))
diff --git a/fairseq/examples/hubert/simple_kmeans/feature_utils.py b/fairseq/examples/hubert/simple_kmeans/feature_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f80bc4569768fac181133cdc8f76d1230e03bff6
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/feature_utils.py
@@ -0,0 +1,66 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import tqdm
+from npy_append_array import NpyAppendArray
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("feature_utils")
+
+
+def get_shard_range(tot, nshard, rank):
+    assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}"
+    start = round(tot / nshard * rank)
+    end = round(tot / nshard * (rank + 1))
+    assert start < end, f"start={start}, end={end}"
+    logger.info(
+        f"rank {rank} of {nshard}, process {end-start} "
+        f"({start}-{end}) out of {tot}"
+    )
+    return start, end
+
+
+def get_path_iterator(tsv, nshard, rank):
+    with open(tsv, "r") as f:
+        root = f.readline().rstrip()
+        lines = [line.rstrip() for line in f]
+        start, end = get_shard_range(len(lines), nshard, rank)
+        lines = lines[start:end]
+        def iterate():
+            for line in lines:
+                subpath, nsample = line.split("\t")
+                yield f"{root}/{subpath}", int(nsample)
+    return iterate, len(lines)
+
+
+def dump_feature(reader, generator, num, split, nshard, rank, feat_dir):
+    iterator = generator()
+
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+
+    os.makedirs(feat_dir, exist_ok=True)
+    if os.path.exists(feat_path):
+        os.remove(feat_path)
+
+    feat_f = NpyAppendArray(feat_path)
+    with open(leng_path, "w") as leng_f:
+        for path, nsample in tqdm.tqdm(iterator, total=num):
+            feat = reader.get_feats(path, nsample)
+            feat_f.append(feat.cpu().numpy())
+            leng_f.write(f"{len(feat)}\n")
+    logger.info("finished successfully")
+
+
diff --git a/fairseq/examples/hubert/simple_kmeans/learn_kmeans.py b/fairseq/examples/hubert/simple_kmeans/learn_kmeans.py
new file mode 100644
index 0000000000000000000000000000000000000000..113ac655b8c0a585fe43797e99674e445098edd0
--- /dev/null
+++ b/fairseq/examples/hubert/simple_kmeans/learn_kmeans.py
@@ -0,0 +1,146 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+
+import joblib
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("learn_kmeans")
+
+
+def get_km_model(
+    n_clusters,
+    init,
+    max_iter,
+    batch_size,
+    tol,
+    max_no_improvement,
+    n_init,
+    reassignment_ratio,
+):
+    return MiniBatchKMeans(
+        n_clusters=n_clusters,
+        init=init,
+        max_iter=max_iter,
+        batch_size=batch_size,
+        verbose=1,
+        compute_labels=False,
+        tol=tol,
+        max_no_improvement=max_no_improvement,
+        init_size=None,
+        n_init=n_init,
+        reassignment_ratio=reassignment_ratio,
+    )
+
+
+def load_feature_shard(feat_dir, split, nshard, rank, percent):
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+    with open(leng_path, "r") as f:
+        lengs = [int(line.rstrip()) for line in f]
+        offsets = [0] + np.cumsum(lengs[:-1]).tolist()
+
+    if percent < 0:
+        return np.load(feat_path, mmap_mode="r")
+    else:
+        nsample = int(np.ceil(len(lengs) * percent))
+        indices = np.random.choice(len(lengs), nsample, replace=False)
+        feat = np.load(feat_path, mmap_mode="r")
+        sampled_feat = np.concatenate(
+            [feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0
+        )
+        logger.info(
+            (
+                f"sampled {nsample} utterances, {len(sampled_feat)} frames "
+                f"from shard {rank}/{nshard}"
+            )
+        )
+        return sampled_feat
+
+
+def load_feature(feat_dir, split, nshard, seed, percent):
+    assert percent <= 1.0
+    feat = np.concatenate(
+        [
+            load_feature_shard(feat_dir, split, nshard, r, percent)
+            for r in range(nshard)
+        ],
+        axis=0,
+    )
+    logging.info(f"loaded feature with dimension {feat.shape}")
+    return feat
+
+
+def learn_kmeans(
+    feat_dir,
+    split,
+    nshard,
+    km_path,
+    n_clusters,
+    seed,
+    percent,
+    init,
+    max_iter,
+    batch_size,
+    tol,
+    n_init,
+    reassignment_ratio,
+    max_no_improvement,
+):
+    np.random.seed(seed)
+    feat = load_feature(feat_dir, split, nshard, seed, percent)
+    km_model = get_km_model(
+        n_clusters,
+        init,
+        max_iter,
+        batch_size,
+        tol,
+        max_no_improvement,
+        n_init,
+        reassignment_ratio,
+    )
+    km_model.fit(feat)
+    joblib.dump(km_model, km_path)
+
+    inertia = -km_model.score(feat) / len(feat)
+    logger.info("total intertia: %.5f", inertia)
+    logger.info("finished successfully")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("feat_dir", type=str)
+    parser.add_argument("split", type=str)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("km_path", type=str)
+    parser.add_argument("n_clusters", type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument(
+        "--percent", default=-1, type=float, help="sample a subset; -1 for all"
+    )
+    parser.add_argument("--init", default="k-means++")
+    parser.add_argument("--max_iter", default=100, type=int)
+    parser.add_argument("--batch_size", default=10000, type=int)
+    parser.add_argument("--tol", default=0.0, type=float)
+    parser.add_argument("--max_no_improvement", default=100, type=int)
+    parser.add_argument("--n_init", default=20, type=int)
+    parser.add_argument("--reassignment_ratio", default=0.0, type=float)
+    args = parser.parse_args()
+    logging.info(str(args))
+
+    learn_kmeans(**vars(args))
diff --git a/fairseq/examples/hubert/tests/sample.base.L9.km500.km b/fairseq/examples/hubert/tests/sample.base.L9.km500.km
new file mode 100644
index 0000000000000000000000000000000000000000..656eef96e588b601a7a8c0f2ab8644d4185045fb
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.base.L9.km500.km
@@ -0,0 +1 @@
+17 17 17 17 296 296 20 20 20 461 461 20 184 20 20 20 184 289 144 445 445 213 213 213 213 252 215 129 401 20 354 180 494 44 416 416 416 192 192 180 180 84 84 84 16 88 88 88 88 319 242 240 348 35 35 117 404 197 226 209 83 55 55 55 322 67 94 199 118 118 118 118 118 118 402 219 219 219 222 222 222 353 59 245 245 251 251 241 241 431 367 367 178 35 35 35 458 192 351 41 324 324 324 252 464 464 139 139 424 424 424 497 497 497 122 90 42 42 147 380 380 499 319 319 319 348 348 33 33 394 90 76 465 74 425 425 386 386 431 319 319 319 319 319 240 203 53 473 34 340 340 340 340 116 64 212 384 377 123 123 123 216 216 216 114 114 57 57 57 203 381 381 117 48 13 47 80 20 80 80 320 7 7 364 345 141 141 141 141 281 281 9 86 221 198 198 22 283 455 236 239 239 107 107 395 286 286 286 468 468 406 406 467 176 176 176 328 200 200 248 464 145 365 365 365 365 330 385 457 77 77 77 54 224 300 334 334 382 304 304 271 186 31 342 342 342 198 22 283 5 38 162 232 232 482 68 26 26 359 359 81 444 213 213 252 143 458 41 324 324 324 422 143 445 445 445 351 180 486 315 315 450 450 450 203 53 473 291 89 116 379 243 478 478 66 482 482 105 105 336 336 354 29 498 498 498 498 396 396 313 37 314 198 22 222 222 222 222 245 129 74 74 437 437 496 496 496 413 94 199 41 41 324 324 318 318 269 342 9 168 106 106 284 426 426 426 426 348 64 76 401 259 108 123 153 153 153 153 372 372 396 313 24 314 90 401 259 445 445 351 351 365 365 365 365 282 282 215 233 233 229 427 20 247 126 126 126 326 326 326 326 326 326 326 101 101 101 149 228 228 20 289 20 7 217 70 65 189 189 151 240 285 300 300 495 406 467 176 135 135 339 248 466 114 222 222 222 313 313 239 384 371 490 490 38 31 54 54 224 494 494 236 129 259 74 190 487 288 288 288 288 374 173 173 280 280 302 302 175 175 69 69 223 130 129 401 75 108 119 295 295 295 295 143 192 192 135 135 135 135 200 200 464 255 255 255 251 251 241 431 235 235 235 348 348 465 192 44 44 236 8 8 354 319 319 383 348 36 310 107 107 395 462 462 8 32 32 32 354 153 153 153 153 153 387 387 387 387 85 207 318 318 318 49 453 9 168 125 125 125 125 125 466 199 44 44 143 129 144 445 351 351 351 486 486 460 285 285 302 302 497 497 122 239 161 161 79 79 499 499 499 265 265 265 85 85 85 299 299 173 352 352 427 229 170 247 15 15 15 15 15 15 193 193 193 17
diff --git a/fairseq/examples/hubert/tests/sample.base.L9.len b/fairseq/examples/hubert/tests/sample.base.L9.len
new file mode 100644
index 0000000000000000000000000000000000000000..7d3028fa244a2121c51f39dcc92dc15be82823a6
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.base.L9.len
@@ -0,0 +1 @@
+596
diff --git a/fairseq/examples/hubert/tests/sample.large.L20.len b/fairseq/examples/hubert/tests/sample.large.L20.len
new file mode 100644
index 0000000000000000000000000000000000000000..7d3028fa244a2121c51f39dcc92dc15be82823a6
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.large.L20.len
@@ -0,0 +1 @@
+596
diff --git a/fairseq/examples/hubert/tests/sample.large.hypo.word b/fairseq/examples/hubert/tests/sample.large.hypo.word
new file mode 100644
index 0000000000000000000000000000000000000000..d77a4cfddcb93c2e08eb55e630c85fe840fd3cc2
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.large.hypo.word
@@ -0,0 +1 @@
+KEEP A GOING AN IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0)
diff --git a/fairseq/examples/hubert/tests/sample.xlarge.L30.len b/fairseq/examples/hubert/tests/sample.xlarge.L30.len
new file mode 100644
index 0000000000000000000000000000000000000000..7d3028fa244a2121c51f39dcc92dc15be82823a6
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.xlarge.L30.len
@@ -0,0 +1 @@
+596
diff --git a/fairseq/examples/hubert/tests/sample.xlarge.hypo.word b/fairseq/examples/hubert/tests/sample.xlarge.hypo.word
new file mode 100644
index 0000000000000000000000000000000000000000..53e402d4550c820220e0964654a600dabaca8b1c
--- /dev/null
+++ b/fairseq/examples/hubert/tests/sample.xlarge.hypo.word
@@ -0,0 +1 @@
+KEEP A GOIN AND IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0)
diff --git a/fairseq/examples/hubert/update_ckpt.py b/fairseq/examples/hubert/update_ckpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..53c9e74ea613e30aa5c22614e658f2b7272bac0c
--- /dev/null
+++ b/fairseq/examples/hubert/update_ckpt.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+src_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2.pt"
+ref_ckpt = "/checkpoint/wnhsu/w2v/hubert_icassp_oss_v3/iter2_km100-400k-grp-L6/oss.km500_p0_1_s334.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU100k.s1337.ngpu32/checkpoint_last.pt"
+new_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2_updated.pt"
+
+
+def update_state(state):
+    state["model"]["label_embs_concat"] = state["model"].pop("label_embs")
+    state["args"].task = "hubert_pretraining"
+    state["args"].labels = f"['{state['args'].labels}']"
+    return state
+
+
+src_state = torch.load(src_ckpt)
+src_state = update_state(src_state)
+torch.save(src_state, new_ckpt)
diff --git a/fairseq/examples/latent_depth/latent_depth_src/loss/__init__.py b/fairseq/examples/latent_depth/latent_depth_src/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/examples/latent_depth/latent_depth_src/loss/latent_depth.py b/fairseq/examples/latent_depth/latent_depth_src/loss/latent_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b9535ecac3ec403868681a8b50c1fbe1c90dfe
--- /dev/null
+++ b/fairseq/examples/latent_depth/latent_depth_src/loss/latent_depth.py
@@ -0,0 +1,99 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+from torch.nn.modules.loss import _Loss
+
+
+class LatentLayersKLLoss(_Loss):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+
+    def forward(self, layer_samples, lang_idx, update_num, sample_size):
+        prior = self.args.prior
+        samples = layer_samples[lang_idx]
+        eps = 1e-7
+        if prior == "uniform":
+            # uniform prior
+            kl_loss = (samples * (torch.log(samples + eps) - math.log(0.5))).sum(-1)
+        elif prior == "agged_posterior":
+            # aggregated posterior
+            y_t = torch.stack([x.detach() for x in layer_samples], dim=0)
+            agged_q = torch.sum(y_t, dim=0)
+            row_norm = agged_q.sum(-1)
+            normed_agg_q = agged_q / row_norm
+            kl_loss = (
+                samples * (torch.log(samples + eps) - torch.log(normed_agg_q + eps))
+            ).sum(-1)
+        else:
+            raise NotImplementedError("The specified prior is not implemented.")
+
+        # normalized by number of layers
+        kl_loss /= layer_samples[0].size()[0]
+        kl_weight = min(
+            self.args.sparsity_weight,
+            (update_num - self.args.soft_update)
+            * self.args.sparsity_weight
+            / self.args.anneal_updates,
+        )
+        kl_loss *= kl_weight * sample_size
+        return kl_loss
+
+
+class LatentLayersSparsityLoss(_Loss):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+
+    def is_valid(self, update_num):
+        if self.args.target_layers <= 0:
+            return False
+        return update_num > (self.args.soft_update + self.args.anneal_updates)
+
+    def forward(self, layer_samples_list, update_num, sample_size):
+        batch_loss = 0
+        share_loss = 0
+        global_sparsity_loss = 0
+        layer_samples = torch.stack(layer_samples_list, dim=0)
+        if (
+            self.args.target_layers > 0 or self.args.share_weight > 0
+        ) and update_num > (self.args.soft_update + self.args.anneal_updates):
+            # anneal sparsity weight
+            if update_num < (self.args.anneal_updates + self.args.soft_update):
+                weight_anneal = 0
+            elif update_num < (2 * self.args.anneal_updates + self.args.soft_update):
+                weight_anneal = (
+                    (update_num - self.args.soft_update - self.args.anneal_updates)
+                    * self.args.share_weight
+                    / self.args.anneal_updates
+                )
+            else:
+                weight_anneal = 1
+            # compute ratio among languages
+            layer_utilization = torch.sum(layer_samples, dim=0)
+            layer_utilization /= layer_samples.size()[0]
+            if self.args.share_weight > 0:
+                # encouraging sharing across languages
+                share_loss = sum(
+                    -1.0 * v * math.log(v) for v in layer_utilization if v > 0
+                )
+                batch_loss += (
+                    weight_anneal * self.args.share_weight * sample_size * share_loss
+                )
+            if self.args.target_layers > 0:
+                # computed expected number of layers selected
+                expeted_layers = sum(layer_utilization)
+                # compute l2 loss wrt target number of layers
+                global_sparsity_loss = (expeted_layers - self.args.target_layers) ** 2
+                batch_loss += (
+                    weight_anneal
+                    * self.args.share_weight
+                    * sample_size
+                    * global_sparsity_loss
+                )
+        return batch_loss
diff --git a/fairseq/examples/latent_depth/latent_depth_src/models/__init__.py b/fairseq/examples/latent_depth/latent_depth_src/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/examples/latent_depth/latent_depth_src/models/latent_transformer.py b/fairseq/examples/latent_depth/latent_depth_src/models/latent_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a825301a452bd935deafdaf78fa2427ca9a469e
--- /dev/null
+++ b/fairseq/examples/latent_depth/latent_depth_src/models/latent_transformer.py
@@ -0,0 +1,156 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+import torch.nn as nn
+from fairseq.models.fairseq_encoder import EncoderOut
+from fairseq.models.transformer import TransformerDecoder, TransformerEncoder
+from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer
+from torch import Tensor
+
+from ..modules.latent_layers import LayerSelect
+
+
+class LatentTransformerEncoder(TransformerEncoder):
+    """Latent depth (https://arxiv.org/abs/2009.13102) implemented in
+    TransformerEncoder.
+    """
+
+    def __init__(self, args, dictionary, embed_tokens, num_logits=1):
+        self.num_logits = num_logits
+        self.num_layers = args.encoder_layers
+        super().__init__(args, dictionary, embed_tokens)
+        self.layer_select = LayerSelect(
+            num_layers=self.num_layers,
+            num_logits=self.num_logits,
+            soft_select=getattr(args, "soft_select", False),
+            sampling_tau=getattr(args, "sampling_tau", 5.),
+        )
+        self.lang_idx = None
+        self.layers = nn.ModuleList(
+            [self._build_encoder_layer(args, idx) for idx in range(args.encoder_layers)]
+        )
+
+    def set_lang_idx(self, lang_idx):
+        self.lang_idx = lang_idx
+
+    def _build_encoder_layer(self, args, idx=None):
+        return LatentTransformerEncoderLayer(args, idx, layer_select=self.layer_select)
+
+    def forward(self, src_tokens, src_lengths, return_all_hiddens: bool = False):
+        self.layer_select.sample(self.lang_idx)
+        return super().forward(src_tokens, src_lengths, return_all_hiddens)
+
+
+class LatentTransformerEncoderLayer(TransformerEncoderLayer):
+    """Encoder layer with each (non_residual) block weighted by samples of Bernouli
+    or Gumbel Signmoid samples.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments from standard
+            TransformerEncoderLayer.
+        idx (int): layer index (used to retrieve samples).
+        layer_select (LayerSelect, optional): instance of LayerSelect module with logits
+            parameters and sampling method.
+    """
+
+    def __init__(self, args, idx, layer_select=None):
+        super().__init__(args)
+        self.idx = idx
+        self.layer_select = layer_select
+
+    def residual_connection(self, x, residual):
+        return residual + x * self.layer_select(self.idx)
+
+
+class LatentTransformerDecoder(TransformerDecoder):
+    """Latent depth (https://arxiv.org/abs/2009.13102) implemented in
+    TransformerDecoder.
+    """
+
+    def __init__(
+        self, args, dictionary, embed_tokens, no_encoder_attn=False, num_logits=1
+    ):
+        self.num_logits = num_logits
+        self.num_layers = args.decoder_layers
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+        self.layer_select = LayerSelect(
+            num_layers=self.num_layers,
+            num_logits=self.num_logits,
+            soft_select=getattr(args, "soft_select", False),
+            sampling_tau=getattr(args, "sampling_tau", 5.),
+        )
+        self.lang_idx = None
+        self.layers = nn.ModuleList(
+            [
+                self._build_decoder_layer(args, no_encoder_attn, idx)
+                for idx in range(args.decoder_layers)
+            ]
+        )
+
+    def set_lang_idx(self, lang_idx):
+        self.lang_idx = lang_idx
+
+    def _build_decoder_layer(self, args, no_encoder_attn=False, idx=None):
+        return LatentTransformerDecoderLayer(
+            args, idx, layer_select=self.layer_select, no_encoder_attn=no_encoder_attn
+        )
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[EncoderOut] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        self.layer_select.sample(self.lang_idx)
+        return super().forward(
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            features_only=features_only,
+            alignment_layer=alignment_layer,
+            src_lengths=src_lengths,
+            return_all_hiddens=return_all_hiddens,
+        )
+
+
+class LatentTransformerDecoderLayer(TransformerDecoderLayer):
+    """Decoder layer with each (non_residual) block weighted by samples of Bernouli
+    or Gumbel Signmoid samples.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments from standard
+            TransformerDecoderLayer.
+        idx (int): layer index (used to retrieve samples).
+        layer_select (LayerSelect, optional): instance of LayerSelect module with logits
+            parameters and sampling method.
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+
+    """
+
+    def __init__(
+        self,
+        args,
+        idx,
+        layer_select=None,
+        no_encoder_attn=False,
+        add_bias_kv=False,
+        add_zero_attn=False,
+    ):
+        super().__init__(args, no_encoder_attn, add_bias_kv, add_zero_attn)
+        self.idx = idx
+        self.layer_select = layer_select
+
+    def residual_connection(self, x, residual):
+        return residual + x * self.layer_select(self.idx)
diff --git a/fairseq/examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py b/fairseq/examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cc2a7174b765b7ad8808489196e12082a91a2d7
--- /dev/null
+++ b/fairseq/examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py
@@ -0,0 +1,195 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.tasks import register_task
+from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
+from fairseq.utils import safe_hasattr
+
+from .loss.latent_depth import LatentLayersKLLoss, LatentLayersSparsityLoss
+
+
+@register_task("multilingual_translation_latent_depth")
+class MultilingualTranslationTaskLatentDepth(MultilingualTranslationTask):
+    """A task for multiple translation with latent depth.
+
+    See `"Deep Transformer with Latent Depth"
+        (Li et al., 2020) <https://arxiv.org/pdf/2009.13102.pdf>`_.
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        MultilingualTranslationTask.add_args(parser)
+        parser.add_argument('--encoder-latent-layer', action='store_true', help='latent layer selection in encoder')
+        parser.add_argument('--decoder-latent-layer', action='store_true', help='latent layer selection in decoder')
+        parser.add_argument('--target-layers', default=-1, type=int,
+                            help='number of effective layers to learn; -1 means no constraint')
+        parser.add_argument('--sparsity-weight', default=0.0, type=float,
+                            help='weight for sparsity loss')
+        parser.add_argument('--share-weight', default=0.0, type=float,
+                            help='weight for sharing loss')
+        parser.add_argument('--soft-update', default=1, type=int,
+                            help='number of updates with soft sampling')
+        parser.add_argument('--anneal-updates', default=1, type=int,
+                            help='number of updates to anneal the KL loss weight')
+        parser.add_argument('--prior', default="uniform", type=str,
+                            help='prior used for computing KL loss')
+        # fmt: on
+
+    def __init__(self, args, dicts, training):
+        super().__init__(args, dicts, training)
+        self.src_langs, self.tgt_langs = zip(
+            *[(lang.split("-")[0], lang.split("-")[1]) for lang in args.lang_pairs]
+        )
+        if self.training and self.encoder_latent_layer:
+            assert self.args.share_encoders
+        if self.training and self.decoder_latent_layer:
+            assert self.args.share_decoders
+        if training or self.encoder_latent_layer or self.decoder_latent_layer:
+            self.lang_pairs = args.lang_pairs
+        else:
+            self.lang_pairs = ["{}-{}".format(args.source_lang, args.target_lang)]
+        self.eval_lang_pairs = self.lang_pairs
+        self.model_lang_pairs = self.lang_pairs
+        if self.training and (self.encoder_latent_layer or self.decoder_latent_layer):
+            self.kl_loss = LatentLayersKLLoss(self.args)
+            self.sparsity_loss = LatentLayersSparsityLoss(self.args)
+
+    def _per_lang_pair_train_loss(
+        self, lang_pair, model, update_num, criterion, sample, optimizer, ignore_grad
+    ):
+        src, tgt = lang_pair.split("-")
+        if self.encoder_latent_layer:
+            src_lang_idx = self.src_lang_idx_dict[src]
+            model.models[lang_pair].encoder.set_lang_idx(src_lang_idx)
+            model.models[lang_pair].encoder.layer_select.hard_select = (
+                update_num > self.args.soft_update
+            )
+        if self.decoder_latent_layer:
+            tgt_lang_idx = self.tgt_lang_idx_dict[tgt]
+            model.models[lang_pair].decoder.set_lang_idx(tgt_lang_idx)
+            model.models[lang_pair].decoder.layer_select.hard_select = (
+                update_num > self.args.soft_update
+            )
+
+        loss, sample_size, logging_output = criterion(
+            model.models[lang_pair], sample[lang_pair]
+        )
+        if self.encoder_latent_layer:
+            none_samples = sum(
+                1 if x is None else 0
+                for x in model.models[lang_pair].encoder.layer_select.layer_samples
+            )
+            if none_samples == 0 or self.args.prior != "agged_posterior":
+                loss += self.kl_loss(
+                    model.models[lang_pair].encoder.layer_select.layer_samples,
+                    src_lang_idx,
+                    update_num,
+                    sample_size,
+                )
+        if self.decoder_latent_layer:
+            none_samples = sum(
+                1 if x is None else 0
+                for x in model.models[lang_pair].decoder.layer_select.layer_samples
+            )
+            if none_samples == 0 or self.args.prior != "agged_posterior":
+                loss += self.kl_loss(
+                    model.models[lang_pair].decoder.layer_select.layer_samples,
+                    tgt_lang_idx,
+                    update_num,
+                    sample_size,
+                )
+        if ignore_grad:
+            loss *= 0
+
+        if hasattr(self, "sparsity_loss") and self.sparsity_loss.is_valid(update_num):
+            # need to retain the graph if sparsity loss needs to be added
+            loss.backward(retain_graph=True)
+        else:
+            optimizer.backward(loss)
+
+        return loss, sample_size, logging_output
+
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+        agg_loss, agg_sample_size, agg_logging_output = super().train_step(
+            sample, model, criterion, optimizer, update_num, ignore_grad
+        )
+        # compute auxiliary loss from layere sparsity, based on all samples from all languages
+        if hasattr(self, "sparsity_loss") and self.sparsity_loss.is_valid(update_num):
+            sparsity_loss = 0
+            if self.encoder_latent_layer:
+                sparsity_loss += self.sparsity_loss(
+                    next(
+                        iter(model.models.values())
+                    ).encoder.layer_select.layer_samples,
+                    update_num,
+                    agg_sample_size,
+                )
+            if self.decoder_latent_layer:
+                sparsity_loss += self.sparsity_loss(
+                    next(
+                        iter(model.models.values())
+                    ).decoder.layer_select.layer_samples,
+                    update_num,
+                    agg_sample_size,
+                )
+            if sparsity_loss > 0:
+                optimizer.backward(sparsity_loss)
+        return agg_loss, agg_sample_size, agg_logging_output
+
+    def _per_lang_pair_valid_loss(self, lang_pair, model, criterion, sample):
+        src, tgt = lang_pair.split("-")
+        if self.encoder_latent_layer:
+            src_lang_idx = self.src_lang_idx_dict[src]
+            model.models[lang_pair].encoder.set_lang_idx(src_lang_idx)
+        if self.decoder_latent_layer:
+            tgt_lang_idx = self.tgt_lang_idx_dict[tgt]
+            model.models[lang_pair].decoder.set_lang_idx(tgt_lang_idx)
+        loss, sample_size, logging_output = criterion(
+            model.models[lang_pair], sample[lang_pair]
+        )
+        return loss, sample_size, logging_output
+
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        if self.encoder_latent_layer or self.decoder_latent_layer:
+            for model in models:
+                if self.encoder_latent_layer:
+                    assert model.encoder.layer_select is not None
+                    src_lang_idx = self.src_lang_idx_dict[self.args.source_lang]
+                    model.encoder.set_lang_idx(src_lang_idx)
+                if self.decoder_latent_layer:
+                    assert model.decoder.layer_select is not None
+                    tgt_lang_idx = self.tgt_lang_idx_dict[self.args.target_lang]
+                    model.decoder.set_lang_idx(tgt_lang_idx)
+        return super().inference_step(
+            generator, models, sample, prefix_tokens, constraints
+        )
+
+    @property
+    def encoder_latent_layer(self):
+        return (
+            safe_hasattr(self.args, "encoder_latent_layer")
+            and self.args.encoder_latent_layer
+        )
+
+    @property
+    def decoder_latent_layer(self):
+        return (
+            safe_hasattr(self.args, "decoder_latent_layer")
+            and self.args.decoder_latent_layer
+        )
+
+    @property
+    def src_lang_idx_dict(self):
+        return {lang: lang_idx for lang_idx, lang in enumerate(self.src_langs)}
+
+    @property
+    def tgt_lang_idx_dict(self):
+        return {lang: lang_idx for lang_idx, lang in enumerate(self.tgt_langs)}
diff --git a/fairseq/examples/layerdrop/README.md b/fairseq/examples/layerdrop/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4d48ee9615e1458e1e889635dc9938e427a7f64a
--- /dev/null
+++ b/fairseq/examples/layerdrop/README.md
@@ -0,0 +1,154 @@
+# Reducing Transformer Depth on Demand with Structured Dropout (Fan et al., 2019)
+This page contains information for how to train models with LayerDrop, based on this [paper](https://arxiv.org/abs/1909.11556).
+
+## Citation:
+If you found this technique useful, please cite our paper:
+```bibtex
+@article{fan2019reducing,
+  title={Reducing Transformer Depth on Demand with Structured Dropout},
+  author={Fan, Angela and Grave, Edouard and Joulin, Armand},
+  journal={arXiv preprint arXiv:1909.11556},
+  year={2019}
+}
+```
+
+## Pre-trained models
+
+Model | Description | Download
+---|---|---
+`layerdrop_wmt_en_de_12_6` | Transformer + LayerDrop 0.2 trained on WMT16 en-de with 12 encoder and 6 decoder layers | [layerdrop_wmt_en_de_12_6.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/layerdrop_wmt_en_de_12_6.tar.gz)
+`roberta_layerdrop.base` | RoBERTa Base + LayerDrop 0.2 | [roberta_layerdrop.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.base.qnli.tar.gz)
+`roberta_layerdrop.large` | RoBERTa Large + LayerDrop 0.2 | [roberta_layerdrop.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.large.tar.gz)
+`roberta_layerdrop.large.mnli` | `roberta_layerdrop.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | [roberta_layerdrop.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.large.mnli.tar.gz)
+`roberta_layerdrop.large.qnli` | `roberta_layerdrop.large` finetuned on [QNLI](https://arxiv.org/abs/1804.07461) | [roberta_layerdrop.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.large.qnli.tar.gz)
+
+
+Evaluate performance of these pre-trained models:
+```bash
+# Example for Machine Translation
+fairseq-generate /path/to/bped/wmt/data --path nmt_checkpoint.pt \
+  --beam 8 --lenpen 0.4 \
+  --batch-size 64 \
+  --remove-bpe \
+  --gen-subset test > wmt16_gen.txt
+bash scripts/compound_split_bleu.sh wmt16_gen.txt
+# prints BLEU4 = 30.17
+```
+
+```python
+# Example for RoBERTa + LayerDrop finetuned on MNLI:
+from fairseq.models.roberta import RobertaModel
+
+roberta_layerdrop = RobertaModel.from_pretrained(
+    '/path/to/MNLI/model',
+    checkpoint_file='mnli_checkpoint.pt',
+    data_name_or_path='/path/to/MNLI/data/MNLI-bin'
+)
+label_map = {0: 'contradiction', 2: 'neutral', 1: 'entailment'}
+ncorrect, nsamples = 0, 0
+roberta_layerdrop.cuda()
+roberta_layerdrop.eval()
+with open('/path/to/MNLI/data/dev_matched.tsv') as fin:
+    fin.readline()
+    for index, line in enumerate(fin):
+        tokens = line.strip().split('\t')
+        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
+        tokens = roberta_layerdrop.encode(sent1, sent2)
+        prediction = roberta_layerdrop.predict('sentence_classification_head', tokens).argmax().item()
+        prediction_label = label_map[prediction]
+        ncorrect += int(prediction_label == target)
+        nsamples += 1
+print('| Accuracy: ', float(ncorrect)/float(nsamples))
+# prints | Accuracy:  0.9026999490575649
+
+
+# Example for RoBERTa + LayerDrop finetuned on QNLI:
+roberta = RobertaModel.from_pretrained(
+    '/path/to/QNLI/model',
+    checkpoint_file='qnli_checkpoint.pt',
+    data_name_or_path='/path/to/QNLI/data/QNLI-bin'
+)
+
+label_fn = lambda label: roberta.task.label_dictionary.string(
+    [label + roberta.task.target_dictionary.nspecial]
+)
+ncorrect, nsamples = 0, 0
+roberta.cuda()
+roberta.eval()
+with open('/path/to/QNLI/data/dev.tsv') as fin:
+    fin.readline()
+    for index, line in enumerate(fin):
+        tokens = line.strip().split('\t')
+        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
+        tokens = roberta.encode(sent1, sent2)
+        prediction = roberta.predict('sentence_classification_head', tokens).argmax().item()
+        prediction_label = label_fn(prediction)
+        ncorrect += int(prediction_label == target)
+        nsamples += 1
+print('| Accuracy: ', float(ncorrect)/float(nsamples))
+# prints | Accuracy:  0.9480139117700896
+```
+
+
+## Example usage
+
+To train a model with LayerDrop, add the following flags. We recommend 0.2, a value that worked well in our experiments. For Language Models that are decoder-only, you need only the decoder flag. For RoBERTa, an encoder, you need only the encoder flag. The encoder and decoder LayerDrop values can be set differently.
+```
+--encoder-layerdrop 0.2 --decoder-layerdrop 0.2
+```
+
+To prune a model that has been trained with LayerDrop, add the following flags followed by a comma separated list of which layers you would like to keep.
+```
+--encoder-layers-to-keep 0,2,4,6,8,10,12,14 --decoder-layers-to-keep 0,2,4,6,8,10,12,14
+```
+Setting these flags should print a message such as:
+```
+| Pruning model to specified layer configuration
+```
+You should also see a smaller number of parameters in the model, for example the 16-Layer Transformer Language Model prints:
+```
+num. model params: 246933504
+```
+while a model pruned to 8 Layers prints:
+```
+num. model params: 146163712
+```
+
+If you would like to pick up training with a model that has been pruned, simply adding these flags is sufficient. If you would like to use a script that only does evaluation (no training), you may need to pass an override command. A specific example would be for language modeling:
+```bash
+fairseq-eval-lm /path/to/wikitext-103 \
+  --path /path/to/model/checkpoint.pt \
+  --model-overrides "{'decoder_layers_to_keep':'0,2,4,6,8,10,12,14'}"
+```
+This model override command overrides the training parameters and updates the model arguments so that the pruned model is run instead of the full model.
+
+## Reproduce Paper Results
+
+Looking to reproduce the results in the paper?
+
+1. For Translation on WMT16 en-de, we followed this setting [here](https://github.com/pytorch/fairseq/blob/main/examples/scaling_nmt/README.md)
+2. To train RoBERTa, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/roberta)
+3. To train Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/language_model)
+
+
+## Tips
+
+1. If you would like to train large models with better performance, LayerDrop should be set to a smaller value such as 0.1 or 0.2. Too much LayerDrop will mean the model has too much regularization, so may not reach the best performance. Since LayerDrop adds regularization, you may achieve the best performance by slightly reducing the amount of standard dropout (for example, reduce by 0.1).
+
+2. If you would like to train large models to be pruned and made smaller, LayerDrop should be set to a larger value such as 0.5 if you want to prune very aggressively (such as removing half the network or more). If you would like to prune fewer layers away, LayerDrop can be set to a smaller value such as 0.2. Our experiments were conducted with low values of LayerDrop (such as 0.1 and 0.2), for reference.
+
+3. When pruning layers at inference time, it is best to spread out the layers remaining so they are evenly spaced throughout the network. For example, if you want to remove 50% of the network, keeping every other layer is good.
+
+
+## FAQ
+
+1. How did the sharing layers experiment work? In an appendix (https://openreview.net/pdf?id=SylO2yStDr) we added an experiment on Wikitext-103 language modeling that combined LayerDrop with Weight Sharing. We shared chunks of 2 layers such that every other layer had shared weights. For example, if our network has layers 1 through 6, then layer 1 and 2 are shared, layer 3 and 4 are shared, and layer 5 and 6 are shared.
+
+2. LayerDrop hasn't been helping in my setting? During training time, LayerDrop can help regularize your network. This is most important if your network is already overfitting - if your network is underfitting, it is possible LayerDrop is adding too much regularization. We recommend using smaller values (such as 0.1 or 0.2) and also decreasing the quantity of standard dropout (for example, reduce by 0.1).
+
+3. Can you train a model without LayerDrop and finetune with LayerDrop (e.g. for BERT)? In our experiments, we did not see great performance. Models such as RoBERTa have trained for a long time in the pre-training setting, so only finetuning with LayerDrop for a few epochs on a downstream task such as MNLI does not achieve the robustness required for successful pruning.
+
+
+## Having an issue or have a question?
+
+Please open an issue in this repository with the details of your question. Thanks!
diff --git a/fairseq/examples/linformer/linformer_src/models/__init__.py b/fairseq/examples/linformer/linformer_src/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391