diff --git a/fairseq/fairseq.egg-info/not-zip-safe b/fairseq/fairseq.egg-info/not-zip-safe
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/fairseq/fairseq.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/fairseq/tests/distributed/__init__.py b/fairseq/tests/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/tests/distributed/test_bmuf.py b/fairseq/tests/distributed/test_bmuf.py
new file mode 100644
index 0000000000000000000000000000000000000000..995d0db18080f9b1a2fca65206c5f00dfa1ff90e
--- /dev/null
+++ b/fairseq/tests/distributed/test_bmuf.py
@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import functools
+import random
+import unittest
+from multiprocessing import Manager
+
+import torch
+import torch.nn as nn
+from omegaconf import OmegaConf
+
+from fairseq import optim
+from fairseq.distributed import utils as distributed_utils
+
+
+class Model(nn.Module):
+    def __init__(self, input_size, output_size):
+        super(Model, self).__init__()
+        self.fc = nn.Linear(input_size, output_size)
+
+    def forward(self, input):
+        output = self.fc(input)
+        return output
+
+
+def setup_model_loss_criterion(cfg, args, rank, is_cuda):
+    """
+    setup model, criterion and optimizer based on input args
+    """
+    args.distributed_rank = rank
+    cfg.distributed_training.distributed_rank = args.distributed_rank
+    if cfg.distributed_training.distributed_world_size > 1:
+        distributed_utils.distributed_init(cfg)
+    torch.manual_seed(1)
+    model = Model(args.input_size, args.nb_classes)
+    loss_fn = nn.CrossEntropyLoss()
+    if is_cuda:
+        model = model.cuda()
+        loss_fn = loss_fn.cuda()
+
+    optimizer = optim.sgd.SGD(args, model.parameters())
+    optimizer = optim.FairseqBMUF(cfg=cfg.bmuf, optimizer=optimizer)
+
+    return model, loss_fn, optimizer
+
+
+def train_step(input, target, model, loss_fn, optimizer, **unused):
+    """Do forward, backward and parameter update."""
+    model.train()
+    output = model(input)
+    loss = loss_fn(output, target)
+    optimizer.backward(loss)
+    optimizer.step()
+
+
+def single_gpu_training(cfg, args, rank, iterations, shared_results):
+
+    is_cuda = torch.cuda.is_available()
+    if is_cuda:
+        torch.cuda.set_device(rank)
+
+    model, loss_fn, optimizer = setup_model_loss_criterion(cfg, args, rank, is_cuda)
+
+    for _ in range(iterations):
+        input = torch.randn(1, args.input_size)
+        target = torch.empty(args.batch_size, dtype=torch.long).random_(args.nb_classes)
+
+        if is_cuda:
+            input = input.cuda()
+            target = target.cuda()
+        train_step(input, target, model, loss_fn, optimizer)
+
+    results = []
+    for param in model.parameters():
+        if len(results) == 0:
+            results = param.flatten().cpu().data
+        else:
+            results = torch.cat((results, param.flatten().cpu().data), 0)
+
+    shared_results[rank] = results
+
+
+def setup_args():
+    args = argparse.Namespace()
+    args.global_sync_iter = 20
+    args.block_momentum = 0.875
+    args.block_lr = 0.5
+    args.input_size = 5
+    args.nb_classes = 2
+    args.batch_size = 1
+    args.lr = [1e-3]
+    args.momentum = 0
+    args.weight_decay = 0
+    args.warmup_iterations = 0
+    args.use_nbm = True
+    args.average_sync = True
+    args.global_sync_iter = 1
+    args.model_parallel_size = 1
+    args.distributed_backend = "gloo"
+
+    args.distributed_world_size = 2
+    port = random.randint(10000, 20000)
+    args.distributed_init_method = "tcp://localhost:{port}".format(port=port)
+    args.distributed_init_host = "localhost"
+    args.distributed_port = port + 1
+    args.local_world_size = args.distributed_world_size
+
+    cfg = OmegaConf.create()
+    cfg.optimization = OmegaConf.create()
+    cfg.common = OmegaConf.create()
+    cfg.distributed_training = OmegaConf.create()
+    cfg.dataset = OmegaConf.create()
+    cfg.bmuf = OmegaConf.create()
+    cfg.optimizer = OmegaConf.create()
+
+    cfg.bmuf.global_sync_iter = args.global_sync_iter
+    cfg.bmuf.block_momentum = args.block_momentum
+    cfg.bmuf.block_lr = args.block_lr
+    cfg.dataset.batch_size = args.batch_size
+    cfg.optimization.lr = args.lr
+    cfg.optimizer.momentum = args.momentum
+    cfg.optimizer.weight_decay = args.weight_decay
+    cfg.bmuf.warmup_iterations = args.warmup_iterations
+    cfg.bmuf.use_nbm = args.use_nbm
+    cfg.bmuf.average_sync = args.average_sync
+    cfg.common.model_parallel_size = args.model_parallel_size
+    cfg.distributed_training.distributed_backend = args.distributed_backend
+    cfg.distributed_training.distributed_world_size = args.distributed_world_size
+    cfg.bmuf.distributed_world_size = args.distributed_world_size
+    cfg.distributed_training.distributed_init_method = args.distributed_init_method
+    cfg.distributed_training.distributed_port = args.distributed_port
+
+    return cfg, args
+
+
+@unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs")
+class TestBMUF(unittest.TestCase):
+    def bmuf_process(self, cfg, args, iterations):
+        results = Manager().dict()
+        torch.multiprocessing.spawn(
+            fn=functools.partial(single_gpu_training, cfg, args),
+            args=(iterations, results),
+            nprocs=args.distributed_world_size,
+            join=True,
+        )
+        return results
+
+    def test_bmuf_sync(self):
+        # Train model for 1 iteration and do bmuf sync without doing warmup
+        cfg, args = setup_args()
+        iterations = 1
+        results = self.bmuf_process(cfg, args, iterations)
+        # Make sure params in both machines are same
+        assert len(results) == 2
+        self.assertAlmostEqual(results[0], results[1])
+
+    def test_warmup_sync(self):
+        # Train model for 20 iteration and do warmup sync without doing bmuf sync
+        cfg, args = setup_args()
+        args.warmup_iterations = 20
+        cfg.bmuf.warmup_iterations = args.warmup_iterations
+        iterations = 20
+        results = self.bmuf_process(cfg, args, iterations)
+        # Make sure params in both machines are same
+        assert len(results) == 2
+        self.assertAlmostEqual(results[0], results[1])
+
+    def test_warmup_sync_bmuf_sync(self):
+        # Train model for 25 iteration and do warmup sync after 20 iteration
+        # and bmuf sync after 25 iteration
+        cfg, args = setup_args()
+        args.warmup_iterations = 20
+        args.global_sync_iter = 5
+        cfg.bmuf.warmup_iterations = args.warmup_iterations
+        cfg.bmuf.global_sync_iter = args.global_sync_iter
+        iterations = 25
+        results = self.bmuf_process(cfg, args, iterations)
+        # Make sure params in both machines are same
+        assert len(results) == 2
+        self.assertAlmostEqual(results[0], results[1])
+
+    def test_single_gpu_bmuf(self):
+        # Train model for 5 iterations and use GPU 1
+        cfg, args = setup_args()
+        args.distributed_world_size = 1
+        args.warmup_iterations = 5
+        cfg.distributed_training.distributed_world_size = args.distributed_world_size
+        cfg.bmuf.distributed_world_size = args.distributed_world_size
+        cfg.bmuf.warmup_iterations = args.warmup_iterations
+        iterations = 20
+        results = self.bmuf_process(cfg, args, iterations)
+        assert len(results) == 1
+
+    def assertAlmostEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertLess((t1 - t2).abs().max(), 1e-4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/distributed/test_distributed_timeout_wrapper.py b/fairseq/tests/distributed/test_distributed_timeout_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..996093cb2d73fb3d5a41e65fbac4bd61bf122134
--- /dev/null
+++ b/fairseq/tests/distributed/test_distributed_timeout_wrapper.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import signal
+import time
+import unittest
+
+import torch
+from torch import nn
+
+from fairseq.distributed import DistributedTimeoutWrapper
+
+
+class ModuleWithDelay(nn.Module):
+    def __init__(self, delay):
+        super().__init__()
+        self.delay = delay
+
+    def forward(self, x):
+        time.sleep(self.delay)
+        return x
+
+
+class TestDistributedTimeoutWrapper(unittest.TestCase):
+    def setUp(self):
+        logging.disable(logging.CRITICAL)
+
+    def tearDown(self):
+        logging.disable(logging.NOTSET)
+
+    def test_no_timeout(self):
+        module = DistributedTimeoutWrapper(ModuleWithDelay(1), 0, signal.SIGINT)
+        module(torch.rand(5))
+        module.stop_timeout()
+
+    def test_timeout_safe(self):
+        module = DistributedTimeoutWrapper(ModuleWithDelay(1), 10, signal.SIGINT)
+        module(torch.rand(5))
+        module.stop_timeout()
+
+    def test_timeout_killed(self):
+        with self.assertRaises(KeyboardInterrupt):
+            module = DistributedTimeoutWrapper(ModuleWithDelay(5), 1, signal.SIGINT)
+            module(torch.rand(5))
+            module.stop_timeout()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/distributed/test_module_proxy_wrapper.py b/fairseq/tests/distributed/test_module_proxy_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ac1a877c3dc137cf32d01e080c61462711432b3
--- /dev/null
+++ b/fairseq/tests/distributed/test_module_proxy_wrapper.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from torch import nn
+
+from fairseq.distributed import ModuleProxyWrapper
+
+from .utils import objects_are_equal
+
+
+class MockDDPWrapper(nn.Module):
+    """A simple wrapper with an interface similar to DistributedDataParallel."""
+
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+
+    def forward(self, x):
+        return self.module(x)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(5, 10)
+        self.xyz = "hello"
+
+    def forward(self, x):
+        return self.linear(x)
+
+    def get_xyz(self):
+        return self.xyz
+
+
+class TestModuleProxyWrapper(unittest.TestCase):
+    def _get_module(self):
+        module = Model()
+        wrapped_module = MockDDPWrapper(module)
+        wrapped_module = ModuleProxyWrapper(wrapped_module)
+        return wrapped_module, module
+
+    def test_getattr_forwarding(self):
+        wrapped_module, module = self._get_module()
+        assert module.xyz == "hello"
+        assert module.get_xyz() == "hello"
+        assert wrapped_module.xyz == "hello"
+
+        wrapped_module.xyz = "world"
+        assert wrapped_module.xyz == "world"
+        assert module.get_xyz() == "hello"
+
+    def test_state_dict(self):
+        wrapped_module, module = self._get_module()
+        assert objects_are_equal(wrapped_module.state_dict(), module.state_dict())
+
+    def test_load_state_dict(self):
+        wrapped_module, module = self._get_module()
+        wrapped_module.load_state_dict(module.state_dict())
+        input = torch.rand(4, 5)
+        torch.testing.assert_allclose(wrapped_module(input), module(input))
+
+    def test_forward(self):
+        wrapped_module, module = self._get_module()
+        input = torch.rand(4, 5)
+        torch.testing.assert_allclose(wrapped_module(input), module(input))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/distributed/test_utils.py b/fairseq/tests/distributed/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f995b67acd39af5816d2eb412d6b4df7f44f8c
--- /dev/null
+++ b/fairseq/tests/distributed/test_utils.py
@@ -0,0 +1,124 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import sys
+import unittest
+
+import torch
+
+from fairseq.distributed import utils as dist_utils
+
+from .utils import objects_are_equal, spawn_and_init
+
+
+class DistributedTest(unittest.TestCase):
+    def setUp(self):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA not available, skipping test")
+        if sys.platform == "win32":
+            raise unittest.SkipTest("NCCL doesn't support Windows, skipping test")
+        if torch.cuda.device_count() < 2:
+            raise unittest.SkipTest("distributed tests require 2+ GPUs, skipping")
+
+
+class TestBroadcastObject(DistributedTest):
+    def test_str(self):
+        spawn_and_init(
+            functools.partial(
+                TestBroadcastObject._test_broadcast_object, "hello world"
+            ),
+            world_size=2,
+        )
+
+    def test_tensor(self):
+        spawn_and_init(
+            functools.partial(
+                TestBroadcastObject._test_broadcast_object,
+                torch.rand(5),
+            ),
+            world_size=2,
+        )
+
+    def test_complex(self):
+        spawn_and_init(
+            functools.partial(
+                TestBroadcastObject._test_broadcast_object,
+                {
+                    "a": "1",
+                    "b": [2, torch.rand(2, 3), 3],
+                    "c": (torch.rand(2, 3), 4),
+                    "d": {5, torch.rand(5)},
+                    "e": torch.rand(5),
+                    "f": torch.rand(5).int().cuda(),
+                },
+            ),
+            world_size=2,
+        )
+
+    @staticmethod
+    def _test_broadcast_object(ref_obj, rank, group):
+        obj = dist_utils.broadcast_object(
+            ref_obj if rank == 0 else None, src_rank=0, group=group
+        )
+        assert objects_are_equal(ref_obj, obj)
+
+
+class TestAllGatherList(DistributedTest):
+    def test_str_equality(self):
+        spawn_and_init(
+            functools.partial(
+                TestAllGatherList._test_all_gather_list_equality,
+                "hello world",
+            ),
+            world_size=2,
+        )
+
+    def test_tensor_equality(self):
+        spawn_and_init(
+            functools.partial(
+                TestAllGatherList._test_all_gather_list_equality,
+                torch.rand(5),
+            ),
+            world_size=2,
+        )
+
+    def test_complex_equality(self):
+        spawn_and_init(
+            functools.partial(
+                TestAllGatherList._test_all_gather_list_equality,
+                {
+                    "a": "1",
+                    "b": [2, torch.rand(2, 3), 3],
+                    "c": (torch.rand(2, 3), 4),
+                    "d": {5, torch.rand(5)},
+                    "e": torch.rand(5),
+                    "f": torch.rand(5).int(),
+                },
+            ),
+            world_size=2,
+        )
+
+    @staticmethod
+    def _test_all_gather_list_equality(ref_obj, rank, group):
+        objs = dist_utils.all_gather_list(ref_obj, group)
+        for obj in objs:
+            assert objects_are_equal(ref_obj, obj)
+
+    def test_rank_tensor(self):
+        spawn_and_init(
+            TestAllGatherList._test_all_gather_list_rank_tensor, world_size=2
+        )
+
+    @staticmethod
+    def _test_all_gather_list_rank_tensor(rank, group):
+        obj = torch.tensor([rank])
+        objs = dist_utils.all_gather_list(obj, group)
+        for i, obj in enumerate(objs):
+            assert obj.item() == i
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/distributed/utils.py b/fairseq/tests/distributed/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..be4e19cd1e2182903e9dad6b0e52bf33b9968c9d
--- /dev/null
+++ b/fairseq/tests/distributed/utils.py
@@ -0,0 +1,65 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import tempfile
+
+import torch
+
+
+def spawn_and_init(fn, world_size, args=None):
+    if args is None:
+        args = ()
+    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+        torch.multiprocessing.spawn(
+            fn=functools.partial(init_and_run, fn, args),
+            args=(
+                world_size,
+                tmp_file.name,
+            ),
+            nprocs=world_size,
+            join=True,
+        )
+
+
+def distributed_init(rank, world_size, tmp_file):
+    torch.distributed.init_process_group(
+        backend="nccl",
+        init_method="file://{}".format(tmp_file),
+        world_size=world_size,
+        rank=rank,
+    )
+    torch.cuda.set_device(rank)
+
+
+def init_and_run(fn, args, rank, world_size, tmp_file):
+    distributed_init(rank, world_size, tmp_file)
+    group = torch.distributed.new_group()
+    fn(rank, group, *args)
+
+
+def objects_are_equal(a, b) -> bool:
+    if type(a) is not type(b):
+        return False
+    if isinstance(a, dict):
+        if set(a.keys()) != set(b.keys()):
+            return False
+        for k in a.keys():
+            if not objects_are_equal(a[k], b[k]):
+                return False
+        return True
+    elif isinstance(a, (list, tuple, set)):
+        if len(a) != len(b):
+            return False
+        return all(objects_are_equal(x, y) for x, y in zip(a, b))
+    elif torch.is_tensor(a):
+        return (
+            a.size() == b.size()
+            and a.dtype == b.dtype
+            and a.device == b.device
+            and torch.all(a == b)
+        )
+    else:
+        return a == b
diff --git a/fairseq/tests/gpu/__init__.py b/fairseq/tests/gpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/tests/gpu/test_binaries_gpu.py b/fairseq/tests/gpu/test_binaries_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5caf94cde7fb0e63a9c2c4fe85133e036a57ba01
--- /dev/null
+++ b/fairseq/tests/gpu/test_binaries_gpu.py
@@ -0,0 +1,590 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import json
+import logging
+import os
+import tempfile
+import unittest
+from io import StringIO
+
+import torch
+
+from fairseq import options
+from fairseq_cli import train
+from tests.utils import (
+    create_dummy_data,
+    generate_main,
+    preprocess_lm_data,
+    preprocess_translation_data,
+    train_language_model,
+    train_translation_model,
+)
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestMultiGPU(unittest.TestCase):
+    @staticmethod
+    def parse_logs(logfile):
+        logs = []
+        for ln in open(logfile, "r").readlines():
+            try:
+                logs.append(json.loads(ln))
+            except json.JSONDecodeError:
+                continue
+        return logs
+
+    @property
+    def world_size(self):
+        return torch.cuda.device_count()
+
+    def train_flags(self, mu):
+        return [
+            "--memory-efficient-fp16",
+            "--update-freq",
+            "1",
+            "--seed",
+            "1",
+            "--log-format",
+            "json",
+            "--max-update",
+            str(mu),
+            "--tokens-per-sample",
+            "20",
+            "--batch-size",
+            "2",
+            "--share-decoder-input-output-embed",
+            "--optimizer",
+            "adam",
+            "--max-valid-steps",
+            "1",
+            "--pad-to-fixed-length",
+            "--sample-break-mode",
+            "none",
+        ]
+
+    def _test_resume_multilingual_training(
+        self, extra_clargs, arch="transformer_lm_gpt2_tiny"
+    ):
+        languages = ["en_XX", "fr_XX", "zh_CN"]
+        save_interval = 5
+        mu = 10
+        flags = (
+            self.train_flags(mu)
+            + ["--save-interval-updates", str(save_interval), "--log-interval", "1"]
+            + extra_clargs
+        )
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_fp16") as data_dir:
+                log = os.path.join(data_dir, "train.log")
+                create_dummy_data(
+                    data_dir,
+                    num_examples=int(
+                        mu * 20 * self.world_size * 1.5
+                    ),  # make sure enough data for max updates
+                    languages=languages,
+                )
+                preprocess_lm_data(data_dir, languages)
+                train_language_model(
+                    data_dir,
+                    arch,
+                    flags + ["--log-file", log],
+                    task="multilingual_language_modeling",
+                    world_size=self.world_size,
+                )
+                log2 = os.path.join(data_dir, "resume.log")
+                ckpt_name = f"checkpoint_1_{save_interval}.pt"
+                restore_file = os.path.join(data_dir, ckpt_name)
+                train_language_model(
+                    data_dir,
+                    arch,
+                    flags
+                    + ["--log-file", log2, "--restore-file", restore_file, "--no-save"],
+                    task="multilingual_language_modeling",
+                    world_size=self.world_size,
+                )
+
+                l1 = self.parse_logs(log)
+                assert (
+                    int(l1[-1]["train_num_updates"]) == mu
+                ), f"The first run did not complete {mu} updates. Add more data"
+                l2 = self.parse_logs(log2)
+
+                if int(l2[0]["num_updates"]) != save_interval + 1:
+                    all_ckpt_files = [
+                        x for x in os.listdir(data_dir) if x.endswith(".pt")
+                    ]
+                    import shutil
+
+                    shutil.move(data_dir, "last_failed_resume")
+                    raise AssertionError(
+                        f"Likely failed to load {ckpt_name}. {all_ckpt_files} \n LOGS: {l1} \n\n {l2}. "
+                    )
+                for k in [
+                    "train_loss",
+                    "train_num_updates",
+                    "train_ppl",
+                    "train_gnorm",
+                ]:
+                    from_scratch, resumed = float(l1[-1][k]), float(l2[-1][k])
+                    # This fails without rounding!
+                    assert (
+                        from_scratch == resumed
+                    ), f"difference at {k} {from_scratch} != {resumed}"
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestTranslationGPU(unittest.TestCase):
+    def setUp(self):
+        logging.disable(logging.CRITICAL)
+
+    def tearDown(self):
+        logging.disable(logging.NOTSET)
+
+    def test_fp16_multigpu(self):
+        self._test_multigpu("test_fp16", ["--fp16"])
+
+    def test_slowmo_multigpu(self):
+        self._test_multigpu(
+            "test_slowmo", ["--ddp-backend", "slowmo", "--nprocs-per-node", "1"]
+        )
+
+    def test_slowmo_single_node_multigpu(self):
+        self._test_multigpu(
+            "test_slowmo_single_node",
+            ["--ddp-backend", "slowmo", "--nprocs-per-node", "2"],
+        )
+
+    def _test_multigpu(self, test_name, test_args):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory(test_name) as data_dir:
+                log = os.path.join(data_dir, "train.log")
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(
+                    data_dir,
+                    "fconv_iwslt_de_en",
+                    test_args + ["--log-file", log],
+                    world_size=min(torch.cuda.device_count(), 2),
+                )
+                generate_main(data_dir)
+                assert os.path.exists(log)
+
+    @staticmethod
+    def parse_logs(logfile):
+        logs = []
+        for ln in open(logfile, "r").readlines():
+            try:
+                logs.append(json.loads(ln))
+            except json.JSONDecodeError:
+                continue
+        return logs
+
+    def test_resume_training_fsdp(self):
+        self._test_resume_training(["--ddp-backend", "fully_sharded"])
+
+    def test_resume_training_fsdp_sharded_state(self):
+        self._test_resume_training(
+            ["--ddp-backend", "fully_sharded", "--use-sharded-state"]
+        )
+
+    def test_resume_training_noc10d(self):
+        self._test_resume_training([])
+
+    def _test_resume_training(self, extra_clargs, arch="fconv_iwslt_de_en"):
+        flags = [
+            "--fp16",
+            "--log-format",
+            "json",
+            "--max-update",
+            "10",
+            "--save-interval-updates",
+            "2",
+            "--log-interval",
+            "1",
+        ] + extra_clargs
+        world_size = min(torch.cuda.device_count(), 2)
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_fp16") as data_dir:
+                log = os.path.join(data_dir, "train.log")
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(
+                    data_dir,
+                    arch,
+                    flags + ["--log-file", log],
+                    world_size=world_size,
+                )
+                log2 = os.path.join(data_dir, "resume.log")
+                restore_file = os.path.join(data_dir, "checkpoint_1_2.pt")
+                train_translation_model(
+                    data_dir,
+                    arch,
+                    flags + ["--log-file", log2, "--restore-file", restore_file],
+                    world_size=world_size,
+                )
+
+                l1 = self.parse_logs(log)
+                l2 = self.parse_logs(log2)
+                assert int(l2[0]["num_updates"]) == 3, f"{l1}\n\n {l2}"
+                for k in [
+                    "train_loss",
+                    "train_num_updates",
+                    "train_ppl",
+                    "train_gnorm",
+                ]:
+                    from_scratch, resumed = l1[-1][k], l2[-1][k]
+                    assert (
+                        from_scratch == resumed
+                    ), f"difference at {k} {from_scratch} != {resumed}"
+
+    def test_memory_efficient_fp16(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_memory_efficient_fp16") as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(
+                    data_dir, "fconv_iwslt_de_en", ["--memory-efficient-fp16"]
+                )
+                generate_main(data_dir)
+
+    def test_transformer_fp16(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_transformer") as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(
+                    data_dir,
+                    "transformer_iwslt_de_en",
+                    [
+                        "--encoder-layers",
+                        "2",
+                        "--decoder-layers",
+                        "2",
+                        "--encoder-embed-dim",
+                        "64",
+                        "--decoder-embed-dim",
+                        "64",
+                        "--fp16",
+                    ],
+                    run_validation=True,
+                )
+                generate_main(data_dir)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+    def test_amp(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_amp") as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(data_dir, "fconv_iwslt_de_en", ["--amp"])
+                generate_main(data_dir)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+    def test_transformer_amp(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_transformer") as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(
+                    data_dir,
+                    "transformer_iwslt_de_en",
+                    [
+                        "--encoder-layers",
+                        "2",
+                        "--decoder-layers",
+                        "2",
+                        "--encoder-embed-dim",
+                        "64",
+                        "--decoder-embed-dim",
+                        "64",
+                        "--amp",
+                    ],
+                    run_validation=True,
+                )
+                generate_main(data_dir)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+    def test_levenshtein_transformer(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory(
+                "test_levenshtein_transformer"
+            ) as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir, ["--joined-dictionary"])
+                train_translation_model(
+                    data_dir,
+                    "levenshtein_transformer",
+                    [
+                        "--apply-bert-init",
+                        "--early-exit",
+                        "6,6,6",
+                        "--criterion",
+                        "nat_loss",
+                    ],
+                    task="translation_lev",
+                )
+                gen_config = [
+                    "--task",
+                    "translation_lev",
+                    "--iter-decode-max-iter",
+                    "9",
+                    "--iter-decode-eos-penalty",
+                    "0",
+                    "--print-step",
+                ]
+                # non-ensemble generation
+                generate_main(data_dir, gen_config)
+                # ensemble generation
+                generate_main(
+                    data_dir,
+                    gen_config,
+                    path=os.pathsep.join(
+                        [
+                            os.path.join(data_dir, "checkpoint_last.pt"),
+                            os.path.join(data_dir, "checkpoint_last.pt"),
+                        ]
+                    ),
+                )
+
+    def test_fsdp_checkpoint_generate(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir:
+                log = os.path.join(data_dir, "train.log")
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                world_size = min(torch.cuda.device_count(), 2)
+                train_translation_model(
+                    data_dir,
+                    "fconv_iwslt_de_en",
+                    ["--log-file", log, "--ddp-backend", "fully_sharded"],
+                    world_size=world_size,
+                )
+                generate_main(data_dir)
+                assert os.path.exists(log)
+
+    def test_fsdp_sharded_checkpoint_generate(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir:
+                log = os.path.join(data_dir, "train.log")
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                world_size = min(torch.cuda.device_count(), 2)
+                train_translation_model(
+                    data_dir,
+                    "fconv_iwslt_de_en",
+                    [
+                        "--log-file",
+                        log,
+                        "--ddp-backend",
+                        "fully_sharded",
+                        "--use-sharded-state",
+                    ],
+                    world_size=world_size,
+                )
+                generate_main(data_dir, ["--checkpoint-shard-count", str(world_size)])
+                assert os.path.exists(log)
+
+
+def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False):
+    train_parser = options.get_training_parser()
+    train_args = options.parse_args_and_arch(
+        train_parser,
+        [
+            "--task",
+            "language_modeling",
+            data_dir,
+            "--arch",
+            arch,
+            "--optimizer",
+            "adam",
+            "--lr",
+            "0.0001",
+            "--criterion",
+            "adaptive_loss",
+            "--adaptive-softmax-cutoff",
+            "5,10,15",
+            "--max-tokens",
+            "500",
+            "--tokens-per-sample",
+            "500",
+            "--save-dir",
+            data_dir,
+            "--max-epoch",
+            "1",
+            "--no-progress-bar",
+            "--distributed-world-size",
+            "1",
+            "--ddp-backend",
+            "no_c10d",
+            "--num-workers",
+            "0",
+        ]
+        + (extra_flags or []),
+    )
+    train.main(train_args)
+
+    # try scalar quantization
+    scalar_quant_train_parser = options.get_training_parser()
+    scalar_quant_train_args = options.parse_args_and_arch(
+        scalar_quant_train_parser,
+        [
+            "--task",
+            "language_modeling",
+            data_dir,
+            "--arch",
+            arch,
+            "--optimizer",
+            "adam",
+            "--lr",
+            "0.0001",
+            "--criterion",
+            "adaptive_loss",
+            "--adaptive-softmax-cutoff",
+            "5,10,15",
+            "--max-tokens",
+            "500",
+            "--tokens-per-sample",
+            "500",
+            "--save-dir",
+            data_dir,
+            "--max-update",
+            "3",
+            "--no-progress-bar",
+            "--distributed-world-size",
+            "1",
+            "--ddp-backend",
+            "no_c10d",
+            "--num-workers",
+            "0",
+            "--quant-noise-scalar",
+            "0.5",
+        ]
+        + (extra_flags or []),
+    )
+    train.main(scalar_quant_train_args)
+
+    # try iterative PQ quantization
+    quantize_parser = options.get_training_parser()
+    quantize_args = options.parse_args_and_arch(
+        quantize_parser,
+        [
+            "--task",
+            "language_modeling",
+            data_dir,
+            "--arch",
+            arch,
+            "--optimizer",
+            "adam",
+            "--lr",
+            "0.0001",
+            "--criterion",
+            "adaptive_loss",
+            "--adaptive-softmax-cutoff",
+            "5,10,15",
+            "--max-tokens",
+            "50",
+            "--tokens-per-sample",
+            "50",
+            "--max-update",
+            "6",
+            "--no-progress-bar",
+            "--distributed-world-size",
+            "1",
+            "--ddp-backend",
+            "no_c10d",
+            "--num-workers",
+            "0",
+            "--restore-file",
+            os.path.join(data_dir, "checkpoint_last.pt"),
+            "--reset-optimizer",
+            "--quantization-config-path",
+            os.path.join(
+                os.path.dirname(__file__), "transformer_quantization_config.yaml"
+            ),
+        ]
+        + (extra_flags or []),
+    )
+    train.main(quantize_args)
+
+
+@unittest.skipIf(
+    int(torch.__version__[2]) < 10, reason="quantized kernels are only supported on CPU"
+)
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestQuantization(unittest.TestCase):
+    def setUp(self):
+        logging.disable(logging.CRITICAL)
+
+    def tearDown(self):
+        logging.disable(logging.NOTSET)
+
+    def test_quantization(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_quantization") as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_lm_data(data_dir)
+                # tests both scalar and iterative PQ quantization
+                _quantize_language_model(data_dir, "transformer_lm")
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestOptimizersGPU(unittest.TestCase):
+    def setUp(self):
+        logging.disable(logging.CRITICAL)
+
+    def tearDown(self):
+        logging.disable(logging.NOTSET)
+
+    def test_flat_grads(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_flat_grads") as data_dir:
+                # Use just a bit of data and tiny model to keep this test runtime reasonable
+                create_dummy_data(data_dir, num_examples=10, maxlen=5)
+                preprocess_translation_data(data_dir)
+                with self.assertRaises(RuntimeError):
+                    # adafactor isn't compatible with flat grads, which
+                    # are used by default with --fp16
+                    train_translation_model(
+                        data_dir,
+                        "lstm",
+                        [
+                            "--required-batch-size-multiple",
+                            "1",
+                            "--encoder-layers",
+                            "1",
+                            "--encoder-hidden-size",
+                            "32",
+                            "--decoder-layers",
+                            "1",
+                            "--optimizer",
+                            "adafactor",
+                            "--fp16",
+                        ],
+                    )
+                # but it should pass once we set --fp16-no-flatten-grads
+                train_translation_model(
+                    data_dir,
+                    "lstm",
+                    [
+                        "--required-batch-size-multiple",
+                        "1",
+                        "--encoder-layers",
+                        "1",
+                        "--encoder-hidden-size",
+                        "32",
+                        "--decoder-layers",
+                        "1",
+                        "--optimizer",
+                        "adafactor",
+                        "--fp16",
+                        "--fp16-no-flatten-grads",
+                    ],
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/gpu/test_ema_gpu.py b/fairseq/tests/gpu/test_ema_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..33fb5607b41261191c55cdaf9268bc6aed2de7c3
--- /dev/null
+++ b/fairseq/tests/gpu/test_ema_gpu.py
@@ -0,0 +1,215 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from fairseq.models.ema import EMA
+
+
+class DummyModule(torch.nn.Module):
+    def __init__(self) -> None:
+        """LightningModule for testing purposes
+
+        Args:
+            epoch_min_loss_override (int, optional): Pass in an epoch that will be set to the minimum
+                validation loss for testing purposes (zero based). If None this is ignored. Defaults to None.
+        """
+        super().__init__()
+        self.layer = torch.nn.Linear(in_features=32, out_features=2)
+        self.another_layer = torch.nn.Linear(in_features=2, out_features=2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.layer(x)
+        return self.another_layer(x)
+
+
+@dataclass
+class EMAConfig(object):
+    ema_decay: float = 0.99
+    ema_start_update: int = 0
+    ema_fp32: bool = False
+    ema_seed_model: Optional[str] = None
+    ema_update_freq: int = 1
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestEMAGPU(unittest.TestCase):
+    def assertTorchAllClose(self, x, y, atol=1e-8, rtol=1e-5, msg=None):
+        diff = x.float() - y.float()
+        diff_norm = torch.norm(diff)
+        other_norm = torch.norm(y.float())
+
+        if msg is None:
+            msg = "|input - other| > {} + {} * |other|".format(atol, rtol)
+
+        self.assertLessEqual(
+            diff_norm,
+            atol + rtol * other_norm,
+            msg=msg,
+        )
+
+    def test_ema(self):
+        model = DummyModule().cuda()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+        state = deepcopy(model.state_dict())
+        config = EMAConfig()
+        ema = EMA(model, config)
+
+        # set decay
+        ema._set_decay(config.ema_decay)
+        self.assertEqual(ema.get_decay(), config.ema_decay)
+
+        # get model
+        self.assertEqual(ema.get_model(), ema.model)
+
+        # Since fp32 params is not used, it should be of size 0
+        self.assertEqual(len(ema.fp32_params), 0)
+
+        # EMA step
+        x = torch.randn(32).cuda()
+        y = model(x)
+        loss = y.sum()
+        loss.backward()
+        optimizer.step()
+
+        ema.step(model)
+
+        ema_state_dict = ema.get_model().state_dict()
+
+        for key, param in model.state_dict().items():
+            prev_param = state[key]
+            ema_param = ema_state_dict[key]
+
+            if "version" in key:
+                # Do not decay a model.version pytorch param
+                continue
+            self.assertTorchAllClose(
+                ema_param,
+                config.ema_decay * prev_param + (1 - config.ema_decay) * param,
+            )
+
+        # Since fp32 params is not used, it should be of size 0
+        self.assertEqual(len(ema.fp32_params), 0)
+
+        # Load EMA into model
+        model2 = DummyModule().cuda()
+        ema.reverse(model2)
+
+        for key, param in model2.state_dict().items():
+            ema_param = ema_state_dict[key]
+            self.assertTrue(torch.allclose(ema_param, param))
+
+    def test_ema_fp32(self):
+        model = DummyModule().cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+        state = deepcopy(model.state_dict())
+        config = EMAConfig(ema_fp32=True)
+        ema = EMA(model, config)
+
+        x = torch.randn(32).cuda()
+        y = model(x.half())
+        loss = y.sum()
+        loss.backward()
+        optimizer.step()
+
+        ema.step(model)
+
+        for key, param in model.state_dict().items():
+            prev_param = state[key]
+            ema_param = ema.get_model().state_dict()[key]
+
+            if "version" in key:
+                # Do not decay a model.version pytorch param
+                continue
+            self.assertIn(key, ema.fp32_params)
+
+            # EMA update is done in fp32, and hence the EMA param must be
+            # closer to the EMA update done in fp32 than in fp16.
+            self.assertLessEqual(
+                torch.norm(
+                    ema_param.float()
+                    - (
+                        config.ema_decay * prev_param.float()
+                        + (1 - config.ema_decay) * param.float()
+                    )
+                    .half()
+                    .float()
+                ),
+                torch.norm(
+                    ema_param.float()
+                    - (
+                        config.ema_decay * prev_param + (1 - config.ema_decay) * param
+                    ).float()
+                ),
+            )
+            self.assertTorchAllClose(
+                ema_param,
+                (
+                    config.ema_decay * prev_param.float()
+                    + (1 - config.ema_decay) * param.float()
+                ).half(),
+            )
+
+    def test_ema_fp16(self):
+        model = DummyModule().cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+        state = deepcopy(model.state_dict())
+        config = EMAConfig(ema_fp32=False)
+        ema = EMA(model, config)
+
+        # Since fp32 params is not used, it should be of size 0
+        self.assertEqual(len(ema.fp32_params), 0)
+
+        x = torch.randn(32).cuda()
+        y = model(x.half())
+        loss = y.sum()
+        loss.backward()
+        optimizer.step()
+
+        ema.step(model)
+
+        for key, param in model.state_dict().items():
+            prev_param = state[key]
+            ema_param = ema.get_model().state_dict()[key]
+
+            if "version" in key:
+                # Do not decay a model.version pytorch param
+                continue
+
+            # EMA update is done in fp16, and hence the EMA param must be
+            # closer to the EMA update done in fp16 than in fp32.
+            self.assertLessEqual(
+                torch.norm(
+                    ema_param.float()
+                    - (
+                        config.ema_decay * prev_param + (1 - config.ema_decay) * param
+                    ).float()
+                ),
+                torch.norm(
+                    ema_param.float()
+                    - (
+                        config.ema_decay * prev_param.float()
+                        + (1 - config.ema_decay) * param.float()
+                    )
+                    .half()
+                    .float()
+                ),
+            )
+            self.assertTorchAllClose(
+                ema_param,
+                config.ema_decay * prev_param + (1 - config.ema_decay) * param,
+            )
+
+        # Since fp32 params is not used, it should be of size 0
+        self.assertEqual(len(ema.fp32_params), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/gpu/transformer_quantization_config.yaml b/fairseq/tests/gpu/transformer_quantization_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de31d8116ced675b81eb74119642217d768e7736
--- /dev/null
+++ b/fairseq/tests/gpu/transformer_quantization_config.yaml
@@ -0,0 +1,28 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file defines example configuration arguments for quantizing
+# a transformer model with product quantization
+
+n_centroids:
+    Linear:
+        key: in_features
+        value: {"*": 8}
+    Embedding:
+        key: embedding_dim
+        value: {"*": 8}
+
+block_sizes:
+  Linear:
+      key: fuzzy_name
+      value: {fc: 8, attn: 4, emb: 4}
+  Embedding:
+      key: fuzzy_name
+      value: {emb: 8}
+
+layers_to_quantize:
+    - decoder\\.layers\\.\d+\\.fc[12]
+    - decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]
+    - decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)
diff --git a/fairseq/tests/speech/__init__.py b/fairseq/tests/speech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dba99e4d933c6b04792bc42f7ab6b873ec17608c
--- /dev/null
+++ b/fairseq/tests/speech/__init__.py
@@ -0,0 +1,210 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from argparse import Namespace
+import os
+import re
+import unittest
+from pathlib import Path
+from tqdm import tqdm
+from typing import List, Dict, Optional
+import torch
+from fairseq.checkpoint_utils import load_model_ensemble_and_task
+from fairseq.scoring.wer import WerScorer
+from fairseq.scoring.bleu import SacrebleuScorer
+from fairseq import utils
+import zipfile
+
+S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq"
+
+
+class TestFairseqSpeech(unittest.TestCase):
+    @classmethod
+    def download(cls, base_url: str, out_root: Path, filename: str):
+        url = f"{base_url}/{filename}"
+        path = out_root / filename
+        if not path.exists():
+            torch.hub.download_url_to_file(url, path.as_posix(), progress=True)
+        return path
+
+    def _set_up(self, dataset_id: str, s3_dir: str, data_filenames: List[str]):
+        self.use_cuda = torch.cuda.is_available()
+        self.root = Path.home() / ".cache" / "fairseq" / dataset_id
+        self.root.mkdir(exist_ok=True, parents=True)
+        os.chdir(self.root)
+        self.base_url = (
+            s3_dir if re.search("^https:", s3_dir) else f"{S3_BASE_URL}/{s3_dir}"
+        )
+        for filename in data_filenames:
+            self.download(self.base_url, self.root, filename)
+
+    def set_up_librispeech(self):
+        self._set_up(
+            "librispeech",
+            "s2t/librispeech",
+            [
+                "cfg_librispeech.yaml",
+                "spm_librispeech_unigram10000.model",
+                "spm_librispeech_unigram10000.txt",
+                "librispeech_test-other.tsv",
+                "librispeech_test-other.zip",
+            ],
+        )
+
+    def set_up_ljspeech(self):
+        self._set_up(
+            "ljspeech",
+            "s2/ljspeech",
+            [
+                "cfg_ljspeech_g2p.yaml",
+                "ljspeech_g2p_gcmvn_stats.npz",
+                "ljspeech_g2p.txt",
+                "ljspeech_test.tsv",
+                "ljspeech_test.zip",
+            ],
+        )
+
+    def set_up_sotasty_es_en(self):
+        self._set_up(
+            "sotasty_es_en",
+            "s2t/big/es-en",
+            [
+                "cfg_es_en.yaml",
+                "spm_bpe32768_es_en.model",
+                "spm_bpe32768_es_en.txt",
+                "sotasty_es_en_test_ted.tsv",
+                "sotasty_es_en_test_ted.zip",
+            ],
+        )
+
+    def set_up_mustc_de_fbank(self):
+        self._set_up(
+            "mustc_de_fbank",
+            "https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de",
+            [
+                "config.yaml",
+                "spm.model",
+                "dict.txt",
+                "src_dict.txt",
+                "tgt_dict.txt",
+                "tst-COMMON.tsv",
+                "tst-COMMON.zip",
+            ],
+        )
+
+    def download_and_load_checkpoint(
+        self,
+        checkpoint_filename: str,
+        arg_overrides: Optional[Dict[str, str]] = None,
+        strict: bool = True,
+    ):
+        path = self.download(self.base_url, self.root, checkpoint_filename)
+        _arg_overrides = arg_overrides or {}
+        _arg_overrides["data"] = self.root.as_posix()
+        models, cfg, task = load_model_ensemble_and_task(
+            [path.as_posix()], arg_overrides=_arg_overrides, strict=strict
+        )
+        if self.use_cuda:
+            for model in models:
+                model.cuda()
+
+        return models, cfg, task, self.build_generator(task, models, cfg)
+
+    def build_generator(
+        self,
+        task,
+        models,
+        cfg,
+    ):
+        return task.build_generator(models, cfg)
+
+    @classmethod
+    def get_batch_iterator(cls, task, test_split, max_tokens, max_positions):
+        task.load_dataset(test_split)
+        return task.get_batch_iterator(
+            dataset=task.dataset(test_split),
+            max_tokens=max_tokens,
+            max_positions=max_positions,
+            num_workers=1,
+        ).next_epoch_itr(shuffle=False)
+
+    @classmethod
+    def get_wer_scorer(
+        cls, tokenizer="none", lowercase=False, remove_punct=False, char_level=False
+    ):
+        scorer_args = {
+            "wer_tokenizer": tokenizer,
+            "wer_lowercase": lowercase,
+            "wer_remove_punct": remove_punct,
+            "wer_char_level": char_level,
+        }
+        return WerScorer(Namespace(**scorer_args))
+
+    @classmethod
+    def get_bleu_scorer(cls, tokenizer="13a", lowercase=False, char_level=False):
+        scorer_args = {
+            "sacrebleu_tokenizer": tokenizer,
+            "sacrebleu_lowercase": lowercase,
+            "sacrebleu_char_level": char_level,
+        }
+        return SacrebleuScorer(Namespace(**scorer_args))
+
+    @torch.no_grad()
+    def base_test(
+        self,
+        ckpt_name,
+        reference_score,
+        score_delta=0.3,
+        dataset="librispeech_test-other",
+        max_tokens=65_536,
+        max_positions=(4_096, 1_024),
+        arg_overrides=None,
+        strict=True,
+        score_type="wer",
+    ):
+        models, _, task, generator = self.download_and_load_checkpoint(
+            ckpt_name, arg_overrides=arg_overrides, strict=strict
+        )
+        if not self.use_cuda:
+            return
+
+        batch_iterator = self.get_batch_iterator(
+            task, dataset, max_tokens, max_positions
+        )
+        if score_type == "bleu":
+            scorer = self.get_bleu_scorer()
+        elif score_type == "wer":
+            scorer = self.get_wer_scorer()
+        else:
+            raise Exception(f"Unsupported score type {score_type}")
+
+        progress = tqdm(enumerate(batch_iterator), total=len(batch_iterator))
+        for batch_idx, sample in progress:
+            sample = utils.move_to_cuda(sample) if self.use_cuda else sample
+            hypo = task.inference_step(generator, models, sample)
+            for i, sample_id in enumerate(sample["id"].tolist()):
+                tgt_str, hypo_str = self.postprocess_tokens(
+                    task,
+                    sample["target"][i, :],
+                    hypo[i][0]["tokens"].int().cpu(),
+                )
+                if batch_idx == 0 and i < 3:
+                    print(f"T-{sample_id} {tgt_str}")
+                    print(f"H-{sample_id} {hypo_str}")
+                scorer.add_string(tgt_str, hypo_str)
+
+        print(scorer.result_string() + f" (reference: {reference_score})")
+        self.assertAlmostEqual(scorer.score(), reference_score, delta=score_delta)
+
+    def postprocess_tokens(self, task, target, hypo_tokens):
+        tgt_tokens = utils.strip_pad(target, task.tgt_dict.pad()).int().cpu()
+        tgt_str = task.tgt_dict.string(tgt_tokens, "sentencepiece")
+        hypo_str = task.tgt_dict.string(hypo_tokens, "sentencepiece")
+        return tgt_str, hypo_str
+
+    def unzip_files(self, zip_file_name):
+        zip_file_path = self.root / zip_file_name
+        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+            zip_ref.extractall(self.root / zip_file_name.strip(".zip"))
diff --git a/fairseq/tests/speech/test_convtransformer_simul_trans.py b/fairseq/tests/speech/test_convtransformer_simul_trans.py
new file mode 100644
index 0000000000000000000000000000000000000000..0562404f52ab2dd26386c28e4f8b673557924737
--- /dev/null
+++ b/fairseq/tests/speech/test_convtransformer_simul_trans.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from tests.speech import TestFairseqSpeech
+
+S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq/"
+
+
+class TestConvtransformerSimulTrans(TestFairseqSpeech):
+    def setUp(self):
+        self._set_up(
+            "simul",
+            "speech_tests/simul",
+            ["config_gcmvn_specaug.yaml", "dict.txt", "dev.tsv"],
+        )
+
+    def test_waitk_checkpoint(self):
+        """Only test model loading since fairseq currently doesn't support inference of simultaneous models"""
+        _, _, _, _ = self.download_and_load_checkpoint(
+            "checkpoint_best.pt",
+            arg_overrides={
+                "config_yaml": "config_gcmvn_specaug.yaml",
+                "load_pretrained_encoder_from": None,
+            },
+        )
+        return
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech/test_dual_input_wav_transformer.py b/fairseq/tests/speech/test_dual_input_wav_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3581bc1991485d08269891873aa4c8ec375ae034
--- /dev/null
+++ b/fairseq/tests/speech/test_dual_input_wav_transformer.py
@@ -0,0 +1,76 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from collections import namedtuple
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+import fairseq
+from fairseq import utils
+from fairseq.checkpoint_utils import load_model_ensemble_and_task
+from fairseq.scoring.bleu import SacrebleuScorer
+from fairseq.tasks import import_tasks
+from tests.speech import S3_BASE_URL, TestFairseqSpeech
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestLibrispeechDualInputWavTransformer(TestFairseqSpeech):
+    def setUp(self):
+        dataset_id = "librispeech_wvtrasnformer"
+        base_url = "https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned"
+        data_filenames = [
+            "checkpoint_ave_10.pt",
+            "spm.model",
+            "src_dict.txt",
+            "tgt_dict.txt",
+            "config.yaml",
+        ]
+        self._set_up(
+            dataset_id,
+            "s2t",
+            [
+                "librispeech_flac_test-other.tsv",
+                "librispeech_flac_test-other.zip",
+            ],
+        )
+        for filename in data_filenames:
+            self.download(base_url, self.root, filename)
+
+    def import_user_module(self):
+        user_dir = (
+            Path(fairseq.__file__).parent.parent / "examples/speech_text_joint_to_text"
+        )
+        Arg = namedtuple("Arg", ["user_dir"])
+        arg = Arg(user_dir.__str__())
+        utils.import_user_module(arg)
+
+    @torch.no_grad()
+    def test_librispeech_dualinput_wav_transformer_checkpoint(self):
+        self.import_user_module()
+        checkpoint_filename = "checkpoint_ave_10.pt"
+        arg_overrides = {
+            "config_yaml": "config.yaml",
+            "load_pretrained_speech_text_encoder": "",
+            "load_pretrained_speech_text_decoder": "",
+            "beam": 10,
+            "nbest": 1,
+            "lenpen": 1.0,
+            "load_speech_only": True,
+        }
+        self.base_test(
+            checkpoint_filename,
+            4.6,
+            dataset="librispeech_flac_test-other",
+            max_tokens=800000,
+            max_positions=(800000, 1024),
+            arg_overrides=arg_overrides,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech/test_dualinput_s2t_transformer.py b/fairseq/tests/speech/test_dualinput_s2t_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..76675b98238fe738627eb88c5d43591bb7957db4
--- /dev/null
+++ b/fairseq/tests/speech/test_dualinput_s2t_transformer.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from argparse import Namespace
+from collections import namedtuple
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+import fairseq
+from fairseq import utils
+from fairseq.checkpoint_utils import load_model_ensemble_and_task
+from fairseq.scoring.bleu import SacrebleuScorer
+from fairseq.tasks import import_tasks
+from tests.speech import TestFairseqSpeech
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestDualInputS2TTransformer(TestFairseqSpeech):
+    def setUp(self):
+        self.set_up_mustc_de_fbank()
+
+    def import_user_module(self):
+        user_dir = (
+            Path(fairseq.__file__).parent.parent / "examples/speech_text_joint_to_text"
+        )
+        Arg = namedtuple("Arg", ["user_dir"])
+        arg = Arg(user_dir.__str__())
+        utils.import_user_module(arg)
+
+    @torch.no_grad()
+    def test_mustc_de_fbank_dualinput_s2t_transformer_checkpoint(self):
+        self.import_user_module()
+        checkpoint_filename = "checkpoint_ave_10.pt"
+        path = self.download(self.base_url, self.root, checkpoint_filename)
+        models, cfg, task = load_model_ensemble_and_task(
+            [path.as_posix()],
+            arg_overrides={
+                "data": self.root.as_posix(),
+                "config_yaml": "config.yaml",
+                "load_pretrain_speech_encoder": "",
+                "load_pretrain_text_encoder_last": "",
+                "load_pretrain_decoder": "",
+                "beam": 10,
+                "nbest": 1,
+                "lenpen": 1.0,
+                "load_speech_only": True,
+            },
+        )
+        if self.use_cuda:
+            for model in models:
+                model.cuda()
+        generator = task.build_generator(models, cfg)
+        test_split = "tst-COMMON"
+        task.load_dataset(test_split)
+        batch_iterator = task.get_batch_iterator(
+            dataset=task.dataset(test_split),
+            max_tokens=250_000,
+            max_positions=(10_000, 1_024),
+            num_workers=1,
+        ).next_epoch_itr(shuffle=False)
+
+        tokenizer = task.build_tokenizer(cfg.tokenizer)
+        bpe = task.build_bpe(cfg.bpe)
+
+        def decode_fn(x):
+            if bpe is not None:
+                x = bpe.decode(x)
+            if tokenizer is not None:
+                x = tokenizer.decode(x)
+            return x
+
+        scorer_args = {
+            "sacrebleu_tokenizer": "13a",
+            "sacrebleu_lowercase": False,
+            "sacrebleu_char_level": False,
+        }
+        scorer = SacrebleuScorer(Namespace(**scorer_args))
+        progress = tqdm(enumerate(batch_iterator), total=len(batch_iterator))
+        for batch_idx, sample in progress:
+            sample = utils.move_to_cuda(sample) if self.use_cuda else sample
+            hypo = task.inference_step(generator, models, sample)
+            for i, sample_id in enumerate(sample["id"].tolist()):
+                tgt_tokens = (
+                    utils.strip_pad(sample["target"][i, :], task.tgt_dict.pad())
+                    .int()
+                    .cpu()
+                )
+
+                tgt_str = task.tgt_dict.string(tgt_tokens, "sentencepiece")
+                hypo_str = task.tgt_dict.string(
+                    hypo[i][0]["tokens"].int().cpu(), "sentencepiece"
+                )
+                if batch_idx == 0 and i < 3:
+                    print(f"T-{sample_id} {tgt_str}")
+                    print(f"D-{sample_id} {hypo_str}")
+                scorer.add_string(tgt_str, hypo_str)
+        reference_bleu = 27.3
+        result = scorer.result_string()
+        print(result + f" (reference: {reference_bleu})")
+        res_bleu = float(result.split()[2])
+        self.assertAlmostEqual(res_bleu, reference_bleu, delta=0.3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech/test_fastspeech2.py b/fairseq/tests/speech/test_fastspeech2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7150a3bda25b3da2f0b4326de2cd074013198cde
--- /dev/null
+++ b/fairseq/tests/speech/test_fastspeech2.py
@@ -0,0 +1,53 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from tqdm import tqdm
+
+from fairseq import utils
+from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion
+from tests.speech import TestFairseqSpeech
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestFastSpeech2(TestFairseqSpeech):
+    def setUp(self):
+        self.set_up_ljspeech()
+
+    @torch.no_grad()
+    def test_ljspeech_fastspeech2_checkpoint(self):
+        models, cfg, task, generator = self.download_and_load_checkpoint(
+            "ljspeech_fastspeech2_g2p.pt",
+            arg_overrides={
+                "config_yaml": "cfg_ljspeech_g2p.yaml",
+                "vocoder": "griffin_lim",
+                "fp16": False,
+            },
+        )
+
+        batch_iterator = self.get_batch_iterator(task, "ljspeech_test", 65_536, 4_096)
+        progress = tqdm(batch_iterator, total=len(batch_iterator))
+        mcd, n_samples = 0.0, 0
+        for sample in progress:
+            sample = utils.move_to_cuda(sample) if self.use_cuda else sample
+            hypos = generator.generate(models[0], sample, has_targ=True)
+            rets = batch_mel_cepstral_distortion(
+                [hypo["targ_waveform"] for hypo in hypos],
+                [hypo["waveform"] for hypo in hypos],
+                sr=task.sr,
+            )
+            mcd += sum(d.item() for d, _ in rets)
+            n_samples += len(sample["id"].tolist())
+
+        mcd = round(mcd / n_samples, 1)
+        reference_mcd = 3.2
+        print(f"MCD: {mcd} (reference: {reference_mcd})")
+        self.assertAlmostEqual(mcd, reference_mcd, delta=0.1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech/test_s2s_transformer.py b/fairseq/tests/speech/test_s2s_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..180f46307cd61d7fa932c5eff84d74c04a5fd0aa
--- /dev/null
+++ b/fairseq/tests/speech/test_s2s_transformer.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from tests.speech import TestFairseqSpeech
+from fairseq import utils
+
+S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq/"
+
+
+class TestS2STransformer(TestFairseqSpeech):
+    def setUp(self):
+        self._set_up(
+            "s2s",
+            "speech_tests/s2s",
+            [
+                "dev_shuf200.tsv",
+                "src_feat.zip",
+                "config_specaug_lb.yaml",
+                "vocoder",
+                "vocoder_config.json",
+            ],
+        )
+
+    def test_s2s_transformer_checkpoint(self):
+        self.base_test(
+            ckpt_name="s2u_transformer_reduced_fisher.pt",
+            reference_score=38.3,
+            dataset="dev_shuf200",
+            arg_overrides={
+                "config_yaml": "config_specaug_lb.yaml",
+                "multitask_config_yaml": None,
+                "target_is_code": True,
+                "target_code_size": 100,
+                "eval_inference": False,
+            },
+            score_type="bleu",
+            strict=False,
+        )
+
+    def postprocess_tokens(self, task, target, hypo_tokens):
+        tgt_tokens = utils.strip_pad(target, task.tgt_dict.pad()).int().cpu()
+        tgt_str = task.tgt_dict.string(tgt_tokens)
+        hypo_str = task.tgt_dict.string(hypo_tokens)
+        return tgt_str, hypo_str
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech/test_s2t_conformer.py b/fairseq/tests/speech/test_s2t_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aaa4a0ed68ae33cd9f5c8cba75025da17c78d0f
--- /dev/null
+++ b/fairseq/tests/speech/test_s2t_conformer.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from tests.speech import TestFairseqSpeech
+
+
+class TestS2TConformer(TestFairseqSpeech):
+    def setUp(self):
+        self.set_up_librispeech()
+
+    def test_librispeech_s2t_conformer_s_checkpoint(self):
+        self.base_test(
+            ckpt_name="librispeech_conformer_rel_pos_s.pt",
+            reference_score=12,
+            arg_overrides={"config_yaml": "cfg_librispeech.yaml"},
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech/test_s2t_transformer.py b/fairseq/tests/speech/test_s2t_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..172f5484a0a930535702a1be5db7b9d7c490e902
--- /dev/null
+++ b/fairseq/tests/speech/test_s2t_transformer.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from tests.speech import TestFairseqSpeech
+
+
+class TestS2TTransformer(TestFairseqSpeech):
+    def setUp(self):
+        self.set_up_librispeech()
+
+    def test_librispeech_s2t_transformer_s_checkpoint(self):
+        self.base_test(
+            ckpt_name="librispeech_transformer_s.pt",
+            reference_score=9,
+            arg_overrides={"config_yaml": "cfg_librispeech.yaml"},
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech/test_tts_transformer.py b/fairseq/tests/speech/test_tts_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6330c60772fa89747c98fc61143be9097e1ea18
--- /dev/null
+++ b/fairseq/tests/speech/test_tts_transformer.py
@@ -0,0 +1,53 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from tqdm import tqdm
+
+from fairseq import utils
+from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion
+from tests.speech import TestFairseqSpeech
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestTTSTransformer(TestFairseqSpeech):
+    def setUp(self):
+        self.set_up_ljspeech()
+
+    @torch.no_grad()
+    def test_ljspeech_tts_transformer_checkpoint(self):
+        models, cfg, task, generator = self.download_and_load_checkpoint(
+            "ljspeech_transformer_g2p.pt",
+            arg_overrides={
+                "config_yaml": "cfg_ljspeech_g2p.yaml",
+                "vocoder": "griffin_lim",
+                "fp16": False,
+            },
+        )
+
+        batch_iterator = self.get_batch_iterator(task, "ljspeech_test", 65_536, 1024)
+        progress = tqdm(batch_iterator, total=len(batch_iterator))
+        mcd, n_samples = 0.0, 0
+        for sample in progress:
+            sample = utils.move_to_cuda(sample) if self.use_cuda else sample
+            hypos = generator.generate(models[0], sample, has_targ=True)
+            rets = batch_mel_cepstral_distortion(
+                [hypo["targ_waveform"] for hypo in hypos],
+                [hypo["waveform"] for hypo in hypos],
+                sr=task.sr,
+            )
+            mcd += sum(d.item() for d, _ in rets)
+            n_samples += len(sample["id"].tolist())
+
+        mcd = round(mcd / n_samples, 1)
+        reference_mcd = 3.3
+        print(f"MCD: {mcd} (reference: {reference_mcd})")
+        self.assertAlmostEqual(mcd, reference_mcd, delta=0.1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech/test_wav2vec2.py b/fairseq/tests/speech/test_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..eff6114c8e11be99800ae3d435d2ddcc2a7e65d3
--- /dev/null
+++ b/fairseq/tests/speech/test_wav2vec2.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+import torch
+from tests.speech import TestFairseqSpeech
+from fairseq.data.data_utils import post_process
+from fairseq import utils
+from omegaconf import open_dict
+
+S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq"
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestWav2Vec2(TestFairseqSpeech):
+    def setUp(self):
+        self._set_up(
+            "librispeech_w2v2",
+            "conformer/wav2vec2/librispeech",
+            [
+                "test_librispeech-other.ltr",
+                "test_librispeech-other.tsv",
+                "test_librispeech-other_small.ltr_100",
+                "test_librispeech-other_small.tsv",
+                "test-other.zip",
+                "dict.ltr.txt",
+                "dict.ltr_100.txt",
+            ],
+        )
+        self.unzip_files(
+            "test-other.zip",
+        )
+
+    def test_transformer_w2v2(self):
+        self.base_test(
+            ckpt_name="transformer_oss_small_100h.pt",
+            reference_score=38,
+            score_delta=1,
+            dataset="test_librispeech-other",
+            max_tokens=1000000,
+            max_positions=(700000, 1000),
+            arg_overrides={
+                "task": "audio_finetuning",
+                "labels": "ltr",
+                "nbest": 1,
+                "tpu": False,
+            },
+            strict=False,
+        )
+
+    def test_conformer_w2v2(self):
+        self.base_test(
+            ckpt_name="conformer_LS_PT_LS_FT_rope.pt",
+            reference_score=4.5,
+            score_delta=1,
+            dataset="test_librispeech-other_small",
+            max_tokens=1000000,
+            max_positions=(700000, 1000),
+            arg_overrides={
+                "task": "audio_finetuning",
+                "labels": "ltr_100",
+                "nbest": 1,
+                "tpu": False,
+            },
+            strict=True,
+        )
+
+    def build_generator(self, task, models, cfg):
+        try:
+            from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
+        except Exception:
+            raise Exception("Cannot run this test without flashlight dependency")
+        with open_dict(cfg):
+            cfg.nbest = 1
+        return W2lViterbiDecoder(cfg, task.target_dictionary)
+
+    def postprocess_tokens(self, task, target, hypo_tokens):
+        tgt_tokens = utils.strip_pad(target, task.target_dictionary.pad()).int().cpu()
+        tgt_str = task.target_dictionary.string(tgt_tokens)
+        tgt_str = post_process(tgt_str, "letter")
+
+        hypo_pieces = task.target_dictionary.string(hypo_tokens)
+        hypo_str = post_process(hypo_pieces, "letter")
+        return tgt_str, hypo_str
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech/test_xm_transformer.py b/fairseq/tests/speech/test_xm_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a5509415110c5da9b29456f5e47e45fc7cd0677
--- /dev/null
+++ b/fairseq/tests/speech/test_xm_transformer.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from tests.speech import TestFairseqSpeech
+
+
+class TestXMTransformer(TestFairseqSpeech):
+    def setUp(self):
+        self.set_up_sotasty_es_en()
+
+    # TODO: investigate increases BLEU score (30.42 -> 31.74)
+    def test_sotasty_es_en_600m_checkpoint(self):
+        self.base_test(
+            ckpt_name="xm_transformer_600m_es_en_md.pt",
+            reference_score=31.74,
+            score_delta=0.2,
+            max_tokens=3_000_000,
+            max_positions=(1_000_000, 1_024),
+            dataset="sotasty_es_en_test_ted",
+            arg_overrides={"config_yaml": "cfg_es_en.yaml"},
+            score_type="bleu",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/speech_recognition/__init__.py b/fairseq/tests/speech_recognition/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/tests/speech_recognition/asr_test_base.py b/fairseq/tests/speech_recognition/asr_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c5d414e7bf17ee02f280d024fa5d07e28b79d6b
--- /dev/null
+++ b/fairseq/tests/speech_recognition/asr_test_base.py
@@ -0,0 +1,557 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import unittest
+from inspect import currentframe, getframeinfo
+
+import numpy as np
+import torch
+from examples.speech_recognition.data.data_utils import lengths_to_encoder_padding_mask
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data.dictionary import Dictionary
+from fairseq.models import (
+    BaseFairseqModel,
+    FairseqDecoder,
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqEncoderModel,
+    FairseqModel,
+)
+from fairseq.tasks.fairseq_task import LegacyFairseqTask
+
+
+DEFAULT_TEST_VOCAB_SIZE = 100
+
+
+# ///////////////////////////////////////////////////////////////////////////
+# utility function to setup dummy dict/task/input
+# ///////////////////////////////////////////////////////////////////////////
+
+
+def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE):
+    dummy_dict = Dictionary()
+    # add dummy symbol to satisfy vocab size
+    for id, _ in enumerate(range(vocab_size)):
+        dummy_dict.add_symbol("{}".format(id), 1000)
+    return dummy_dict
+
+
+class DummyTask(LegacyFairseqTask):
+    def __init__(self, args):
+        super().__init__(args)
+        self.dictionary = get_dummy_dictionary()
+        if getattr(self.args, "ctc", False):
+            self.dictionary.add_symbol("<ctc_blank>")
+        self.tgt_dict = self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+
+def get_dummy_task_and_parser():
+    """
+    to build a fariseq model, we need some dummy parse and task. This function
+    is used to create dummy task and parser to faciliate model/criterion test
+
+    Note: we use FbSpeechRecognitionTask as the dummy task. You may want
+    to use other task by providing another function
+    """
+    parser = argparse.ArgumentParser(
+        description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS
+    )
+    DummyTask.add_args(parser)
+    args = parser.parse_args([])
+    task = DummyTask.setup_task(args)
+    return task, parser
+
+
+def get_dummy_input(T=100, D=80, B=5, K=100):
+    forward_input = {}
+    # T max sequence length
+    # D feature vector dimension
+    # B batch size
+    # K target dimension size
+    feature = torch.randn(B, T, D)
+    # this (B, T, D) layout is just a convention, you can override it by
+    # write your own _prepare_forward_input function
+    src_lengths = torch.from_numpy(
+        np.random.randint(low=1, high=T, size=B, dtype=np.int64)
+    )
+    src_lengths[0] = T  # make sure the maximum length matches
+    prev_output_tokens = []
+    for b in range(B):
+        token_length = np.random.randint(low=1, high=src_lengths[b].item() + 1)
+        tokens = np.random.randint(low=0, high=K, size=token_length, dtype=np.int64)
+        prev_output_tokens.append(torch.from_numpy(tokens))
+
+    prev_output_tokens = fairseq_data_utils.collate_tokens(
+        prev_output_tokens,
+        pad_idx=1,
+        eos_idx=2,
+        left_pad=False,
+        move_eos_to_beginning=False,
+    )
+    src_lengths, sorted_order = src_lengths.sort(descending=True)
+    forward_input["src_tokens"] = feature.index_select(0, sorted_order)
+    forward_input["src_lengths"] = src_lengths
+    forward_input["prev_output_tokens"] = prev_output_tokens
+
+    return forward_input
+
+
+def get_dummy_encoder_output(encoder_out_shape=(100, 80, 5)):
+    """
+    This only provides an example to generate dummy encoder output
+    """
+    (T, B, D) = encoder_out_shape
+    encoder_out = {}
+
+    encoder_out["encoder_out"] = torch.from_numpy(
+        np.random.randn(*encoder_out_shape).astype(np.float32)
+    )
+    seq_lengths = torch.from_numpy(np.random.randint(low=1, high=T, size=B))
+    # some dummy mask
+    encoder_out["encoder_padding_mask"] = torch.arange(T).view(1, T).expand(
+        B, -1
+    ) >= seq_lengths.view(B, 1).expand(-1, T)
+    encoder_out["encoder_padding_mask"].t_()
+
+    # encoer_padding_mask is (T, B) tensor, with (t, b)-th element indicate
+    # whether encoder_out[t, b] is valid (=0) or not (=1)
+    return encoder_out
+
+
+def _current_postion_info():
+    cf = currentframe()
+    frameinfo = " (at {}:{})".format(
+        os.path.basename(getframeinfo(cf).filename), cf.f_back.f_lineno
+    )
+    return frameinfo
+
+
+def check_encoder_output(encoder_output, batch_size=None):
+    """we expect encoder_output to be a dict with the following
+    key/value pairs:
+    - encoder_out: a Torch.Tensor
+    - encoder_padding_mask: a binary Torch.Tensor
+    """
+    if not isinstance(encoder_output, dict):
+        msg = (
+            "FairseqEncoderModel.forward(...) must be a dict" + _current_postion_info()
+        )
+        return False, msg
+
+    if "encoder_out" not in encoder_output:
+        msg = (
+            "FairseqEncoderModel.forward(...) must contain encoder_out"
+            + _current_postion_info()
+        )
+        return False, msg
+
+    if "encoder_padding_mask" not in encoder_output:
+        msg = (
+            "FairseqEncoderModel.forward(...) must contain encoder_padding_mask"
+            + _current_postion_info()
+        )
+        return False, msg
+
+    if not isinstance(encoder_output["encoder_out"], torch.Tensor):
+        msg = "encoder_out must be a torch.Tensor" + _current_postion_info()
+        return False, msg
+
+    if encoder_output["encoder_out"].dtype != torch.float32:
+        msg = "encoder_out must have float32 dtype" + _current_postion_info()
+        return False, msg
+
+    mask = encoder_output["encoder_padding_mask"]
+    if mask is not None:
+        if not isinstance(mask, torch.Tensor):
+            msg = (
+                "encoder_padding_mask must be a torch.Tensor" + _current_postion_info()
+            )
+            return False, msg
+        if mask.dtype != torch.uint8 and (
+            not hasattr(torch, "bool") or mask.dtype != torch.bool
+        ):
+            msg = (
+                "encoder_padding_mask must have dtype of uint8"
+                + _current_postion_info()
+            )
+            return False, msg
+
+        if mask.dim() != 2:
+            msg = (
+                "we expect encoder_padding_mask to be a 2-d tensor, in shape (T, B)"
+                + _current_postion_info()
+            )
+            return False, msg
+
+        if batch_size is not None and mask.size(1) != batch_size:
+            msg = (
+                "we expect encoder_padding_mask to be a 2-d tensor, with size(1)"
+                + " being the batch size"
+                + _current_postion_info()
+            )
+            return False, msg
+    return True, None
+
+
+def check_decoder_output(decoder_output):
+    """we expect output from a decoder is a tuple with the following constraint:
+    - the first element is a torch.Tensor
+    - the second element can be anything (reserved for future use)
+    """
+    if not isinstance(decoder_output, tuple):
+        msg = "FariseqDecoder output must be a tuple" + _current_postion_info()
+        return False, msg
+
+    if len(decoder_output) != 2:
+        msg = "FairseqDecoder output must be 2-elem tuple" + _current_postion_info()
+        return False, msg
+
+    if not isinstance(decoder_output[0], torch.Tensor):
+        msg = (
+            "FariseqDecoder output[0] must be a torch.Tensor" + _current_postion_info()
+        )
+        return False, msg
+
+    return True, None
+
+
+# ///////////////////////////////////////////////////////////////////////////
+# Base Test class
+# ///////////////////////////////////////////////////////////////////////////
+
+
+class TestBaseFairseqModelBase(unittest.TestCase):
+    """
+    This class is used to facilitate writing unittest for any class derived from
+    `BaseFairseqModel`.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestBaseFairseqModelBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpModel(self, model):
+        self.assertTrue(isinstance(model, BaseFairseqModel))
+        self.model = model
+
+    def setupInput(self):
+        pass
+
+    def setUp(self):
+        self.model = None
+        self.forward_input = None
+        pass
+
+
+class TestFairseqEncoderDecoderModelBase(TestBaseFairseqModelBase):
+    """
+    base code to test FairseqEncoderDecoderModel (formally known as
+    `FairseqModel`) must be derived from this base class
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestFairseqEncoderDecoderModelBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpModel(self, model_cls, extra_args_setters=None):
+        self.assertTrue(
+            issubclass(model_cls, (FairseqEncoderDecoderModel, FairseqModel)),
+            msg="This class only tests for FairseqModel subclasses",
+        )
+
+        task, parser = get_dummy_task_and_parser()
+        model_cls.add_args(parser)
+
+        args = parser.parse_args([])
+
+        if extra_args_setters is not None:
+            for args_setter in extra_args_setters:
+                args_setter(args)
+        model = model_cls.build_model(args, task)
+        self.model = model
+
+    def setUpInput(self, input=None):
+        self.forward_input = get_dummy_input() if input is None else input
+
+    def setUp(self):
+        super().setUp()
+
+    def test_forward(self):
+        if self.model and self.forward_input:
+            forward_output = self.model.forward(**self.forward_input)
+            # for FairseqEncoderDecoderModel, forward returns a tuple of two
+            # elements, the first one is a Torch.Tensor
+            succ, msg = check_decoder_output(forward_output)
+            if not succ:
+                self.assertTrue(succ, msg=msg)
+            self.forward_output = forward_output
+
+    def test_get_normalized_probs(self):
+        if self.model and self.forward_input:
+            forward_output = self.model.forward(**self.forward_input)
+            logprob = self.model.get_normalized_probs(forward_output, log_probs=True)
+            prob = self.model.get_normalized_probs(forward_output, log_probs=False)
+
+            # in order for different models/criterion to play with each other
+            # we need to know whether the logprob or prob output is batch_first
+            # or not. We assume an additional attribute will be attached to logprob
+            # or prob. If you find your code failed here, simply override
+            # FairseqModel.get_normalized_probs, see example at
+            # https://fburl.com/batch_first_example
+            self.assertTrue(hasattr(logprob, "batch_first"))
+            self.assertTrue(hasattr(prob, "batch_first"))
+
+            self.assertTrue(torch.is_tensor(logprob))
+            self.assertTrue(torch.is_tensor(prob))
+
+
+class TestFairseqEncoderModelBase(TestBaseFairseqModelBase):
+    """
+    base class to test FairseqEncoderModel
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestFairseqEncoderModelBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpModel(self, model_cls, extra_args_setters=None):
+        self.assertTrue(
+            issubclass(model_cls, FairseqEncoderModel),
+            msg="This class is only used for testing FairseqEncoderModel",
+        )
+        task, parser = get_dummy_task_and_parser()
+        model_cls.add_args(parser)
+        args = parser.parse_args([])
+        if extra_args_setters is not None:
+            for args_setter in extra_args_setters:
+                args_setter(args)
+
+        model = model_cls.build_model(args, task)
+        self.model = model
+
+    def setUpInput(self, input=None):
+        self.forward_input = get_dummy_input() if input is None else input
+        # get_dummy_input() is originally for s2s, here we delete extra dict
+        # items, so it can be used for EncoderModel / Encoder as well
+        self.forward_input.pop("prev_output_tokens", None)
+
+    def setUp(self):
+        super().setUp()
+
+    def test_forward(self):
+        if self.forward_input and self.model:
+            bsz = self.forward_input["src_tokens"].size(0)
+            forward_output = self.model.forward(**self.forward_input)
+
+            # we expect forward_output to be a dict with the following
+            # key/value pairs:
+            # - encoder_out: a Torch.Tensor
+            # - encoder_padding_mask: a binary Torch.Tensor
+            succ, msg = check_encoder_output(forward_output, batch_size=bsz)
+            if not succ:
+                self.assertTrue(succ, msg=msg)
+            self.forward_output = forward_output
+
+    def test_get_normalized_probs(self):
+        if self.model and self.forward_input:
+            forward_output = self.model.forward(**self.forward_input)
+            logprob = self.model.get_normalized_probs(forward_output, log_probs=True)
+            prob = self.model.get_normalized_probs(forward_output, log_probs=False)
+
+            # in order for different models/criterion to play with each other
+            # we need to know whether the logprob or prob output is batch_first
+            # or not. We assume an additional attribute will be attached to logprob
+            # or prob. If you find your code failed here, simply override
+            # FairseqModel.get_normalized_probs, see example at
+            # https://fburl.com/batch_first_example
+            self.assertTrue(hasattr(logprob, "batch_first"))
+            self.assertTrue(hasattr(prob, "batch_first"))
+
+            self.assertTrue(torch.is_tensor(logprob))
+            self.assertTrue(torch.is_tensor(prob))
+
+
+class TestFairseqEncoderBase(unittest.TestCase):
+    """
+    base class to test FairseqEncoder
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestFairseqEncoderBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpEncoder(self, encoder):
+        self.assertTrue(
+            isinstance(encoder, FairseqEncoder),
+            msg="This class is only used for test FairseqEncoder",
+        )
+        self.encoder = encoder
+
+    def setUpInput(self, input=None):
+        self.forward_input = get_dummy_input() if input is None else input
+        # get_dummy_input() is originally for s2s, here we delete extra dict
+        # items, so it can be used for EncoderModel / Encoder as well
+        self.forward_input.pop("prev_output_tokens", None)
+
+    def setUp(self):
+        self.encoder = None
+        self.forward_input = None
+
+    def test_forward(self):
+        if self.encoder and self.forward_input:
+            bsz = self.forward_input["src_tokens"].size(0)
+
+            forward_output = self.encoder.forward(**self.forward_input)
+            succ, msg = check_encoder_output(forward_output, batch_size=bsz)
+            if not succ:
+                self.assertTrue(succ, msg=msg)
+            self.forward_output = forward_output
+
+
+class TestFairseqDecoderBase(unittest.TestCase):
+    """
+    base class to test FairseqDecoder
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestFairseqDecoderBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpDecoder(self, decoder):
+        self.assertTrue(
+            isinstance(decoder, FairseqDecoder),
+            msg="This class is only used for test FairseqDecoder",
+        )
+        self.decoder = decoder
+
+    def setUpInput(self, input=None):
+        self.forward_input = get_dummy_encoder_output() if input is None else input
+
+    def setUpPrevOutputTokens(self, tokens=None):
+        if tokens is None:
+            self.encoder_input = get_dummy_input()
+            self.prev_output_tokens = self.encoder_input["prev_output_tokens"]
+        else:
+            self.prev_output_tokens = tokens
+
+    def setUp(self):
+        self.decoder = None
+        self.forward_input = None
+        self.prev_output_tokens = None
+
+    def test_forward(self):
+        if (
+            self.decoder is not None
+            and self.forward_input is not None
+            and self.prev_output_tokens is not None
+        ):
+            forward_output = self.decoder.forward(
+                prev_output_tokens=self.prev_output_tokens,
+                encoder_out=self.forward_input,
+            )
+            succ, msg = check_decoder_output(forward_output)
+            if not succ:
+                self.assertTrue(succ, msg=msg)
+            self.forward_input = forward_output
+
+
+class DummyEncoderModel(FairseqEncoderModel):
+    def __init__(self, encoder):
+        super().__init__(encoder)
+
+    @classmethod
+    def build_model(cls, args, task):
+        return cls(DummyEncoder())
+
+    def get_logits(self, net_output):
+        # Inverse of sigmoid to use with BinaryCrossEntropyWithLogitsCriterion as
+        # F.binary_cross_entropy_with_logits combines sigmoid and CE
+        return torch.log(
+            torch.div(net_output["encoder_out"], 1 - net_output["encoder_out"])
+        )
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample=sample)
+        lprobs.batch_first = True
+        return lprobs
+
+
+class DummyEncoder(FairseqEncoder):
+    def __init__(self):
+        super().__init__(None)
+
+    def forward(self, src_tokens, src_lengths):
+        mask, max_len = lengths_to_encoder_padding_mask(src_lengths)
+        return {"encoder_out": src_tokens, "encoder_padding_mask": mask}
+
+
+class CrossEntropyCriterionTestBase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        if cls is CrossEntropyCriterionTestBase:
+            raise unittest.SkipTest("Skipping base class test case")
+        super().setUpClass()
+
+    def setUpArgs(self):
+        args = argparse.Namespace()
+        args.sentence_avg = False
+        args.threshold = 0.1  # to use with BinaryCrossEntropyWithLogitsCriterion
+        return args
+
+    def setUp(self):
+        args = self.setUpArgs()
+        self.model = DummyEncoderModel(encoder=DummyEncoder())
+        self.criterion = self.criterion_cls.build_criterion(args, task=DummyTask(args))
+
+    def get_src_tokens(self, correct_prediction, aggregate):
+        """
+        correct_prediction: True if the net_output (src_tokens) should
+        predict the correct target
+        aggregate: True if the criterion expects net_output (src_tokens)
+        aggregated across time axis
+        """
+        predicted_idx = 0 if correct_prediction else 1
+        if aggregate:
+            src_tokens = torch.zeros((2, 2), dtype=torch.float)
+            for b in range(2):
+                src_tokens[b][predicted_idx] = 1.0
+        else:
+            src_tokens = torch.zeros((2, 10, 2), dtype=torch.float)
+            for b in range(2):
+                for t in range(10):
+                    src_tokens[b][t][predicted_idx] = 1.0
+        return src_tokens
+
+    def get_target(self, soft_target):
+        if soft_target:
+            target = torch.zeros((2, 2), dtype=torch.float)
+            for b in range(2):
+                target[b][0] = 1.0
+        else:
+            target = torch.zeros((2, 10), dtype=torch.long)
+        return target
+
+    def get_test_sample(self, correct, soft_target, aggregate):
+        src_tokens = self.get_src_tokens(correct, aggregate)
+        target = self.get_target(soft_target)
+        L = src_tokens.size(1)
+        return {
+            "net_input": {"src_tokens": src_tokens, "src_lengths": torch.tensor([L])},
+            "target": target,
+            "ntokens": src_tokens.size(0) * src_tokens.size(1),
+        }
diff --git a/fairseq/tests/speech_recognition/test_cross_entropy.py b/fairseq/tests/speech_recognition/test_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..b05400ed95e22762c3e3e5e8fd3ebfa6caf1e325
--- /dev/null
+++ b/fairseq/tests/speech_recognition/test_cross_entropy.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from examples.speech_recognition.criterions.cross_entropy_acc import (
+    CrossEntropyWithAccCriterion,
+)
+
+from .asr_test_base import CrossEntropyCriterionTestBase
+
+
+class CrossEntropyWithAccCriterionTest(CrossEntropyCriterionTestBase):
+    def setUp(self):
+        self.criterion_cls = CrossEntropyWithAccCriterion
+        super().setUp()
+
+    def test_cross_entropy_all_correct(self):
+        sample = self.get_test_sample(correct=True, soft_target=False, aggregate=False)
+        loss, sample_size, logging_output = self.criterion(
+            self.model, sample, "sum", log_probs=True
+        )
+        assert logging_output["correct"] == 20
+        assert logging_output["total"] == 20
+        assert logging_output["sample_size"] == 20
+        assert logging_output["ntokens"] == 20
+
+    def test_cross_entropy_all_wrong(self):
+        sample = self.get_test_sample(correct=False, soft_target=False, aggregate=False)
+        loss, sample_size, logging_output = self.criterion(
+            self.model, sample, "sum", log_probs=True
+        )
+        assert logging_output["correct"] == 0
+        assert logging_output["total"] == 20
+        assert logging_output["sample_size"] == 20
+        assert logging_output["ntokens"] == 20
diff --git a/fairseq/tests/speech_recognition/test_vggtransformer.py b/fairseq/tests/speech_recognition/test_vggtransformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dc73b8c7379970dc0bcc16fcb088a64a1bd7e3b
--- /dev/null
+++ b/fairseq/tests/speech_recognition/test_vggtransformer.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+
+# import models/encoder/decoder to be tested
+from examples.speech_recognition.models.vggtransformer import (
+    TransformerDecoder,
+    VGGTransformerEncoder,
+    VGGTransformerModel,
+    vggtransformer_1,
+    vggtransformer_2,
+    vggtransformer_base,
+)
+
+# import base test class
+from .asr_test_base import (
+    DEFAULT_TEST_VOCAB_SIZE,
+    TestFairseqDecoderBase,
+    TestFairseqEncoderBase,
+    TestFairseqEncoderDecoderModelBase,
+    get_dummy_dictionary,
+    get_dummy_encoder_output,
+    get_dummy_input,
+)
+
+
+class VGGTransformerModelTest_mid(TestFairseqEncoderDecoderModelBase):
+    def setUp(self):
+        def override_config(args):
+            """
+            vggtrasformer_1 use 14 layers of transformer,
+            for testing purpose, it is too expensive. For fast turn-around
+            test, reduce the number of layers to 3.
+            """
+            args.transformer_enc_config = (
+                "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3"
+            )
+
+        super().setUp()
+        extra_args_setter = [vggtransformer_1, override_config]
+
+        self.setUpModel(VGGTransformerModel, extra_args_setter)
+        self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE))
+
+
+class VGGTransformerModelTest_big(TestFairseqEncoderDecoderModelBase):
+    def setUp(self):
+        def override_config(args):
+            """
+            vggtrasformer_2 use 16 layers of transformer,
+            for testing purpose, it is too expensive. For fast turn-around
+            test, reduce the number of layers to 3.
+            """
+            args.transformer_enc_config = (
+                "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3"
+            )
+
+        super().setUp()
+        extra_args_setter = [vggtransformer_2, override_config]
+
+        self.setUpModel(VGGTransformerModel, extra_args_setter)
+        self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE))
+
+
+class VGGTransformerModelTest_base(TestFairseqEncoderDecoderModelBase):
+    def setUp(self):
+        def override_config(args):
+            """
+            vggtrasformer_base use 12 layers of transformer,
+            for testing purpose, it is too expensive. For fast turn-around
+            test, reduce the number of layers to 3.
+            """
+            args.transformer_enc_config = (
+                "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 3"
+            )
+
+        super().setUp()
+        extra_args_setter = [vggtransformer_base, override_config]
+
+        self.setUpModel(VGGTransformerModel, extra_args_setter)
+        self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE))
+
+
+class VGGTransformerEncoderTest(TestFairseqEncoderBase):
+    def setUp(self):
+        super().setUp()
+
+        self.setUpInput(get_dummy_input(T=50, D=80, B=5))
+
+    def test_forward(self):
+        print("1. test standard vggtransformer")
+        self.setUpEncoder(VGGTransformerEncoder(input_feat_per_channel=80))
+        super().test_forward()
+        print("2. test vggtransformer with limited right context")
+        self.setUpEncoder(
+            VGGTransformerEncoder(
+                input_feat_per_channel=80, transformer_context=(-1, 5)
+            )
+        )
+        super().test_forward()
+        print("3. test vggtransformer with limited left context")
+        self.setUpEncoder(
+            VGGTransformerEncoder(
+                input_feat_per_channel=80, transformer_context=(5, -1)
+            )
+        )
+        super().test_forward()
+        print("4. test vggtransformer with limited right context and sampling")
+        self.setUpEncoder(
+            VGGTransformerEncoder(
+                input_feat_per_channel=80,
+                transformer_context=(-1, 12),
+                transformer_sampling=(2, 2),
+            )
+        )
+        super().test_forward()
+        print("5. test vggtransformer with windowed context and sampling")
+        self.setUpEncoder(
+            VGGTransformerEncoder(
+                input_feat_per_channel=80,
+                transformer_context=(12, 12),
+                transformer_sampling=(2, 2),
+            )
+        )
+
+
+class TransformerDecoderTest(TestFairseqDecoderBase):
+    def setUp(self):
+        super().setUp()
+
+        dict = get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE)
+        decoder = TransformerDecoder(dict)
+        dummy_encoder_output = get_dummy_encoder_output(encoder_out_shape=(50, 5, 256))
+
+        self.setUpDecoder(decoder)
+        self.setUpInput(dummy_encoder_output)
+        self.setUpPrevOutputTokens()
diff --git a/fairseq/tests/tasks/test_multilingual_denoising.py b/fairseq/tests/tasks/test_multilingual_denoising.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0227f69b5747d461f2e52d586eab3eb4a9e8357
--- /dev/null
+++ b/fairseq/tests/tasks/test_multilingual_denoising.py
@@ -0,0 +1,98 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import unittest
+from tempfile import TemporaryDirectory
+
+from fairseq import options
+from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.tasks.multilingual_denoising import MultilingualDenoisingTask
+from tests.utils import build_vocab, make_data
+
+
+class TestMultilingualDenoising(unittest.TestCase):
+    def test_multilingual_denoising(self):
+        with TemporaryDirectory() as dirname:
+
+            # prep input file
+            lang_dir = os.path.join(dirname, "en")
+            os.mkdir(lang_dir)
+            raw_file = os.path.join(lang_dir, "raw")
+            data = make_data(out_file=raw_file)
+            vocab = build_vocab(data)
+
+            # binarize
+            binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False)
+            split = "train"
+            bin_file = os.path.join(lang_dir, split)
+            dataset_impl = "mmap"
+            FileBinarizer.multiprocess_dataset(
+                input_file=raw_file,
+                binarizer=binarizer,
+                dataset_impl=dataset_impl,
+                vocab_size=len(vocab),
+                output_prefix=bin_file,
+            )
+
+            # setup task
+            train_args = options.parse_args_and_arch(
+                options.get_training_parser(),
+                [
+                    "--task",
+                    "multilingual_denoising",
+                    "--arch",
+                    "bart_base",
+                    "--seed",
+                    "42",
+                    "--mask-length",
+                    "word",
+                    "--permute-sentences",
+                    "1",
+                    "--rotate",
+                    "0",
+                    "--replace-length",
+                    "-1",
+                    "--mask",
+                    "0.2",
+                    dirname,
+                ],
+            )
+            cfg = convert_namespace_to_omegaconf(train_args)
+            task = MultilingualDenoisingTask(cfg.task, binarizer.dict)
+
+            # load datasets
+            original_dataset = task._load_dataset_split(bin_file, 1, False)
+            task.load_dataset(split)
+            masked_dataset = task.dataset(split)
+
+            iterator = task.get_batch_iterator(
+                dataset=masked_dataset,
+                max_tokens=65_536,
+                max_positions=4_096,
+            ).next_epoch_itr(shuffle=False)
+            mask_index = task.source_dictionary.index("<mask>")
+            for batch in iterator:
+                for sample in range(len(batch)):
+                    net_input = batch["net_input"]
+                    masked_src_tokens = net_input["src_tokens"][sample]
+                    masked_src_length = net_input["src_lengths"][sample]
+                    masked_tgt_tokens = batch["target"][sample]
+
+                    sample_id = batch["id"][sample]
+                    original_tokens = original_dataset[sample_id]
+                    original_tokens = original_tokens.masked_select(
+                        masked_src_tokens[:masked_src_length] == mask_index
+                    )
+                    masked_tokens = masked_tgt_tokens.masked_select(
+                        masked_src_tokens == mask_index
+                    )
+
+                    assert masked_tokens.equal(original_tokens)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_label_smoothing.py b/fairseq/tests/test_label_smoothing.py
new file mode 100644
index 0000000000000000000000000000000000000000..04c0f974ac80f7606327f868e948712c3c18f1d0
--- /dev/null
+++ b/fairseq/tests/test_label_smoothing.py
@@ -0,0 +1,123 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import copy
+import unittest
+
+import tests.utils as test_utils
+import torch
+from fairseq.criterions.cross_entropy import CrossEntropyCriterion
+from fairseq.criterions.label_smoothed_cross_entropy import (
+    LabelSmoothedCrossEntropyCriterion,
+)
+
+
+class TestLabelSmoothing(unittest.TestCase):
+    def setUp(self):
+        # build dictionary
+        self.d = test_utils.dummy_dictionary(3)
+        vocab = len(self.d)
+        self.assertEqual(vocab, 4 + 3)  # 4 special + 3 tokens
+        self.assertEqual(self.d.pad(), 1)
+        self.assertEqual(self.d.eos(), 2)
+        self.assertEqual(self.d.unk(), 3)
+        pad, eos, unk, w1, w2, w3 = 1, 2, 3, 4, 5, 6  # noqa: F841
+
+        # build dataset
+        self.data = [
+            # the first batch item has padding
+            {
+                "source": torch.LongTensor([w1, eos]),
+                "target": torch.LongTensor([w1, eos]),
+            },
+            {
+                "source": torch.LongTensor([w1, eos]),
+                "target": torch.LongTensor([w1, w1, eos]),
+            },
+        ]
+        self.sample = next(test_utils.dummy_dataloader(self.data))
+
+        # build model
+        self.args = argparse.Namespace()
+        self.args.sentence_avg = False
+        self.args.report_accuracy = False
+        self.args.probs = (
+            torch.FloatTensor(
+                [
+                    #      pad   eos  unk   w1   w2   w3
+                    [0.05, 0.05, 0.1, 0.05, 0.3, 0.4, 0.05],
+                    [0.05, 0.10, 0.2, 0.05, 0.2, 0.3, 0.10],
+                    [0.05, 0.15, 0.3, 0.05, 0.1, 0.2, 0.15],
+                ]
+            )
+            .unsqueeze(0)
+            .expand(2, 3, 7)
+        )  # add batch dimension
+        self.task = test_utils.TestTranslationTask.setup_task(self.args, self.d, self.d)
+        self.model = self.task.build_model(self.args)
+
+    def test_nll_loss(self):
+        self.args.label_smoothing = 0.1
+        nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task)
+        smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion(
+            self.args, self.task
+        )
+        nll_loss, nll_sample_size, nll_logging_output = nll_crit(
+            self.model, self.sample
+        )
+        smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit(
+            self.model, self.sample
+        )
+        self.assertLess(abs(nll_loss - nll_logging_output["loss"]), 1e-6)
+        self.assertLess(abs(nll_loss - smooth_logging_output["nll_loss"]), 1e-6)
+
+    def test_padding(self):
+        self.args.label_smoothing = 0.1
+        crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task)
+        loss, _, logging_output = crit(self.model, self.sample)
+
+        def get_one_no_padding(idx):
+            # create a new sample with just a single batch item so that there's
+            # no padding
+            sample1 = next(test_utils.dummy_dataloader([self.data[idx]]))
+            args1 = copy.copy(self.args)
+            args1.probs = args1.probs[idx, :, :].unsqueeze(0)
+            model1 = self.task.build_model(args1)
+            loss1, _, _ = crit(model1, sample1)
+            return loss1
+
+        loss1 = get_one_no_padding(0)
+        loss2 = get_one_no_padding(1)
+        self.assertAlmostEqual(loss, loss1 + loss2)
+
+    def test_reduction(self):
+        self.args.label_smoothing = 0.1
+        crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task)
+        loss, _, logging_output = crit(self.model, self.sample, reduce=True)
+        unreduced_loss, _, _ = crit(self.model, self.sample, reduce=False)
+        self.assertAlmostEqual(loss, unreduced_loss.sum())
+
+    def test_zero_eps(self):
+        self.args.label_smoothing = 0.0
+        nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task)
+        smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion(
+            self.args, self.task
+        )
+        nll_loss, nll_sample_size, nll_logging_output = nll_crit(
+            self.model, self.sample
+        )
+        smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit(
+            self.model, self.sample
+        )
+        self.assertAlmostEqual(nll_loss, smooth_loss)
+
+    def assertAlmostEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertLess((t1 - t2).abs().max(), 1e-6)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_memory_efficient_fp16.py b/fairseq/tests/test_memory_efficient_fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bf2f29888d6027896128930626b1aafe7f18475
--- /dev/null
+++ b/fairseq/tests/test_memory_efficient_fp16.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import unittest
+
+import torch
+from fairseq.optim.adam import FairseqAdam
+from fairseq.optim.fp16_optimizer import MemoryEfficientFP16Optimizer
+from omegaconf import OmegaConf
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+class TestMemoryEfficientFP16(unittest.TestCase):
+    def setUp(self):
+        logging.disable(logging.CRITICAL)
+
+    def tearDown(self):
+        logging.disable(logging.NOTSET)
+
+    def test_load_state_dict(self):
+        # define simple FP16 model
+        model = torch.nn.Linear(5, 5).cuda().half()
+        params = list(model.parameters())
+
+        # initialize memory efficient FP16 optimizer
+        # with pseudo DictConfigs
+        optimizer = FairseqAdam(
+            cfg=OmegaConf.create(
+                vars(
+                    argparse.Namespace(
+                        adam_betas="(0.9, 0.999)",
+                        adam_eps=1e-8,
+                        weight_decay=0.0,
+                        lr=[0.00001],
+                    )
+                )
+            ),
+            params=params,
+        )
+        me_optimizer = MemoryEfficientFP16Optimizer(
+            cfg=OmegaConf.create(
+                {
+                    "common": vars(
+                        argparse.Namespace(
+                            fp16_init_scale=1,
+                            fp16_scale_window=1,
+                            fp16_scale_tolerance=1,
+                            threshold_loss_scale=1,
+                            min_loss_scale=1e-4,
+                        )
+                    )
+                }
+            ),
+            params=params,
+            optimizer=optimizer,
+        )
+
+        # optimizer state is created in the first step
+        loss = model(torch.rand(5).cuda().half()).sum()
+        me_optimizer.backward(loss)
+        me_optimizer.step()
+
+        # reload state
+        state = me_optimizer.state_dict()
+        me_optimizer.load_state_dict(state)
+        for k, v in me_optimizer.optimizer.state.items():
+            self.assertTrue(k.dtype == torch.float16)
+            for v_i in v.values():
+                if torch.is_tensor(v_i):
+                    self.assertTrue(v_i.dtype == torch.float32)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_metrics.py b/fairseq/tests/test_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc93b48088e3833914b142156e6de1002eda093b
--- /dev/null
+++ b/fairseq/tests/test_metrics.py
@@ -0,0 +1,77 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+import uuid
+
+from fairseq.logging import metrics
+
+
+class TestMetrics(unittest.TestCase):
+    def test_nesting(self):
+        with metrics.aggregate() as a:
+            metrics.log_scalar("loss", 1)
+            with metrics.aggregate() as b:
+                metrics.log_scalar("loss", 2)
+
+        self.assertEqual(a.get_smoothed_values()["loss"], 1.5)
+        self.assertEqual(b.get_smoothed_values()["loss"], 2)
+
+    def test_new_root(self):
+        with metrics.aggregate() as a:
+            metrics.log_scalar("loss", 1)
+            with metrics.aggregate(new_root=True) as b:
+                metrics.log_scalar("loss", 2)
+
+        self.assertEqual(a.get_smoothed_values()["loss"], 1)
+        self.assertEqual(b.get_smoothed_values()["loss"], 2)
+
+    def test_nested_new_root(self):
+        with metrics.aggregate() as layer1:
+            metrics.log_scalar("loss", 1)
+            with metrics.aggregate(new_root=True) as layer2:
+                metrics.log_scalar("loss", 2)
+                with metrics.aggregate() as layer3:
+                    metrics.log_scalar("loss", 3)
+                    with metrics.aggregate(new_root=True) as layer4:
+                        metrics.log_scalar("loss", 4)
+            metrics.log_scalar("loss", 1.5)
+
+        self.assertEqual(layer4.get_smoothed_values()["loss"], 4)
+        self.assertEqual(layer3.get_smoothed_values()["loss"], 3)
+        self.assertEqual(layer2.get_smoothed_values()["loss"], 2.5)
+        self.assertEqual(layer1.get_smoothed_values()["loss"], 1.25)
+
+    def test_named(self):
+        name = str(uuid.uuid4())
+        metrics.reset_meters(name)
+
+        with metrics.aggregate(name):
+            metrics.log_scalar("loss", 1)
+
+        metrics.log_scalar("loss", 3)
+
+        with metrics.aggregate(name):
+            metrics.log_scalar("loss", 2)
+
+        self.assertEqual(metrics.get_smoothed_values(name)["loss"], 1.5)
+
+    def test_nested_duplicate_names(self):
+        name = str(uuid.uuid4())
+        metrics.reset_meters(name)
+
+        with metrics.aggregate(name):
+            metrics.log_scalar("loss", 1)
+            with metrics.aggregate() as other:
+                with metrics.aggregate(name):
+                    metrics.log_scalar("loss", 2)
+            metrics.log_scalar("loss", 6)
+
+        self.assertEqual(metrics.get_smoothed_values(name)["loss"], 3)
+        self.assertEqual(other.get_smoothed_values()["loss"], 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_multi_corpus_dataset.py b/fairseq/tests/test_multi_corpus_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..79900abf615f37e3513710352022d547304ccdba
--- /dev/null
+++ b/fairseq/tests/test_multi_corpus_dataset.py
@@ -0,0 +1,82 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from collections import OrderedDict
+
+import torch
+
+from fairseq.data import LanguagePairDataset, TokenBlockDataset
+from fairseq.data.multi_corpus_dataset import MultiCorpusDataset
+from tests.test_train import mock_dict
+
+
+class TestMultiCorpusDataset(unittest.TestCase):
+    def setUp(self):
+        d = mock_dict()
+        tokens_1 = torch.LongTensor([i for i in range(1, 5000, 2)]).view(1, -1)
+        tokens_ds1 = TokenBlockDataset(
+            tokens_1,
+            sizes=[tokens_1.size(-1)],
+            block_size=1,
+            pad=0,
+            eos=1,
+            include_targets=False,
+        )
+        self.dataset_1 = LanguagePairDataset(
+            tokens_ds1, tokens_ds1.sizes, d, shuffle=False
+        )
+        tokens_2 = torch.LongTensor([i for i in range(0, 5000, 2)]).view(1, -1)
+        tokens_ds2 = TokenBlockDataset(
+            tokens_2,
+            sizes=[tokens_2.size(-1)],
+            block_size=1,
+            pad=0,
+            eos=1,
+            include_targets=False,
+        )
+        self.dataset_2 = LanguagePairDataset(
+            tokens_ds2, tokens_ds2.sizes, d, shuffle=False
+        )
+
+    def _test_sample_helper(
+        self,
+        distribution,
+    ):
+        m = MultiCorpusDataset(
+            OrderedDict({0: self.dataset_1, 1: self.dataset_2}),
+            distribution=distribution,
+            seed=0,
+            sort_indices=True,
+        )
+        m.set_epoch(1)
+        indices = m.ordered_indices()
+        count_sample_from_first_dataset = 0
+        items = set()
+        for i in indices:
+            item = m[i]["source"].item()
+            if item % 2 == 1:
+                count_sample_from_first_dataset += 1
+
+            items.add(item)
+        sample_from_first_ds_percentage = (
+            1.0 * count_sample_from_first_dataset / len(indices)
+        )
+        self.assertLess(
+            abs(sample_from_first_ds_percentage - distribution[0]),
+            0.01,
+        )
+        self.assertEqual(
+            len(items),
+            int(
+                min(len(self.dataset_1), len(indices) * distribution[0])
+                + min(len(self.dataset_1), len(indices) * distribution[1])
+            ),
+        )
+        print(distribution)
+
+    def test_multi_corpus_dataset(self):
+        for distribution in [[0.5, 0.5], [0.1, 0.9], [0.9, 0.1], [0.0, 1.0]]:
+            self._test_sample_helper(distribution=distribution)
diff --git a/fairseq/tests/test_multi_corpus_sampled_dataset.py b/fairseq/tests/test_multi_corpus_sampled_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f0817a54541a42b4837141a83ab4a0cb870133
--- /dev/null
+++ b/fairseq/tests/test_multi_corpus_sampled_dataset.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import torch
+from fairseq.data import LanguagePairDataset, TokenBlockDataset
+from fairseq.data.multi_corpus_sampled_dataset import MultiCorpusSampledDataset
+from tests.test_train import mock_dict
+
+
+class TestMultiCorpusSampledDataset(unittest.TestCase):
+    def setUp(self):
+        d = mock_dict()
+        tokens_1 = torch.LongTensor([1]).view(1, -1)
+        tokens_ds1 = TokenBlockDataset(
+            tokens_1,
+            sizes=[tokens_1.size(-1)],
+            block_size=1,
+            pad=0,
+            eos=1,
+            include_targets=False,
+        )
+        self.dataset_1 = LanguagePairDataset(
+            tokens_ds1, tokens_ds1.sizes, d, shuffle=False
+        )
+        tokens_2 = torch.LongTensor([2]).view(1, -1)
+        tokens_ds2 = TokenBlockDataset(
+            tokens_2,
+            sizes=[tokens_2.size(-1)],
+            block_size=1,
+            pad=0,
+            eos=1,
+            include_targets=False,
+        )
+        self.dataset_2 = LanguagePairDataset(
+            tokens_ds2, tokens_ds2.sizes, d, shuffle=False
+        )
+
+    def _test_sample_helper(
+        self,
+        expected_sample_from_first_ds_percentage,
+        num_samples=1000,
+        sampling_func=None,
+    ):
+        # To make sure test is not flaky
+        np.random.seed(0)
+        if sampling_func is None:
+            m = MultiCorpusSampledDataset(
+                OrderedDict({0: self.dataset_1, 1: self.dataset_2}),
+            )
+        else:
+            m = MultiCorpusSampledDataset(
+                OrderedDict({0: self.dataset_1, 1: self.dataset_2}),
+                sampling_func=sampling_func,
+            )
+        m.ordered_indices()
+        count_sample_from_first_dataset = 0
+        for _ in range(num_samples):
+            if m.collater([m[0], m[1]])["net_input"]["src_tokens"][0] == 1:
+                count_sample_from_first_dataset += 1
+        sample_from_first_ds_percentage = (
+            1.0 * count_sample_from_first_dataset / num_samples
+        )
+        self.assertLess(
+            abs(
+                sample_from_first_ds_percentage
+                - expected_sample_from_first_ds_percentage
+            ),
+            0.01,
+        )
+
+    def test_multi_corpus_sampled_dataset_uniform_sample(self):
+        self._test_sample_helper(expected_sample_from_first_ds_percentage=0.5)
+
+    def test_multi_corpus_sampled_dataset_weighted_sample(self):
+        def naive_weighted_sample(weights):
+            def f(input):
+                v = np.random.random()
+                agg = 0
+                for i, weight in enumerate(weights):
+                    agg += weight
+                    if agg > v:
+                        return i
+
+            return f
+
+        self._test_sample_helper(
+            expected_sample_from_first_ds_percentage=0.9,
+            sampling_func=naive_weighted_sample(weights=[0.9, 0.1]),
+        )
diff --git a/fairseq/tests/test_multihead_attention.py b/fairseq/tests/test_multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a0b430b6f0e98ad83afa21e7b004392313c6c26
--- /dev/null
+++ b/fairseq/tests/test_multihead_attention.py
@@ -0,0 +1,488 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+import unittest
+
+import pytest
+import torch
+
+from fairseq.modules.multihead_attention import MultiheadAttention, _mask_for_xformers
+
+BATCH = [20, 41, 97]
+SEQ = [64]
+EMB = [48]
+HEADS = [4]
+DROP = 0.1
+DEVICE = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+ATTN_MASK_DTYPE = [None, torch.uint8, torch.bool, torch.float]
+KEY_PADDING_MASK_DTYPE = [None, torch.uint8, torch.bool]
+
+
+# FIXME: some tests fail when decimal=2, fix this and set decimal to 2
+def assert_almost_equal(x, y, decimal=1, err_msg=""):
+    import numpy.testing as npt
+
+    if isinstance(x, torch.Tensor):
+        x = x.cpu().detach().numpy()
+    if isinstance(y, torch.Tensor):
+        y = y.cpu().detach().numpy()
+    npt.assert_array_almost_equal(x, y, err_msg=err_msg, decimal=decimal)
+
+
+def _reset_seeds():
+    torch.manual_seed(0)
+    torch.random.manual_seed(0)
+    random.seed(0)
+    torch.cuda.manual_seed_all(0)
+
+
+def _get_mask(to_dtype: torch.dtype, dim0: int, dim1: int):
+    if to_dtype == torch.float:
+        mask = torch.randint(0, 2, (dim0, dim1)).to(dtype=torch.bool)
+        return mask.to(dtype=to_dtype).masked_fill(mask, -float("inf"))
+    return torch.randint(0, 2, (dim0, dim1)).to(dtype=to_dtype)
+
+
+def test_mask_for_xformers():
+    # Additive Mask
+    m_float_add = torch.tensor([float("-inf"), 0]).to(torch.float)
+    m_float_add_flipped = torch.tensor([0, float("-inf")]).to(torch.float)
+    m_float16_add = torch.tensor([float("-inf"), 0]).to(torch.float16)
+    m_float16_add_flipped = torch.tensor([0, float("-inf")]).to(torch.float16)
+    m_uint = torch.tensor([1, 0]).to(torch.uint8)
+    m_uint_flipped = torch.tensor([0, 1]).to(torch.uint8)
+    m_bool = torch.tensor([False, True])
+
+    assert torch.equal(_mask_for_xformers(m_float_add), m_float_add)
+    assert torch.equal(_mask_for_xformers(m_float16_add), m_float16_add)
+    assert torch.equal(_mask_for_xformers(m_uint), m_uint_flipped)
+    assert torch.equal(_mask_for_xformers(m_bool), ~m_bool)
+
+    assert torch.equal(
+        _mask_for_xformers(m_float_add, to_dtype=torch.float16), m_float16_add
+    )
+    assert torch.equal(
+        _mask_for_xformers(m_float_add, to_dtype=torch.float), m_float_add
+    )
+    assert torch.equal(_mask_for_xformers(m_float_add, to_dtype=torch.bool), m_bool)
+    assert torch.equal(
+        _mask_for_xformers(m_float_add, to_dtype=torch.uint8), m_uint_flipped
+    )
+
+    assert torch.equal(
+        _mask_for_xformers(m_float16_add, to_dtype=torch.float16), m_float16_add
+    )
+    assert torch.equal(
+        _mask_for_xformers(m_float16_add, to_dtype=torch.float), m_float_add
+    )
+    assert torch.equal(_mask_for_xformers(m_float16_add, to_dtype=torch.bool), m_bool)
+    assert torch.equal(
+        _mask_for_xformers(m_float16_add, to_dtype=torch.uint8), m_uint_flipped
+    )
+
+    assert torch.equal(
+        _mask_for_xformers(m_bool, to_dtype=torch.float16), m_float16_add_flipped
+    )
+    assert torch.equal(
+        _mask_for_xformers(m_bool, to_dtype=torch.float), m_float_add_flipped
+    )
+    assert torch.equal(_mask_for_xformers(m_bool, to_dtype=torch.bool), ~m_bool)
+    assert torch.equal(_mask_for_xformers(m_bool, to_dtype=torch.uint8), m_uint)
+
+    assert torch.equal(
+        _mask_for_xformers(m_uint, to_dtype=torch.float16), m_float16_add
+    )
+    assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.float), m_float_add)
+    assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.bool), m_bool)
+    assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.uint8), m_uint_flipped)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="blocksparse requires gpu")
+@pytest.mark.skip(reason="not part of latest xformers")
+@pytest.mark.parametrize("device", ["cuda"])
+@pytest.mark.parametrize("add_zero_attn", [False])
+@pytest.mark.parametrize("batch_size", [20])
+@pytest.mark.parametrize("embedding", [64])
+@pytest.mark.parametrize("seq_len", [64])
+@pytest.mark.parametrize("num_heads", [4])
+def test_xformers_blocksparse_parity(
+    device,
+    add_zero_attn,
+    batch_size,
+    embedding,
+    seq_len,
+    num_heads,
+):
+
+    xformers_att_config = '{"name": "scaled_dot_product"}'
+    xformers_blocksparse_blocksize = 16
+    xformers_blocksparse_layout = torch.ones(
+        seq_len // xformers_blocksparse_blocksize,
+        seq_len // xformers_blocksparse_blocksize,
+        dtype=torch.int32,
+    )
+
+    q = torch.rand(seq_len, batch_size, embedding).to(device).half()
+    q.requires_grad = True
+    k = torch.rand(seq_len, batch_size, embedding).to(device).half()
+    k.requires_grad = True
+    v = torch.rand(seq_len, batch_size, embedding).to(device).half()
+    v.requires_grad = True
+
+    q_ = q.detach().clone().half()
+    q_.requires_grad = True
+    k_ = k.detach().clone().half()
+    k_.requires_grad = True
+    v_ = v.detach().clone().half()
+    v_.requires_grad = True
+
+    _reset_seeds()
+    xf_blocksparse_mha = (
+        MultiheadAttention(
+            embedding,
+            num_heads,
+            dropout=0.0,
+            add_zero_attn=add_zero_attn,
+            xformers_att_config=xformers_att_config,
+            xformers_blocksparse_layout=xformers_blocksparse_layout,
+            xformers_blocksparse_blocksize=xformers_blocksparse_blocksize,
+        )
+        .to(device)
+        .half()
+    )
+
+    xf_blocksparse_output, _ = xf_blocksparse_mha(
+        q,
+        k,
+        v,
+    )
+
+    _reset_seeds()
+    xformers_mha = (
+        MultiheadAttention(
+            embedding,
+            num_heads,
+            dropout=0.0,
+            add_zero_attn=add_zero_attn,
+            xformers_att_config=xformers_att_config,
+            xformers_blocksparse_layout=None,
+        )
+        .to(device)
+        .half()
+    )
+
+    xformers_output, _ = xformers_mha(
+        q_,
+        k_,
+        v_,
+    )
+
+    # # account for when nan != nan
+    rand = random.uniform(0, 1)
+    xformers_output = xformers_output.masked_fill(xformers_output.isnan(), rand)
+    xf_blocksparse_output = xf_blocksparse_output.masked_fill(
+        xf_blocksparse_output.isnan(), rand
+    )
+
+    assert_almost_equal(xformers_output, xf_blocksparse_output)
+
+    loss_blocksparse = torch.norm(xformers_output)
+    loss_original = torch.norm(xf_blocksparse_output)
+    loss_blocksparse.backward()
+    loss_original.backward()
+
+    q.masked_fill(q.isnan(), rand)
+    q_.masked_fill(q_.isnan(), rand)
+    k.masked_fill(k.isnan(), rand)
+    k_.masked_fill(k_.isnan(), rand)
+    v.masked_fill(v.isnan(), rand)
+    v_.masked_fill(v_.isnan(), rand)
+
+    assert_almost_equal(q.grad, q_.grad)
+    assert_almost_equal(k.grad, k_.grad)
+    assert_almost_equal(v.grad, v_.grad)
+
+
+@pytest.mark.parametrize("device", DEVICE)
+@pytest.mark.parametrize("attn_dtype", ATTN_MASK_DTYPE)
+@pytest.mark.parametrize("key_padding_dtype", KEY_PADDING_MASK_DTYPE)
+@pytest.mark.parametrize("add_bias_kv", [True, False])
+@pytest.mark.parametrize("add_zero_attn", [True, False])
+# TODO: test with static_kv True
+@pytest.mark.parametrize("static_kv", [False])
+@pytest.mark.parametrize("batch_size", BATCH)
+@pytest.mark.parametrize("embedding", EMB)
+@pytest.mark.parametrize("seq_len", SEQ)
+@pytest.mark.parametrize("num_heads", HEADS)
+def test_xformers_single_forward_parity(
+    device,
+    attn_dtype,
+    key_padding_dtype,
+    add_bias_kv,
+    add_zero_attn,
+    static_kv,
+    batch_size,
+    embedding,
+    seq_len,
+    num_heads,
+):
+
+    xformers_att_config = '{"name": "scaled_dot_product"}'
+
+    attn_mask = (
+        None
+        if attn_dtype is None
+        else _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len).to(device)
+    )
+    key_padding_mask = (
+        None
+        if key_padding_dtype is None
+        else _get_mask(to_dtype=key_padding_dtype, dim0=batch_size, dim1=seq_len).to(
+            device
+        )
+    )
+
+    q = torch.rand(seq_len, batch_size, embedding).to(device)
+    q.requires_grad = True
+    k = torch.rand(seq_len, batch_size, embedding).to(device)
+    k.requires_grad = True
+    v = torch.rand(seq_len, batch_size, embedding).to(device)
+    v.requires_grad = True
+
+    q_ = q.detach().clone()
+    q_.requires_grad = True
+    k_ = k.detach().clone()
+    k_.requires_grad = True
+    v_ = v.detach().clone()
+    v_.requires_grad = True
+
+    # TODO: dropouts in the two implementations lead to different entries dropped.
+    _reset_seeds()
+    xformers_mha = MultiheadAttention(
+        embedding,
+        num_heads,
+        dropout=0.0,
+        xformers_att_config=xformers_att_config,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+    ).to(device)
+    xformers_output, _ = xformers_mha(
+        q,
+        k,
+        v,
+        key_padding_mask=key_padding_mask,
+        attn_mask=attn_mask,
+        static_kv=static_kv,
+    )
+
+    _reset_seeds()
+    original_mha = MultiheadAttention(
+        embedding,
+        num_heads,
+        dropout=0.0,
+        xformers_att_config=None,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+    ).to(device)
+    original_output, _ = original_mha(
+        q_,
+        k_,
+        v_,
+        key_padding_mask=key_padding_mask,
+        attn_mask=attn_mask,
+        static_kv=static_kv,
+    )
+
+    # account for when nan != nan
+    if xformers_output.isnan().any() or original_output.isnan().any():
+        rand = random.uniform(0, 1)
+        xformers_output = xformers_output.masked_fill(xformers_output.isnan(), rand)
+        original_output = original_output.masked_fill(original_output.isnan(), rand)
+
+    # torch.equal works for cpu, on cuda allclose is needed.
+    assert torch.allclose(
+        xformers_output, original_output, atol=1e-06
+    ), f"max diff is {torch.max(torch.abs(xformers_output - original_output))}"
+
+    loss_xformers = torch.norm(xformers_output)
+    loss_original = torch.norm(original_output)
+    loss_xformers.backward()
+    loss_original.backward()
+
+    # torch.equal works for cpu, on cuda allclose is needed.
+    assert torch.allclose(
+        q.grad, q_.grad
+    ), f"max diff is {torch.max(torch.abs(q.grad - q_.grad))}"
+    assert torch.allclose(
+        k.grad, k_.grad
+    ), f"max diff is {torch.max(torch.abs(k.grad - k_.grad))}"
+    assert torch.allclose(
+        v.grad, v_.grad
+    ), f"max diff is {torch.max(torch.abs(v.grad - v_.grad))}"
+
+
+def test_mask_padding_parity():
+    def old_padding_code(key_padding_mask, attn_mask):
+        if attn_mask is not None:
+            attn_mask = torch.cat(
+                [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+            )
+        if key_padding_mask is not None:
+            key_padding_mask = torch.cat(
+                [
+                    key_padding_mask,
+                    torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask),
+                ],
+                dim=1,
+            )
+        return key_padding_mask, attn_mask
+
+    # values don't matter for this test.
+    mha = MultiheadAttention(
+        embed_dim=8,
+        num_heads=2,
+        dropout=0.0,
+        add_bias_kv=True,
+        add_zero_attn=True,
+    )
+
+    key_padding_mask = torch.rand((8, 64))
+    attn_mask = torch.rand((64, 64))
+
+    kp_mask_orig, a_mask_orig = old_padding_code(key_padding_mask, attn_mask)
+    kp_mask_new, a_mask_new = mha._pad_masks(key_padding_mask, attn_mask)
+
+    assert kp_mask_orig.size() == kp_mask_new.size()
+    assert a_mask_orig.size() == a_mask_new.size()
+    assert torch.equal(kp_mask_orig, kp_mask_new)
+    assert torch.equal(a_mask_orig, a_mask_new)
+
+
+def test_add_bias_parity():
+    # values don't matter for this test.
+    mha = MultiheadAttention(
+        embed_dim=8,
+        num_heads=2,
+        dropout=0.0,
+        add_bias_kv=True,
+        add_zero_attn=True,
+    )
+
+    def old_bias_code(k, v, key_padding_mask, attn_mask, bsz):
+        k = torch.cat([k, mha.bias_k.repeat(1, bsz, 1)])
+        v = torch.cat([v, mha.bias_v.repeat(1, bsz, 1)])
+        if attn_mask is not None:
+            attn_mask = torch.cat(
+                [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+            )
+        if key_padding_mask is not None:
+            key_padding_mask = torch.cat(
+                [
+                    key_padding_mask,
+                    key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                ],
+                dim=1,
+            )
+        return k, v, key_padding_mask, attn_mask
+
+    seq_len = 64
+    bsz = 8
+    embedding = 8
+    key_padding_mask = torch.rand((bsz, seq_len))
+    attn_mask = torch.rand((seq_len, seq_len))
+    k = torch.rand((seq_len, bsz, embedding))
+    v = torch.rand((seq_len, bsz, embedding))
+
+    k_orig, v_orig, kp_mask_orig, a_mask_orig = old_bias_code(
+        k, v, key_padding_mask, attn_mask, bsz
+    )
+    k_new, v_new, kp_mask_new, a_mask_new = mha._add_bias(
+        k, v, key_padding_mask, attn_mask, bsz
+    )
+
+    assert torch.equal(k_orig, k_new)
+    assert torch.equal(v_orig, v_new)
+    assert torch.equal(kp_mask_orig, kp_mask_new)
+    assert torch.equal(a_mask_orig, a_mask_new)
+
+
+class TestMultiheadAttention(unittest.TestCase):
+    def test_append_prev_key_padding_mask(self):
+        bsz = 1
+        src_len = 4
+
+        cases = [
+            # no padding mask
+            (None, None, None),
+            # current padding mask only
+            (
+                torch.tensor([[1]]).bool(),
+                None,
+                torch.tensor([[0, 0, 0, 1]]).bool(),
+            ),
+            # previous padding mask only
+            (
+                None,
+                torch.tensor([[0, 1, 0]]).bool(),
+                torch.tensor([[0, 1, 0, 0]]).bool(),
+            ),
+            # both padding masks
+            (
+                torch.tensor([[1]]).bool(),
+                torch.tensor([[0, 1, 0]]).bool(),
+                torch.tensor([[0, 1, 0, 1]]).bool(),
+            ),
+            # prev_key_padding_mask already full
+            (
+                torch.tensor([[0, 1, 0, 1]]).bool(),
+                None,
+                torch.tensor([[0, 1, 0, 1]]).bool(),
+            ),
+            # key_padding_mask already full
+            (
+                None,
+                torch.tensor([[0, 1, 0, 1]]).bool(),
+                torch.tensor([[0, 1, 0, 1]]).bool(),
+            ),
+        ]
+        for c in cases:
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                c[0],
+                c[1],
+                batch_size=bsz,
+                src_len=src_len,
+                static_kv=False,
+            )
+
+            if key_padding_mask is not None:
+                self.assertTrue(
+                    torch.all(torch.eq(key_padding_mask, c[2])),
+                    f"Unexpected resultant key padding mask: {key_padding_mask}"
+                    f" given current: {c[0]} and previous: {c[1]}",
+                )
+                self.assertEqual(key_padding_mask.size(0), bsz)
+                self.assertEqual(key_padding_mask.size(1), src_len)
+            else:
+                self.assertIsNone(c[2])
+
+    def test_pruning_heads(self):
+        embed_dim = 768
+        num_heads = 12
+        num_heads_to_keep = 8
+        dummy_input = torch.randn(32, 2, embed_dim)
+        mha = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads)
+        reserve_head_index = mha._get_reserve_head_index(
+            num_heads_to_keep=num_heads_to_keep
+        )
+        mha._adaptive_prune_heads(reserve_head_index=reserve_head_index)
+        mha._set_skip_embed_dim_check()
+        mha(query=dummy_input, key=dummy_input, value=dummy_input)
+        self.assertEqual(mha.head_dim, embed_dim / num_heads)
+        self.assertEqual(mha.num_heads, num_heads_to_keep)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_noising.py b/fairseq/tests/test_noising.py
new file mode 100644
index 0000000000000000000000000000000000000000..1956f6ad1d0ffd9340a1b028d298b2cf78ae460f
--- /dev/null
+++ b/fairseq/tests/test_noising.py
@@ -0,0 +1,531 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Dict, List
+
+import torch
+
+import tests.utils as test_utils
+from fairseq import utils
+from fairseq.data import (
+    Dictionary,
+    LanguagePairDataset,
+    TransformEosDataset,
+    data_utils,
+    noising,
+)
+
+
+class TestDataNoising(unittest.TestCase):
+    def _get_test_data_with_bpe_cont_marker(self, append_eos=True):
+        """
+        Args:
+            append_eos: if True, each input sentence in the source tokens tensor
+                will have an EOS appended to the end.
+
+        Returns:
+            vocabs: BPE vocab with continuation markers as suffixes to denote
+                non-end of word tokens. This is the standard BPE format used in
+                fairseq's preprocessing.
+            x: input tensor containing numberized source tokens, with EOS at the
+                end if append_eos is true
+            src_lengths: and source lengths.
+        """
+        vocab = Dictionary()
+        vocab.add_symbol("he@@")
+        vocab.add_symbol("llo")
+        vocab.add_symbol("how")
+        vocab.add_symbol("are")
+        vocab.add_symbol("y@@")
+        vocab.add_symbol("ou")
+        vocab.add_symbol("n@@")
+        vocab.add_symbol("ew")
+        vocab.add_symbol("or@@")
+        vocab.add_symbol("k")
+
+        src_tokens = [
+            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
+            ["how", "are", "y@@", "ou"],
+        ]
+        x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
+            vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
+        )
+        return vocab, x, src_lengths
+
+    def _get_test_data_with_bpe_end_marker(self, append_eos=True):
+        """
+        Args:
+            append_eos: if True, each input sentence in the source tokens tensor
+                will have an EOS appended to the end.
+
+        Returns:
+            vocabs: BPE vocab with end-of-word markers as suffixes to denote
+                tokens at the end of a word. This is an alternative to fairseq's
+                standard preprocessing framework and is not generally supported
+                within fairseq.
+            x: input tensor containing numberized source tokens, with EOS at the
+                end if append_eos is true
+            src_lengths: and source lengths.
+        """
+        vocab = Dictionary()
+        vocab.add_symbol("he")
+        vocab.add_symbol("llo_EOW")
+        vocab.add_symbol("how_EOW")
+        vocab.add_symbol("are_EOW")
+        vocab.add_symbol("y")
+        vocab.add_symbol("ou_EOW")
+        vocab.add_symbol("n")
+        vocab.add_symbol("ew_EOW")
+        vocab.add_symbol("or")
+        vocab.add_symbol("k_EOW")
+
+        src_tokens = [
+            ["he", "llo_EOW", "n", "ew_EOW", "y", "or", "k_EOW"],
+            ["how_EOW", "are_EOW", "y", "ou_EOW"],
+        ]
+        x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
+            vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
+        )
+        return vocab, x, src_lengths
+
+    def _get_test_data_with_word_vocab(self, append_eos=True):
+        """
+        Args:
+            append_eos: if True, each input sentence in the source tokens tensor
+                will have an EOS appended to the end.
+
+        Returns:
+            vocabs: word vocab
+            x: input tensor containing numberized source tokens, with EOS at the
+                end if append_eos is true
+            src_lengths: and source lengths.
+        """
+        vocab = Dictionary()
+
+        vocab.add_symbol("hello")
+        vocab.add_symbol("how")
+        vocab.add_symbol("are")
+        vocab.add_symbol("you")
+        vocab.add_symbol("new")
+        vocab.add_symbol("york")
+        src_tokens = [
+            ["hello", "new", "york", "you"],
+            ["how", "are", "you", "new", "york"],
+        ]
+        x, src_lengths = self._convert_src_tokens_to_tensor(
+            vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
+        )
+        return vocab, x, src_lengths
+
+    def _convert_src_tokens_to_tensor(
+        self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool
+    ):
+        src_len = [len(x) for x in src_tokens]
+        # If we have to append EOS, we include EOS in counting src length
+        if append_eos:
+            src_len = [length + 1 for length in src_len]
+
+        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
+        for i in range(len(src_tokens)):
+            for j in range(len(src_tokens[i])):
+                x[i][j] = vocab.index(src_tokens[i][j])
+            if append_eos:
+                x[i][j + 1] = vocab.eos()
+
+        x = x.transpose(1, 0)
+        return x, torch.LongTensor(src_len)
+
+    def assert_eos_at_end(self, x, x_len, eos):
+        """Asserts last token of every sentence in x is EOS"""
+        for i in range(len(x_len)):
+            self.assertEqual(
+                x[x_len[i] - 1][i],
+                eos,
+                (
+                    "Expected eos (token id {eos}) at the end of sentence {i} "
+                    "but got {other} instead"
+                ).format(i=i, eos=eos, other=x[i][-1]),
+            )
+
+    def assert_word_dropout_correct(self, x, x_noised, x_len, l_noised):
+        # Expect only the first word (2 bpe tokens) of the first example
+        # was dropped out
+        self.assertEqual(x_len[0] - 2, l_noised[0])
+        for i in range(l_noised[0]):
+            self.assertEqual(x_noised[i][0], x[i + 2][0])
+
+    def test_word_dropout_with_eos(self):
+        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
+
+        with data_utils.numpy_seed(1234):
+            noising_gen = noising.WordDropout(vocab)
+            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
+            self.assert_word_dropout_correct(
+                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
+            )
+            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
+
+    def assert_word_blanking_correct(self, x, x_noised, x_len, l_noised, unk):
+        # Expect only the first word (2 bpe tokens) of the first example
+        # was blanked out
+        self.assertEqual(x_len[0], l_noised[0])
+        for i in range(l_noised[0]):
+            if i < 2:
+                self.assertEqual(x_noised[i][0], unk)
+            else:
+                self.assertEqual(x_noised[i][0], x[i][0])
+
+    def test_word_blank_with_eos(self):
+        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
+
+        with data_utils.numpy_seed(1234):
+            noising_gen = noising.WordDropout(vocab)
+            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
+            self.assert_word_blanking_correct(
+                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk()
+            )
+            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
+
+    def generate_unchanged_shuffle_map(self, length):
+        return {i: i for i in range(length)}
+
+    def assert_word_shuffle_matches_expected(
+        self,
+        x,
+        x_len,
+        max_shuffle_distance: int,
+        vocab: Dictionary,
+        expected_shufle_maps: List[Dict[int, int]],
+        expect_eos_at_end: bool,
+        bpe_end_marker=None,
+    ):
+        """
+        This verifies that with a given x, x_len, max_shuffle_distance, and
+        vocab, we get the expected shuffle result.
+
+        Args:
+            x: Tensor of shape (T x B) = (sequence_length, batch_size)
+            x_len: Tensor of length B = batch_size
+            max_shuffle_distance: arg to pass to noising
+            expected_shuffle_maps: List[mapping] where mapping is a
+                Dict[old_index, new_index], mapping x's elements from their
+                old positions in x to their new positions in x.
+            expect_eos_at_end: if True, check the output to make sure there is
+                an EOS at the end.
+            bpe_end_marker: str denoting the BPE end token. If this is not None, we
+                set the BPE cont token to None in the noising classes.
+        """
+        bpe_cont_marker = None
+        if bpe_end_marker is None:
+            bpe_cont_marker = "@@"
+
+        with data_utils.numpy_seed(1234):
+            word_shuffle = noising.WordShuffle(
+                vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker
+            )
+            x_noised, l_noised = word_shuffle.noising(
+                x, x_len, max_shuffle_distance=max_shuffle_distance
+            )
+
+        # For every example, we have a different expected shuffle map. We check
+        # that each example is shuffled as expected according to each
+        # corresponding shuffle map.
+        for i in range(len(expected_shufle_maps)):
+            shuffle_map = expected_shufle_maps[i]
+            for k, v in shuffle_map.items():
+                self.assertEqual(x[k][i], x_noised[v][i])
+
+        # Shuffling should not affect the length of each example
+        for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised):
+            self.assertEqual(pre_shuffle_length, post_shuffle_length)
+        if expect_eos_at_end:
+            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
+
+    def test_word_shuffle_with_eos(self):
+        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
+
+        # Assert word shuffle with max shuffle distance 0 causes input to be
+        # unchanged
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            max_shuffle_distance=0,
+            vocab=vocab,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(example_len)
+                for example_len in x_len
+            ],
+            expect_eos_at_end=True,
+        )
+
+        # Assert word shuffle with max shuffle distance 3 matches our expected
+        # shuffle order
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            vocab=vocab,
+            max_shuffle_distance=3,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(x_len[0]),
+                {0: 0, 1: 3, 2: 1, 3: 2},
+            ],
+            expect_eos_at_end=True,
+        )
+
+    def test_word_shuffle_with_eos_nonbpe(self):
+        """The purpose of this is to test shuffling logic with word vocabs"""
+        vocab, x, x_len = self._get_test_data_with_word_vocab(append_eos=True)
+
+        # Assert word shuffle with max shuffle distance 0 causes input to be
+        # unchanged
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            max_shuffle_distance=0,
+            vocab=vocab,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(example_len)
+                for example_len in x_len
+            ],
+            expect_eos_at_end=True,
+        )
+
+        # Assert word shuffle with max shuffle distance 3 matches our expected
+        # shuffle order
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            vocab=vocab,
+            max_shuffle_distance=3,
+            expected_shufle_maps=[
+                {0: 0, 1: 1, 2: 3, 3: 2},
+                {0: 0, 1: 2, 2: 1, 3: 3, 4: 4},
+            ],
+            expect_eos_at_end=True,
+        )
+
+    def test_word_shuffle_without_eos(self):
+        """Same result as word shuffle with eos except no EOS at end"""
+        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
+
+        # Assert word shuffle with max shuffle distance 0 causes input to be
+        # unchanged
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            max_shuffle_distance=0,
+            vocab=vocab,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(example_len)
+                for example_len in x_len
+            ],
+            expect_eos_at_end=False,
+        )
+
+        # Assert word shuffle with max shuffle distance 3 matches our expected
+        # shuffle order
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            vocab=vocab,
+            max_shuffle_distance=3,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(x_len[0]),
+                {0: 0, 1: 3, 2: 1, 3: 2},
+            ],
+            expect_eos_at_end=False,
+        )
+
+    def test_word_shuffle_without_eos_with_bpe_end_marker(self):
+        """Same result as word shuffle without eos except using BPE end token"""
+        vocab, x, x_len = self._get_test_data_with_bpe_end_marker(append_eos=False)
+
+        # Assert word shuffle with max shuffle distance 0 causes input to be
+        # unchanged
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            max_shuffle_distance=0,
+            vocab=vocab,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(example_len)
+                for example_len in x_len
+            ],
+            expect_eos_at_end=False,
+            bpe_end_marker="_EOW",
+        )
+
+        # Assert word shuffle with max shuffle distance 3 matches our expected
+        # shuffle order
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            vocab=vocab,
+            max_shuffle_distance=3,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(x_len[0]),
+                {0: 0, 1: 3, 2: 1, 3: 2},
+            ],
+            expect_eos_at_end=False,
+            bpe_end_marker="_EOW",
+        )
+
+    def assert_no_eos_at_end(self, x, x_len, eos):
+        """Asserts that the last token of each sentence in x is not EOS"""
+        for i in range(len(x_len)):
+            self.assertNotEqual(
+                x[x_len[i] - 1][i],
+                eos,
+                "Expected no eos (token id {eos}) at the end of sentence {i}.".format(
+                    eos=eos, i=i
+                ),
+            )
+
+    def test_word_dropout_without_eos(self):
+        """Same result as word dropout with eos except no EOS at end"""
+        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
+
+        with data_utils.numpy_seed(1234):
+            noising_gen = noising.WordDropout(vocab)
+            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
+            self.assert_word_dropout_correct(
+                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
+            )
+            self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
+
+    def test_word_blank_without_eos(self):
+        """Same result as word blank with eos except no EOS at end"""
+        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
+
+        with data_utils.numpy_seed(1234):
+            noising_gen = noising.WordDropout(vocab)
+            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
+            self.assert_word_blanking_correct(
+                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk()
+            )
+            self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
+
+    def _get_noising_dataset_batch(
+        self,
+        src_tokens_no_pad,
+        src_dict,
+        append_eos_to_tgt=False,
+    ):
+        """
+        Constructs a NoisingDataset and the corresponding
+        ``LanguagePairDataset(NoisingDataset(src), src)``. If
+        *append_eos_to_tgt* is True, wrap the source dataset in
+        :class:`TransformEosDataset` to append EOS to the clean source when
+        using it as the target.
+        """
+        src_dataset = test_utils.TestDataset(data=src_tokens_no_pad)
+
+        noising_dataset = noising.NoisingDataset(
+            src_dataset=src_dataset,
+            src_dict=src_dict,
+            seed=1234,
+            max_word_shuffle_distance=3,
+            word_dropout_prob=0.2,
+            word_blanking_prob=0.2,
+            noising_class=noising.UnsupervisedMTNoising,
+        )
+        tgt = src_dataset
+        language_pair_dataset = LanguagePairDataset(
+            src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict
+        )
+        language_pair_dataset = TransformEosDataset(
+            language_pair_dataset,
+            src_dict.eos(),
+            append_eos_to_tgt=append_eos_to_tgt,
+        )
+
+        dataloader = torch.utils.data.DataLoader(
+            dataset=language_pair_dataset,
+            batch_size=2,
+            collate_fn=language_pair_dataset.collater,
+        )
+        denoising_batch_result = next(iter(dataloader))
+        return denoising_batch_result
+
+    def test_noising_dataset_with_eos(self):
+        src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
+            append_eos=True
+        )
+
+        # Format data for src_dataset
+        src_tokens = torch.t(src_tokens)
+        src_tokens_no_pad = []
+        for src_sentence in src_tokens:
+            src_tokens_no_pad.append(
+                utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
+            )
+        denoising_batch_result = self._get_noising_dataset_batch(
+            src_tokens_no_pad=src_tokens_no_pad, src_dict=src_dict
+        )
+
+        eos, pad = src_dict.eos(), src_dict.pad()
+
+        # Generated noisy source as source
+        expected_src = torch.LongTensor(
+            [[4, 5, 10, 11, 8, 12, 13, eos], [pad, pad, pad, 6, 8, 9, 7, eos]]
+        )
+        # Original clean source as target (right-padded)
+        expected_tgt = torch.LongTensor(
+            [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
+        )
+        generated_src = denoising_batch_result["net_input"]["src_tokens"]
+        tgt_tokens = denoising_batch_result["target"]
+
+        self.assertTensorEqual(expected_src, generated_src)
+        self.assertTensorEqual(expected_tgt, tgt_tokens)
+
+    def test_noising_dataset_without_eos(self):
+        """
+        Similar to test noising dataset with eos except that we have to set
+        *append_eos_to_tgt* to ``True``.
+        """
+
+        src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
+            append_eos=False
+        )
+
+        # Format data for src_dataset
+        src_tokens = torch.t(src_tokens)
+        src_tokens_no_pad = []
+        for src_sentence in src_tokens:
+            src_tokens_no_pad.append(
+                utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
+            )
+        denoising_batch_result = self._get_noising_dataset_batch(
+            src_tokens_no_pad=src_tokens_no_pad,
+            src_dict=src_dict,
+            append_eos_to_tgt=True,
+        )
+
+        eos, pad = src_dict.eos(), src_dict.pad()
+
+        # Generated noisy source as source
+        expected_src = torch.LongTensor(
+            [[4, 5, 10, 11, 8, 12, 13], [pad, pad, pad, 6, 8, 9, 7]]
+        )
+        # Original clean source as target (right-padded)
+        expected_tgt = torch.LongTensor(
+            [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
+        )
+
+        generated_src = denoising_batch_result["net_input"]["src_tokens"]
+        tgt_tokens = denoising_batch_result["target"]
+
+        self.assertTensorEqual(expected_src, generated_src)
+        self.assertTensorEqual(expected_tgt, tgt_tokens)
+
+    def assertTensorEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertEqual(t1.ne(t2).long().sum(), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_online_backtranslation.py b/fairseq/tests/test_online_backtranslation.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ae7e773da0ff838b3c8151bc14b84a6a9238a72
--- /dev/null
+++ b/fairseq/tests/test_online_backtranslation.py
@@ -0,0 +1,206 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import tempfile
+import unittest
+from pathlib import Path
+from typing import Any, Dict, Sequence
+
+import fairseq.data.indexed_dataset as indexed_dataset
+import fairseq.options
+import fairseq.tasks.online_backtranslation as obt
+import torch
+from tests import utils
+
+
+def mk_sample(tokens: Sequence[int], batch_size: int = 2) -> Dict[str, Any]:
+    batch = torch.stack([torch.tensor(tokens, dtype=torch.long)] * batch_size)
+    sample = {
+        "net_input": {
+            "src_tokens": batch,
+            "prev_output_tokens": batch,
+            "src_lengths": torch.tensor([len(tokens)] * batch_size, dtype=torch.long),
+        },
+        "target": batch[:, 1:],
+    }
+    return sample
+
+
+def mk_dataset(num_samples: int, max_len: int, output: Path):
+    output.parent.mkdir(exist_ok=True)
+    idx = indexed_dataset.IndexedDatasetBuilder(str(output))
+    data = torch.randint(5, 100, (num_samples, max_len))
+    lengths = torch.randint(3, max_len, (num_samples,))
+    for d, l in zip(data, lengths):
+        d[0] = 0
+        idx.add_item(d[:l])
+    idx.finalize(output.with_suffix(".idx"))
+    assert output.exists()
+    assert output.with_suffix(".idx").exists()
+
+
+class OnlineBacktranslationTest(unittest.TestCase):
+
+    tmp_dir = Path(tempfile.mkdtemp(suffix="OnlineBacktranslationTest"))
+
+    @classmethod
+    def obt_task(
+        cls, languages: Sequence[str], data: Path = None, language_mapping: str = None
+    ):
+        dict_path = cls.tmp_dir / "dict.txt"
+        if not dict_path.exists():
+            dictionary = utils.dummy_dictionary(100)
+            dictionary.save(str(dict_path))
+
+        if data is not None:
+            (data / "dict.txt").write_text(dict_path.read_text())
+        else:
+            data = cls.tmp_dir
+        assert len(languages) >= 2
+
+        kwargs = {
+            "arch": "transformer",
+            # --max-sentences=1 for better predictability of batches
+            "max_sentences": 1,
+            # Use characteristics dimensions
+            "encoder_layers": 3,
+            "encoder_embed_dim": 12,
+            "encoder_ffn_embed_dim": 14,
+            "encoder_attention_heads": 4,
+            "decoder_layers": 3,
+            "decoder_embed_dim": 12,
+            "decoder_output_dim": 12,
+            "decoder_ffn_embed_dim": 14,
+            "decoder_attention_heads": 4,
+            # Disable dropout so we have comparable tests.
+            "dropout": 0,
+            "attention_dropout": 0,
+            "activation_dropout": 0,
+            "encoder_layerdrop": 0,
+        }
+
+        args = fairseq.options.get_args(
+            data,
+            task="online_backtranslation",
+            mono_langs=",".join(languages),
+            valid_lang_pairs=f"{languages[0]}-{languages[1]}",
+            tokens_per_sample=256,
+            language_mapping=language_mapping,
+            **kwargs,
+        )
+        task = obt.OnlineBackTranslationTask.setup_task(args)
+        # we need to build the model to have the correct dictionary
+        model = task.build_model(task.args)
+        return task, model
+
+    def tmp_path(self, test_case: str) -> Path:
+        return Path(tempfile.mkdtemp(test_case, dir=self.tmp_dir))
+
+    def test_lang_tokens(self):
+        task, model = self.obt_task(["en", "ro", "zh"])
+        assert obt._lang_token("en") in task.dictionary
+        assert obt._lang_token("ro") in task.dictionary
+        assert obt._lang_token("zh") in task.dictionary
+
+        en_bos = obt._lang_token_index(task.common_dict, "en")
+        assert "en" == task.common_dict[en_bos].strip("_")
+        zh_bos = obt._lang_token_index(task.common_dict, "zh")
+        assert "zh" == task.common_dict[zh_bos].strip("_")
+        zh_sample = mk_sample([zh_bos, 16, 14, 12, 10])
+
+        # we expect to receive the bos token for translation
+        assert task.get_bos_token_from_sample(zh_sample) == en_bos
+
+    def test_backtranslate_sample(self):
+        task, model = self.obt_task(["en", "ro", "zh"])
+
+        en_bos = obt._lang_token_index(task.common_dict, "en")
+        zh_bos = obt._lang_token_index(task.common_dict, "zh")
+        sample = mk_sample([zh_bos, 16, 14, 12, 10])
+
+        task.backtranslate_sample(sample, "zh", "en")
+        target_zh = list(sample["target"][0])
+        assert target_zh == [16, 14, 12, 10]  # original zh sentence
+        generated_en = sample["net_input"]["src_tokens"][0]
+        assert generated_en[0] == en_bos
+
+    def test_train_dataset(self):
+        data = self.tmp_path("test_train_dataset")
+        mk_dataset(20, 10, data / "en" / "train.bin")
+        mk_dataset(10, 10, data / "zh" / "train.bin")
+        task, model = self.obt_task(["en", "zh"], data)
+        task.load_dataset("train")
+
+        en_bos = obt._lang_token_index(task.common_dict, "en")
+        zh_bos = obt._lang_token_index(task.common_dict, "zh")
+
+        train = task.datasets["train"]
+        train.ordered_indices()
+        train.prefetch([0, 19])
+        sample_0 = train[0]
+        sample_19 = train[19]
+        self.assertEqual(
+            set(sample_0.keys()), {"en-BT", "en-DENOISE", "zh-BT", "zh-DENOISE"}
+        )
+        for sample in (sample_0, sample_19):
+            self.assertEqual(sample["en-BT"]["source"][0], en_bos)
+            # bt target isn't ready to look at.
+            self.assertEqual(sample["en-DENOISE"]["source"][0], en_bos)
+            # TODO What could we check on the target side ?
+
+        for i in range(10):
+            # Zh dataset is shorter, and is wrapped around En dataset.
+            train.prefetch([i, i + 10])
+            self.assertEqual(
+                list(train[i]["zh-DENOISE"]["source"]),
+                list(train[i + 10]["zh-DENOISE"]["source"]),
+            )
+            self.assertEqual(train[i]["zh-DENOISE"]["source"][0].item(), zh_bos)
+
+        # Sorted by increasing len
+        self.assertLess(
+            len(sample_0["en-BT"]["source"]), len(sample_19["en-BT"]["source"])
+        )
+
+    def test_valid_dataset(self):
+        data = self.tmp_path("test_valid_dataset")
+        mk_dataset(10, 21, data / "valid.en-zh.en.bin")
+        mk_dataset(10, 21, data / "valid.en-zh.zh.bin")
+
+        task, model = self.obt_task(["en", "zh"], data)
+        valid = task.load_dataset("valid")
+        en_bos = obt._lang_token_index(task.common_dict, "en")
+
+        assert valid is not None
+        valid.prefetch(range(10))
+        sample_0 = valid[0]
+        sample_9 = valid[9]
+        self.assertEqual(sample_0["id"], 0)
+        self.assertEqual(sample_9["id"], 9)
+        self.assertEqual(sample_0["source"][0], en_bos)
+        self.assertEqual(sample_9["source"][0], en_bos)
+        # TODO: could we test the target side ?
+
+    def assertFnMatch(self, fn, values):
+        for x, y in values.items():
+            fn_x = fn(x)
+            self.assertEqual(fn_x, y, f"Fn has wrong value: fn({x}) = {fn_x} != {y}")
+
+    def test_piecewise_linear_fn(self):
+        self.assertFnMatch(
+            obt.PiecewiseLinearFn.from_string("1.0"), {0: 1, 100: 1, 500: 1, 1000: 1}
+        )
+        self.assertFnMatch(
+            obt.PiecewiseLinearFn.from_string("0:1,1000:0"),
+            {0: 1, 500: 0.5, 1000: 0, 2000: 0},
+        )
+        self.assertFnMatch(
+            obt.PiecewiseLinearFn.from_string("0:0,1000:1"),
+            {0: 0, 500: 0.5, 1000: 1, 2000: 1},
+        )
+        self.assertFnMatch(
+            obt.PiecewiseLinearFn.from_string("0:0,1000:1,2000:0"),
+            {0: 0, 500: 0.5, 1000: 1, 1500: 0.5, 2000: 0, 3000: 0},
+        )
diff --git a/fairseq/tests/test_plasma_utils.py b/fairseq/tests/test_plasma_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7286c6cd3aa6f50e498bbd530f02b6ce538cf546
--- /dev/null
+++ b/fairseq/tests/test_plasma_utils.py
@@ -0,0 +1,127 @@
+import contextlib
+import tempfile
+import unittest
+from io import StringIO
+
+import numpy as np
+
+from tests.utils import create_dummy_data, preprocess_lm_data, train_language_model
+
+try:
+    from pyarrow import plasma
+
+    from fairseq.data.plasma_utils import PlasmaStore, PlasmaView
+
+    PYARROW_AVAILABLE = True
+except ImportError:
+    PYARROW_AVAILABLE = False
+
+dummy_path = "dummy"
+
+
+@unittest.skipUnless(PYARROW_AVAILABLE, "")
+class TestPlasmaView(unittest.TestCase):
+    def setUp(self) -> None:
+        self.tmp_file = tempfile.NamedTemporaryFile()  # noqa: P201
+        self.path = self.tmp_file.name
+        self.server = PlasmaStore.start(path=self.path, nbytes=10000)
+        self.client = plasma.connect(self.path, num_retries=10)
+
+    def tearDown(self) -> None:
+        self.client.disconnect()
+        self.tmp_file.close()
+        self.server.kill()
+
+    def test_two_servers_do_not_share_object_id_space(self):
+        data_server_1 = np.array([0, 1])
+        data_server_2 = np.array([2, 3])
+        server_2_path = self.path
+        with tempfile.NamedTemporaryFile() as server_1_path:
+            server = PlasmaStore.start(path=server_1_path.name, nbytes=10000)
+            arr1 = PlasmaView(
+                data_server_1, dummy_path, 1, plasma_path=server_1_path.name
+            )
+            assert len(arr1.client.list()) == 1
+            assert (arr1.array == data_server_1).all()
+            arr2 = PlasmaView(data_server_2, dummy_path, 1, plasma_path=server_2_path)
+            assert (arr2.array == data_server_2).all()
+            assert (arr1.array == data_server_1).all()
+            server.kill()
+
+    def test_hash_collision(self):
+        data_server_1 = np.array([0, 1])
+        data_server_2 = np.array([2, 3])
+        arr1 = PlasmaView(data_server_1, dummy_path, 1, plasma_path=self.path)
+        assert len(arr1.client.list()) == 1
+        arr2 = PlasmaView(data_server_2, dummy_path, 1, plasma_path=self.path)
+        assert len(arr1.client.list()) == 1
+        assert len(arr2.client.list()) == 1
+        assert (arr2.array == data_server_1).all()
+        # New hash key based on tuples
+        arr3 = PlasmaView(
+            data_server_2, dummy_path, (1, 12312312312, None), plasma_path=self.path
+        )
+        assert (
+            len(arr2.client.list()) == 2
+        ), "No new object was created by using a novel hash key"
+        assert (
+            arr3.object_id in arr2.client.list()
+        ), "No new object was created by using a novel hash key"
+        assert (
+            arr3.object_id in arr3.client.list()
+        ), "No new object was created by using a novel hash key"
+        del arr3, arr2, arr1
+
+    @staticmethod
+    def _assert_view_equal(pv1, pv2):
+        np.testing.assert_array_equal(pv1.array, pv2.array)
+
+    def test_putting_same_array_twice(self):
+        data = np.array([4, 4, 4])
+        arr1 = PlasmaView(data, dummy_path, 1, plasma_path=self.path)
+        assert len(self.client.list()) == 1
+        arr1b = PlasmaView(
+            data, dummy_path, 1, plasma_path=self.path
+        )  # should not change contents of store
+        arr1c = PlasmaView(
+            None, dummy_path, 1, plasma_path=self.path
+        )  # should not change contents of store
+
+        assert len(self.client.list()) == 1
+        self._assert_view_equal(arr1, arr1b)
+        self._assert_view_equal(arr1, arr1c)
+        PlasmaView(
+            data, dummy_path, 2, plasma_path=self.path
+        )  # new object id, adds new entry
+        assert len(self.client.list()) == 2
+
+        new_client = plasma.connect(self.path)
+        assert len(new_client.list()) == 2  # new client can access same objects
+        assert isinstance(arr1.object_id, plasma.ObjectID)
+        del arr1b
+        del arr1c
+
+    def test_plasma_store_full_raises(self):
+        with tempfile.NamedTemporaryFile() as new_path:
+            server = PlasmaStore.start(path=new_path.name, nbytes=10000)
+            with self.assertRaises(plasma.PlasmaStoreFull):
+                # 2000 floats is more than 2000 bytes
+                PlasmaView(
+                    np.random.rand(10000, 1), dummy_path, 1, plasma_path=new_path.name
+                )
+            server.kill()
+
+    def test_object_id_overflow(self):
+        PlasmaView.get_object_id("", 2**21)
+
+    def test_training_lm_plasma(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_lm_data(data_dir)
+                train_language_model(
+                    data_dir,
+                    "transformer_lm",
+                    ["--use-plasma-view", "--plasma-path", self.path],
+                    run_validation=True,
+                )
diff --git a/fairseq/tests/test_positional_encoding.py b/fairseq/tests/test_positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e38c4397dc17fdc9f2bb1fe4f80ebf9c82a0166
--- /dev/null
+++ b/fairseq/tests/test_positional_encoding.py
@@ -0,0 +1,63 @@
+import unittest
+
+import torch
+from fairseq.modules import RelPositionalEncoding
+import numpy as np
+
+
+class TestRelPositionalEncoding(unittest.TestCase):
+    def setUp(self) -> None:
+        self.T = 3
+        self.B = 1
+        self.C = 2
+        torch.manual_seed(0)
+        self.sample = torch.randn(self.T, self.B, self.C)  # TBC
+        self.rel_pos_enc = RelPositionalEncoding(max_len=4, d_model=self.C)
+
+    def test_extend_pe(self):
+        inp = self.sample.transpose(0, 1)
+        self.rel_pos_enc.extend_pe(inp)
+        expected_pe = torch.tensor(
+            [
+                [
+                    [0.1411, -0.9900],
+                    [0.9093, -0.4161],
+                    [0.8415, 0.5403],
+                    [0.0000, 1.0000],
+                    [-0.8415, 0.5403],
+                    [-0.9093, -0.4161],
+                    [-0.1411, -0.9900],
+                ]
+            ]
+        )
+
+        self.assertTrue(
+            np.allclose(
+                expected_pe.cpu().detach().numpy(),
+                self.rel_pos_enc.pe.cpu().detach().numpy(),
+                atol=1e-4,
+            )
+        )
+
+    def test_forward(self):
+        pos_enc = self.rel_pos_enc(self.sample)
+        expected_pos_enc = torch.tensor(
+            [
+                [[0.9093, -0.4161]],
+                [[0.8415, 0.5403]],
+                [[0.0000, 1.0000]],
+                [[-0.8415, 0.5403]],
+                [[-0.9093, -0.4161]],
+            ]
+        )
+        self.assertTrue(
+            np.allclose(
+                pos_enc.cpu().detach().numpy(),
+                expected_pos_enc.cpu().detach().numpy(),
+                atol=1e-4,
+            )
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_reproducibility.py b/fairseq/tests/test_reproducibility.py
new file mode 100644
index 0000000000000000000000000000000000000000..b285593272151ed95d99313d371e2f12628face6
--- /dev/null
+++ b/fairseq/tests/test_reproducibility.py
@@ -0,0 +1,148 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import tempfile
+import unittest
+
+import torch
+
+from . import test_binaries
+
+
+class TestReproducibility(unittest.TestCase):
+    def _test_reproducibility(
+        self,
+        name,
+        extra_flags=None,
+        delta=0.0001,
+        resume_checkpoint="checkpoint1.pt",
+        max_epoch=3,
+    ):
+        def get_last_log_stats_containing_string(log_records, search_string):
+            for log_record in logs.records[::-1]:
+                if isinstance(log_record.msg, str) and search_string in log_record.msg:
+                    return json.loads(log_record.msg)
+
+        if extra_flags is None:
+            extra_flags = []
+
+        with tempfile.TemporaryDirectory(name) as data_dir:
+            with self.assertLogs() as logs:
+                test_binaries.create_dummy_data(data_dir)
+                test_binaries.preprocess_translation_data(data_dir)
+
+            # train epochs 1 and 2 together
+            with self.assertLogs() as logs:
+                test_binaries.train_translation_model(
+                    data_dir,
+                    "fconv_iwslt_de_en",
+                    [
+                        "--dropout",
+                        "0.0",
+                        "--log-format",
+                        "json",
+                        "--log-interval",
+                        "1",
+                        "--max-epoch",
+                        str(max_epoch),
+                    ]
+                    + extra_flags,
+                )
+            train_log = get_last_log_stats_containing_string(logs.records, "train_loss")
+            valid_log = get_last_log_stats_containing_string(logs.records, "valid_loss")
+
+            # train epoch 2, resuming from previous checkpoint 1
+            os.rename(
+                os.path.join(data_dir, resume_checkpoint),
+                os.path.join(data_dir, "checkpoint_last.pt"),
+            )
+            with self.assertLogs() as logs:
+                test_binaries.train_translation_model(
+                    data_dir,
+                    "fconv_iwslt_de_en",
+                    [
+                        "--dropout",
+                        "0.0",
+                        "--log-format",
+                        "json",
+                        "--log-interval",
+                        "1",
+                        "--max-epoch",
+                        str(max_epoch),
+                    ]
+                    + extra_flags,
+                )
+            train_res_log = get_last_log_stats_containing_string(
+                logs.records, "train_loss"
+            )
+            valid_res_log = get_last_log_stats_containing_string(
+                logs.records, "valid_loss"
+            )
+
+            for k in ["train_loss", "train_ppl", "train_num_updates", "train_gnorm"]:
+                self.assertAlmostEqual(
+                    float(train_log[k]), float(train_res_log[k]), delta=delta
+                )
+            for k in [
+                "valid_loss",
+                "valid_ppl",
+                "valid_num_updates",
+                "valid_best_loss",
+            ]:
+                self.assertAlmostEqual(
+                    float(valid_log[k]), float(valid_res_log[k]), delta=delta
+                )
+
+    def test_reproducibility(self):
+        self._test_reproducibility("test_reproducibility")
+
+    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+    def test_reproducibility_fp16(self):
+        self._test_reproducibility(
+            "test_reproducibility_fp16",
+            [
+                "--fp16",
+                "--fp16-init-scale",
+                "4096",
+            ],
+            delta=0.011,
+        )
+
+    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+    def test_reproducibility_memory_efficient_fp16(self):
+        self._test_reproducibility(
+            "test_reproducibility_memory_efficient_fp16",
+            [
+                "--memory-efficient-fp16",
+                "--fp16-init-scale",
+                "4096",
+            ],
+        )
+
+    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
+    def test_reproducibility_amp(self):
+        self._test_reproducibility(
+            "test_reproducibility_amp",
+            [
+                "--amp",
+                "--fp16-init-scale",
+                "4096",
+            ],
+            delta=0.011,
+        )
+
+    def test_mid_epoch_reproducibility(self):
+        self._test_reproducibility(
+            "test_mid_epoch_reproducibility",
+            ["--save-interval-updates", "3"],
+            resume_checkpoint="checkpoint_1_3.pt",
+            max_epoch=1,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_resampling_dataset.py b/fairseq/tests/test_resampling_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccb53a253ce6ca0d8e972adfa708144b4299b3cb
--- /dev/null
+++ b/fairseq/tests/test_resampling_dataset.py
@@ -0,0 +1,103 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import collections
+import unittest
+
+import numpy as np
+from fairseq.data import ListDataset, ResamplingDataset
+
+
+class TestResamplingDataset(unittest.TestCase):
+    def setUp(self):
+        self.strings = ["ab", "c", "def", "ghij"]
+        self.weights = [4.0, 2.0, 7.0, 1.5]
+        self.size_ratio = 2
+        self.dataset = ListDataset(
+            self.strings, np.array([len(s) for s in self.strings])
+        )
+
+    def _test_common(self, resampling_dataset, iters):
+        assert len(self.dataset) == len(self.strings) == len(self.weights)
+        assert len(resampling_dataset) == self.size_ratio * len(self.strings)
+
+        results = {"ordered_by_size": True, "max_distribution_diff": 0.0}
+
+        totalfreqs = 0
+        freqs = collections.defaultdict(int)
+
+        for epoch_num in range(iters):
+            resampling_dataset.set_epoch(epoch_num)
+
+            indices = resampling_dataset.ordered_indices()
+            assert len(indices) == len(resampling_dataset)
+
+            prev_size = -1
+
+            for i in indices:
+                cur_size = resampling_dataset.size(i)
+                # Make sure indices map to same sequences within an epoch
+                assert resampling_dataset[i] == resampling_dataset[i]
+
+                # Make sure length of sequence is correct
+                assert cur_size == len(resampling_dataset[i])
+
+                freqs[resampling_dataset[i]] += 1
+                totalfreqs += 1
+
+                if prev_size > cur_size:
+                    results["ordered_by_size"] = False
+
+                prev_size = cur_size
+
+        assert set(freqs.keys()) == set(self.strings)
+        for s, weight in zip(self.strings, self.weights):
+            freq = freqs[s] / totalfreqs
+            expected_freq = weight / sum(self.weights)
+            results["max_distribution_diff"] = max(
+                results["max_distribution_diff"], abs(expected_freq - freq)
+            )
+
+        return results
+
+    def test_resampling_dataset_batch_by_size_false(self):
+        resampling_dataset = ResamplingDataset(
+            self.dataset,
+            self.weights,
+            size_ratio=self.size_ratio,
+            batch_by_size=False,
+            seed=0,
+        )
+
+        results = self._test_common(resampling_dataset, iters=1000)
+
+        # For batch_by_size = False, the batches should be returned in
+        # arbitrary order of size.
+        assert not results["ordered_by_size"]
+
+        # Allow tolerance in distribution error of 2%.
+        assert results["max_distribution_diff"] < 0.02
+
+    def test_resampling_dataset_batch_by_size_true(self):
+        resampling_dataset = ResamplingDataset(
+            self.dataset,
+            self.weights,
+            size_ratio=self.size_ratio,
+            batch_by_size=True,
+            seed=0,
+        )
+
+        results = self._test_common(resampling_dataset, iters=1000)
+
+        # For batch_by_size = True, the batches should be returned in
+        # increasing order of size.
+        assert results["ordered_by_size"]
+
+        # Allow tolerance in distribution error of 2%.
+        assert results["max_distribution_diff"] < 0.02
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_roberta.py b/fairseq/tests/test_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..14f01f9cb7fe252511037ef9d8165faeeaee44f6
--- /dev/null
+++ b/fairseq/tests/test_roberta.py
@@ -0,0 +1,344 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import unittest
+from typing import Any, Dict, Sequence
+
+import fairseq
+import fairseq.options
+import fairseq.tasks
+import torch
+from tests.utils import dummy_dictionary
+
+VOCAB_SIZE = 100
+
+
+@fairseq.tasks.register_task("fake_task")
+class FakeTask(fairseq.tasks.LegacyFairseqTask):
+    def __init__(self, args):
+        super().__init__(args)
+        self.dictionary = dummy_dictionary(VOCAB_SIZE - 4)
+        assert len(self.dictionary) == VOCAB_SIZE
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+
+@functools.lru_cache()
+def get_toy_model(
+    device: str,
+    architecture: str = "roberta_enc_dec",
+    **extra_args: Any,
+):
+    assert device in ("gpu", "cpu")
+    kwargs = {
+        "arch": architecture,
+        # Use characteristics dimensions
+        "encoder_layers": 3,
+        "encoder_embed_dim": 12,
+        "encoder_ffn_embed_dim": 14,
+        "encoder_attention_heads": 4,
+        "decoder_layers": 3,
+        "decoder_embed_dim": 12,
+        "decoder_ffn_embed_dim": 14,
+        "decoder_attention_heads": 4,
+        # Disable dropout so we have comparable tests.
+        "dropout": 0,
+        "attention_dropout": 0,
+        "activation_dropout": 0,
+        "encoder_layerdrop": 0,
+        # required args
+        "tokens_per_sample": 256,
+        "data": "/tmp/test_roberta",
+    }
+    kwargs.update(extra_args)
+    fake_task = FakeTask(kwargs)
+    args = fairseq.options.get_args(
+        task="online_backtranslation",
+        mono_langs="en,ro",
+        valid_lang_pairs="en-ro",
+        **kwargs,
+    )
+    torch.manual_seed(0)
+    model = fake_task.build_model(args)
+    if device == "gpu":
+        model.cuda()
+    return fake_task, model
+
+
+def mk_sample(
+    lang: str, device: str, tok: Sequence[int] = None, batch_size: int = 2
+) -> Dict[str, Any]:
+    assert device in ("gpu", "cpu")
+    if not tok:
+        if lang == "en":
+            tok = [10, 11, 12, 13, 14, 15, 2]
+        else:
+            tok = [20, 21, 22, 23, 24, 25, 26, 27, 2]
+
+    batch = torch.stack([torch.tensor(tok, dtype=torch.long)] * batch_size)
+    if device == "gpu":
+        batch = batch.cuda()
+    sample = {
+        "net_input": {
+            "src_tokens": batch,
+            "prev_output_tokens": batch,
+            "src_lengths": torch.tensor(
+                [len(tok)] * batch_size, dtype=torch.long, device=batch.device
+            ),
+        },
+        "target": batch[:, 1:],
+    }
+    return sample
+
+
+def cpu_gpu(fn):
+    def helper(self):
+        fn(self, "cpu")
+        if torch.cuda.is_available():
+            fn(self, "gpu")
+
+    return helper
+
+
+def architectures(fn):
+    def helper(self):
+        for arch in ["roberta_enc_dec", "transformer"]:
+            fn(self, arch)
+
+    return helper
+
+
+class RobertaTest(unittest.TestCase):
+    def assertTensorEqual(self, t1, t2, delta: float = 1e-6):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        if delta == 0.0:
+            self.assertEqual(t1.ne(t2).long().sum(), 0)
+        else:
+            self.assertEqual(((t2 - t1).abs() > delta).long().sum(), 0)
+
+    def assertSharing(self, model, link_groups: Sequence[Sequence[str]]):
+        ids = {}
+        for group in link_groups:
+            group_ids = {name: id(params(model, name)) for name in group}
+            shared_id = group_ids[group[0]]
+            self.assertEqual(group_ids, {name: shared_id for name in group})
+            self.assertNotIn(shared_id, ids)
+            ids[shared_id] = group
+
+    def test_roberta_shared_params(self):
+        _, roberta = get_toy_model("cpu", architecture="roberta")
+        self.assertSharing(
+            roberta,
+            [
+                [
+                    "encoder.sentence_encoder.embed_tokens.weight",
+                    "encoder.lm_head.weight",
+                ]
+            ],
+        )
+
+        _, roberta = get_toy_model(
+            "cpu", architecture="roberta", untie_weights_roberta=True
+        )
+        self.assertSharing(
+            roberta,
+            [
+                ["encoder.sentence_encoder.embed_tokens.weight"],
+                ["encoder.lm_head.weight"],
+            ],
+        )
+
+    def test_roberta_enc_dec_shared_params(self):
+        # 3 distinct embeddings
+        _, enc_dec = get_toy_model("cpu", architecture="roberta_enc_dec")
+        self.assertSharing(
+            enc_dec,
+            [
+                ["encoder.embed_tokens.weight"],
+                ["decoder.embed_tokens.weight"],
+                ["decoder.output_projection.weight"],
+            ],
+        )
+
+        # 2 distinct embeddings, one for encoder, one for decoder
+        _, enc_dec = get_toy_model(
+            "cpu", architecture="roberta_enc_dec", share_decoder_input_output_embed=True
+        )
+        self.assertSharing(
+            enc_dec,
+            [
+                ["encoder.embed_tokens.weight"],
+                [
+                    "decoder.embed_tokens.weight",
+                    "decoder.output_projection.weight",
+                ],
+            ],
+        )
+
+        # shared embeddings
+        _, enc_dec = get_toy_model(
+            "cpu", architecture="roberta_enc_dec", share_all_embeddings=True
+        )
+        self.assertSharing(
+            enc_dec,
+            [
+                [
+                    "encoder.embed_tokens.weight",
+                    "decoder.embed_tokens.weight",
+                    "decoder.output_projection.weight",
+                ]
+            ],
+        )
+
+    def test_roberta_max_positions_is_correctly_set(self):
+        device = "cpu"
+        task, model = get_toy_model(device)
+        max_pos = model.max_decoder_positions()
+        self.assertEqual(max_pos, 256)
+        self.assertEqual(max_pos, model.decoder.max_positions())
+        self.assertEqual(max_pos, model.encoder.max_positions())
+        self.assertEqual(max_pos, model.encoder.embed_positions.max_positions)
+
+        sentence = [31 for _ in range(max_pos)]
+        sample = mk_sample("en", device, sentence, batch_size=1)
+        self.assertEqual(list(sample["net_input"]["src_lengths"]), [max_pos])
+        self.assertEqual(len(sample["net_input"]["src_tokens"][0]), max_pos)
+        x, _ = model.forward(**sample["net_input"])
+        self.assertEqual(x.shape, (1, max_pos, VOCAB_SIZE))
+
+    @cpu_gpu
+    def test_roberta_forward_backward(self, device: str):
+        _, model = get_toy_model(device)
+        sample = mk_sample("en", device)
+        en_tokens = sample["net_input"]["src_tokens"]
+        (bs, l) = en_tokens.shape
+        # Forward
+        logits, _ = model(**sample["net_input"])
+        self.assertEqual(logits.shape, (bs, l, VOCAB_SIZE))
+
+        # Backward
+        loss = logits.sum()
+        loss.backward()
+
+    @cpu_gpu
+    def test_roberta_forward_backward_bs1(self, device: str):
+        _, model = get_toy_model(device)
+        sample = mk_sample("en", device, batch_size=1)
+        o, _ = model.forward(**sample["net_input"])
+        loss = o.sum()
+        sample2 = mk_sample("ro", device, batch_size=1)
+        o, _ = model.forward(**sample2["net_input"])
+        loss += o.sum()
+        loss.backward()
+
+    @cpu_gpu
+    def test_roberta_batching(self, device: str):
+        """
+        Checks that the batch of size 2 give twice the same results than the batch of size 1.
+        """
+        _, model = get_toy_model(device)
+        sample = mk_sample("en", device, batch_size=1)
+        slen = sample["net_input"]["src_lengths"][0]
+        sample2 = mk_sample("en", device, batch_size=2)
+        with torch.no_grad():
+            z = model.encoder.forward(
+                sample["net_input"]["src_tokens"], sample["net_input"]["src_lengths"]
+            )
+            z = z["encoder_out"][-1]
+            logits, _ = model.forward(**sample["net_input"])
+
+            z2 = model.encoder.forward(
+                sample2["net_input"]["src_tokens"], sample["net_input"]["src_lengths"]
+            )
+            z2 = z2["encoder_out"][-1]
+            logits2, _ = model.forward(**sample2["net_input"])
+
+        self.assertEqual(z.shape, (slen, 1, 12))
+        self.assertEqual(z2.shape, (slen, 2, 12))
+        self.assertTensorEqual(logits2[0], logits2[1])
+        self.assertTensorEqual(logits[0], logits2[0])
+
+    @cpu_gpu
+    def test_roberta_incremental_decoder(self, device: str):
+        """
+        Checks that incremental decoding yields the same result than non incremental one.
+        """
+        task, model = get_toy_model(device)
+
+        en_sample = mk_sample("en", device)
+        en_tokens = en_sample["net_input"]["src_tokens"]
+        ro_sample = mk_sample("ro", device)
+        ro_tokens = ro_sample["net_input"]["src_tokens"]
+
+        en_enc = model.encoder.forward(
+            en_tokens, src_lengths=en_sample["net_input"]["src_lengths"]
+        )
+        (bs, tgt_len) = ro_tokens.shape
+
+        # Decode without incremental state
+        ro_dec, _ = model.decoder.forward(ro_tokens, encoder_out=en_enc)
+        self.assertEqual(ro_dec.shape, (bs, tgt_len, VOCAB_SIZE))
+        self.assertTensorEqual(ro_dec[0], ro_dec[1])
+
+        # Decode with incremental state
+        inc_state = {}
+        ro_dec_inc = []
+        for i in range(tgt_len):
+            ro, _ = model.decoder.forward(
+                ro_tokens[:, : i + 1], encoder_out=en_enc, incremental_state=inc_state
+            )
+            self.assertEqual(ro.shape, (bs, 1, VOCAB_SIZE))
+            ro_dec_inc.append(ro)
+
+        for i in range(tgt_len):
+            # Intra-batch
+            self.assertTensorEqual(ro_dec_inc[i][0], ro_dec_inc[i][1])
+            # Incremental vs non-incremental
+            self.assertTensorEqual(ro_dec_inc[i][:, 0], ro_dec[:, i])
+
+    @cpu_gpu
+    def test_regularize_for_adaprune_in_roberta(self, device: str):
+        _, model = get_toy_model(
+            device=device,
+            architecture="roberta_base",
+            mha_reg_scale_factor=0.000375,
+            ffn_reg_scale_factor=0.000375,
+        )
+        sample = mk_sample("en", device, batch_size=1)
+        task_loss, _ = model.forward(**sample["net_input"])
+        head_loss = model._get_adaptive_head_loss()
+        ffn_loss = model._get_adaptive_ffn_loss()
+        loss = task_loss.sum() + head_loss + ffn_loss
+        loss.backward()
+
+    @cpu_gpu
+    def test_ffn_prune_for_adaprune_in_roberta(self, device: str):
+        _, model = get_toy_model(
+            device=device,
+            architecture="roberta_base",
+        )
+        sample = mk_sample("en", device, batch_size=1)
+        for layer in model.encoder.sentence_encoder.layers:
+            fc1_original_size = layer.fc1.out_features
+            remove_index = layer._get_fc_rank(remove_num=2)
+            layer._prune_fc_layer(remove_index=remove_index)
+            self.assertEqual(layer.fc1.out_features, fc1_original_size - 2)
+
+        task_loss, _ = model.forward(**sample["net_input"])
+
+
+def params(model, name):
+    if "." not in name:
+        return getattr(model, name)
+
+    prefix, name = name.split(".", 1)
+    return params(getattr(model, prefix), name)
diff --git a/fairseq/tests/test_rotary_positional_embedding.py b/fairseq/tests/test_rotary_positional_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c44e86d5dcc2f4bb36b73a598700d44bbca791f
--- /dev/null
+++ b/fairseq/tests/test_rotary_positional_embedding.py
@@ -0,0 +1,85 @@
+import torch
+import numpy as np
+import unittest
+from fairseq.modules.rotary_positional_embedding import apply_rotary_pos_emb
+from fairseq.modules import RotaryPositionalEmbedding
+
+
+class TestRotaryPositionalEmbedding(unittest.TestCase):
+    def setUp(self) -> None:
+        self.T = 3
+        self.B = 1
+        self.C = 2
+        torch.manual_seed(0)
+        self.sample = torch.randn(self.T, self.B, self.C)  # TBC
+        self.rope_pos_emd = RotaryPositionalEmbedding(dim=self.C)
+
+    def test_forward(self):
+        expected_cos = torch.tensor(
+            [[[[1.0000, 1.0000]]], [[[0.5403, 0.5403]]], [[[-0.4161, -0.4161]]]]
+        )
+        expected_sin = torch.tensor(
+            [[[[0.0000, 0.0000]]], [[[0.8415, 0.8415]]], [[[0.9093, 0.9093]]]]
+        )
+        cos, sin = self.rope_pos_emd(self.sample, self.T)
+        self.assertTrue(
+            np.allclose(
+                expected_cos.cpu().detach().numpy(),
+                cos.cpu().detach().numpy(),
+                atol=1e-4,
+            )
+        )
+        self.assertTrue(
+            np.allclose(
+                expected_sin.cpu().detach().numpy(),
+                sin.cpu().detach().numpy(),
+                atol=1e-4,
+            )
+        )
+
+    def test_apply_rotary_pos_emb(self):
+        cos, sin = self.rope_pos_emd(self.sample, self.T)
+        query = self.sample.view(self.T, self.B, 1, self.C)
+        expected_query = torch.tensor(
+            [[[[1.5410, -0.2934]]], [[[-1.6555, -1.5263]]], [[[1.7231, -0.4041]]]]
+        )
+        new_query, new_key = apply_rotary_pos_emb(query, query, cos, sin)
+        self.assertTrue(
+            np.allclose(
+                expected_query.cpu().detach().numpy(),
+                new_query.cpu().detach().numpy(),
+                atol=1e-4,
+            )
+        )
+        self.assertTrue(
+            np.allclose(
+                expected_query.cpu().detach().numpy(),
+                new_key.cpu().detach().numpy(),
+                atol=1e-4,
+            )
+        )
+
+    def test_jit_compile_rope_module(self):
+        module_scripted = torch.jit.script(self.rope_pos_emd)
+        apply_rotary_scripted = torch.jit.script(apply_rotary_pos_emb)
+        # Test several different lengths
+        for T in [3, 5, 10]:
+            sample = torch.randn(T, self.B, self.C)
+            # Run forward pass with the original module
+            cos_original, sin_original = self.rope_pos_emd(sample, T)
+            query = sample.view(T, self.B, 1, self.C)
+            new_query, new_key = apply_rotary_pos_emb(query, query, cos_original, sin_original)
+
+            # Run forward pass with the scripted module
+            cos_scripted, sin_scripted = module_scripted(sample, T)
+            new_query_scripted, new_key_scripted = apply_rotary_scripted(query, query, cos_scripted, sin_scripted)
+
+            # Ensure the outputs are the same
+            self.assertTrue(torch.allclose(cos_original, cos_scripted))
+            self.assertTrue(torch.allclose(sin_original, sin_scripted))
+            self.assertTrue(torch.allclose(new_query, new_query_scripted))
+            self.assertTrue(torch.allclose(new_key, new_key_scripted))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_sequence_generator.py b/fairseq/tests/test_sequence_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e42df0e56ff4075e908b255192750cf2f025c00
--- /dev/null
+++ b/fairseq/tests/test_sequence_generator.py
@@ -0,0 +1,744 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import math
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+import tests.utils as test_utils
+from fairseq import search
+from fairseq.data.dictionary import Dictionary
+from fairseq.models.transformer import TransformerModel
+from fairseq.ngram_repeat_block import NGramRepeatBlock
+from fairseq.sequence_generator import EnsembleModel, SequenceGenerator
+from fairseq.tasks.fairseq_task import LegacyFairseqTask
+
+DEFAULT_TEST_VOCAB_SIZE = 100
+
+
+class DummyTask(LegacyFairseqTask):
+    def __init__(self, args):
+        super().__init__(args)
+        self.dictionary = get_dummy_dictionary()
+        if getattr(self.args, "ctc", False):
+            self.dictionary.add_symbol("<ctc_blank>")
+        self.src_dict = self.dictionary
+        self.tgt_dict = self.dictionary
+
+    @property
+    def source_dictionary(self):
+        return self.src_dict
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+
+def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE):
+    dummy_dict = Dictionary()
+    # add dummy symbol to satisfy vocab size
+    for id, _ in enumerate(range(vocab_size)):
+        dummy_dict.add_symbol("{}".format(id), n=1000)
+    return dummy_dict
+
+
+def get_dummy_task_and_parser():
+    """
+    to build a fariseq model, we need some dummy parse and task. This function
+    is used to create dummy task and parser to faciliate model/criterion test
+
+    Note: we use FbSpeechRecognitionTask as the dummy task. You may want
+    to use other task by providing another function
+    """
+    parser = argparse.ArgumentParser(
+        description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS
+    )
+    DummyTask.add_args(parser)
+    args = parser.parse_args([])
+    task = DummyTask.setup_task(args)
+    return task, parser
+
+
+class TestJitSequenceGeneratorBase(unittest.TestCase):
+    def setUp(self):
+        self.task, self.parser = get_dummy_task_and_parser()
+        eos = self.task.tgt_dict.eos()
+        src_tokens = torch.randint(3, 50, (2, 10)).long()
+        src_tokens = torch.cat((src_tokens, torch.LongTensor([[eos], [eos]])), -1)
+        src_lengths = torch.LongTensor([2, 10])
+        self.sample = {
+            "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths}
+        }
+        TransformerModel.add_args(self.parser)
+        args = self.parser.parse_args([])
+        args.encoder_layers = 2
+        args.decoder_layers = 1
+        self.transformer_model = TransformerModel.build_model(args, self.task)
+
+    def assertOutputEqual(self, hypo, pos_probs):
+        pos_scores = torch.FloatTensor(pos_probs).log()
+        self.assertTensorSizeEqual(hypo["positional_scores"], pos_scores)
+        self.assertTensorSizeEqual(pos_scores.numel(), hypo["tokens"].numel())
+
+    def assertTensorSizeEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+
+    def assertAlmostEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertLess((t1 - t2).abs().max(), 1e-4)
+
+    def assertTensorEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertEqual(t1.ne(t2).long().sum(), 0)
+
+    def assertHypoEqual(self, h1, h2):
+        "Check two hypos are equal"
+        self.assertTensorEqual(h1["tokens"], h2["tokens"])
+        self.assertAlmostEqual(h1["positional_scores"], h2["positional_scores"])
+        self.assertLess(abs(h1["score"] - h2["score"]), 1e-6)
+        self.assertAlmostEqual(h1["attention"], h2["attention"])
+
+    def _test_save_and_load(self, scripted_module):
+        with tempfile.NamedTemporaryFile() as f:
+            scripted_module.save(f.name)
+            torch.jit.load(f.name)
+
+
+JIT_MSG = "Targeting OSS scriptability for the 1.6 release"
+
+
+@unittest.skipIf(torch.__version__ < "1.6.0", JIT_MSG)
+class TestJitSequenceGenerator(TestJitSequenceGeneratorBase):
+    def test_export_transformer(self):
+        model = self.transformer_model
+        torch.jit.script(model)
+
+    def test_ensemble_sequence_generator(self):
+        model = self.transformer_model
+        generator = SequenceGenerator(
+            [model],
+            self.task.tgt_dict,
+            beam_size=2,
+            no_repeat_ngram_size=2,
+            max_len_b=10,
+        )
+        scripted_model = torch.jit.script(generator)
+        self._test_save_and_load(scripted_model)
+
+    def test_export_ensemble_model(self):
+        model = self.transformer_model
+        ensemble_models = EnsembleModel([model])
+        torch.jit.script(ensemble_models)
+
+
+class TestExportSearch(unittest.TestCase):
+    def setUp(self):
+        task, _ = get_dummy_task_and_parser()
+        self.tgt_dict = task.tgt_dict
+        self.min_top1_prob = 0.4
+
+    def test_export_diverse_bs(self):
+        search_strategy = search.DiverseBeamSearch(
+            self.tgt_dict, num_groups=2, diversity_strength=0.0
+        )
+        torch.jit.script(search_strategy)
+
+    def test_export_sampling(self):
+        low_sampling_topp = self.min_top1_prob / 2.0
+        search_strategy = search.Sampling(
+            self.tgt_dict, sampling_topp=low_sampling_topp
+        )
+        torch.jit.script(search_strategy)
+
+    def test_export_diverse_siblings_search(self):
+        search_strategy = search.DiverseSiblingsSearch(
+            self.tgt_dict, diversity_rate=0.5
+        )
+        torch.jit.script(search_strategy)
+
+
+class TestSequenceGeneratorBase(unittest.TestCase):
+    def assertHypoTokens(self, hypo, tokens):
+        self.assertTensorEqual(hypo["tokens"], torch.LongTensor(tokens))
+
+    def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0):
+        pos_scores = torch.FloatTensor(pos_probs).log()
+        self.assertAlmostEqual(hypo["positional_scores"], pos_scores)
+        self.assertEqual(pos_scores.numel(), hypo["tokens"].numel())
+        score = pos_scores.sum()
+        if normalized:
+            score /= pos_scores.numel() ** lenpen
+        self.assertLess(abs(score - hypo["score"]), 1e-6)
+
+    def assertAlmostEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertLess((t1 - t2).abs().max(), 1e-4)
+
+    def assertTensorEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertEqual(t1.ne(t2).long().sum(), 0)
+
+
+class TestSequenceGenerator(TestSequenceGeneratorBase):
+    def setUp(self):
+        (
+            self.tgt_dict,
+            self.w1,
+            self.w2,
+            src_tokens,
+            src_lengths,
+            self.model,
+        ) = test_utils.sequence_generator_setup()
+        self.sample = {
+            "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths}
+        }
+
+    def test_with_normalization(self):
+        generator = SequenceGenerator([self.model], self.tgt_dict, beam_size=2)
+        hypos = generator.forward(self.sample)
+        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
+        # sentence 1, beam 1
+        self.assertHypoTokens(hypos[0][0], [w1, eos])
+        self.assertHypoScore(hypos[0][0], [0.9, 1.0])
+        # sentence 1, beam 2
+        self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
+        self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0])
+        # sentence 2, beam 1
+        self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos])
+        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0])
+        # sentence 2, beam 2
+        self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
+        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6])
+
+    def test_without_normalization(self):
+        # Sentence 1: unchanged from the normalized case
+        # Sentence 2: beams swap order
+        generator = SequenceGenerator(
+            [self.model], self.tgt_dict, beam_size=2, normalize_scores=False
+        )
+        hypos = generator.forward(self.sample)
+        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
+        # sentence 1, beam 1
+        self.assertHypoTokens(hypos[0][0], [w1, eos])
+        self.assertHypoScore(hypos[0][0], [0.9, 1.0], normalized=False)
+        # sentence 1, beam 2
+        self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
+        self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], normalized=False)
+        # sentence 2, beam 1
+        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
+        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], normalized=False)
+        # sentence 2, beam 2
+        self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
+        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], normalized=False)
+
+    def test_with_lenpen_favoring_short_hypos(self):
+        lenpen = 0.6
+        generator = SequenceGenerator(
+            [self.model], self.tgt_dict, beam_size=2, len_penalty=lenpen
+        )
+        hypos = generator.forward(self.sample)
+        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
+        # sentence 1, beam 1
+        self.assertHypoTokens(hypos[0][0], [w1, eos])
+        self.assertHypoScore(hypos[0][0], [0.9, 1.0], lenpen=lenpen)
+        # sentence 1, beam 2
+        self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
+        self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen)
+        # sentence 2, beam 1
+        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
+        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], lenpen=lenpen)
+        # sentence 2, beam 2
+        self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
+        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen)
+
+    def test_with_lenpen_favoring_long_hypos(self):
+        lenpen = 5.0
+        generator = SequenceGenerator(
+            [self.model], self.tgt_dict, beam_size=2, len_penalty=lenpen
+        )
+        hypos = generator.forward(self.sample)
+        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
+        # sentence 1, beam 1
+        self.assertHypoTokens(hypos[0][0], [w2, w1, w2, eos])
+        self.assertHypoScore(hypos[0][0], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen)
+        # sentence 1, beam 2
+        self.assertHypoTokens(hypos[0][1], [w1, eos])
+        self.assertHypoScore(hypos[0][1], [0.9, 1.0], lenpen=lenpen)
+        # sentence 2, beam 1
+        self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos])
+        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen)
+        # sentence 2, beam 2
+        self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
+        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6], lenpen=lenpen)
+
+    def test_maxlen(self):
+        generator = SequenceGenerator(
+            [self.model], self.tgt_dict, beam_size=2, max_len_b=2
+        )
+        hypos = generator.forward(self.sample)
+        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
+        # sentence 1, beam 1
+        self.assertHypoTokens(hypos[0][0], [w1, eos])
+        self.assertHypoScore(hypos[0][0], [0.9, 1.0])
+        # sentence 1, beam 2
+        self.assertHypoTokens(hypos[0][1], [w2, w2, eos])
+        self.assertHypoScore(hypos[0][1], [0.1, 0.1, 0.6])
+        # sentence 2, beam 1
+        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
+        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6])
+        # sentence 2, beam 2
+        self.assertHypoTokens(hypos[1][1], [w2, w2, eos])
+        self.assertHypoScore(hypos[1][1], [0.3, 0.9, 0.01])
+
+    def test_encoder_with_different_output_len(self):
+        args = self.model.encoder.args
+        task = test_utils.TestTranslationTask.setup_task(
+            args, self.tgt_dict, self.tgt_dict
+        )
+        reshaping_model = test_utils.TestReshapingModel.build_model(args, task)
+        generator = SequenceGenerator(
+            [reshaping_model], self.tgt_dict, beam_size=2, max_len_b=2
+        )
+        hypos = generator.forward(self.sample)
+        for sent in [0, 1]:
+            for beam in [0, 1]:
+                assert hypos[sent][beam]["attention"] is not None
+
+    def test_generation_with_additional_input(self):
+        args = self.model.encoder.args
+        task = test_utils.TestTranslationTask.setup_task(
+            args, self.tgt_dict, self.tgt_dict
+        )
+        add_input_model = test_utils.TestAdditionalInputModel.build_model(args, task)
+        generator = SequenceGenerator([add_input_model], self.tgt_dict, beam_size=2)
+        sample = self.sample.copy()
+        sample["net_input"]["fancy_other_input"] = sample["net_input"]["src_tokens"]
+        hypos = generator.forward(self.sample)
+        eos, w1 = self.tgt_dict.eos(), self.w1
+        # sentence 1, beam 1
+        self.assertHypoTokens(hypos[0][0], [w1, eos])
+        self.assertHypoScore(hypos[0][0], [0.9, 1.0])
+
+
+@unittest.skipUnless(torch.cuda.is_available(), "")
+class TestRepeatNgramBlocking(TestSequenceGeneratorBase):
+    @classmethod
+    def setUpClass(cls):
+        (
+            cls.tgt_dict,
+            cls.w1,
+            cls.w2,
+            src_tokens,
+            src_lengths,
+            cls.model,
+        ) = test_utils.sequence_generator_setup()
+        return cls
+
+    def test_finds_repetitive_tokens(self):
+        bsz, vocab_size, beam_size, step = 2, 4, 1, 3
+        generated_tok = torch.tensor(
+            [[2, 2, 2, 2], [3, 3, 3, 3]], dtype=torch.long, device="cuda"
+        )
+        lprobs = torch.zeros((beam_size * bsz, vocab_size), device="cuda")
+        desired_result = lprobs.new_tensor(
+            [[0.0, 0.0, -math.inf, 0.0], [0.0, 0.0, 0.0, -math.inf]]
+        )
+
+        cuda_ext_result, baseline_result = self._compare_cuda_ext_to_default_implem(
+            bsz, beam_size, generated_tok, lprobs, step, 2
+        )
+        self.assertTensorEqual(cuda_ext_result, desired_result)
+        self.assertTensorEqual(baseline_result, desired_result)
+
+    @unittest.skipIf(torch.__version__ < "1.6.0", JIT_MSG)
+    def test_jit_no_extension(self):
+        bsz, vocab_size, beam_size, step = 2, 4, 1, 3
+        generated_tok = torch.tensor(
+            [[2, 2, 2, 2], [3, 3, 3, 3]], dtype=torch.long, device="cuda"
+        )
+        lprobs = torch.zeros((beam_size * bsz, vocab_size), device="cuda")
+        blocker = NGramRepeatBlock(2, use_extension=False)
+        base_result = blocker(generated_tok, lprobs.clone(), bsz, beam_size, step)
+        scripted_blocker = torch.jit.script(blocker)
+        jit_result = scripted_blocker(
+            generated_tok, lprobs.clone(), bsz, beam_size, step
+        )
+        self.assertTensorEqual(base_result, jit_result)
+
+    def test_ngram_blocking_same_as_default_implem(self):
+        """Test that cuda extension returns same things as default impl in many settings."""
+        vocab_size = 4
+        step = 6
+        for _ in range(2):
+            block_param = np.random.choice([1, 2, 3, 4])
+            batch_size = np.random.randint(1, 8)
+            beam_size = np.random.choice([1, 2, 4, 8])
+            lprobs = torch.zeros((beam_size * batch_size, vocab_size), device="cuda")
+
+            generated_tok = torch.tensor(
+                np.random.randint(
+                    0, vocab_size, size=(batch_size * beam_size, step + 1)
+                ),
+                device="cuda",
+                dtype=torch.long,
+            )
+            self._compare_cuda_ext_to_default_implem(
+                batch_size,
+                beam_size,
+                generated_tok,
+                lprobs,
+                step,
+                block_param,
+            )
+
+    def _compare_cuda_ext_to_default_implem(
+        self, bsz, beam_size, generated_tok, lprobs, step, block_param
+    ):
+        """Assert that cuda extension and default implem return the same thing."""
+        blocker = NGramRepeatBlock(block_param)
+        assert blocker.use_extension, "Extension not compiled"
+        cuda_ext_result = blocker(
+            generated_tok,
+            lprobs.clone(),
+            bsz,
+            beam_size,
+            step,
+        )
+        blocker.use_extension = False
+        baseline_result = blocker(
+            generated_tok,
+            lprobs.clone(),
+            bsz,
+            beam_size,
+            step,
+        )
+        self.assertTensorEqual(cuda_ext_result, baseline_result)
+        blocker.use_extension = True
+        return cuda_ext_result, baseline_result
+
+
+class TestDiverseBeamSearch(TestSequenceGeneratorBase):
+    def setUp(self):
+        # construct dummy dictionary
+        d = test_utils.dummy_dictionary(vocab_size=2)
+        self.assertEqual(d.pad(), 1)
+        self.assertEqual(d.eos(), 2)
+        self.assertEqual(d.unk(), 3)
+        self.eos = d.eos()
+        self.w1 = 4
+        self.w2 = 5
+
+        # construct source data
+        self.src_tokens = torch.LongTensor(
+            [
+                [self.w1, self.w2, self.eos],
+                [self.w1, self.w2, self.eos],
+            ]
+        )
+        self.src_lengths = torch.LongTensor([2, 2])
+
+        args = argparse.Namespace()
+        unk = 0.0
+        args.beam_probs = [
+            # step 0:
+            torch.FloatTensor(
+                [
+                    # eos      w1   w2
+                    # sentence 1:
+                    [0.0, unk, 0.9, 0.1],  # beam 1
+                    [0.0, unk, 0.9, 0.1],  # beam 2
+                    # sentence 2:
+                    [0.0, unk, 0.7, 0.3],
+                    [0.0, unk, 0.7, 0.3],
+                ]
+            ),
+            # step 1:
+            torch.FloatTensor(
+                [
+                    # eos      w1   w2
+                    # sentence 1:
+                    [0.0, unk, 0.6, 0.4],
+                    [0.0, unk, 0.6, 0.4],
+                    # sentence 2:
+                    [0.25, unk, 0.35, 0.4],
+                    [0.25, unk, 0.35, 0.4],
+                ]
+            ),
+            # step 2:
+            torch.FloatTensor(
+                [
+                    # eos      w1   w2
+                    # sentence 1:
+                    [1.0, unk, 0.0, 0.0],
+                    [1.0, unk, 0.0, 0.0],
+                    # sentence 2:
+                    [0.9, unk, 0.1, 0.0],
+                    [0.9, unk, 0.1, 0.0],
+                ]
+            ),
+        ]
+
+        task = test_utils.TestTranslationTask.setup_task(args, d, d)
+        self.model = task.build_model(args)
+        self.tgt_dict = task.target_dictionary
+
+    def test_diverse_beam_search(self):
+        search_strategy = search.DiverseBeamSearch(
+            self.tgt_dict, num_groups=2, diversity_strength=0.0
+        )
+        generator = SequenceGenerator(
+            [self.model],
+            self.tgt_dict,
+            beam_size=2,
+            search_strategy=search_strategy,
+        )
+        sample = {
+            "net_input": {
+                "src_tokens": self.src_tokens,
+                "src_lengths": self.src_lengths,
+            }
+        }
+        hypos = generator.forward(sample)
+        eos, w1, w2 = self.eos, self.w1, self.w2
+        # sentence 1, beam 1
+        self.assertHypoTokens(hypos[0][0], [w1, w1, eos])
+        self.assertHypoScore(hypos[0][0], [0.9, 0.6, 1.0])
+        # sentence 1, beam 2
+        self.assertHypoTokens(hypos[0][1], [w1, w1, eos])
+        self.assertHypoScore(hypos[0][1], [0.9, 0.6, 1.0])
+        # sentence 2, beam 1
+        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
+        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.9])
+        # sentence 2, beam 2
+        self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
+        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.9])
+
+
+class TestDiverseSiblingsSearch(TestDiverseBeamSearch):
+    def assertHypoScore(
+        self, hypo, pos_probs, sibling_rank, diversity_rate, normalized=True, lenpen=1.0
+    ):
+        pos_scores = torch.FloatTensor(pos_probs).log()
+        pos_scores.sub_(torch.Tensor(sibling_rank) * diversity_rate)
+        self.assertAlmostEqual(hypo["positional_scores"], pos_scores)
+        self.assertEqual(pos_scores.numel(), hypo["tokens"].numel())
+        score = pos_scores.sum()
+        if normalized:
+            score /= pos_scores.numel() ** lenpen
+        self.assertLess(abs(score - hypo["score"]), 1e-6)
+
+    def test_diverse_beam_search(self):
+        search_strategy = search.DiverseSiblingsSearch(
+            self.tgt_dict, diversity_rate=0.5
+        )
+        generator = SequenceGenerator(
+            [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy
+        )
+        sample = {
+            "net_input": {
+                "src_tokens": self.src_tokens,
+                "src_lengths": self.src_lengths,
+            }
+        }
+        hypos = generator.forward(sample)
+        eos, w1, w2 = self.eos, self.w1, self.w2
+        # sentence 1, beam 1
+        self.assertHypoTokens(hypos[0][0], [w1, w1, eos])
+        self.assertHypoScore(hypos[0][0], [0.9, 0.6, 1.0], [0, 1, 1], 0.5)
+        # sentence 1, beam 2
+        self.assertHypoTokens(hypos[0][1], [w1, w2, eos])
+        self.assertHypoScore(hypos[0][1], [0.9, 0.4, 1.0], [0, 2, 1], 0.5)
+        # sentence 2, beam 1
+        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
+        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.9], [0, 1, 1], 0.5)
+        # sentence 2, beam 2
+        self.assertHypoTokens(hypos[1][1], [w1, w1, eos])
+        self.assertHypoScore(hypos[1][1], [0.7, 0.35, 0.9], [0, 2, 1], 0.5)
+
+
+class TestTopPSamplingSearch(TestSequenceGeneratorBase):
+    def setUp(self):
+        # construct dummy dictionary
+        d = test_utils.dummy_dictionary(vocab_size=2)
+        self.assertEqual(d.pad(), 1)
+        self.assertEqual(d.eos(), 2)
+        self.assertEqual(d.unk(), 3)
+        self.eos = d.eos()
+        self.w1 = 4
+        self.w2 = 5
+
+        # construct source data
+        self.src_tokens = torch.LongTensor(
+            [
+                [self.w1, self.w2, self.eos],
+                [self.w1, self.w2, self.eos],
+            ]
+        )
+        self.src_lengths = torch.LongTensor([2, 2])
+
+        args = argparse.Namespace()
+        unk = 0.0
+        # The minimal probability of top 2 tokens.
+        self.min_top2_prob = 0.75
+        # The minimal probability of the top 1 token.
+        self.min_top1_prob = 0.4
+
+        w1_prob = self.min_top1_prob
+        w2_prob = self.min_top2_prob - self.min_top1_prob
+        eos_prob = 1 - self.min_top2_prob
+
+        args.beam_probs = [
+            # step 0:
+            torch.FloatTensor(
+                [
+                    # eos      w1   w2
+                    [0.0, unk, 1.0, 0.0],
+                    [0.0, unk, 1.0, 0.0],
+                    [0.0, unk, 1.0, 0.0],
+                    [0.0, unk, 1.0, 0.0],
+                ]
+            ),
+            # step 1:
+            torch.FloatTensor(
+                [
+                    # eos           w1       w2
+                    [eos_prob, unk, w1_prob, w2_prob],
+                    [eos_prob, unk, w1_prob, w2_prob],
+                    [eos_prob, unk, w1_prob, w2_prob],
+                    [eos_prob, unk, w1_prob, w2_prob],
+                ]
+            ),
+            # step 2:
+            torch.FloatTensor(
+                [
+                    # eos      w1   w2
+                    [1.0, unk, 0.0, 0.0],
+                    [1.0, unk, 0.0, 0.0],
+                    [1.0, unk, 0.0, 0.0],
+                    [1.0, unk, 0.0, 0.0],
+                ]
+            ),
+        ]
+
+        task = test_utils.TestTranslationTask.setup_task(args, d, d)
+        self.model = task.build_model(args)
+        self.tgt_dict = task.target_dictionary
+
+    def test_topp_sampling_search_low_prob(self):
+        # Given a prob low enough to top-P sampling, we expect only the top
+        # 1 token to be sampled, which always results in the same output.
+        low_sampling_topp = self.min_top1_prob / 2.0
+        search_strategy = search.Sampling(
+            self.tgt_dict, sampling_topp=low_sampling_topp
+        )
+        generator = SequenceGenerator(
+            [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy
+        )
+        sample = {
+            "net_input": {
+                "src_tokens": self.src_tokens,
+                "src_lengths": self.src_lengths,
+            }
+        }
+        hypos = generator.forward(sample)
+        eos, w1 = self.eos, self.w1
+        # sentence 1, beam 1
+        self.assertHypoTokens(hypos[0][0], [w1, w1, eos])
+        self.assertHypoScore(hypos[0][0], [1.0, 0.4, 1.0])
+        # sentence 1, beam 2
+        self.assertHypoTokens(hypos[0][1], [w1, w1, eos])
+        self.assertHypoScore(hypos[0][1], [1.0, 0.4, 1.0])
+        # sentence 2, beam 1
+        self.assertHypoTokens(hypos[1][0], [w1, w1, eos])
+        self.assertHypoScore(hypos[1][0], [1.0, 0.4, 1.0])
+        # sentence 2, beam 2
+        self.assertHypoTokens(hypos[1][1], [w1, w1, eos])
+        self.assertHypoScore(hypos[1][1], [1.0, 0.4, 1.0])
+
+    def test_topp_sampling_search_high_prob(self):
+        # Given a prob high enough to top-P sampling, any of the top 2
+        # tokens could be sampled. This can cause different outputs.
+        high_sampling_topp = (self.min_top1_prob + self.min_top2_prob) / 2.0
+        search_strategy = search.Sampling(
+            self.tgt_dict, sampling_topp=high_sampling_topp
+        )
+        generator = SequenceGenerator(
+            [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy
+        )
+        sample = {
+            "net_input": {
+                "src_tokens": self.src_tokens,
+                "src_lengths": self.src_lengths,
+            }
+        }
+        hypos = generator.forward(sample)
+        eos, w1, w2 = self.eos, self.w1, self.w2
+        # sentence 1, beam 1
+        self.assertTrue(
+            self.hypoTokens(hypos[0][0], [w1, w1, eos])
+            or self.hypoTokens(hypos[0][0], [w1, w2, eos])
+        )
+        self.assertTrue(
+            self.hypoScore(hypos[0][0], [1.0, 0.4, 1.0])
+            or self.hypoScore(hypos[0][0], [1.0, 0.35, 1.0])
+        )
+
+        # sentence 1, beam 2
+        self.assertTrue(
+            self.hypoTokens(hypos[0][1], [w1, w1, eos])
+            or self.hypoTokens(hypos[0][1], [w1, w2, eos])
+        )
+        self.assertTrue(
+            self.hypoScore(hypos[0][1], [1.0, 0.4, 1.0])
+            or self.hypoScore(hypos[0][1], [1.0, 0.35, 1.0])
+        )
+
+        # sentence 2, beam 1
+        self.assertTrue(
+            self.hypoTokens(hypos[1][0], [w1, w1, eos])
+            or self.hypoTokens(hypos[1][0], [w1, w2, eos])
+        )
+        self.assertTrue(
+            self.hypoScore(hypos[1][0], [1.0, 0.4, 1.0])
+            or self.hypoScore(hypos[1][0], [1.0, 0.35, 1.0])
+        )
+
+        # sentence 2, beam 2
+        self.assertTrue(
+            self.hypoTokens(hypos[1][1], [w1, w1, eos])
+            or self.hypoTokens(hypos[1][1], [w1, w2, eos])
+        )
+        self.assertTrue(
+            self.hypoScore(hypos[1][1], [1.0, 0.4, 1.0])
+            or self.hypoScore(hypos[1][1], [1.0, 0.35, 1.0])
+        )
+
+    def hypoTokens(self, hypo, tokens):
+        return self.tensorEqual(hypo["tokens"], torch.LongTensor(tokens))
+
+    def hypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0):
+        pos_scores = torch.FloatTensor(pos_probs).log()
+        if not self.almostEqual(hypo["positional_scores"], pos_scores):
+            return False
+        if pos_scores.numel() != hypo["tokens"].numel():
+            return False
+        score = pos_scores.sum()
+        if normalized:
+            score /= pos_scores.numel() ** lenpen
+        return abs(score - hypo["score"]) < 1e-6
+
+    def almostEqual(self, t1, t2):
+        return t1.size() == t2.size() and (t1 - t2).abs().max() < 1e-4
+
+    def tensorEqual(self, t1, t2):
+        return t1.size() == t2.size() and t1.ne(t2).long().sum() == 0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_sequence_scorer.py b/fairseq/tests/test_sequence_scorer.py
new file mode 100644
index 0000000000000000000000000000000000000000..42f9447b599bcd7a9913aec37d94ea5078ff43a3
--- /dev/null
+++ b/fairseq/tests/test_sequence_scorer.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import unittest
+
+import tests.utils as test_utils
+import torch
+from fairseq.sequence_scorer import SequenceScorer
+
+
+class TestSequenceScorer(unittest.TestCase):
+    def test_sequence_scorer(self):
+        # construct dummy dictionary
+        d = test_utils.dummy_dictionary(vocab_size=2)
+        self.assertEqual(d.pad(), 1)
+        self.assertEqual(d.eos(), 2)
+        self.assertEqual(d.unk(), 3)
+        eos = d.eos()
+        w1 = 4
+        w2 = 5
+
+        # construct dataloader
+        data = [
+            {
+                "source": torch.LongTensor([w1, w2, eos]),
+                "target": torch.LongTensor([w1, w2, w1, eos]),
+            },
+            {
+                "source": torch.LongTensor([w2, eos]),
+                "target": torch.LongTensor([w2, w1, eos]),
+            },
+            {
+                "source": torch.LongTensor([w2, eos]),
+                "target": torch.LongTensor([w2, eos]),
+            },
+        ]
+        data_itr = test_utils.dummy_dataloader(data)
+
+        # specify expected output probabilities
+        args = argparse.Namespace()
+        unk = 0.0
+        args.beam_probs = [
+            # step 0:
+            torch.FloatTensor(
+                [
+                    # eos      w1   w2
+                    [0.0, unk, 0.6, 0.4],  # sentence 1
+                    [0.0, unk, 0.4, 0.6],  # sentence 2
+                    [0.0, unk, 0.7, 0.3],  # sentence 3
+                ]
+            ),
+            # step 1:
+            torch.FloatTensor(
+                [
+                    # eos      w1   w2
+                    [0.0, unk, 0.2, 0.7],  # sentence 1
+                    [0.0, unk, 0.8, 0.2],  # sentence 2
+                    [0.7, unk, 0.1, 0.2],  # sentence 3
+                ]
+            ),
+            # step 2:
+            torch.FloatTensor(
+                [
+                    # eos       w1    w2
+                    [0.10, unk, 0.50, 0.4],  # sentence 1
+                    [0.15, unk, 0.15, 0.7],  # sentence 2
+                    [0.00, unk, 0.00, 0.0],  # sentence 3
+                ]
+            ),
+            # step 3:
+            torch.FloatTensor(
+                [
+                    # eos      w1    w2
+                    [0.9, unk, 0.05, 0.05],  # sentence 1
+                    [0.0, unk, 0.00, 0.0],  # sentence 2
+                    [0.0, unk, 0.00, 0.0],  # sentence 3
+                ]
+            ),
+        ]
+        expected_scores = [
+            [0.6, 0.7, 0.5, 0.9],  # sentence 1
+            [0.6, 0.8, 0.15],  # sentence 2
+            [0.3, 0.7],  # sentence 3
+        ]
+
+        task = test_utils.TestTranslationTask.setup_task(args, d, d)
+        model = task.build_model(args)
+        scorer = SequenceScorer(task.target_dictionary)
+        for sample in data_itr:
+            hypos = task.inference_step(scorer, [model], sample)
+            for id, hypos_id in zip(sample["id"].tolist(), hypos):
+                self.assertHypoTokens(hypos_id[0], data[id]["target"])
+                self.assertHypoScore(hypos_id[0], expected_scores[id])
+
+    def assertHypoTokens(self, hypo, tokens):
+        self.assertTensorEqual(hypo["tokens"], torch.LongTensor(tokens))
+
+    def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0):
+        pos_scores = torch.FloatTensor(pos_probs).log()
+        self.assertAlmostEqual(hypo["positional_scores"], pos_scores)
+        self.assertEqual(pos_scores.numel(), hypo["tokens"].numel())
+        score = pos_scores.sum()
+        if normalized:
+            score /= pos_scores.numel() ** lenpen
+        self.assertLess(abs(score - hypo["score"]), 1e-6)
+
+    def assertAlmostEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertLess((t1 - t2).abs().max(), 1e-4)
+
+    def assertTensorEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertEqual(t1.ne(t2).long().sum(), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_sparse_multihead_attention.py b/fairseq/tests/test_sparse_multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e32b25a7fb1e12295b84d0c65064f8e42b7bdd3
--- /dev/null
+++ b/fairseq/tests/test_sparse_multihead_attention.py
@@ -0,0 +1,114 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
+
+
+class TestSparseMultiheadAttention(unittest.TestCase):
+    def test_sparse_multihead_attention(self):
+        attn_weights = torch.randn(1, 8, 8)
+        bidirectional_sparse_mask = torch.tensor(
+            [
+                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0],
+                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0],
+                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0],
+                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0],
+                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
+                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
+                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
+                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
+            ]
+        )
+
+        bidirectional_attention = SparseMultiheadAttention(
+            16, 1, stride=4, expressivity=1, is_bidirectional=True
+        )
+        bidirectional_attention_sparse_mask = (
+            bidirectional_attention.buffered_sparse_mask(attn_weights, 8, 8)
+        )
+        torch.all(
+            torch.eq(bidirectional_attention_sparse_mask, bidirectional_sparse_mask)
+        )
+
+        sparse_mask = torch.tensor(
+            [
+                [
+                    0,
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                ],
+                [
+                    0,
+                    0,
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                ],
+                [
+                    0,
+                    0,
+                    0,
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                ],
+                [
+                    0,
+                    0,
+                    0,
+                    0,
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                ],
+                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), float("-inf")],
+                [
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    0,
+                    0,
+                    0,
+                    float("-inf"),
+                    float("-inf"),
+                ],
+                [
+                    float("-inf"),
+                    float("-inf"),
+                    float("-inf"),
+                    0,
+                    0,
+                    0,
+                    0,
+                    float("-inf"),
+                ],
+                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
+            ]
+        )
+
+        attention = SparseMultiheadAttention(
+            16, 1, stride=4, expressivity=1, is_bidirectional=False
+        )
+        attention_sparse_mask = attention.buffered_sparse_mask(attn_weights, 8, 8)
+
+        torch.all(torch.eq(attention_sparse_mask, sparse_mask))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_token_block_dataset.py b/fairseq/tests/test_token_block_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4d7b76dcd55fe7869dbb1fa188f7b36fb639bda
--- /dev/null
+++ b/fairseq/tests/test_token_block_dataset.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import tests.utils as test_utils
+import torch
+from fairseq.data import TokenBlockDataset
+
+
+class TestTokenBlockDataset(unittest.TestCase):
+    def _build_dataset(self, data, **kwargs):
+        sizes = [len(x) for x in data]
+        underlying_ds = test_utils.TestDataset(data)
+        return TokenBlockDataset(underlying_ds, sizes, **kwargs)
+
+    def test_eos_break_mode(self):
+        data = [
+            torch.tensor([5, 4, 3, 2, 1], dtype=torch.long),
+            torch.tensor([1], dtype=torch.long),
+            torch.tensor([8, 7, 6, 1], dtype=torch.long),
+        ]
+        ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos")
+        self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1])
+        self.assertEqual(ds[1].tolist(), [1])
+        self.assertEqual(ds[2].tolist(), [8, 7, 6, 1])
+
+        data = [
+            torch.tensor([5, 4, 3, 2, 1], dtype=torch.long),
+            torch.tensor([8, 7, 6, 1], dtype=torch.long),
+            torch.tensor([1], dtype=torch.long),
+        ]
+        ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos")
+        self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1])
+        self.assertEqual(ds[1].tolist(), [8, 7, 6, 1])
+        self.assertEqual(ds[2].tolist(), [1])
+
+    def test_block_break_mode(self):
+        data = [
+            torch.tensor([5, 4, 3, 2, 1], dtype=torch.long),
+            torch.tensor([8, 7, 6, 1], dtype=torch.long),
+            torch.tensor([9, 1], dtype=torch.long),
+        ]
+        ds = self._build_dataset(data, block_size=3, pad=0, eos=1, break_mode="none")
+        self.assertEqual(ds[0].tolist(), [5, 4, 3])
+        self.assertEqual(ds[1].tolist(), [2, 1, 8])
+        self.assertEqual(ds[2].tolist(), [7, 6, 1])
+        self.assertEqual(ds[3].tolist(), [9, 1])
+
+    def test_complete_break_mode(self):
+        data = [
+            torch.tensor([5, 4, 3, 2, 1], dtype=torch.long),
+            torch.tensor([8, 7, 6, 1], dtype=torch.long),
+            torch.tensor([9, 1], dtype=torch.long),
+        ]
+        ds = self._build_dataset(
+            data, block_size=6, pad=0, eos=1, break_mode="complete"
+        )
+        self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1])
+        self.assertEqual(ds[1].tolist(), [8, 7, 6, 1, 9, 1])
+
+        data = [
+            torch.tensor([4, 3, 2, 1], dtype=torch.long),
+            torch.tensor([5, 1], dtype=torch.long),
+            torch.tensor([1], dtype=torch.long),
+            torch.tensor([6, 1], dtype=torch.long),
+        ]
+        ds = self._build_dataset(
+            data, block_size=3, pad=0, eos=1, break_mode="complete"
+        )
+        self.assertEqual(ds[0].tolist(), [4, 3, 2, 1])
+        self.assertEqual(ds[1].tolist(), [5, 1, 1])
+        self.assertEqual(ds[2].tolist(), [6, 1])
+
+    def test_4billion_tokens(self):
+        """Regression test for numpy type promotion issue https://github.com/numpy/numpy/issues/5745"""
+        data = [torch.tensor(list(range(10000)), dtype=torch.long)] * 430000
+        ds = self._build_dataset(
+            data, block_size=6, pad=0, eos=1, break_mode="complete"
+        )
+        ds[-1]  # __getitem__ works
+        start, end = ds.slice_indices[-1]
+        assert end > 4294967295  # data must be sufficiently large to overflow uint32
+        assert not isinstance(
+            end + 1, float
+        )  # this would also raise, since np.uint64(1) + 1 => 2.0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_train.py b/fairseq/tests/test_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ef94cc5b80c05485144db67501b2acedbaf291
--- /dev/null
+++ b/fairseq/tests/test_train.py
@@ -0,0 +1,247 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import logging
+import unittest
+from io import StringIO
+from unittest.mock import MagicMock, patch
+
+import torch
+from fairseq import checkpoint_utils, data
+from omegaconf import OmegaConf
+
+
+def mock_trainer(epoch, num_updates, iterations_in_epoch):
+    trainer = MagicMock()
+    trainer.load_checkpoint.return_value = {
+        "train_iterator": {
+            "epoch": epoch,
+            "iterations_in_epoch": iterations_in_epoch,
+            "shuffle": False,
+        },
+    }
+    trainer.get_num_updates.return_value = num_updates
+    return trainer
+
+
+def mock_dict():
+    d = MagicMock()
+    d.pad.return_value = 1
+    d.eos.return_value = 2
+    d.unk.return_value = 3
+    return d
+
+
+def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
+    tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1)
+    tokens_ds = data.TokenBlockDataset(
+        tokens,
+        sizes=[tokens.size(-1)],
+        block_size=1,
+        pad=0,
+        eos=1,
+        include_targets=False,
+    )
+    trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
+    dataset = data.LanguagePairDataset(
+        tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False
+    )
+    epoch_itr = data.EpochBatchIterator(
+        dataset=dataset,
+        collate_fn=dataset.collater,
+        batch_sampler=[[i] for i in range(epoch_size)],
+    )
+    return trainer, epoch_itr
+
+
+def get_mock_cfg(finetune_from_model):
+    cfg_mock = OmegaConf.create(
+        {
+            "checkpoint": {
+                "save_dir": None,
+                "optimizer_overrides": "{}",
+                "reset_dataloader": False,
+                "reset_meters": False,
+                "reset_optimizer": False,
+                "reset_lr_scheduler": False,
+                "finetune_from_model": finetune_from_model,
+                "model_parallel_size": 1,
+                "restore_file": "checkpoint_last.pt",
+            },
+            "common": {
+                "model_parallel_size": 1,
+            },
+        }
+    )
+    return cfg_mock
+
+
+class TestLoadCheckpoint(unittest.TestCase):
+    def setUp(self):
+        self.cfg_mock = get_mock_cfg(None)
+        self.patches = {
+            "os.makedirs": MagicMock(),
+            "os.path.join": MagicMock(),
+            "os.path.isfile": MagicMock(return_value=True),
+            "os.path.isabs": MagicMock(return_value=False),
+            "fairseq.file_io.PathManager.exists": MagicMock(return_value=False),
+        }
+        self.applied_patches = [patch(p, d) for p, d in self.patches.items()]
+        [p.start() for p in self.applied_patches]
+        logging.disable(logging.CRITICAL)
+
+    def tearDown(self):
+        patch.stopall()
+        logging.disable(logging.NOTSET)
+
+    def test_load_partial_checkpoint(self):
+        with contextlib.redirect_stdout(StringIO()):
+            trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 200, 50)
+            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
+
+            _, epoch_itr = checkpoint_utils.load_checkpoint(
+                self.cfg_mock.checkpoint, trainer
+            )
+
+            self.assertEqual(epoch_itr.epoch, 2)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 50)
+
+            itr = epoch_itr.next_epoch_itr(shuffle=False)
+            self.assertEqual(epoch_itr.epoch, 2)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 50)
+
+            self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 50)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 51)
+
+            for _ in range(150 - 52):
+                next(itr)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 149)
+            self.assertTrue(itr.has_next())
+            next(itr)
+            self.assertFalse(itr.has_next())
+
+            itr = epoch_itr.next_epoch_itr(shuffle=False)
+            self.assertTrue(itr.has_next())
+            self.assertEqual(epoch_itr.epoch, 3)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 0)
+
+    def test_load_full_checkpoint(self):
+        with contextlib.redirect_stdout(StringIO()):
+            trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 300, 150)
+            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
+
+            _, epoch_itr = checkpoint_utils.load_checkpoint(
+                self.cfg_mock.checkpoint, trainer
+            )
+            itr = epoch_itr.next_epoch_itr(shuffle=False)
+
+            self.assertEqual(epoch_itr.epoch, 3)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 0)
+            self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 0)
+
+    def test_load_no_checkpoint(self):
+        with contextlib.redirect_stdout(StringIO()):
+            trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0)
+            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
+            self.patches["os.path.isfile"].return_value = False
+
+            _, epoch_itr = checkpoint_utils.load_checkpoint(
+                self.cfg_mock.checkpoint, trainer
+            )
+            itr = epoch_itr.next_epoch_itr(shuffle=False)
+
+            self.assertEqual(epoch_itr.epoch, 1)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 0)
+            self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 0)
+
+    def test_finetune_from_model_args_conflict(self):
+        with contextlib.redirect_stdout(StringIO()):
+            trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0)
+            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
+
+            for arg in [
+                "reset_optimizer",
+                "reset_lr_scheduler",
+                "reset_meters",
+                "reset_dataloader",
+            ]:
+                with self.subTest(arg=arg):
+                    cfg_mock = get_mock_cfg("/temp/checkpoint_pretrained.pt")
+                    cfg_mock["checkpoint"][arg] = True
+                    with self.assertRaises(Exception) as context:
+                        _, _ = checkpoint_utils.load_checkpoint(
+                            cfg_mock.checkpoint, trainer
+                        )
+
+                    self.assertTrue(
+                        "--finetune-from-model can not be set together with either --reset-optimizer"
+                        " or reset_lr_scheduler or reset_meters or reset_dataloader"
+                        in str(context.exception)
+                    )
+
+    def test_finetune_from_model(self):
+        with contextlib.redirect_stdout(StringIO()):
+            trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0)
+            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
+            from_model_path = "/temp/checkpoint_pretrained.pt"
+
+            def mock_finetune_exist(path):
+                if path == from_model_path:
+                    return True
+                else:
+                    return False
+
+            self.patches[
+                "fairseq.file_io.PathManager.exists"
+            ].side_effect = mock_finetune_exist
+            cfg_mock = get_mock_cfg(from_model_path)
+            cfg_mock.checkpoint.restore_file = "checkpoint_last.pt"
+            _, _ = checkpoint_utils.load_checkpoint(cfg_mock.checkpoint, trainer)
+            (
+                checkpoint_path,
+                reset_optimizer,
+                reset_lr_scheduler,
+                optimizer_overrides,
+            ) = trainer.load_checkpoint.call_args[0]
+            reset_meters = trainer.load_checkpoint.call_args[1]["reset_meters"]
+            self.assertTrue(reset_optimizer)
+            self.assertTrue(reset_lr_scheduler)
+            self.assertTrue(reset_meters)
+
+    def test_finetune_from_model_resume(self):
+        with contextlib.redirect_stdout(StringIO()):
+            trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0)
+            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
+            from_model_path = "/temp/checkpoint_pretrained.pt"
+
+            # launch second time
+            # both restore_file=checkpoint_last.pt and finetune_from_model are set
+            def mock_finetune_exist(path):
+                if path == from_model_path or path.endsWith("checkpoint_last.pt"):
+                    return True
+                else:
+                    return False
+
+            self.patches[
+                "fairseq.file_io.PathManager.exists"
+            ].side_effect = mock_finetune_exist
+            cfg_mock = get_mock_cfg(from_model_path)
+            cfg_mock.checkpoint.restore_file = "checkpoint_last.pt"
+            _, _ = checkpoint_utils.load_checkpoint(cfg_mock.checkpoint, trainer)
+            (
+                checkpoint_path,
+                reset_optimizer,
+                reset_lr_scheduler,
+                optimizer_overrides,
+            ) = trainer.load_checkpoint.call_args[0]
+            reset_meters = trainer.load_checkpoint.call_args[1]["reset_meters"]
+            self.assertFalse(reset_optimizer)
+            self.assertFalse(reset_lr_scheduler)
+            self.assertFalse(reset_meters)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_transformer.py b/fairseq/tests/test_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..de5c5bdbd49692e63fb1cb50108a791304425dc1
--- /dev/null
+++ b/fairseq/tests/test_transformer.py
@@ -0,0 +1,65 @@
+import argparse
+import unittest
+from typing import Any, Dict, Sequence
+
+import torch
+from fairseq.models import transformer
+
+from tests.test_roberta import FakeTask
+
+
+def mk_sample(tok: Sequence[int] = None, batch_size: int = 2) -> Dict[str, Any]:
+    if not tok:
+        tok = [10, 11, 12, 13, 14, 15, 2]
+
+    batch = torch.stack([torch.tensor(tok, dtype=torch.long)] * batch_size)
+    sample = {
+        "net_input": {
+            "src_tokens": batch,
+            "prev_output_tokens": batch,
+            "src_lengths": torch.tensor(
+                [len(tok)] * batch_size, dtype=torch.long, device=batch.device
+            ),
+        },
+        "target": batch[:, 1:],
+    }
+    return sample
+
+
+def mk_transformer(**extra_args: Any):
+    overrides = {
+        # Use characteristics dimensions
+        "encoder_embed_dim": 12,
+        "encoder_ffn_embed_dim": 14,
+        "decoder_embed_dim": 12,
+        "decoder_ffn_embed_dim": 14,
+        # Disable dropout so we have comparable tests.
+        "dropout": 0,
+        "attention_dropout": 0,
+        "activation_dropout": 0,
+        "encoder_layerdrop": 0,
+    }
+    overrides.update(extra_args)
+    # Overrides the defaults from the parser
+    args = argparse.Namespace(**overrides)
+    transformer.tiny_architecture(args)
+
+    torch.manual_seed(0)
+    task = FakeTask(args)
+    return transformer.TransformerModel.build_model(args, task)
+
+
+class TransformerTestCase(unittest.TestCase):
+    def test_forward_backward(self):
+        model = mk_transformer(encoder_embed_dim=12, decoder_embed_dim=12)
+        sample = mk_sample()
+        o, _ = model.forward(**sample["net_input"])
+        loss = o.sum()
+        loss.backward()
+
+    def test_different_encoder_decoder_embed_dim(self):
+        model = mk_transformer(encoder_embed_dim=12, decoder_embed_dim=16)
+        sample = mk_sample()
+        o, _ = model.forward(**sample["net_input"])
+        loss = o.sum()
+        loss.backward()
diff --git a/fairseq/tests/test_utils.py b/fairseq/tests/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..79195903e0f34372a24fa50312a6e00170c14471
--- /dev/null
+++ b/fairseq/tests/test_utils.py
@@ -0,0 +1,114 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from fairseq import utils
+
+
+class TestUtils(unittest.TestCase):
+    def test_convert_padding_direction(self):
+        pad = 1
+        left_pad = torch.LongTensor(
+            [
+                [2, 3, 4, 5, 6],
+                [1, 7, 8, 9, 10],
+                [1, 1, 1, 11, 12],
+            ]
+        )
+        right_pad = torch.LongTensor(
+            [
+                [2, 3, 4, 5, 6],
+                [7, 8, 9, 10, 1],
+                [11, 12, 1, 1, 1],
+            ]
+        )
+
+        self.assertAlmostEqual(
+            right_pad,
+            utils.convert_padding_direction(
+                left_pad,
+                pad,
+                left_to_right=True,
+            ),
+        )
+        self.assertAlmostEqual(
+            left_pad,
+            utils.convert_padding_direction(
+                right_pad,
+                pad,
+                right_to_left=True,
+            ),
+        )
+
+    def test_make_positions(self):
+        pad = 1
+        left_pad_input = torch.LongTensor(
+            [
+                [9, 9, 9, 9, 9],
+                [1, 9, 9, 9, 9],
+                [1, 1, 1, 9, 9],
+            ]
+        )
+        left_pad_output = torch.LongTensor(
+            [
+                [2, 3, 4, 5, 6],
+                [1, 2, 3, 4, 5],
+                [1, 1, 1, 2, 3],
+            ]
+        )
+        right_pad_input = torch.LongTensor(
+            [
+                [9, 9, 9, 9, 9],
+                [9, 9, 9, 9, 1],
+                [9, 9, 1, 1, 1],
+            ]
+        )
+        right_pad_output = torch.LongTensor(
+            [
+                [2, 3, 4, 5, 6],
+                [2, 3, 4, 5, 1],
+                [2, 3, 1, 1, 1],
+            ]
+        )
+
+        self.assertAlmostEqual(
+            left_pad_output,
+            utils.make_positions(left_pad_input, pad),
+        )
+        self.assertAlmostEqual(
+            right_pad_output,
+            utils.make_positions(right_pad_input, pad),
+        )
+
+    def test_clip_grad_norm_(self):
+        params = torch.nn.Parameter(torch.zeros(5)).requires_grad_(False)
+        grad_norm = utils.clip_grad_norm_(params, 1.0)
+        self.assertTrue(torch.is_tensor(grad_norm))
+        self.assertEqual(grad_norm, 0.0)
+
+        params = [torch.nn.Parameter(torch.zeros(5)) for i in range(3)]
+        for p in params:
+            p.grad = torch.full((5,), fill_value=2.0)
+        grad_norm = utils.clip_grad_norm_(params, 1.0)
+        exp_grad_norm = torch.full((15,), fill_value=2.0).norm()
+        self.assertTrue(torch.is_tensor(grad_norm))
+        self.assertEqual(grad_norm, exp_grad_norm)
+
+        grad_norm = utils.clip_grad_norm_(params, 1.0)
+        self.assertAlmostEqual(grad_norm, torch.tensor(1.0))
+
+    def test_resolve_max_positions_with_tuple(self):
+        resolved = utils.resolve_max_positions(None, (2000, 100, 2000), 12000)
+        self.assertEqual(resolved, (2000, 100, 2000))
+
+    def assertAlmostEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertLess(utils.item((t1 - t2).abs().max()), 1e-4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fairseq/tests/test_valid_subset_checks.py b/fairseq/tests/test_valid_subset_checks.py
new file mode 100644
index 0000000000000000000000000000000000000000..c39fb89823305dd9a84220d366da9150cedd659e
--- /dev/null
+++ b/fairseq/tests/test_valid_subset_checks.py
@@ -0,0 +1,143 @@
+import os
+import shutil
+import tempfile
+import unittest
+
+from fairseq import options
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.data.data_utils import raise_if_valid_subsets_unintentionally_ignored
+from .utils import create_dummy_data, preprocess_lm_data, train_language_model
+
+
+def make_lm_config(
+    data_dir=None,
+    extra_flags=None,
+    task="language_modeling",
+    arch="transformer_lm_gpt2_tiny",
+):
+    task_args = [task]
+    if data_dir is not None:
+        task_args += [data_dir]
+    train_parser = options.get_training_parser()
+    train_args = options.parse_args_and_arch(
+        train_parser,
+        [
+            "--task",
+            *task_args,
+            "--arch",
+            arch,
+            "--optimizer",
+            "adam",
+            "--lr",
+            "0.0001",
+            "--max-tokens",
+            "500",
+            "--tokens-per-sample",
+            "500",
+            "--save-dir",
+            data_dir,
+            "--max-epoch",
+            "1",
+        ]
+        + (extra_flags or []),
+    )
+    cfg = convert_namespace_to_omegaconf(train_args)
+    return cfg
+
+
+def write_empty_file(path):
+    with open(path, "w"):
+        pass
+    assert os.path.exists(path)
+
+
+class TestValidSubsetsErrors(unittest.TestCase):
+    """Test various filesystem, clarg combinations and ensure that error raising happens as expected"""
+
+    def _test_case(self, paths, extra_flags):
+        with tempfile.TemporaryDirectory() as data_dir:
+            [
+                write_empty_file(os.path.join(data_dir, f"{p}.bin"))
+                for p in paths + ["train"]
+            ]
+            cfg = make_lm_config(data_dir, extra_flags=extra_flags)
+            raise_if_valid_subsets_unintentionally_ignored(cfg)
+
+    def test_default_raises(self):
+        with self.assertRaises(ValueError):
+            self._test_case(["valid", "valid1"], [])
+        with self.assertRaises(ValueError):
+            self._test_case(
+                ["valid", "valid1", "valid2"], ["--valid-subset", "valid,valid1"]
+            )
+
+    def partially_specified_valid_subsets(self):
+        with self.assertRaises(ValueError):
+            self._test_case(
+                ["valid", "valid1", "valid2"], ["--valid-subset", "valid,valid1"]
+            )
+        # Fix with ignore unused
+        self._test_case(
+            ["valid", "valid1", "valid2"],
+            ["--valid-subset", "valid,valid1", "--ignore-unused-valid-subsets"],
+        )
+
+    def test_legal_configs(self):
+        self._test_case(["valid"], [])
+        self._test_case(["valid", "valid1"], ["--ignore-unused-valid-subsets"])
+        self._test_case(["valid", "valid1"], ["--combine-val"])
+        self._test_case(["valid", "valid1"], ["--valid-subset", "valid,valid1"])
+        self._test_case(["valid", "valid1"], ["--valid-subset", "valid1"])
+        self._test_case(
+            ["valid", "valid1"], ["--combine-val", "--ignore-unused-valid-subsets"]
+        )
+        self._test_case(
+            ["valid1"], ["--valid-subset", "valid1"]
+        )  # valid.bin doesn't need to be ignored.
+
+    def test_disable_validation(self):
+        self._test_case([], ["--disable-validation"])
+        self._test_case(["valid", "valid1"], ["--disable-validation"])
+
+    def test_dummy_task(self):
+        cfg = make_lm_config(task="dummy_lm")
+        raise_if_valid_subsets_unintentionally_ignored(cfg)
+
+    def test_masked_dummy_task(self):
+        cfg = make_lm_config(task="dummy_masked_lm")
+        raise_if_valid_subsets_unintentionally_ignored(cfg)
+
+
+class TestCombineValidSubsets(unittest.TestCase):
+    def _train(self, extra_flags):
+        with self.assertLogs() as logs:
+            with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir:
+                create_dummy_data(data_dir, num_examples=20)
+                preprocess_lm_data(data_dir)
+
+                shutil.copyfile(f"{data_dir}/valid.bin", f"{data_dir}/valid1.bin")
+                shutil.copyfile(f"{data_dir}/valid.idx", f"{data_dir}/valid1.idx")
+                train_language_model(
+                    data_dir,
+                    "transformer_lm",
+                    ["--max-update", "0", "--log-format", "json"] + extra_flags,
+                    run_validation=False,
+                )
+        return [x.message for x in logs.records]
+
+    def test_combined(self):
+        flags = ["--combine-valid-subsets", "--required-batch-size-multiple", "1"]
+        logs = self._train(flags)
+        assert any(["valid1" in x for x in logs])  # loaded 100 examples from valid1
+        assert not any(["valid1_ppl" in x for x in logs])  # metrics are combined
+
+    def test_subsets(self):
+        flags = [
+            "--valid-subset",
+            "valid,valid1",
+            "--required-batch-size-multiple",
+            "1",
+        ]
+        logs = self._train(flags)
+        assert any(["valid_ppl" in x for x in logs])  # loaded 100 examples from valid1
+        assert any(["valid1_ppl" in x for x in logs])  # metrics are combined
diff --git a/fairseq/tests/utils.py b/fairseq/tests/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..af3f714ed13ce06faf4426cc83f0b69368a8ac6f
--- /dev/null
+++ b/fairseq/tests/utils.py
@@ -0,0 +1,797 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+import os
+import random
+import shutil
+import string
+import sys
+import typing as tp
+from io import StringIO
+
+import torch
+import torch.nn.functional as F
+
+import fairseq.distributed.utils as distributed_utils
+from fairseq import options, utils
+from fairseq.data import Dictionary
+from fairseq.data.language_pair_dataset import collate
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+)
+from fairseq.models.fairseq_encoder import EncoderOut
+from fairseq.tasks import LegacyFairseqTask
+from fairseq_cli import generate, interactive, preprocess, train, validate
+
+
+def dummy_dictionary(vocab_size, prefix="token_"):
+    d = Dictionary()
+    for i in range(vocab_size):
+        token = prefix + str(i)
+        d.add_symbol(token)
+    d.finalize(padding_factor=1)  # don't add extra padding symbols
+    return d
+
+
+def dummy_dataloader(
+    samples,
+    padding_idx=1,
+    eos_idx=2,
+    batch_size=None,
+):
+    if batch_size is None:
+        batch_size = len(samples)
+
+    # add any missing data to samples
+    for i, sample in enumerate(samples):
+        if "id" not in sample:
+            sample["id"] = i
+
+    # create dataloader
+    dataset = TestDataset(samples)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        collate_fn=(lambda samples: collate(samples, padding_idx, eos_idx)),
+    )
+    return iter(dataloader)
+
+
+def sequence_generator_setup():
+    # construct dummy dictionary
+    d = dummy_dictionary(vocab_size=2)
+
+    eos = d.eos()
+    w1 = 4
+    w2 = 5
+
+    # construct source data
+    src_tokens = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]])
+    src_lengths = torch.LongTensor([2, 2])
+
+    args = argparse.Namespace()
+    unk = 0.0
+    args.beam_probs = [
+        # step 0:
+        torch.FloatTensor(
+            [
+                # eos      w1   w2
+                # sentence 1:
+                [0.0, unk, 0.9, 0.1],  # beam 1
+                [0.0, unk, 0.9, 0.1],  # beam 2
+                # sentence 2:
+                [0.0, unk, 0.7, 0.3],
+                [0.0, unk, 0.7, 0.3],
+            ]
+        ),
+        # step 1:
+        torch.FloatTensor(
+            [
+                # eos      w1   w2       prefix
+                # sentence 1:
+                [1.0, unk, 0.0, 0.0],  # w1: 0.9  (emit: w1 <eos>: 0.9*1.0)
+                [0.0, unk, 0.9, 0.1],  # w2: 0.1
+                # sentence 2:
+                [0.25, unk, 0.35, 0.4],  # w1: 0.7  (don't emit: w1 <eos>: 0.7*0.25)
+                [0.00, unk, 0.10, 0.9],  # w2: 0.3
+            ]
+        ),
+        # step 2:
+        torch.FloatTensor(
+            [
+                # eos      w1   w2       prefix
+                # sentence 1:
+                [0.0, unk, 0.1, 0.9],  # w2 w1: 0.1*0.9
+                [
+                    0.6,
+                    unk,
+                    0.2,
+                    0.2,
+                ],  # w2 w2: 0.1*0.1  (emit: w2 w2 <eos>: 0.1*0.1*0.6)
+                # sentence 2:
+                [
+                    0.60,
+                    unk,
+                    0.4,
+                    0.00,
+                ],  # w1 w2: 0.7*0.4  (emit: w1 w2 <eos>: 0.7*0.4*0.6)
+                [0.01, unk, 0.0, 0.99],  # w2 w2: 0.3*0.9
+            ]
+        ),
+        # step 3:
+        torch.FloatTensor(
+            [
+                # eos      w1   w2       prefix
+                # sentence 1:
+                [
+                    1.0,
+                    unk,
+                    0.0,
+                    0.0,
+                ],  # w2 w1 w2: 0.1*0.9*0.9  (emit: w2 w1 w2 <eos>: 0.1*0.9*0.9*1.0)
+                [
+                    1.0,
+                    unk,
+                    0.0,
+                    0.0,
+                ],  # w2 w1 w1: 0.1*0.9*0.1  (emit: w2 w1 w1 <eos>: 0.1*0.9*0.1*1.0)
+                # sentence 2:
+                [
+                    0.1,
+                    unk,
+                    0.5,
+                    0.4,
+                ],  # w2 w2 w2: 0.3*0.9*0.99  (emit: w2 w2 w2 <eos>: 0.3*0.9*0.99*0.1)
+                [
+                    1.0,
+                    unk,
+                    0.0,
+                    0.0,
+                ],  # w1 w2 w1: 0.7*0.4*0.4  (emit: w1 w2 w1 <eos>: 0.7*0.4*0.4*1.0)
+            ]
+        ),
+    ]
+
+    task = TestTranslationTask.setup_task(args, d, d)
+    model = task.build_model(args)
+    tgt_dict = task.target_dictionary
+
+    return tgt_dict, w1, w2, src_tokens, src_lengths, model
+
+
+def create_dummy_data(
+    data_dir, num_examples=100, maxlen=20, alignment=False, languages=None
+):
+    def _create_dummy_data(dir, filename):
+        data = torch.rand(num_examples * maxlen)
+        data = 97 + torch.floor(26 * data).int()
+        with open(os.path.join(dir, filename), "w") as h:
+            offset = 0
+            for _ in range(num_examples):
+                ex_len = random.randint(1, maxlen)
+                ex_str = " ".join(map(chr, data[offset : offset + ex_len]))
+                print(ex_str, file=h)
+                offset += ex_len
+
+    def _create_dummy_alignment_data(filename_src, filename_tgt, filename):
+        with open(os.path.join(data_dir, filename_src), "r") as src_f, open(
+            os.path.join(data_dir, filename_tgt), "r"
+        ) as tgt_f, open(os.path.join(data_dir, filename), "w") as h:
+            for src, tgt in zip(src_f, tgt_f):
+                src_len = len(src.split())
+                tgt_len = len(tgt.split())
+                avg_len = (src_len + tgt_len) // 2
+                num_alignments = random.randint(avg_len // 2, 2 * avg_len)
+                src_indices = torch.floor(torch.rand(num_alignments) * src_len).int()
+                tgt_indices = torch.floor(torch.rand(num_alignments) * tgt_len).int()
+                ex_str = " ".join(
+                    [
+                        "{}-{}".format(src, tgt)
+                        for src, tgt in zip(src_indices, tgt_indices)
+                    ]
+                )
+                print(ex_str, file=h)
+
+    files_to_write = [
+        "train.in",
+        "train.out",
+        "valid.in",
+        "valid.out",
+        "test.in",
+        "test.out",
+    ]
+    if languages is None:  # En only dummy dataset
+        for f in files_to_write:
+            _create_dummy_data(data_dir, f)
+    else:
+        for lang in languages:
+            lang_dir = os.path.join(data_dir, lang)
+            os.makedirs(lang_dir, exist_ok=True)
+            for f in files_to_write:
+                _create_dummy_data(lang_dir, f)
+
+    if alignment:
+        _create_dummy_alignment_data("train.in", "train.out", "train.align")
+        _create_dummy_alignment_data("valid.in", "valid.out", "valid.align")
+        _create_dummy_alignment_data("test.in", "test.out", "test.align")
+
+
+def preprocess_lm_data(data_dir, languages=None):
+    preprocess_parser = options.get_preprocessing_parser()
+    if languages is None:
+        preprocess_args = preprocess_parser.parse_args(
+            [
+                "--only-source",
+                "--trainpref",
+                os.path.join(data_dir, "train.out"),
+                "--validpref",
+                os.path.join(data_dir, "valid.out"),
+                "--testpref",
+                os.path.join(data_dir, "test.out"),
+                "--destdir",
+                data_dir,
+            ]
+        )
+        preprocess.main(preprocess_args)
+    else:
+        for lang in languages:
+            lang_dir = os.path.join(data_dir, lang)
+            assert os.path.exists(lang_dir)
+            preprocess_args = preprocess_parser.parse_args(
+                [
+                    "--only-source",
+                    "--trainpref",
+                    os.path.join(lang_dir, "train.out"),
+                    "--validpref",
+                    os.path.join(lang_dir, "valid.out"),
+                    "--testpref",
+                    os.path.join(lang_dir, "test.out"),
+                    "--destdir",
+                    lang_dir,
+                ]
+            )
+            preprocess.main(preprocess_args)
+        shutil.copyfile(
+            os.path.join(data_dir, languages[0], "dict.txt"),
+            os.path.join(data_dir, "dict.txt"),
+        )
+
+
+def preprocess_translation_data(data_dir, extra_flags=None):
+    preprocess_parser = options.get_preprocessing_parser()
+    preprocess_args = preprocess_parser.parse_args(
+        [
+            "--source-lang",
+            "in",
+            "--target-lang",
+            "out",
+            "--trainpref",
+            os.path.join(data_dir, "train"),
+            "--validpref",
+            os.path.join(data_dir, "valid"),
+            "--testpref",
+            os.path.join(data_dir, "test"),
+            "--thresholdtgt",
+            "0",
+            "--thresholdsrc",
+            "0",
+            "--destdir",
+            data_dir,
+        ]
+        + (extra_flags or []),
+    )
+    preprocess.main(preprocess_args)
+
+
+def preprocess_summarization_data(data_dir, extra_flags=None):
+    preprocess_parser = options.get_preprocessing_parser()
+    preprocess_args = preprocess_parser.parse_args(
+        [
+            "--source-lang",
+            "in",
+            "--target-lang",
+            "out",
+            "--trainpref",
+            os.path.join(data_dir, "train"),
+            "--validpref",
+            os.path.join(data_dir, "valid"),
+            "--testpref",
+            os.path.join(data_dir, "test"),
+            "--thresholdtgt",
+            "0",
+            "--thresholdsrc",
+            "0",
+            "--joined-dictionary",
+            "--destdir",
+            data_dir,
+        ]
+        + (extra_flags or []),
+    )
+    preprocess.main(preprocess_args)
+
+
+def create_laser_data_and_config_json(data_dir):
+    src_langs = ["de", "fr", "ru", "tr", "zh"]
+    tgt_langs = ["en", "es"]
+    config_json = {}
+    config_train_json = []
+    src_vocab = None
+    tgt_vocab = None
+
+    for src_lang in src_langs:
+        for tgt_lang in tgt_langs:
+            langpair_folder = f"{src_lang}-{tgt_lang}"
+
+            langpair_path = os.path.join(data_dir, langpair_folder)
+            os.mkdir(langpair_path)
+            create_dummy_data(langpair_path)
+            preprocess_translation_data(langpair_path, ["--dataset-impl", "cached"])
+
+            src_vocab = os.path.join(langpair_path, "dict.in.txt")
+            tgt_vocab = os.path.join(langpair_path, "dict.out.txt")
+            config_train_json.append(
+                {
+                    "id": 0 if tgt_lang == "en" else 1,
+                    "src": os.path.join(langpair_path, "train.in-out.in"),
+                    "tgt": os.path.join(langpair_path, "train.in-out.out"),
+                }
+            )
+
+    config_json["src_vocab"] = src_vocab
+    config_json["tgt_vocab"] = tgt_vocab
+    config_json["train"] = config_train_json
+
+    with open(os.path.join(data_dir, "laserconfig.json"), "w") as config_file:
+        json.dump(config_json, config_file)
+
+    return config_file
+
+
+def train_translation_model(
+    data_dir,
+    arch,
+    extra_flags=None,
+    task="translation",
+    run_validation=False,
+    lang_flags=None,
+    extra_valid_flags=None,
+    world_size=1,
+):
+    if lang_flags is None:
+        lang_flags = [
+            "--source-lang",
+            "in",
+            "--target-lang",
+            "out",
+        ]
+    train_parser = options.get_training_parser()
+    train_args = options.parse_args_and_arch(
+        train_parser,
+        [
+            "--task",
+            task,
+            data_dir,
+            "--save-dir",
+            data_dir,
+            "--arch",
+            arch,
+            "--optimizer",
+            "nag",
+            "--lr",
+            "0.05",
+            "--max-tokens",
+            "500",
+            "--max-epoch",
+            "1",
+            "--no-progress-bar",
+            "--distributed-world-size",
+            str(world_size),
+            "--num-workers",
+            "0",
+        ]
+        + lang_flags
+        + (extra_flags or []),
+    )
+
+    cfg = convert_namespace_to_omegaconf(train_args)
+    distributed_utils.call_main(cfg, train.main)
+
+    if run_validation:
+        # test validation
+        validate_parser = options.get_validation_parser()
+        validate_args = options.parse_args_and_arch(
+            validate_parser,
+            [
+                "--task",
+                task,
+                data_dir,
+                "--path",
+                os.path.join(data_dir, "checkpoint_last.pt"),
+                "--valid-subset",
+                "valid",
+                "--max-tokens",
+                "500",
+                "--no-progress-bar",
+                "--num-workers",
+                "0",
+            ]
+            + lang_flags
+            + (extra_valid_flags or []),
+        )
+        validate.main(validate_args)
+
+
+def generate_main(data_dir, extra_flags=None, path=None):
+    if extra_flags is None:
+        extra_flags = [
+            "--print-alignment",
+        ]
+    if path is None:
+        path = os.path.join(data_dir, "checkpoint_last.pt")
+    generate_parser = options.get_generation_parser()
+    generate_args = options.parse_args_and_arch(
+        generate_parser,
+        [
+            data_dir,
+            "--path",
+            path,
+            "--beam",
+            "3",
+            "--batch-size",
+            "64",
+            "--max-len-b",
+            "5",
+            "--gen-subset",
+            "valid",
+            "--no-progress-bar",
+            "--num-workers",
+            "0",
+        ]
+        + (extra_flags or []),
+    )
+
+    # evaluate model in batch mode
+    generate.main(generate_args)
+
+    # evaluate model interactively
+    generate_args.buffer_size = 0
+    generate_args.input = "-"
+    generate_args.batch_size = None
+    orig_stdin = sys.stdin
+    sys.stdin = StringIO("h e l l o\n")
+    interactive.main(generate_args)
+    sys.stdin = orig_stdin
+
+
+class TestDataset(torch.utils.data.Dataset):
+    def __init__(self, data):
+        super().__init__()
+        self.data = data
+        self.sizes = None
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return len(self.data)
+
+
+class TestTranslationTask(LegacyFairseqTask):
+    def __init__(self, args, src_dict, tgt_dict, model):
+        super().__init__(args)
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+        self.model = model
+
+    @classmethod
+    def setup_task(cls, args, src_dict=None, tgt_dict=None, model=None):
+        return cls(args, src_dict, tgt_dict, model)
+
+    def build_model(self, args, from_checkpoint=False):
+        return TestModel.build_model(args, self)
+
+    @property
+    def source_dictionary(self):
+        return self.src_dict
+
+    @property
+    def target_dictionary(self):
+        return self.tgt_dict
+
+
+class TestModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = TestEncoder(args, task.source_dictionary)
+        decoder = TestIncrementalDecoder(args, task.target_dictionary)
+        return cls(encoder, decoder)
+
+
+class TestEncoder(FairseqEncoder):
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+        self.args = args
+
+    def forward(self, src_tokens, src_lengths=None, **kwargs):
+        return EncoderOut(
+            encoder_out=src_tokens,
+            encoder_padding_mask=None,
+            encoder_embedding=None,
+            encoder_states=None,
+            src_tokens=None,
+            src_lengths=None,
+        )
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        return EncoderOut(
+            encoder_out=encoder_out.encoder_out.index_select(0, new_order),
+            encoder_padding_mask=None,
+            encoder_embedding=None,
+            encoder_states=None,
+            src_tokens=None,
+            src_lengths=None,
+        )
+
+
+class TestIncrementalDecoder(FairseqIncrementalDecoder):
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+        assert hasattr(args, "beam_probs") or hasattr(args, "probs")
+        args.max_decoder_positions = getattr(args, "max_decoder_positions", 100)
+        self.args = args
+
+    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+        bbsz = prev_output_tokens.size(0)
+        vocab = len(self.dictionary)
+        src_len = encoder_out.encoder_out.size(1)
+        tgt_len = prev_output_tokens.size(1)
+
+        # determine number of steps
+        if incremental_state is not None:
+            # cache step number
+            step = utils.get_incremental_state(self, incremental_state, "step")
+            if step is None:
+                step = 0
+            utils.set_incremental_state(self, incremental_state, "step", step + 1)
+            steps = [step]
+        else:
+            steps = list(range(tgt_len))
+
+        # define output in terms of raw probs
+        if hasattr(self.args, "probs"):
+            assert (
+                self.args.probs.dim() == 3
+            ), "expected probs to have size bsz*steps*vocab"
+            probs = self.args.probs.index_select(1, torch.LongTensor(steps))
+        else:
+            probs = torch.FloatTensor(bbsz, len(steps), vocab).zero_()
+            for i, step in enumerate(steps):
+                # args.beam_probs gives the probability for every vocab element,
+                # starting with eos, then unknown, and then the rest of the vocab
+                if step < len(self.args.beam_probs):
+                    probs[:, i, self.dictionary.eos() :] = self.args.beam_probs[step]
+                else:
+                    probs[:, i, self.dictionary.eos()] = 1.0
+
+        # random attention
+        attn = torch.rand(bbsz, tgt_len, src_len)
+
+        dev = prev_output_tokens.device
+        return probs.to(dev), {"attn": [attn.to(dev)]}
+
+    def get_normalized_probs(self, net_output, log_probs, _):
+        # the decoder returns probabilities directly
+        probs = net_output[0]
+        if log_probs:
+            return probs.log()
+        else:
+            return probs
+
+    def max_positions(self):
+        return self.args.max_decoder_positions
+
+
+class TestReshapingEncoder(FairseqEncoder):
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+        self.args = args
+
+    def forward(self, src_tokens, src_lengths=None, **kwargs):
+        b_sz, t_sz = src_tokens.shape
+        padding_needed = t_sz % 2
+        x = src_tokens
+        if padding_needed > 0:
+            padding_needed = 2 - padding_needed
+            x = F.pad(x, (0, padding_needed))
+
+        return EncoderOut(
+            encoder_out=x.view(b_sz, -1, 2),
+            encoder_padding_mask=None,
+            encoder_embedding=None,
+            encoder_states=None,
+            src_tokens=None,
+            src_lengths=None,
+        )
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        return EncoderOut(
+            encoder_out=encoder_out.encoder_out.index_select(0, new_order),
+            encoder_padding_mask=None,
+            encoder_embedding=None,
+            encoder_states=None,
+            src_tokens=None,
+            src_lengths=None,
+        )
+
+
+class TestReshapingModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = TestReshapingEncoder(args, task.source_dictionary)
+        decoder = TestIncrementalDecoder(args, task.target_dictionary)
+        return cls(encoder, decoder)
+
+
+class TestAdditionalInputEncoder(FairseqEncoder):
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+        self.args = args
+
+    def forward(self, src_tokens, src_lengths=None, **kwargs):
+        assert "fancy_other_input" in kwargs
+        assert kwargs["fancy_other_input"] is not None
+        return EncoderOut(
+            encoder_out=src_tokens,
+            encoder_padding_mask=None,
+            encoder_embedding=None,
+            encoder_states=None,
+            src_tokens=None,
+            src_lengths=None,
+        )
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        return EncoderOut(
+            encoder_out=encoder_out.encoder_out.index_select(0, new_order),
+            encoder_padding_mask=None,
+            encoder_embedding=None,
+            encoder_states=None,
+            src_tokens=None,
+            src_lengths=None,
+        )
+
+
+class TestAdditionalInputModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = TestAdditionalInputEncoder(args, task.source_dictionary)
+        decoder = TestIncrementalDecoder(args, task.target_dictionary)
+        return cls(encoder, decoder)
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        decoder_out = self.decoder(
+            prev_output_tokens, encoder_out=encoder_out, **kwargs
+        )
+        return decoder_out
+
+
+def train_language_model(
+    data_dir,
+    arch,
+    extra_flags=None,
+    run_validation=False,
+    extra_valid_flags=None,
+    task="language_modeling",
+    world_size=1,
+):
+    train_parser = options.get_training_parser()
+    train_args = options.parse_args_and_arch(
+        train_parser,
+        [
+            "--task",
+            task,
+            data_dir,
+            "--arch",
+            arch,
+            "--optimizer",
+            "adam",
+            "--lr",
+            "0.0001",
+            "--max-tokens",
+            "500",
+            "--tokens-per-sample",
+            "500",
+            "--save-dir",
+            data_dir,
+            "--max-epoch",
+            "1",
+            "--no-progress-bar",
+            "--distributed-world-size",
+            str(world_size),
+            "--ddp-backend",
+            "no_c10d",
+            "--num-workers",
+            "0",
+        ]
+        + (extra_flags or []),
+    )
+    cfg = convert_namespace_to_omegaconf(train_args)
+    distributed_utils.call_main(cfg, train.main)
+
+    if run_validation:
+        # test validation
+        validate_parser = options.get_validation_parser()
+        validate_args = options.parse_args_and_arch(
+            validate_parser,
+            [
+                "--task",
+                task,
+                data_dir,
+                "--path",
+                os.path.join(data_dir, "checkpoint_last.pt"),
+                "--valid-subset",
+                "valid",
+                "--max-tokens",
+                "500",
+                "--no-progress-bar",
+                "--num-workers",
+                "0",
+            ]
+            + (extra_valid_flags or []),
+        )
+        validate.main(validate_args)
+
+
+def sizes(data):
+    return [len(sentence) for sentence in data]
+
+
+POPULATION = string.ascii_letters + string.digits
+
+
+def make_sentence() -> tp.List[str]:
+    length = random.randint(10, 50)
+    return random.choices(
+        population=POPULATION, k=length, weights=range(1, len(POPULATION) + 1)
+    )
+
+
+def make_data(length=1000, out_file=None) -> tp.List[tp.List[str]]:
+    data = (
+        [make_sentence() for _ in range(0, length)]
+        # add all the symbols at least once
+        + [list(string.ascii_letters), list(string.digits)]
+    )
+    if out_file is not None:
+        with open(out_file, "w", encoding="utf-8") as out:
+            for s in data:
+                print(" ".join(s), file=out)
+
+    return data
+
+
+def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary:
+    d = Dictionary()
+    for s in data:
+        for token in s:
+            d.add_symbol(token)
+    d.finalize()
+    return d
diff --git a/pytorch_model.bin b/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c79aa2ce6fb7175ddef605d568ba046b305cc301
--- /dev/null
+++ b/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5d83f0f88e9e68f6d25a2a8ff7d48deb39361a2f2eda0e89e154dd80530ce46
+size 1271613458