diff --git a/fairseq/fairseq.egg-info/not-zip-safe b/fairseq/fairseq.egg-info/not-zip-safe new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/fairseq/fairseq.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/fairseq/tests/distributed/__init__.py b/fairseq/tests/distributed/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/fairseq/tests/distributed/test_bmuf.py b/fairseq/tests/distributed/test_bmuf.py new file mode 100644 index 0000000000000000000000000000000000000000..995d0db18080f9b1a2fca65206c5f00dfa1ff90e --- /dev/null +++ b/fairseq/tests/distributed/test_bmuf.py @@ -0,0 +1,204 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import functools +import random +import unittest +from multiprocessing import Manager + +import torch +import torch.nn as nn +from omegaconf import OmegaConf + +from fairseq import optim +from fairseq.distributed import utils as distributed_utils + + +class Model(nn.Module): + def __init__(self, input_size, output_size): + super(Model, self).__init__() + self.fc = nn.Linear(input_size, output_size) + + def forward(self, input): + output = self.fc(input) + return output + + +def setup_model_loss_criterion(cfg, args, rank, is_cuda): + """ + setup model, criterion and optimizer based on input args + """ + args.distributed_rank = rank + cfg.distributed_training.distributed_rank = args.distributed_rank + if cfg.distributed_training.distributed_world_size > 1: + distributed_utils.distributed_init(cfg) + torch.manual_seed(1) + model = Model(args.input_size, args.nb_classes) + loss_fn = nn.CrossEntropyLoss() + if is_cuda: + model = model.cuda() + loss_fn = loss_fn.cuda() + + optimizer = optim.sgd.SGD(args, model.parameters()) + optimizer = optim.FairseqBMUF(cfg=cfg.bmuf, optimizer=optimizer) + + return model, loss_fn, optimizer + + +def train_step(input, target, model, loss_fn, optimizer, **unused): + """Do forward, backward and parameter update.""" + model.train() + output = model(input) + loss = loss_fn(output, target) + optimizer.backward(loss) + optimizer.step() + + +def single_gpu_training(cfg, args, rank, iterations, shared_results): + + is_cuda = torch.cuda.is_available() + if is_cuda: + torch.cuda.set_device(rank) + + model, loss_fn, optimizer = setup_model_loss_criterion(cfg, args, rank, is_cuda) + + for _ in range(iterations): + input = torch.randn(1, args.input_size) + target = torch.empty(args.batch_size, dtype=torch.long).random_(args.nb_classes) + + if is_cuda: + input = input.cuda() + target = target.cuda() + train_step(input, target, model, loss_fn, optimizer) + + results = [] + for param in model.parameters(): + if len(results) == 0: + results = param.flatten().cpu().data + else: + results = torch.cat((results, param.flatten().cpu().data), 0) + + shared_results[rank] = results + + +def setup_args(): + args = argparse.Namespace() + args.global_sync_iter = 20 + args.block_momentum = 0.875 + args.block_lr = 0.5 + args.input_size = 5 + args.nb_classes = 2 + args.batch_size = 1 + args.lr = [1e-3] + args.momentum = 0 + args.weight_decay = 0 + args.warmup_iterations = 0 + args.use_nbm = True + args.average_sync = True + args.global_sync_iter = 1 + args.model_parallel_size = 1 + args.distributed_backend = "gloo" + + args.distributed_world_size = 2 + port = random.randint(10000, 20000) + args.distributed_init_method = "tcp://localhost:{port}".format(port=port) + args.distributed_init_host = "localhost" + args.distributed_port = port + 1 + args.local_world_size = args.distributed_world_size + + cfg = OmegaConf.create() + cfg.optimization = OmegaConf.create() + cfg.common = OmegaConf.create() + cfg.distributed_training = OmegaConf.create() + cfg.dataset = OmegaConf.create() + cfg.bmuf = OmegaConf.create() + cfg.optimizer = OmegaConf.create() + + cfg.bmuf.global_sync_iter = args.global_sync_iter + cfg.bmuf.block_momentum = args.block_momentum + cfg.bmuf.block_lr = args.block_lr + cfg.dataset.batch_size = args.batch_size + cfg.optimization.lr = args.lr + cfg.optimizer.momentum = args.momentum + cfg.optimizer.weight_decay = args.weight_decay + cfg.bmuf.warmup_iterations = args.warmup_iterations + cfg.bmuf.use_nbm = args.use_nbm + cfg.bmuf.average_sync = args.average_sync + cfg.common.model_parallel_size = args.model_parallel_size + cfg.distributed_training.distributed_backend = args.distributed_backend + cfg.distributed_training.distributed_world_size = args.distributed_world_size + cfg.bmuf.distributed_world_size = args.distributed_world_size + cfg.distributed_training.distributed_init_method = args.distributed_init_method + cfg.distributed_training.distributed_port = args.distributed_port + + return cfg, args + + +@unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs") +class TestBMUF(unittest.TestCase): + def bmuf_process(self, cfg, args, iterations): + results = Manager().dict() + torch.multiprocessing.spawn( + fn=functools.partial(single_gpu_training, cfg, args), + args=(iterations, results), + nprocs=args.distributed_world_size, + join=True, + ) + return results + + def test_bmuf_sync(self): + # Train model for 1 iteration and do bmuf sync without doing warmup + cfg, args = setup_args() + iterations = 1 + results = self.bmuf_process(cfg, args, iterations) + # Make sure params in both machines are same + assert len(results) == 2 + self.assertAlmostEqual(results[0], results[1]) + + def test_warmup_sync(self): + # Train model for 20 iteration and do warmup sync without doing bmuf sync + cfg, args = setup_args() + args.warmup_iterations = 20 + cfg.bmuf.warmup_iterations = args.warmup_iterations + iterations = 20 + results = self.bmuf_process(cfg, args, iterations) + # Make sure params in both machines are same + assert len(results) == 2 + self.assertAlmostEqual(results[0], results[1]) + + def test_warmup_sync_bmuf_sync(self): + # Train model for 25 iteration and do warmup sync after 20 iteration + # and bmuf sync after 25 iteration + cfg, args = setup_args() + args.warmup_iterations = 20 + args.global_sync_iter = 5 + cfg.bmuf.warmup_iterations = args.warmup_iterations + cfg.bmuf.global_sync_iter = args.global_sync_iter + iterations = 25 + results = self.bmuf_process(cfg, args, iterations) + # Make sure params in both machines are same + assert len(results) == 2 + self.assertAlmostEqual(results[0], results[1]) + + def test_single_gpu_bmuf(self): + # Train model for 5 iterations and use GPU 1 + cfg, args = setup_args() + args.distributed_world_size = 1 + args.warmup_iterations = 5 + cfg.distributed_training.distributed_world_size = args.distributed_world_size + cfg.bmuf.distributed_world_size = args.distributed_world_size + cfg.bmuf.warmup_iterations = args.warmup_iterations + iterations = 20 + results = self.bmuf_process(cfg, args, iterations) + assert len(results) == 1 + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess((t1 - t2).abs().max(), 1e-4) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/distributed/test_distributed_timeout_wrapper.py b/fairseq/tests/distributed/test_distributed_timeout_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..996093cb2d73fb3d5a41e65fbac4bd61bf122134 --- /dev/null +++ b/fairseq/tests/distributed/test_distributed_timeout_wrapper.py @@ -0,0 +1,52 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import signal +import time +import unittest + +import torch +from torch import nn + +from fairseq.distributed import DistributedTimeoutWrapper + + +class ModuleWithDelay(nn.Module): + def __init__(self, delay): + super().__init__() + self.delay = delay + + def forward(self, x): + time.sleep(self.delay) + return x + + +class TestDistributedTimeoutWrapper(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + def test_no_timeout(self): + module = DistributedTimeoutWrapper(ModuleWithDelay(1), 0, signal.SIGINT) + module(torch.rand(5)) + module.stop_timeout() + + def test_timeout_safe(self): + module = DistributedTimeoutWrapper(ModuleWithDelay(1), 10, signal.SIGINT) + module(torch.rand(5)) + module.stop_timeout() + + def test_timeout_killed(self): + with self.assertRaises(KeyboardInterrupt): + module = DistributedTimeoutWrapper(ModuleWithDelay(5), 1, signal.SIGINT) + module(torch.rand(5)) + module.stop_timeout() + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/distributed/test_module_proxy_wrapper.py b/fairseq/tests/distributed/test_module_proxy_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..2ac1a877c3dc137cf32d01e080c61462711432b3 --- /dev/null +++ b/fairseq/tests/distributed/test_module_proxy_wrapper.py @@ -0,0 +1,74 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from torch import nn + +from fairseq.distributed import ModuleProxyWrapper + +from .utils import objects_are_equal + + +class MockDDPWrapper(nn.Module): + """A simple wrapper with an interface similar to DistributedDataParallel.""" + + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, x): + return self.module(x) + + +class Model(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 10) + self.xyz = "hello" + + def forward(self, x): + return self.linear(x) + + def get_xyz(self): + return self.xyz + + +class TestModuleProxyWrapper(unittest.TestCase): + def _get_module(self): + module = Model() + wrapped_module = MockDDPWrapper(module) + wrapped_module = ModuleProxyWrapper(wrapped_module) + return wrapped_module, module + + def test_getattr_forwarding(self): + wrapped_module, module = self._get_module() + assert module.xyz == "hello" + assert module.get_xyz() == "hello" + assert wrapped_module.xyz == "hello" + + wrapped_module.xyz = "world" + assert wrapped_module.xyz == "world" + assert module.get_xyz() == "hello" + + def test_state_dict(self): + wrapped_module, module = self._get_module() + assert objects_are_equal(wrapped_module.state_dict(), module.state_dict()) + + def test_load_state_dict(self): + wrapped_module, module = self._get_module() + wrapped_module.load_state_dict(module.state_dict()) + input = torch.rand(4, 5) + torch.testing.assert_allclose(wrapped_module(input), module(input)) + + def test_forward(self): + wrapped_module, module = self._get_module() + input = torch.rand(4, 5) + torch.testing.assert_allclose(wrapped_module(input), module(input)) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/distributed/test_utils.py b/fairseq/tests/distributed/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..30f995b67acd39af5816d2eb412d6b4df7f44f8c --- /dev/null +++ b/fairseq/tests/distributed/test_utils.py @@ -0,0 +1,124 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +import sys +import unittest + +import torch + +from fairseq.distributed import utils as dist_utils + +from .utils import objects_are_equal, spawn_and_init + + +class DistributedTest(unittest.TestCase): + def setUp(self): + if not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA not available, skipping test") + if sys.platform == "win32": + raise unittest.SkipTest("NCCL doesn't support Windows, skipping test") + if torch.cuda.device_count() < 2: + raise unittest.SkipTest("distributed tests require 2+ GPUs, skipping") + + +class TestBroadcastObject(DistributedTest): + def test_str(self): + spawn_and_init( + functools.partial( + TestBroadcastObject._test_broadcast_object, "hello world" + ), + world_size=2, + ) + + def test_tensor(self): + spawn_and_init( + functools.partial( + TestBroadcastObject._test_broadcast_object, + torch.rand(5), + ), + world_size=2, + ) + + def test_complex(self): + spawn_and_init( + functools.partial( + TestBroadcastObject._test_broadcast_object, + { + "a": "1", + "b": [2, torch.rand(2, 3), 3], + "c": (torch.rand(2, 3), 4), + "d": {5, torch.rand(5)}, + "e": torch.rand(5), + "f": torch.rand(5).int().cuda(), + }, + ), + world_size=2, + ) + + @staticmethod + def _test_broadcast_object(ref_obj, rank, group): + obj = dist_utils.broadcast_object( + ref_obj if rank == 0 else None, src_rank=0, group=group + ) + assert objects_are_equal(ref_obj, obj) + + +class TestAllGatherList(DistributedTest): + def test_str_equality(self): + spawn_and_init( + functools.partial( + TestAllGatherList._test_all_gather_list_equality, + "hello world", + ), + world_size=2, + ) + + def test_tensor_equality(self): + spawn_and_init( + functools.partial( + TestAllGatherList._test_all_gather_list_equality, + torch.rand(5), + ), + world_size=2, + ) + + def test_complex_equality(self): + spawn_and_init( + functools.partial( + TestAllGatherList._test_all_gather_list_equality, + { + "a": "1", + "b": [2, torch.rand(2, 3), 3], + "c": (torch.rand(2, 3), 4), + "d": {5, torch.rand(5)}, + "e": torch.rand(5), + "f": torch.rand(5).int(), + }, + ), + world_size=2, + ) + + @staticmethod + def _test_all_gather_list_equality(ref_obj, rank, group): + objs = dist_utils.all_gather_list(ref_obj, group) + for obj in objs: + assert objects_are_equal(ref_obj, obj) + + def test_rank_tensor(self): + spawn_and_init( + TestAllGatherList._test_all_gather_list_rank_tensor, world_size=2 + ) + + @staticmethod + def _test_all_gather_list_rank_tensor(rank, group): + obj = torch.tensor([rank]) + objs = dist_utils.all_gather_list(obj, group) + for i, obj in enumerate(objs): + assert obj.item() == i + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/distributed/utils.py b/fairseq/tests/distributed/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..be4e19cd1e2182903e9dad6b0e52bf33b9968c9d --- /dev/null +++ b/fairseq/tests/distributed/utils.py @@ -0,0 +1,65 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +import tempfile + +import torch + + +def spawn_and_init(fn, world_size, args=None): + if args is None: + args = () + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + torch.multiprocessing.spawn( + fn=functools.partial(init_and_run, fn, args), + args=( + world_size, + tmp_file.name, + ), + nprocs=world_size, + join=True, + ) + + +def distributed_init(rank, world_size, tmp_file): + torch.distributed.init_process_group( + backend="nccl", + init_method="file://{}".format(tmp_file), + world_size=world_size, + rank=rank, + ) + torch.cuda.set_device(rank) + + +def init_and_run(fn, args, rank, world_size, tmp_file): + distributed_init(rank, world_size, tmp_file) + group = torch.distributed.new_group() + fn(rank, group, *args) + + +def objects_are_equal(a, b) -> bool: + if type(a) is not type(b): + return False + if isinstance(a, dict): + if set(a.keys()) != set(b.keys()): + return False + for k in a.keys(): + if not objects_are_equal(a[k], b[k]): + return False + return True + elif isinstance(a, (list, tuple, set)): + if len(a) != len(b): + return False + return all(objects_are_equal(x, y) for x, y in zip(a, b)) + elif torch.is_tensor(a): + return ( + a.size() == b.size() + and a.dtype == b.dtype + and a.device == b.device + and torch.all(a == b) + ) + else: + return a == b diff --git a/fairseq/tests/gpu/__init__.py b/fairseq/tests/gpu/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/fairseq/tests/gpu/test_binaries_gpu.py b/fairseq/tests/gpu/test_binaries_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..5caf94cde7fb0e63a9c2c4fe85133e036a57ba01 --- /dev/null +++ b/fairseq/tests/gpu/test_binaries_gpu.py @@ -0,0 +1,590 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import json +import logging +import os +import tempfile +import unittest +from io import StringIO + +import torch + +from fairseq import options +from fairseq_cli import train +from tests.utils import ( + create_dummy_data, + generate_main, + preprocess_lm_data, + preprocess_translation_data, + train_language_model, + train_translation_model, +) + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestMultiGPU(unittest.TestCase): + @staticmethod + def parse_logs(logfile): + logs = [] + for ln in open(logfile, "r").readlines(): + try: + logs.append(json.loads(ln)) + except json.JSONDecodeError: + continue + return logs + + @property + def world_size(self): + return torch.cuda.device_count() + + def train_flags(self, mu): + return [ + "--memory-efficient-fp16", + "--update-freq", + "1", + "--seed", + "1", + "--log-format", + "json", + "--max-update", + str(mu), + "--tokens-per-sample", + "20", + "--batch-size", + "2", + "--share-decoder-input-output-embed", + "--optimizer", + "adam", + "--max-valid-steps", + "1", + "--pad-to-fixed-length", + "--sample-break-mode", + "none", + ] + + def _test_resume_multilingual_training( + self, extra_clargs, arch="transformer_lm_gpt2_tiny" + ): + languages = ["en_XX", "fr_XX", "zh_CN"] + save_interval = 5 + mu = 10 + flags = ( + self.train_flags(mu) + + ["--save-interval-updates", str(save_interval), "--log-interval", "1"] + + extra_clargs + ) + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_fp16") as data_dir: + log = os.path.join(data_dir, "train.log") + create_dummy_data( + data_dir, + num_examples=int( + mu * 20 * self.world_size * 1.5 + ), # make sure enough data for max updates + languages=languages, + ) + preprocess_lm_data(data_dir, languages) + train_language_model( + data_dir, + arch, + flags + ["--log-file", log], + task="multilingual_language_modeling", + world_size=self.world_size, + ) + log2 = os.path.join(data_dir, "resume.log") + ckpt_name = f"checkpoint_1_{save_interval}.pt" + restore_file = os.path.join(data_dir, ckpt_name) + train_language_model( + data_dir, + arch, + flags + + ["--log-file", log2, "--restore-file", restore_file, "--no-save"], + task="multilingual_language_modeling", + world_size=self.world_size, + ) + + l1 = self.parse_logs(log) + assert ( + int(l1[-1]["train_num_updates"]) == mu + ), f"The first run did not complete {mu} updates. Add more data" + l2 = self.parse_logs(log2) + + if int(l2[0]["num_updates"]) != save_interval + 1: + all_ckpt_files = [ + x for x in os.listdir(data_dir) if x.endswith(".pt") + ] + import shutil + + shutil.move(data_dir, "last_failed_resume") + raise AssertionError( + f"Likely failed to load {ckpt_name}. {all_ckpt_files} \n LOGS: {l1} \n\n {l2}. " + ) + for k in [ + "train_loss", + "train_num_updates", + "train_ppl", + "train_gnorm", + ]: + from_scratch, resumed = float(l1[-1][k]), float(l2[-1][k]) + # This fails without rounding! + assert ( + from_scratch == resumed + ), f"difference at {k} {from_scratch} != {resumed}" + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestTranslationGPU(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + def test_fp16_multigpu(self): + self._test_multigpu("test_fp16", ["--fp16"]) + + def test_slowmo_multigpu(self): + self._test_multigpu( + "test_slowmo", ["--ddp-backend", "slowmo", "--nprocs-per-node", "1"] + ) + + def test_slowmo_single_node_multigpu(self): + self._test_multigpu( + "test_slowmo_single_node", + ["--ddp-backend", "slowmo", "--nprocs-per-node", "2"], + ) + + def _test_multigpu(self, test_name, test_args): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory(test_name) as data_dir: + log = os.path.join(data_dir, "train.log") + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, + "fconv_iwslt_de_en", + test_args + ["--log-file", log], + world_size=min(torch.cuda.device_count(), 2), + ) + generate_main(data_dir) + assert os.path.exists(log) + + @staticmethod + def parse_logs(logfile): + logs = [] + for ln in open(logfile, "r").readlines(): + try: + logs.append(json.loads(ln)) + except json.JSONDecodeError: + continue + return logs + + def test_resume_training_fsdp(self): + self._test_resume_training(["--ddp-backend", "fully_sharded"]) + + def test_resume_training_fsdp_sharded_state(self): + self._test_resume_training( + ["--ddp-backend", "fully_sharded", "--use-sharded-state"] + ) + + def test_resume_training_noc10d(self): + self._test_resume_training([]) + + def _test_resume_training(self, extra_clargs, arch="fconv_iwslt_de_en"): + flags = [ + "--fp16", + "--log-format", + "json", + "--max-update", + "10", + "--save-interval-updates", + "2", + "--log-interval", + "1", + ] + extra_clargs + world_size = min(torch.cuda.device_count(), 2) + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_fp16") as data_dir: + log = os.path.join(data_dir, "train.log") + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, + arch, + flags + ["--log-file", log], + world_size=world_size, + ) + log2 = os.path.join(data_dir, "resume.log") + restore_file = os.path.join(data_dir, "checkpoint_1_2.pt") + train_translation_model( + data_dir, + arch, + flags + ["--log-file", log2, "--restore-file", restore_file], + world_size=world_size, + ) + + l1 = self.parse_logs(log) + l2 = self.parse_logs(log2) + assert int(l2[0]["num_updates"]) == 3, f"{l1}\n\n {l2}" + for k in [ + "train_loss", + "train_num_updates", + "train_ppl", + "train_gnorm", + ]: + from_scratch, resumed = l1[-1][k], l2[-1][k] + assert ( + from_scratch == resumed + ), f"difference at {k} {from_scratch} != {resumed}" + + def test_memory_efficient_fp16(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_memory_efficient_fp16") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, "fconv_iwslt_de_en", ["--memory-efficient-fp16"] + ) + generate_main(data_dir) + + def test_transformer_fp16(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_transformer") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, + "transformer_iwslt_de_en", + [ + "--encoder-layers", + "2", + "--decoder-layers", + "2", + "--encoder-embed-dim", + "64", + "--decoder-embed-dim", + "64", + "--fp16", + ], + run_validation=True, + ) + generate_main(data_dir) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_amp(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_amp") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model(data_dir, "fconv_iwslt_de_en", ["--amp"]) + generate_main(data_dir) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_transformer_amp(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_transformer") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, + "transformer_iwslt_de_en", + [ + "--encoder-layers", + "2", + "--decoder-layers", + "2", + "--encoder-embed-dim", + "64", + "--decoder-embed-dim", + "64", + "--amp", + ], + run_validation=True, + ) + generate_main(data_dir) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_levenshtein_transformer(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory( + "test_levenshtein_transformer" + ) as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir, ["--joined-dictionary"]) + train_translation_model( + data_dir, + "levenshtein_transformer", + [ + "--apply-bert-init", + "--early-exit", + "6,6,6", + "--criterion", + "nat_loss", + ], + task="translation_lev", + ) + gen_config = [ + "--task", + "translation_lev", + "--iter-decode-max-iter", + "9", + "--iter-decode-eos-penalty", + "0", + "--print-step", + ] + # non-ensemble generation + generate_main(data_dir, gen_config) + # ensemble generation + generate_main( + data_dir, + gen_config, + path=os.pathsep.join( + [ + os.path.join(data_dir, "checkpoint_last.pt"), + os.path.join(data_dir, "checkpoint_last.pt"), + ] + ), + ) + + def test_fsdp_checkpoint_generate(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir: + log = os.path.join(data_dir, "train.log") + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + world_size = min(torch.cuda.device_count(), 2) + train_translation_model( + data_dir, + "fconv_iwslt_de_en", + ["--log-file", log, "--ddp-backend", "fully_sharded"], + world_size=world_size, + ) + generate_main(data_dir) + assert os.path.exists(log) + + def test_fsdp_sharded_checkpoint_generate(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir: + log = os.path.join(data_dir, "train.log") + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + world_size = min(torch.cuda.device_count(), 2) + train_translation_model( + data_dir, + "fconv_iwslt_de_en", + [ + "--log-file", + log, + "--ddp-backend", + "fully_sharded", + "--use-sharded-state", + ], + world_size=world_size, + ) + generate_main(data_dir, ["--checkpoint-shard-count", str(world_size)]) + assert os.path.exists(log) + + +def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False): + train_parser = options.get_training_parser() + train_args = options.parse_args_and_arch( + train_parser, + [ + "--task", + "language_modeling", + data_dir, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--criterion", + "adaptive_loss", + "--adaptive-softmax-cutoff", + "5,10,15", + "--max-tokens", + "500", + "--tokens-per-sample", + "500", + "--save-dir", + data_dir, + "--max-epoch", + "1", + "--no-progress-bar", + "--distributed-world-size", + "1", + "--ddp-backend", + "no_c10d", + "--num-workers", + "0", + ] + + (extra_flags or []), + ) + train.main(train_args) + + # try scalar quantization + scalar_quant_train_parser = options.get_training_parser() + scalar_quant_train_args = options.parse_args_and_arch( + scalar_quant_train_parser, + [ + "--task", + "language_modeling", + data_dir, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--criterion", + "adaptive_loss", + "--adaptive-softmax-cutoff", + "5,10,15", + "--max-tokens", + "500", + "--tokens-per-sample", + "500", + "--save-dir", + data_dir, + "--max-update", + "3", + "--no-progress-bar", + "--distributed-world-size", + "1", + "--ddp-backend", + "no_c10d", + "--num-workers", + "0", + "--quant-noise-scalar", + "0.5", + ] + + (extra_flags or []), + ) + train.main(scalar_quant_train_args) + + # try iterative PQ quantization + quantize_parser = options.get_training_parser() + quantize_args = options.parse_args_and_arch( + quantize_parser, + [ + "--task", + "language_modeling", + data_dir, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--criterion", + "adaptive_loss", + "--adaptive-softmax-cutoff", + "5,10,15", + "--max-tokens", + "50", + "--tokens-per-sample", + "50", + "--max-update", + "6", + "--no-progress-bar", + "--distributed-world-size", + "1", + "--ddp-backend", + "no_c10d", + "--num-workers", + "0", + "--restore-file", + os.path.join(data_dir, "checkpoint_last.pt"), + "--reset-optimizer", + "--quantization-config-path", + os.path.join( + os.path.dirname(__file__), "transformer_quantization_config.yaml" + ), + ] + + (extra_flags or []), + ) + train.main(quantize_args) + + +@unittest.skipIf( + int(torch.__version__[2]) < 10, reason="quantized kernels are only supported on CPU" +) +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestQuantization(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + def test_quantization(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_quantization") as data_dir: + create_dummy_data(data_dir) + preprocess_lm_data(data_dir) + # tests both scalar and iterative PQ quantization + _quantize_language_model(data_dir, "transformer_lm") + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestOptimizersGPU(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + def test_flat_grads(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_flat_grads") as data_dir: + # Use just a bit of data and tiny model to keep this test runtime reasonable + create_dummy_data(data_dir, num_examples=10, maxlen=5) + preprocess_translation_data(data_dir) + with self.assertRaises(RuntimeError): + # adafactor isn't compatible with flat grads, which + # are used by default with --fp16 + train_translation_model( + data_dir, + "lstm", + [ + "--required-batch-size-multiple", + "1", + "--encoder-layers", + "1", + "--encoder-hidden-size", + "32", + "--decoder-layers", + "1", + "--optimizer", + "adafactor", + "--fp16", + ], + ) + # but it should pass once we set --fp16-no-flatten-grads + train_translation_model( + data_dir, + "lstm", + [ + "--required-batch-size-multiple", + "1", + "--encoder-layers", + "1", + "--encoder-hidden-size", + "32", + "--decoder-layers", + "1", + "--optimizer", + "adafactor", + "--fp16", + "--fp16-no-flatten-grads", + ], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/gpu/test_ema_gpu.py b/fairseq/tests/gpu/test_ema_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..33fb5607b41261191c55cdaf9268bc6aed2de7c3 --- /dev/null +++ b/fairseq/tests/gpu/test_ema_gpu.py @@ -0,0 +1,215 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from copy import deepcopy +from dataclasses import dataclass +from typing import Optional + +import torch + +from fairseq.models.ema import EMA + + +class DummyModule(torch.nn.Module): + def __init__(self) -> None: + """LightningModule for testing purposes + + Args: + epoch_min_loss_override (int, optional): Pass in an epoch that will be set to the minimum + validation loss for testing purposes (zero based). If None this is ignored. Defaults to None. + """ + super().__init__() + self.layer = torch.nn.Linear(in_features=32, out_features=2) + self.another_layer = torch.nn.Linear(in_features=2, out_features=2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.layer(x) + return self.another_layer(x) + + +@dataclass +class EMAConfig(object): + ema_decay: float = 0.99 + ema_start_update: int = 0 + ema_fp32: bool = False + ema_seed_model: Optional[str] = None + ema_update_freq: int = 1 + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestEMAGPU(unittest.TestCase): + def assertTorchAllClose(self, x, y, atol=1e-8, rtol=1e-5, msg=None): + diff = x.float() - y.float() + diff_norm = torch.norm(diff) + other_norm = torch.norm(y.float()) + + if msg is None: + msg = "|input - other| > {} + {} * |other|".format(atol, rtol) + + self.assertLessEqual( + diff_norm, + atol + rtol * other_norm, + msg=msg, + ) + + def test_ema(self): + model = DummyModule().cuda() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig() + ema = EMA(model, config) + + # set decay + ema._set_decay(config.ema_decay) + self.assertEqual(ema.get_decay(), config.ema_decay) + + # get model + self.assertEqual(ema.get_model(), ema.model) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + # EMA step + x = torch.randn(32).cuda() + y = model(x) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model) + + ema_state_dict = ema.get_model().state_dict() + + for key, param in model.state_dict().items(): + prev_param = state[key] + ema_param = ema_state_dict[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + self.assertTorchAllClose( + ema_param, + config.ema_decay * prev_param + (1 - config.ema_decay) * param, + ) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + # Load EMA into model + model2 = DummyModule().cuda() + ema.reverse(model2) + + for key, param in model2.state_dict().items(): + ema_param = ema_state_dict[key] + self.assertTrue(torch.allclose(ema_param, param)) + + def test_ema_fp32(self): + model = DummyModule().cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig(ema_fp32=True) + ema = EMA(model, config) + + x = torch.randn(32).cuda() + y = model(x.half()) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model) + + for key, param in model.state_dict().items(): + prev_param = state[key] + ema_param = ema.get_model().state_dict()[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + self.assertIn(key, ema.fp32_params) + + # EMA update is done in fp32, and hence the EMA param must be + # closer to the EMA update done in fp32 than in fp16. + self.assertLessEqual( + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param.float() + + (1 - config.ema_decay) * param.float() + ) + .half() + .float() + ), + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param + (1 - config.ema_decay) * param + ).float() + ), + ) + self.assertTorchAllClose( + ema_param, + ( + config.ema_decay * prev_param.float() + + (1 - config.ema_decay) * param.float() + ).half(), + ) + + def test_ema_fp16(self): + model = DummyModule().cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig(ema_fp32=False) + ema = EMA(model, config) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + x = torch.randn(32).cuda() + y = model(x.half()) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model) + + for key, param in model.state_dict().items(): + prev_param = state[key] + ema_param = ema.get_model().state_dict()[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + + # EMA update is done in fp16, and hence the EMA param must be + # closer to the EMA update done in fp16 than in fp32. + self.assertLessEqual( + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param + (1 - config.ema_decay) * param + ).float() + ), + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param.float() + + (1 - config.ema_decay) * param.float() + ) + .half() + .float() + ), + ) + self.assertTorchAllClose( + ema_param, + config.ema_decay * prev_param + (1 - config.ema_decay) * param, + ) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/gpu/transformer_quantization_config.yaml b/fairseq/tests/gpu/transformer_quantization_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de31d8116ced675b81eb74119642217d768e7736 --- /dev/null +++ b/fairseq/tests/gpu/transformer_quantization_config.yaml @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# This file defines example configuration arguments for quantizing +# a transformer model with product quantization + +n_centroids: + Linear: + key: in_features + value: {"*": 8} + Embedding: + key: embedding_dim + value: {"*": 8} + +block_sizes: + Linear: + key: fuzzy_name + value: {fc: 8, attn: 4, emb: 4} + Embedding: + key: fuzzy_name + value: {emb: 8} + +layers_to_quantize: + - decoder\\.layers\\.\d+\\.fc[12] + - decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01] + - decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj) diff --git a/fairseq/tests/speech/__init__.py b/fairseq/tests/speech/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dba99e4d933c6b04792bc42f7ab6b873ec17608c --- /dev/null +++ b/fairseq/tests/speech/__init__.py @@ -0,0 +1,210 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from argparse import Namespace +import os +import re +import unittest +from pathlib import Path +from tqdm import tqdm +from typing import List, Dict, Optional +import torch +from fairseq.checkpoint_utils import load_model_ensemble_and_task +from fairseq.scoring.wer import WerScorer +from fairseq.scoring.bleu import SacrebleuScorer +from fairseq import utils +import zipfile + +S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq" + + +class TestFairseqSpeech(unittest.TestCase): + @classmethod + def download(cls, base_url: str, out_root: Path, filename: str): + url = f"{base_url}/{filename}" + path = out_root / filename + if not path.exists(): + torch.hub.download_url_to_file(url, path.as_posix(), progress=True) + return path + + def _set_up(self, dataset_id: str, s3_dir: str, data_filenames: List[str]): + self.use_cuda = torch.cuda.is_available() + self.root = Path.home() / ".cache" / "fairseq" / dataset_id + self.root.mkdir(exist_ok=True, parents=True) + os.chdir(self.root) + self.base_url = ( + s3_dir if re.search("^https:", s3_dir) else f"{S3_BASE_URL}/{s3_dir}" + ) + for filename in data_filenames: + self.download(self.base_url, self.root, filename) + + def set_up_librispeech(self): + self._set_up( + "librispeech", + "s2t/librispeech", + [ + "cfg_librispeech.yaml", + "spm_librispeech_unigram10000.model", + "spm_librispeech_unigram10000.txt", + "librispeech_test-other.tsv", + "librispeech_test-other.zip", + ], + ) + + def set_up_ljspeech(self): + self._set_up( + "ljspeech", + "s2/ljspeech", + [ + "cfg_ljspeech_g2p.yaml", + "ljspeech_g2p_gcmvn_stats.npz", + "ljspeech_g2p.txt", + "ljspeech_test.tsv", + "ljspeech_test.zip", + ], + ) + + def set_up_sotasty_es_en(self): + self._set_up( + "sotasty_es_en", + "s2t/big/es-en", + [ + "cfg_es_en.yaml", + "spm_bpe32768_es_en.model", + "spm_bpe32768_es_en.txt", + "sotasty_es_en_test_ted.tsv", + "sotasty_es_en_test_ted.zip", + ], + ) + + def set_up_mustc_de_fbank(self): + self._set_up( + "mustc_de_fbank", + "https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de", + [ + "config.yaml", + "spm.model", + "dict.txt", + "src_dict.txt", + "tgt_dict.txt", + "tst-COMMON.tsv", + "tst-COMMON.zip", + ], + ) + + def download_and_load_checkpoint( + self, + checkpoint_filename: str, + arg_overrides: Optional[Dict[str, str]] = None, + strict: bool = True, + ): + path = self.download(self.base_url, self.root, checkpoint_filename) + _arg_overrides = arg_overrides or {} + _arg_overrides["data"] = self.root.as_posix() + models, cfg, task = load_model_ensemble_and_task( + [path.as_posix()], arg_overrides=_arg_overrides, strict=strict + ) + if self.use_cuda: + for model in models: + model.cuda() + + return models, cfg, task, self.build_generator(task, models, cfg) + + def build_generator( + self, + task, + models, + cfg, + ): + return task.build_generator(models, cfg) + + @classmethod + def get_batch_iterator(cls, task, test_split, max_tokens, max_positions): + task.load_dataset(test_split) + return task.get_batch_iterator( + dataset=task.dataset(test_split), + max_tokens=max_tokens, + max_positions=max_positions, + num_workers=1, + ).next_epoch_itr(shuffle=False) + + @classmethod + def get_wer_scorer( + cls, tokenizer="none", lowercase=False, remove_punct=False, char_level=False + ): + scorer_args = { + "wer_tokenizer": tokenizer, + "wer_lowercase": lowercase, + "wer_remove_punct": remove_punct, + "wer_char_level": char_level, + } + return WerScorer(Namespace(**scorer_args)) + + @classmethod + def get_bleu_scorer(cls, tokenizer="13a", lowercase=False, char_level=False): + scorer_args = { + "sacrebleu_tokenizer": tokenizer, + "sacrebleu_lowercase": lowercase, + "sacrebleu_char_level": char_level, + } + return SacrebleuScorer(Namespace(**scorer_args)) + + @torch.no_grad() + def base_test( + self, + ckpt_name, + reference_score, + score_delta=0.3, + dataset="librispeech_test-other", + max_tokens=65_536, + max_positions=(4_096, 1_024), + arg_overrides=None, + strict=True, + score_type="wer", + ): + models, _, task, generator = self.download_and_load_checkpoint( + ckpt_name, arg_overrides=arg_overrides, strict=strict + ) + if not self.use_cuda: + return + + batch_iterator = self.get_batch_iterator( + task, dataset, max_tokens, max_positions + ) + if score_type == "bleu": + scorer = self.get_bleu_scorer() + elif score_type == "wer": + scorer = self.get_wer_scorer() + else: + raise Exception(f"Unsupported score type {score_type}") + + progress = tqdm(enumerate(batch_iterator), total=len(batch_iterator)) + for batch_idx, sample in progress: + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + hypo = task.inference_step(generator, models, sample) + for i, sample_id in enumerate(sample["id"].tolist()): + tgt_str, hypo_str = self.postprocess_tokens( + task, + sample["target"][i, :], + hypo[i][0]["tokens"].int().cpu(), + ) + if batch_idx == 0 and i < 3: + print(f"T-{sample_id} {tgt_str}") + print(f"H-{sample_id} {hypo_str}") + scorer.add_string(tgt_str, hypo_str) + + print(scorer.result_string() + f" (reference: {reference_score})") + self.assertAlmostEqual(scorer.score(), reference_score, delta=score_delta) + + def postprocess_tokens(self, task, target, hypo_tokens): + tgt_tokens = utils.strip_pad(target, task.tgt_dict.pad()).int().cpu() + tgt_str = task.tgt_dict.string(tgt_tokens, "sentencepiece") + hypo_str = task.tgt_dict.string(hypo_tokens, "sentencepiece") + return tgt_str, hypo_str + + def unzip_files(self, zip_file_name): + zip_file_path = self.root / zip_file_name + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(self.root / zip_file_name.strip(".zip")) diff --git a/fairseq/tests/speech/test_convtransformer_simul_trans.py b/fairseq/tests/speech/test_convtransformer_simul_trans.py new file mode 100644 index 0000000000000000000000000000000000000000..0562404f52ab2dd26386c28e4f8b673557924737 --- /dev/null +++ b/fairseq/tests/speech/test_convtransformer_simul_trans.py @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech + +S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq/" + + +class TestConvtransformerSimulTrans(TestFairseqSpeech): + def setUp(self): + self._set_up( + "simul", + "speech_tests/simul", + ["config_gcmvn_specaug.yaml", "dict.txt", "dev.tsv"], + ) + + def test_waitk_checkpoint(self): + """Only test model loading since fairseq currently doesn't support inference of simultaneous models""" + _, _, _, _ = self.download_and_load_checkpoint( + "checkpoint_best.pt", + arg_overrides={ + "config_yaml": "config_gcmvn_specaug.yaml", + "load_pretrained_encoder_from": None, + }, + ) + return + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech/test_dual_input_wav_transformer.py b/fairseq/tests/speech/test_dual_input_wav_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..3581bc1991485d08269891873aa4c8ec375ae034 --- /dev/null +++ b/fairseq/tests/speech/test_dual_input_wav_transformer.py @@ -0,0 +1,76 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from collections import namedtuple +from pathlib import Path + +import torch +from tqdm import tqdm + +import fairseq +from fairseq import utils +from fairseq.checkpoint_utils import load_model_ensemble_and_task +from fairseq.scoring.bleu import SacrebleuScorer +from fairseq.tasks import import_tasks +from tests.speech import S3_BASE_URL, TestFairseqSpeech + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestLibrispeechDualInputWavTransformer(TestFairseqSpeech): + def setUp(self): + dataset_id = "librispeech_wvtrasnformer" + base_url = "https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned" + data_filenames = [ + "checkpoint_ave_10.pt", + "spm.model", + "src_dict.txt", + "tgt_dict.txt", + "config.yaml", + ] + self._set_up( + dataset_id, + "s2t", + [ + "librispeech_flac_test-other.tsv", + "librispeech_flac_test-other.zip", + ], + ) + for filename in data_filenames: + self.download(base_url, self.root, filename) + + def import_user_module(self): + user_dir = ( + Path(fairseq.__file__).parent.parent / "examples/speech_text_joint_to_text" + ) + Arg = namedtuple("Arg", ["user_dir"]) + arg = Arg(user_dir.__str__()) + utils.import_user_module(arg) + + @torch.no_grad() + def test_librispeech_dualinput_wav_transformer_checkpoint(self): + self.import_user_module() + checkpoint_filename = "checkpoint_ave_10.pt" + arg_overrides = { + "config_yaml": "config.yaml", + "load_pretrained_speech_text_encoder": "", + "load_pretrained_speech_text_decoder": "", + "beam": 10, + "nbest": 1, + "lenpen": 1.0, + "load_speech_only": True, + } + self.base_test( + checkpoint_filename, + 4.6, + dataset="librispeech_flac_test-other", + max_tokens=800000, + max_positions=(800000, 1024), + arg_overrides=arg_overrides, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech/test_dualinput_s2t_transformer.py b/fairseq/tests/speech/test_dualinput_s2t_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..76675b98238fe738627eb88c5d43591bb7957db4 --- /dev/null +++ b/fairseq/tests/speech/test_dualinput_s2t_transformer.py @@ -0,0 +1,110 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from argparse import Namespace +from collections import namedtuple +from pathlib import Path + +import torch +from tqdm import tqdm + +import fairseq +from fairseq import utils +from fairseq.checkpoint_utils import load_model_ensemble_and_task +from fairseq.scoring.bleu import SacrebleuScorer +from fairseq.tasks import import_tasks +from tests.speech import TestFairseqSpeech + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestDualInputS2TTransformer(TestFairseqSpeech): + def setUp(self): + self.set_up_mustc_de_fbank() + + def import_user_module(self): + user_dir = ( + Path(fairseq.__file__).parent.parent / "examples/speech_text_joint_to_text" + ) + Arg = namedtuple("Arg", ["user_dir"]) + arg = Arg(user_dir.__str__()) + utils.import_user_module(arg) + + @torch.no_grad() + def test_mustc_de_fbank_dualinput_s2t_transformer_checkpoint(self): + self.import_user_module() + checkpoint_filename = "checkpoint_ave_10.pt" + path = self.download(self.base_url, self.root, checkpoint_filename) + models, cfg, task = load_model_ensemble_and_task( + [path.as_posix()], + arg_overrides={ + "data": self.root.as_posix(), + "config_yaml": "config.yaml", + "load_pretrain_speech_encoder": "", + "load_pretrain_text_encoder_last": "", + "load_pretrain_decoder": "", + "beam": 10, + "nbest": 1, + "lenpen": 1.0, + "load_speech_only": True, + }, + ) + if self.use_cuda: + for model in models: + model.cuda() + generator = task.build_generator(models, cfg) + test_split = "tst-COMMON" + task.load_dataset(test_split) + batch_iterator = task.get_batch_iterator( + dataset=task.dataset(test_split), + max_tokens=250_000, + max_positions=(10_000, 1_024), + num_workers=1, + ).next_epoch_itr(shuffle=False) + + tokenizer = task.build_tokenizer(cfg.tokenizer) + bpe = task.build_bpe(cfg.bpe) + + def decode_fn(x): + if bpe is not None: + x = bpe.decode(x) + if tokenizer is not None: + x = tokenizer.decode(x) + return x + + scorer_args = { + "sacrebleu_tokenizer": "13a", + "sacrebleu_lowercase": False, + "sacrebleu_char_level": False, + } + scorer = SacrebleuScorer(Namespace(**scorer_args)) + progress = tqdm(enumerate(batch_iterator), total=len(batch_iterator)) + for batch_idx, sample in progress: + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + hypo = task.inference_step(generator, models, sample) + for i, sample_id in enumerate(sample["id"].tolist()): + tgt_tokens = ( + utils.strip_pad(sample["target"][i, :], task.tgt_dict.pad()) + .int() + .cpu() + ) + + tgt_str = task.tgt_dict.string(tgt_tokens, "sentencepiece") + hypo_str = task.tgt_dict.string( + hypo[i][0]["tokens"].int().cpu(), "sentencepiece" + ) + if batch_idx == 0 and i < 3: + print(f"T-{sample_id} {tgt_str}") + print(f"D-{sample_id} {hypo_str}") + scorer.add_string(tgt_str, hypo_str) + reference_bleu = 27.3 + result = scorer.result_string() + print(result + f" (reference: {reference_bleu})") + res_bleu = float(result.split()[2]) + self.assertAlmostEqual(res_bleu, reference_bleu, delta=0.3) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech/test_fastspeech2.py b/fairseq/tests/speech/test_fastspeech2.py new file mode 100644 index 0000000000000000000000000000000000000000..7150a3bda25b3da2f0b4326de2cd074013198cde --- /dev/null +++ b/fairseq/tests/speech/test_fastspeech2.py @@ -0,0 +1,53 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from tqdm import tqdm + +from fairseq import utils +from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion +from tests.speech import TestFairseqSpeech + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestFastSpeech2(TestFairseqSpeech): + def setUp(self): + self.set_up_ljspeech() + + @torch.no_grad() + def test_ljspeech_fastspeech2_checkpoint(self): + models, cfg, task, generator = self.download_and_load_checkpoint( + "ljspeech_fastspeech2_g2p.pt", + arg_overrides={ + "config_yaml": "cfg_ljspeech_g2p.yaml", + "vocoder": "griffin_lim", + "fp16": False, + }, + ) + + batch_iterator = self.get_batch_iterator(task, "ljspeech_test", 65_536, 4_096) + progress = tqdm(batch_iterator, total=len(batch_iterator)) + mcd, n_samples = 0.0, 0 + for sample in progress: + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + hypos = generator.generate(models[0], sample, has_targ=True) + rets = batch_mel_cepstral_distortion( + [hypo["targ_waveform"] for hypo in hypos], + [hypo["waveform"] for hypo in hypos], + sr=task.sr, + ) + mcd += sum(d.item() for d, _ in rets) + n_samples += len(sample["id"].tolist()) + + mcd = round(mcd / n_samples, 1) + reference_mcd = 3.2 + print(f"MCD: {mcd} (reference: {reference_mcd})") + self.assertAlmostEqual(mcd, reference_mcd, delta=0.1) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech/test_s2s_transformer.py b/fairseq/tests/speech/test_s2s_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..180f46307cd61d7fa932c5eff84d74c04a5fd0aa --- /dev/null +++ b/fairseq/tests/speech/test_s2s_transformer.py @@ -0,0 +1,51 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech +from fairseq import utils + +S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq/" + + +class TestS2STransformer(TestFairseqSpeech): + def setUp(self): + self._set_up( + "s2s", + "speech_tests/s2s", + [ + "dev_shuf200.tsv", + "src_feat.zip", + "config_specaug_lb.yaml", + "vocoder", + "vocoder_config.json", + ], + ) + + def test_s2s_transformer_checkpoint(self): + self.base_test( + ckpt_name="s2u_transformer_reduced_fisher.pt", + reference_score=38.3, + dataset="dev_shuf200", + arg_overrides={ + "config_yaml": "config_specaug_lb.yaml", + "multitask_config_yaml": None, + "target_is_code": True, + "target_code_size": 100, + "eval_inference": False, + }, + score_type="bleu", + strict=False, + ) + + def postprocess_tokens(self, task, target, hypo_tokens): + tgt_tokens = utils.strip_pad(target, task.tgt_dict.pad()).int().cpu() + tgt_str = task.tgt_dict.string(tgt_tokens) + hypo_str = task.tgt_dict.string(hypo_tokens) + return tgt_str, hypo_str + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech/test_s2t_conformer.py b/fairseq/tests/speech/test_s2t_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..5aaa4a0ed68ae33cd9f5c8cba75025da17c78d0f --- /dev/null +++ b/fairseq/tests/speech/test_s2t_conformer.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech + + +class TestS2TConformer(TestFairseqSpeech): + def setUp(self): + self.set_up_librispeech() + + def test_librispeech_s2t_conformer_s_checkpoint(self): + self.base_test( + ckpt_name="librispeech_conformer_rel_pos_s.pt", + reference_score=12, + arg_overrides={"config_yaml": "cfg_librispeech.yaml"}, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech/test_s2t_transformer.py b/fairseq/tests/speech/test_s2t_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..172f5484a0a930535702a1be5db7b9d7c490e902 --- /dev/null +++ b/fairseq/tests/speech/test_s2t_transformer.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech + + +class TestS2TTransformer(TestFairseqSpeech): + def setUp(self): + self.set_up_librispeech() + + def test_librispeech_s2t_transformer_s_checkpoint(self): + self.base_test( + ckpt_name="librispeech_transformer_s.pt", + reference_score=9, + arg_overrides={"config_yaml": "cfg_librispeech.yaml"}, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech/test_tts_transformer.py b/fairseq/tests/speech/test_tts_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b6330c60772fa89747c98fc61143be9097e1ea18 --- /dev/null +++ b/fairseq/tests/speech/test_tts_transformer.py @@ -0,0 +1,53 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from tqdm import tqdm + +from fairseq import utils +from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion +from tests.speech import TestFairseqSpeech + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestTTSTransformer(TestFairseqSpeech): + def setUp(self): + self.set_up_ljspeech() + + @torch.no_grad() + def test_ljspeech_tts_transformer_checkpoint(self): + models, cfg, task, generator = self.download_and_load_checkpoint( + "ljspeech_transformer_g2p.pt", + arg_overrides={ + "config_yaml": "cfg_ljspeech_g2p.yaml", + "vocoder": "griffin_lim", + "fp16": False, + }, + ) + + batch_iterator = self.get_batch_iterator(task, "ljspeech_test", 65_536, 1024) + progress = tqdm(batch_iterator, total=len(batch_iterator)) + mcd, n_samples = 0.0, 0 + for sample in progress: + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + hypos = generator.generate(models[0], sample, has_targ=True) + rets = batch_mel_cepstral_distortion( + [hypo["targ_waveform"] for hypo in hypos], + [hypo["waveform"] for hypo in hypos], + sr=task.sr, + ) + mcd += sum(d.item() for d, _ in rets) + n_samples += len(sample["id"].tolist()) + + mcd = round(mcd / n_samples, 1) + reference_mcd = 3.3 + print(f"MCD: {mcd} (reference: {reference_mcd})") + self.assertAlmostEqual(mcd, reference_mcd, delta=0.1) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech/test_wav2vec2.py b/fairseq/tests/speech/test_wav2vec2.py new file mode 100644 index 0000000000000000000000000000000000000000..eff6114c8e11be99800ae3d435d2ddcc2a7e65d3 --- /dev/null +++ b/fairseq/tests/speech/test_wav2vec2.py @@ -0,0 +1,90 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +import torch +from tests.speech import TestFairseqSpeech +from fairseq.data.data_utils import post_process +from fairseq import utils +from omegaconf import open_dict + +S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq" + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestWav2Vec2(TestFairseqSpeech): + def setUp(self): + self._set_up( + "librispeech_w2v2", + "conformer/wav2vec2/librispeech", + [ + "test_librispeech-other.ltr", + "test_librispeech-other.tsv", + "test_librispeech-other_small.ltr_100", + "test_librispeech-other_small.tsv", + "test-other.zip", + "dict.ltr.txt", + "dict.ltr_100.txt", + ], + ) + self.unzip_files( + "test-other.zip", + ) + + def test_transformer_w2v2(self): + self.base_test( + ckpt_name="transformer_oss_small_100h.pt", + reference_score=38, + score_delta=1, + dataset="test_librispeech-other", + max_tokens=1000000, + max_positions=(700000, 1000), + arg_overrides={ + "task": "audio_finetuning", + "labels": "ltr", + "nbest": 1, + "tpu": False, + }, + strict=False, + ) + + def test_conformer_w2v2(self): + self.base_test( + ckpt_name="conformer_LS_PT_LS_FT_rope.pt", + reference_score=4.5, + score_delta=1, + dataset="test_librispeech-other_small", + max_tokens=1000000, + max_positions=(700000, 1000), + arg_overrides={ + "task": "audio_finetuning", + "labels": "ltr_100", + "nbest": 1, + "tpu": False, + }, + strict=True, + ) + + def build_generator(self, task, models, cfg): + try: + from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder + except Exception: + raise Exception("Cannot run this test without flashlight dependency") + with open_dict(cfg): + cfg.nbest = 1 + return W2lViterbiDecoder(cfg, task.target_dictionary) + + def postprocess_tokens(self, task, target, hypo_tokens): + tgt_tokens = utils.strip_pad(target, task.target_dictionary.pad()).int().cpu() + tgt_str = task.target_dictionary.string(tgt_tokens) + tgt_str = post_process(tgt_str, "letter") + + hypo_pieces = task.target_dictionary.string(hypo_tokens) + hypo_str = post_process(hypo_pieces, "letter") + return tgt_str, hypo_str + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech/test_xm_transformer.py b/fairseq/tests/speech/test_xm_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..0a5509415110c5da9b29456f5e47e45fc7cd0677 --- /dev/null +++ b/fairseq/tests/speech/test_xm_transformer.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech + + +class TestXMTransformer(TestFairseqSpeech): + def setUp(self): + self.set_up_sotasty_es_en() + + # TODO: investigate increases BLEU score (30.42 -> 31.74) + def test_sotasty_es_en_600m_checkpoint(self): + self.base_test( + ckpt_name="xm_transformer_600m_es_en_md.pt", + reference_score=31.74, + score_delta=0.2, + max_tokens=3_000_000, + max_positions=(1_000_000, 1_024), + dataset="sotasty_es_en_test_ted", + arg_overrides={"config_yaml": "cfg_es_en.yaml"}, + score_type="bleu", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/speech_recognition/__init__.py b/fairseq/tests/speech_recognition/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/fairseq/tests/speech_recognition/asr_test_base.py b/fairseq/tests/speech_recognition/asr_test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..8c5d414e7bf17ee02f280d024fa5d07e28b79d6b --- /dev/null +++ b/fairseq/tests/speech_recognition/asr_test_base.py @@ -0,0 +1,557 @@ +#!/usr/bin/env python3 + +import argparse +import os +import unittest +from inspect import currentframe, getframeinfo + +import numpy as np +import torch +from examples.speech_recognition.data.data_utils import lengths_to_encoder_padding_mask +from fairseq.data import data_utils as fairseq_data_utils +from fairseq.data.dictionary import Dictionary +from fairseq.models import ( + BaseFairseqModel, + FairseqDecoder, + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqEncoderModel, + FairseqModel, +) +from fairseq.tasks.fairseq_task import LegacyFairseqTask + + +DEFAULT_TEST_VOCAB_SIZE = 100 + + +# /////////////////////////////////////////////////////////////////////////// +# utility function to setup dummy dict/task/input +# /////////////////////////////////////////////////////////////////////////// + + +def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE): + dummy_dict = Dictionary() + # add dummy symbol to satisfy vocab size + for id, _ in enumerate(range(vocab_size)): + dummy_dict.add_symbol("{}".format(id), 1000) + return dummy_dict + + +class DummyTask(LegacyFairseqTask): + def __init__(self, args): + super().__init__(args) + self.dictionary = get_dummy_dictionary() + if getattr(self.args, "ctc", False): + self.dictionary.add_symbol("") + self.tgt_dict = self.dictionary + + @property + def target_dictionary(self): + return self.dictionary + + +def get_dummy_task_and_parser(): + """ + to build a fariseq model, we need some dummy parse and task. This function + is used to create dummy task and parser to faciliate model/criterion test + + Note: we use FbSpeechRecognitionTask as the dummy task. You may want + to use other task by providing another function + """ + parser = argparse.ArgumentParser( + description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS + ) + DummyTask.add_args(parser) + args = parser.parse_args([]) + task = DummyTask.setup_task(args) + return task, parser + + +def get_dummy_input(T=100, D=80, B=5, K=100): + forward_input = {} + # T max sequence length + # D feature vector dimension + # B batch size + # K target dimension size + feature = torch.randn(B, T, D) + # this (B, T, D) layout is just a convention, you can override it by + # write your own _prepare_forward_input function + src_lengths = torch.from_numpy( + np.random.randint(low=1, high=T, size=B, dtype=np.int64) + ) + src_lengths[0] = T # make sure the maximum length matches + prev_output_tokens = [] + for b in range(B): + token_length = np.random.randint(low=1, high=src_lengths[b].item() + 1) + tokens = np.random.randint(low=0, high=K, size=token_length, dtype=np.int64) + prev_output_tokens.append(torch.from_numpy(tokens)) + + prev_output_tokens = fairseq_data_utils.collate_tokens( + prev_output_tokens, + pad_idx=1, + eos_idx=2, + left_pad=False, + move_eos_to_beginning=False, + ) + src_lengths, sorted_order = src_lengths.sort(descending=True) + forward_input["src_tokens"] = feature.index_select(0, sorted_order) + forward_input["src_lengths"] = src_lengths + forward_input["prev_output_tokens"] = prev_output_tokens + + return forward_input + + +def get_dummy_encoder_output(encoder_out_shape=(100, 80, 5)): + """ + This only provides an example to generate dummy encoder output + """ + (T, B, D) = encoder_out_shape + encoder_out = {} + + encoder_out["encoder_out"] = torch.from_numpy( + np.random.randn(*encoder_out_shape).astype(np.float32) + ) + seq_lengths = torch.from_numpy(np.random.randint(low=1, high=T, size=B)) + # some dummy mask + encoder_out["encoder_padding_mask"] = torch.arange(T).view(1, T).expand( + B, -1 + ) >= seq_lengths.view(B, 1).expand(-1, T) + encoder_out["encoder_padding_mask"].t_() + + # encoer_padding_mask is (T, B) tensor, with (t, b)-th element indicate + # whether encoder_out[t, b] is valid (=0) or not (=1) + return encoder_out + + +def _current_postion_info(): + cf = currentframe() + frameinfo = " (at {}:{})".format( + os.path.basename(getframeinfo(cf).filename), cf.f_back.f_lineno + ) + return frameinfo + + +def check_encoder_output(encoder_output, batch_size=None): + """we expect encoder_output to be a dict with the following + key/value pairs: + - encoder_out: a Torch.Tensor + - encoder_padding_mask: a binary Torch.Tensor + """ + if not isinstance(encoder_output, dict): + msg = ( + "FairseqEncoderModel.forward(...) must be a dict" + _current_postion_info() + ) + return False, msg + + if "encoder_out" not in encoder_output: + msg = ( + "FairseqEncoderModel.forward(...) must contain encoder_out" + + _current_postion_info() + ) + return False, msg + + if "encoder_padding_mask" not in encoder_output: + msg = ( + "FairseqEncoderModel.forward(...) must contain encoder_padding_mask" + + _current_postion_info() + ) + return False, msg + + if not isinstance(encoder_output["encoder_out"], torch.Tensor): + msg = "encoder_out must be a torch.Tensor" + _current_postion_info() + return False, msg + + if encoder_output["encoder_out"].dtype != torch.float32: + msg = "encoder_out must have float32 dtype" + _current_postion_info() + return False, msg + + mask = encoder_output["encoder_padding_mask"] + if mask is not None: + if not isinstance(mask, torch.Tensor): + msg = ( + "encoder_padding_mask must be a torch.Tensor" + _current_postion_info() + ) + return False, msg + if mask.dtype != torch.uint8 and ( + not hasattr(torch, "bool") or mask.dtype != torch.bool + ): + msg = ( + "encoder_padding_mask must have dtype of uint8" + + _current_postion_info() + ) + return False, msg + + if mask.dim() != 2: + msg = ( + "we expect encoder_padding_mask to be a 2-d tensor, in shape (T, B)" + + _current_postion_info() + ) + return False, msg + + if batch_size is not None and mask.size(1) != batch_size: + msg = ( + "we expect encoder_padding_mask to be a 2-d tensor, with size(1)" + + " being the batch size" + + _current_postion_info() + ) + return False, msg + return True, None + + +def check_decoder_output(decoder_output): + """we expect output from a decoder is a tuple with the following constraint: + - the first element is a torch.Tensor + - the second element can be anything (reserved for future use) + """ + if not isinstance(decoder_output, tuple): + msg = "FariseqDecoder output must be a tuple" + _current_postion_info() + return False, msg + + if len(decoder_output) != 2: + msg = "FairseqDecoder output must be 2-elem tuple" + _current_postion_info() + return False, msg + + if not isinstance(decoder_output[0], torch.Tensor): + msg = ( + "FariseqDecoder output[0] must be a torch.Tensor" + _current_postion_info() + ) + return False, msg + + return True, None + + +# /////////////////////////////////////////////////////////////////////////// +# Base Test class +# /////////////////////////////////////////////////////////////////////////// + + +class TestBaseFairseqModelBase(unittest.TestCase): + """ + This class is used to facilitate writing unittest for any class derived from + `BaseFairseqModel`. + """ + + @classmethod + def setUpClass(cls): + if cls is TestBaseFairseqModelBase: + raise unittest.SkipTest("Skipping test case in base") + super().setUpClass() + + def setUpModel(self, model): + self.assertTrue(isinstance(model, BaseFairseqModel)) + self.model = model + + def setupInput(self): + pass + + def setUp(self): + self.model = None + self.forward_input = None + pass + + +class TestFairseqEncoderDecoderModelBase(TestBaseFairseqModelBase): + """ + base code to test FairseqEncoderDecoderModel (formally known as + `FairseqModel`) must be derived from this base class + """ + + @classmethod + def setUpClass(cls): + if cls is TestFairseqEncoderDecoderModelBase: + raise unittest.SkipTest("Skipping test case in base") + super().setUpClass() + + def setUpModel(self, model_cls, extra_args_setters=None): + self.assertTrue( + issubclass(model_cls, (FairseqEncoderDecoderModel, FairseqModel)), + msg="This class only tests for FairseqModel subclasses", + ) + + task, parser = get_dummy_task_and_parser() + model_cls.add_args(parser) + + args = parser.parse_args([]) + + if extra_args_setters is not None: + for args_setter in extra_args_setters: + args_setter(args) + model = model_cls.build_model(args, task) + self.model = model + + def setUpInput(self, input=None): + self.forward_input = get_dummy_input() if input is None else input + + def setUp(self): + super().setUp() + + def test_forward(self): + if self.model and self.forward_input: + forward_output = self.model.forward(**self.forward_input) + # for FairseqEncoderDecoderModel, forward returns a tuple of two + # elements, the first one is a Torch.Tensor + succ, msg = check_decoder_output(forward_output) + if not succ: + self.assertTrue(succ, msg=msg) + self.forward_output = forward_output + + def test_get_normalized_probs(self): + if self.model and self.forward_input: + forward_output = self.model.forward(**self.forward_input) + logprob = self.model.get_normalized_probs(forward_output, log_probs=True) + prob = self.model.get_normalized_probs(forward_output, log_probs=False) + + # in order for different models/criterion to play with each other + # we need to know whether the logprob or prob output is batch_first + # or not. We assume an additional attribute will be attached to logprob + # or prob. If you find your code failed here, simply override + # FairseqModel.get_normalized_probs, see example at + # https://fburl.com/batch_first_example + self.assertTrue(hasattr(logprob, "batch_first")) + self.assertTrue(hasattr(prob, "batch_first")) + + self.assertTrue(torch.is_tensor(logprob)) + self.assertTrue(torch.is_tensor(prob)) + + +class TestFairseqEncoderModelBase(TestBaseFairseqModelBase): + """ + base class to test FairseqEncoderModel + """ + + @classmethod + def setUpClass(cls): + if cls is TestFairseqEncoderModelBase: + raise unittest.SkipTest("Skipping test case in base") + super().setUpClass() + + def setUpModel(self, model_cls, extra_args_setters=None): + self.assertTrue( + issubclass(model_cls, FairseqEncoderModel), + msg="This class is only used for testing FairseqEncoderModel", + ) + task, parser = get_dummy_task_and_parser() + model_cls.add_args(parser) + args = parser.parse_args([]) + if extra_args_setters is not None: + for args_setter in extra_args_setters: + args_setter(args) + + model = model_cls.build_model(args, task) + self.model = model + + def setUpInput(self, input=None): + self.forward_input = get_dummy_input() if input is None else input + # get_dummy_input() is originally for s2s, here we delete extra dict + # items, so it can be used for EncoderModel / Encoder as well + self.forward_input.pop("prev_output_tokens", None) + + def setUp(self): + super().setUp() + + def test_forward(self): + if self.forward_input and self.model: + bsz = self.forward_input["src_tokens"].size(0) + forward_output = self.model.forward(**self.forward_input) + + # we expect forward_output to be a dict with the following + # key/value pairs: + # - encoder_out: a Torch.Tensor + # - encoder_padding_mask: a binary Torch.Tensor + succ, msg = check_encoder_output(forward_output, batch_size=bsz) + if not succ: + self.assertTrue(succ, msg=msg) + self.forward_output = forward_output + + def test_get_normalized_probs(self): + if self.model and self.forward_input: + forward_output = self.model.forward(**self.forward_input) + logprob = self.model.get_normalized_probs(forward_output, log_probs=True) + prob = self.model.get_normalized_probs(forward_output, log_probs=False) + + # in order for different models/criterion to play with each other + # we need to know whether the logprob or prob output is batch_first + # or not. We assume an additional attribute will be attached to logprob + # or prob. If you find your code failed here, simply override + # FairseqModel.get_normalized_probs, see example at + # https://fburl.com/batch_first_example + self.assertTrue(hasattr(logprob, "batch_first")) + self.assertTrue(hasattr(prob, "batch_first")) + + self.assertTrue(torch.is_tensor(logprob)) + self.assertTrue(torch.is_tensor(prob)) + + +class TestFairseqEncoderBase(unittest.TestCase): + """ + base class to test FairseqEncoder + """ + + @classmethod + def setUpClass(cls): + if cls is TestFairseqEncoderBase: + raise unittest.SkipTest("Skipping test case in base") + super().setUpClass() + + def setUpEncoder(self, encoder): + self.assertTrue( + isinstance(encoder, FairseqEncoder), + msg="This class is only used for test FairseqEncoder", + ) + self.encoder = encoder + + def setUpInput(self, input=None): + self.forward_input = get_dummy_input() if input is None else input + # get_dummy_input() is originally for s2s, here we delete extra dict + # items, so it can be used for EncoderModel / Encoder as well + self.forward_input.pop("prev_output_tokens", None) + + def setUp(self): + self.encoder = None + self.forward_input = None + + def test_forward(self): + if self.encoder and self.forward_input: + bsz = self.forward_input["src_tokens"].size(0) + + forward_output = self.encoder.forward(**self.forward_input) + succ, msg = check_encoder_output(forward_output, batch_size=bsz) + if not succ: + self.assertTrue(succ, msg=msg) + self.forward_output = forward_output + + +class TestFairseqDecoderBase(unittest.TestCase): + """ + base class to test FairseqDecoder + """ + + @classmethod + def setUpClass(cls): + if cls is TestFairseqDecoderBase: + raise unittest.SkipTest("Skipping test case in base") + super().setUpClass() + + def setUpDecoder(self, decoder): + self.assertTrue( + isinstance(decoder, FairseqDecoder), + msg="This class is only used for test FairseqDecoder", + ) + self.decoder = decoder + + def setUpInput(self, input=None): + self.forward_input = get_dummy_encoder_output() if input is None else input + + def setUpPrevOutputTokens(self, tokens=None): + if tokens is None: + self.encoder_input = get_dummy_input() + self.prev_output_tokens = self.encoder_input["prev_output_tokens"] + else: + self.prev_output_tokens = tokens + + def setUp(self): + self.decoder = None + self.forward_input = None + self.prev_output_tokens = None + + def test_forward(self): + if ( + self.decoder is not None + and self.forward_input is not None + and self.prev_output_tokens is not None + ): + forward_output = self.decoder.forward( + prev_output_tokens=self.prev_output_tokens, + encoder_out=self.forward_input, + ) + succ, msg = check_decoder_output(forward_output) + if not succ: + self.assertTrue(succ, msg=msg) + self.forward_input = forward_output + + +class DummyEncoderModel(FairseqEncoderModel): + def __init__(self, encoder): + super().__init__(encoder) + + @classmethod + def build_model(cls, args, task): + return cls(DummyEncoder()) + + def get_logits(self, net_output): + # Inverse of sigmoid to use with BinaryCrossEntropyWithLogitsCriterion as + # F.binary_cross_entropy_with_logits combines sigmoid and CE + return torch.log( + torch.div(net_output["encoder_out"], 1 - net_output["encoder_out"]) + ) + + def get_normalized_probs(self, net_output, log_probs, sample=None): + lprobs = super().get_normalized_probs(net_output, log_probs, sample=sample) + lprobs.batch_first = True + return lprobs + + +class DummyEncoder(FairseqEncoder): + def __init__(self): + super().__init__(None) + + def forward(self, src_tokens, src_lengths): + mask, max_len = lengths_to_encoder_padding_mask(src_lengths) + return {"encoder_out": src_tokens, "encoder_padding_mask": mask} + + +class CrossEntropyCriterionTestBase(unittest.TestCase): + @classmethod + def setUpClass(cls): + if cls is CrossEntropyCriterionTestBase: + raise unittest.SkipTest("Skipping base class test case") + super().setUpClass() + + def setUpArgs(self): + args = argparse.Namespace() + args.sentence_avg = False + args.threshold = 0.1 # to use with BinaryCrossEntropyWithLogitsCriterion + return args + + def setUp(self): + args = self.setUpArgs() + self.model = DummyEncoderModel(encoder=DummyEncoder()) + self.criterion = self.criterion_cls.build_criterion(args, task=DummyTask(args)) + + def get_src_tokens(self, correct_prediction, aggregate): + """ + correct_prediction: True if the net_output (src_tokens) should + predict the correct target + aggregate: True if the criterion expects net_output (src_tokens) + aggregated across time axis + """ + predicted_idx = 0 if correct_prediction else 1 + if aggregate: + src_tokens = torch.zeros((2, 2), dtype=torch.float) + for b in range(2): + src_tokens[b][predicted_idx] = 1.0 + else: + src_tokens = torch.zeros((2, 10, 2), dtype=torch.float) + for b in range(2): + for t in range(10): + src_tokens[b][t][predicted_idx] = 1.0 + return src_tokens + + def get_target(self, soft_target): + if soft_target: + target = torch.zeros((2, 2), dtype=torch.float) + for b in range(2): + target[b][0] = 1.0 + else: + target = torch.zeros((2, 10), dtype=torch.long) + return target + + def get_test_sample(self, correct, soft_target, aggregate): + src_tokens = self.get_src_tokens(correct, aggregate) + target = self.get_target(soft_target) + L = src_tokens.size(1) + return { + "net_input": {"src_tokens": src_tokens, "src_lengths": torch.tensor([L])}, + "target": target, + "ntokens": src_tokens.size(0) * src_tokens.size(1), + } diff --git a/fairseq/tests/speech_recognition/test_cross_entropy.py b/fairseq/tests/speech_recognition/test_cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..b05400ed95e22762c3e3e5e8fd3ebfa6caf1e325 --- /dev/null +++ b/fairseq/tests/speech_recognition/test_cross_entropy.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from examples.speech_recognition.criterions.cross_entropy_acc import ( + CrossEntropyWithAccCriterion, +) + +from .asr_test_base import CrossEntropyCriterionTestBase + + +class CrossEntropyWithAccCriterionTest(CrossEntropyCriterionTestBase): + def setUp(self): + self.criterion_cls = CrossEntropyWithAccCriterion + super().setUp() + + def test_cross_entropy_all_correct(self): + sample = self.get_test_sample(correct=True, soft_target=False, aggregate=False) + loss, sample_size, logging_output = self.criterion( + self.model, sample, "sum", log_probs=True + ) + assert logging_output["correct"] == 20 + assert logging_output["total"] == 20 + assert logging_output["sample_size"] == 20 + assert logging_output["ntokens"] == 20 + + def test_cross_entropy_all_wrong(self): + sample = self.get_test_sample(correct=False, soft_target=False, aggregate=False) + loss, sample_size, logging_output = self.criterion( + self.model, sample, "sum", log_probs=True + ) + assert logging_output["correct"] == 0 + assert logging_output["total"] == 20 + assert logging_output["sample_size"] == 20 + assert logging_output["ntokens"] == 20 diff --git a/fairseq/tests/speech_recognition/test_vggtransformer.py b/fairseq/tests/speech_recognition/test_vggtransformer.py new file mode 100644 index 0000000000000000000000000000000000000000..4dc73b8c7379970dc0bcc16fcb088a64a1bd7e3b --- /dev/null +++ b/fairseq/tests/speech_recognition/test_vggtransformer.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +# import models/encoder/decoder to be tested +from examples.speech_recognition.models.vggtransformer import ( + TransformerDecoder, + VGGTransformerEncoder, + VGGTransformerModel, + vggtransformer_1, + vggtransformer_2, + vggtransformer_base, +) + +# import base test class +from .asr_test_base import ( + DEFAULT_TEST_VOCAB_SIZE, + TestFairseqDecoderBase, + TestFairseqEncoderBase, + TestFairseqEncoderDecoderModelBase, + get_dummy_dictionary, + get_dummy_encoder_output, + get_dummy_input, +) + + +class VGGTransformerModelTest_mid(TestFairseqEncoderDecoderModelBase): + def setUp(self): + def override_config(args): + """ + vggtrasformer_1 use 14 layers of transformer, + for testing purpose, it is too expensive. For fast turn-around + test, reduce the number of layers to 3. + """ + args.transformer_enc_config = ( + "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3" + ) + + super().setUp() + extra_args_setter = [vggtransformer_1, override_config] + + self.setUpModel(VGGTransformerModel, extra_args_setter) + self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE)) + + +class VGGTransformerModelTest_big(TestFairseqEncoderDecoderModelBase): + def setUp(self): + def override_config(args): + """ + vggtrasformer_2 use 16 layers of transformer, + for testing purpose, it is too expensive. For fast turn-around + test, reduce the number of layers to 3. + """ + args.transformer_enc_config = ( + "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3" + ) + + super().setUp() + extra_args_setter = [vggtransformer_2, override_config] + + self.setUpModel(VGGTransformerModel, extra_args_setter) + self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE)) + + +class VGGTransformerModelTest_base(TestFairseqEncoderDecoderModelBase): + def setUp(self): + def override_config(args): + """ + vggtrasformer_base use 12 layers of transformer, + for testing purpose, it is too expensive. For fast turn-around + test, reduce the number of layers to 3. + """ + args.transformer_enc_config = ( + "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 3" + ) + + super().setUp() + extra_args_setter = [vggtransformer_base, override_config] + + self.setUpModel(VGGTransformerModel, extra_args_setter) + self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE)) + + +class VGGTransformerEncoderTest(TestFairseqEncoderBase): + def setUp(self): + super().setUp() + + self.setUpInput(get_dummy_input(T=50, D=80, B=5)) + + def test_forward(self): + print("1. test standard vggtransformer") + self.setUpEncoder(VGGTransformerEncoder(input_feat_per_channel=80)) + super().test_forward() + print("2. test vggtransformer with limited right context") + self.setUpEncoder( + VGGTransformerEncoder( + input_feat_per_channel=80, transformer_context=(-1, 5) + ) + ) + super().test_forward() + print("3. test vggtransformer with limited left context") + self.setUpEncoder( + VGGTransformerEncoder( + input_feat_per_channel=80, transformer_context=(5, -1) + ) + ) + super().test_forward() + print("4. test vggtransformer with limited right context and sampling") + self.setUpEncoder( + VGGTransformerEncoder( + input_feat_per_channel=80, + transformer_context=(-1, 12), + transformer_sampling=(2, 2), + ) + ) + super().test_forward() + print("5. test vggtransformer with windowed context and sampling") + self.setUpEncoder( + VGGTransformerEncoder( + input_feat_per_channel=80, + transformer_context=(12, 12), + transformer_sampling=(2, 2), + ) + ) + + +class TransformerDecoderTest(TestFairseqDecoderBase): + def setUp(self): + super().setUp() + + dict = get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE) + decoder = TransformerDecoder(dict) + dummy_encoder_output = get_dummy_encoder_output(encoder_out_shape=(50, 5, 256)) + + self.setUpDecoder(decoder) + self.setUpInput(dummy_encoder_output) + self.setUpPrevOutputTokens() diff --git a/fairseq/tests/tasks/test_multilingual_denoising.py b/fairseq/tests/tasks/test_multilingual_denoising.py new file mode 100644 index 0000000000000000000000000000000000000000..a0227f69b5747d461f2e52d586eab3eb4a9e8357 --- /dev/null +++ b/fairseq/tests/tasks/test_multilingual_denoising.py @@ -0,0 +1,98 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import unittest +from tempfile import TemporaryDirectory + +from fairseq import options +from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.tasks.multilingual_denoising import MultilingualDenoisingTask +from tests.utils import build_vocab, make_data + + +class TestMultilingualDenoising(unittest.TestCase): + def test_multilingual_denoising(self): + with TemporaryDirectory() as dirname: + + # prep input file + lang_dir = os.path.join(dirname, "en") + os.mkdir(lang_dir) + raw_file = os.path.join(lang_dir, "raw") + data = make_data(out_file=raw_file) + vocab = build_vocab(data) + + # binarize + binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) + split = "train" + bin_file = os.path.join(lang_dir, split) + dataset_impl = "mmap" + FileBinarizer.multiprocess_dataset( + input_file=raw_file, + binarizer=binarizer, + dataset_impl=dataset_impl, + vocab_size=len(vocab), + output_prefix=bin_file, + ) + + # setup task + train_args = options.parse_args_and_arch( + options.get_training_parser(), + [ + "--task", + "multilingual_denoising", + "--arch", + "bart_base", + "--seed", + "42", + "--mask-length", + "word", + "--permute-sentences", + "1", + "--rotate", + "0", + "--replace-length", + "-1", + "--mask", + "0.2", + dirname, + ], + ) + cfg = convert_namespace_to_omegaconf(train_args) + task = MultilingualDenoisingTask(cfg.task, binarizer.dict) + + # load datasets + original_dataset = task._load_dataset_split(bin_file, 1, False) + task.load_dataset(split) + masked_dataset = task.dataset(split) + + iterator = task.get_batch_iterator( + dataset=masked_dataset, + max_tokens=65_536, + max_positions=4_096, + ).next_epoch_itr(shuffle=False) + mask_index = task.source_dictionary.index("") + for batch in iterator: + for sample in range(len(batch)): + net_input = batch["net_input"] + masked_src_tokens = net_input["src_tokens"][sample] + masked_src_length = net_input["src_lengths"][sample] + masked_tgt_tokens = batch["target"][sample] + + sample_id = batch["id"][sample] + original_tokens = original_dataset[sample_id] + original_tokens = original_tokens.masked_select( + masked_src_tokens[:masked_src_length] == mask_index + ) + masked_tokens = masked_tgt_tokens.masked_select( + masked_src_tokens == mask_index + ) + + assert masked_tokens.equal(original_tokens) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_label_smoothing.py b/fairseq/tests/test_label_smoothing.py new file mode 100644 index 0000000000000000000000000000000000000000..04c0f974ac80f7606327f868e948712c3c18f1d0 --- /dev/null +++ b/fairseq/tests/test_label_smoothing.py @@ -0,0 +1,123 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import copy +import unittest + +import tests.utils as test_utils +import torch +from fairseq.criterions.cross_entropy import CrossEntropyCriterion +from fairseq.criterions.label_smoothed_cross_entropy import ( + LabelSmoothedCrossEntropyCriterion, +) + + +class TestLabelSmoothing(unittest.TestCase): + def setUp(self): + # build dictionary + self.d = test_utils.dummy_dictionary(3) + vocab = len(self.d) + self.assertEqual(vocab, 4 + 3) # 4 special + 3 tokens + self.assertEqual(self.d.pad(), 1) + self.assertEqual(self.d.eos(), 2) + self.assertEqual(self.d.unk(), 3) + pad, eos, unk, w1, w2, w3 = 1, 2, 3, 4, 5, 6 # noqa: F841 + + # build dataset + self.data = [ + # the first batch item has padding + { + "source": torch.LongTensor([w1, eos]), + "target": torch.LongTensor([w1, eos]), + }, + { + "source": torch.LongTensor([w1, eos]), + "target": torch.LongTensor([w1, w1, eos]), + }, + ] + self.sample = next(test_utils.dummy_dataloader(self.data)) + + # build model + self.args = argparse.Namespace() + self.args.sentence_avg = False + self.args.report_accuracy = False + self.args.probs = ( + torch.FloatTensor( + [ + # pad eos unk w1 w2 w3 + [0.05, 0.05, 0.1, 0.05, 0.3, 0.4, 0.05], + [0.05, 0.10, 0.2, 0.05, 0.2, 0.3, 0.10], + [0.05, 0.15, 0.3, 0.05, 0.1, 0.2, 0.15], + ] + ) + .unsqueeze(0) + .expand(2, 3, 7) + ) # add batch dimension + self.task = test_utils.TestTranslationTask.setup_task(self.args, self.d, self.d) + self.model = self.task.build_model(self.args) + + def test_nll_loss(self): + self.args.label_smoothing = 0.1 + nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task) + smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion( + self.args, self.task + ) + nll_loss, nll_sample_size, nll_logging_output = nll_crit( + self.model, self.sample + ) + smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit( + self.model, self.sample + ) + self.assertLess(abs(nll_loss - nll_logging_output["loss"]), 1e-6) + self.assertLess(abs(nll_loss - smooth_logging_output["nll_loss"]), 1e-6) + + def test_padding(self): + self.args.label_smoothing = 0.1 + crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task) + loss, _, logging_output = crit(self.model, self.sample) + + def get_one_no_padding(idx): + # create a new sample with just a single batch item so that there's + # no padding + sample1 = next(test_utils.dummy_dataloader([self.data[idx]])) + args1 = copy.copy(self.args) + args1.probs = args1.probs[idx, :, :].unsqueeze(0) + model1 = self.task.build_model(args1) + loss1, _, _ = crit(model1, sample1) + return loss1 + + loss1 = get_one_no_padding(0) + loss2 = get_one_no_padding(1) + self.assertAlmostEqual(loss, loss1 + loss2) + + def test_reduction(self): + self.args.label_smoothing = 0.1 + crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task) + loss, _, logging_output = crit(self.model, self.sample, reduce=True) + unreduced_loss, _, _ = crit(self.model, self.sample, reduce=False) + self.assertAlmostEqual(loss, unreduced_loss.sum()) + + def test_zero_eps(self): + self.args.label_smoothing = 0.0 + nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task) + smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion( + self.args, self.task + ) + nll_loss, nll_sample_size, nll_logging_output = nll_crit( + self.model, self.sample + ) + smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit( + self.model, self.sample + ) + self.assertAlmostEqual(nll_loss, smooth_loss) + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess((t1 - t2).abs().max(), 1e-6) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_memory_efficient_fp16.py b/fairseq/tests/test_memory_efficient_fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..2bf2f29888d6027896128930626b1aafe7f18475 --- /dev/null +++ b/fairseq/tests/test_memory_efficient_fp16.py @@ -0,0 +1,78 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import unittest + +import torch +from fairseq.optim.adam import FairseqAdam +from fairseq.optim.fp16_optimizer import MemoryEfficientFP16Optimizer +from omegaconf import OmegaConf + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestMemoryEfficientFP16(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + def test_load_state_dict(self): + # define simple FP16 model + model = torch.nn.Linear(5, 5).cuda().half() + params = list(model.parameters()) + + # initialize memory efficient FP16 optimizer + # with pseudo DictConfigs + optimizer = FairseqAdam( + cfg=OmegaConf.create( + vars( + argparse.Namespace( + adam_betas="(0.9, 0.999)", + adam_eps=1e-8, + weight_decay=0.0, + lr=[0.00001], + ) + ) + ), + params=params, + ) + me_optimizer = MemoryEfficientFP16Optimizer( + cfg=OmegaConf.create( + { + "common": vars( + argparse.Namespace( + fp16_init_scale=1, + fp16_scale_window=1, + fp16_scale_tolerance=1, + threshold_loss_scale=1, + min_loss_scale=1e-4, + ) + ) + } + ), + params=params, + optimizer=optimizer, + ) + + # optimizer state is created in the first step + loss = model(torch.rand(5).cuda().half()).sum() + me_optimizer.backward(loss) + me_optimizer.step() + + # reload state + state = me_optimizer.state_dict() + me_optimizer.load_state_dict(state) + for k, v in me_optimizer.optimizer.state.items(): + self.assertTrue(k.dtype == torch.float16) + for v_i in v.values(): + if torch.is_tensor(v_i): + self.assertTrue(v_i.dtype == torch.float32) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_metrics.py b/fairseq/tests/test_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..fc93b48088e3833914b142156e6de1002eda093b --- /dev/null +++ b/fairseq/tests/test_metrics.py @@ -0,0 +1,77 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +import uuid + +from fairseq.logging import metrics + + +class TestMetrics(unittest.TestCase): + def test_nesting(self): + with metrics.aggregate() as a: + metrics.log_scalar("loss", 1) + with metrics.aggregate() as b: + metrics.log_scalar("loss", 2) + + self.assertEqual(a.get_smoothed_values()["loss"], 1.5) + self.assertEqual(b.get_smoothed_values()["loss"], 2) + + def test_new_root(self): + with metrics.aggregate() as a: + metrics.log_scalar("loss", 1) + with metrics.aggregate(new_root=True) as b: + metrics.log_scalar("loss", 2) + + self.assertEqual(a.get_smoothed_values()["loss"], 1) + self.assertEqual(b.get_smoothed_values()["loss"], 2) + + def test_nested_new_root(self): + with metrics.aggregate() as layer1: + metrics.log_scalar("loss", 1) + with metrics.aggregate(new_root=True) as layer2: + metrics.log_scalar("loss", 2) + with metrics.aggregate() as layer3: + metrics.log_scalar("loss", 3) + with metrics.aggregate(new_root=True) as layer4: + metrics.log_scalar("loss", 4) + metrics.log_scalar("loss", 1.5) + + self.assertEqual(layer4.get_smoothed_values()["loss"], 4) + self.assertEqual(layer3.get_smoothed_values()["loss"], 3) + self.assertEqual(layer2.get_smoothed_values()["loss"], 2.5) + self.assertEqual(layer1.get_smoothed_values()["loss"], 1.25) + + def test_named(self): + name = str(uuid.uuid4()) + metrics.reset_meters(name) + + with metrics.aggregate(name): + metrics.log_scalar("loss", 1) + + metrics.log_scalar("loss", 3) + + with metrics.aggregate(name): + metrics.log_scalar("loss", 2) + + self.assertEqual(metrics.get_smoothed_values(name)["loss"], 1.5) + + def test_nested_duplicate_names(self): + name = str(uuid.uuid4()) + metrics.reset_meters(name) + + with metrics.aggregate(name): + metrics.log_scalar("loss", 1) + with metrics.aggregate() as other: + with metrics.aggregate(name): + metrics.log_scalar("loss", 2) + metrics.log_scalar("loss", 6) + + self.assertEqual(metrics.get_smoothed_values(name)["loss"], 3) + self.assertEqual(other.get_smoothed_values()["loss"], 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_multi_corpus_dataset.py b/fairseq/tests/test_multi_corpus_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..79900abf615f37e3513710352022d547304ccdba --- /dev/null +++ b/fairseq/tests/test_multi_corpus_dataset.py @@ -0,0 +1,82 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from collections import OrderedDict + +import torch + +from fairseq.data import LanguagePairDataset, TokenBlockDataset +from fairseq.data.multi_corpus_dataset import MultiCorpusDataset +from tests.test_train import mock_dict + + +class TestMultiCorpusDataset(unittest.TestCase): + def setUp(self): + d = mock_dict() + tokens_1 = torch.LongTensor([i for i in range(1, 5000, 2)]).view(1, -1) + tokens_ds1 = TokenBlockDataset( + tokens_1, + sizes=[tokens_1.size(-1)], + block_size=1, + pad=0, + eos=1, + include_targets=False, + ) + self.dataset_1 = LanguagePairDataset( + tokens_ds1, tokens_ds1.sizes, d, shuffle=False + ) + tokens_2 = torch.LongTensor([i for i in range(0, 5000, 2)]).view(1, -1) + tokens_ds2 = TokenBlockDataset( + tokens_2, + sizes=[tokens_2.size(-1)], + block_size=1, + pad=0, + eos=1, + include_targets=False, + ) + self.dataset_2 = LanguagePairDataset( + tokens_ds2, tokens_ds2.sizes, d, shuffle=False + ) + + def _test_sample_helper( + self, + distribution, + ): + m = MultiCorpusDataset( + OrderedDict({0: self.dataset_1, 1: self.dataset_2}), + distribution=distribution, + seed=0, + sort_indices=True, + ) + m.set_epoch(1) + indices = m.ordered_indices() + count_sample_from_first_dataset = 0 + items = set() + for i in indices: + item = m[i]["source"].item() + if item % 2 == 1: + count_sample_from_first_dataset += 1 + + items.add(item) + sample_from_first_ds_percentage = ( + 1.0 * count_sample_from_first_dataset / len(indices) + ) + self.assertLess( + abs(sample_from_first_ds_percentage - distribution[0]), + 0.01, + ) + self.assertEqual( + len(items), + int( + min(len(self.dataset_1), len(indices) * distribution[0]) + + min(len(self.dataset_1), len(indices) * distribution[1]) + ), + ) + print(distribution) + + def test_multi_corpus_dataset(self): + for distribution in [[0.5, 0.5], [0.1, 0.9], [0.9, 0.1], [0.0, 1.0]]: + self._test_sample_helper(distribution=distribution) diff --git a/fairseq/tests/test_multi_corpus_sampled_dataset.py b/fairseq/tests/test_multi_corpus_sampled_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..88f0817a54541a42b4837141a83ab4a0cb870133 --- /dev/null +++ b/fairseq/tests/test_multi_corpus_sampled_dataset.py @@ -0,0 +1,95 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from collections import OrderedDict + +import numpy as np +import torch +from fairseq.data import LanguagePairDataset, TokenBlockDataset +from fairseq.data.multi_corpus_sampled_dataset import MultiCorpusSampledDataset +from tests.test_train import mock_dict + + +class TestMultiCorpusSampledDataset(unittest.TestCase): + def setUp(self): + d = mock_dict() + tokens_1 = torch.LongTensor([1]).view(1, -1) + tokens_ds1 = TokenBlockDataset( + tokens_1, + sizes=[tokens_1.size(-1)], + block_size=1, + pad=0, + eos=1, + include_targets=False, + ) + self.dataset_1 = LanguagePairDataset( + tokens_ds1, tokens_ds1.sizes, d, shuffle=False + ) + tokens_2 = torch.LongTensor([2]).view(1, -1) + tokens_ds2 = TokenBlockDataset( + tokens_2, + sizes=[tokens_2.size(-1)], + block_size=1, + pad=0, + eos=1, + include_targets=False, + ) + self.dataset_2 = LanguagePairDataset( + tokens_ds2, tokens_ds2.sizes, d, shuffle=False + ) + + def _test_sample_helper( + self, + expected_sample_from_first_ds_percentage, + num_samples=1000, + sampling_func=None, + ): + # To make sure test is not flaky + np.random.seed(0) + if sampling_func is None: + m = MultiCorpusSampledDataset( + OrderedDict({0: self.dataset_1, 1: self.dataset_2}), + ) + else: + m = MultiCorpusSampledDataset( + OrderedDict({0: self.dataset_1, 1: self.dataset_2}), + sampling_func=sampling_func, + ) + m.ordered_indices() + count_sample_from_first_dataset = 0 + for _ in range(num_samples): + if m.collater([m[0], m[1]])["net_input"]["src_tokens"][0] == 1: + count_sample_from_first_dataset += 1 + sample_from_first_ds_percentage = ( + 1.0 * count_sample_from_first_dataset / num_samples + ) + self.assertLess( + abs( + sample_from_first_ds_percentage + - expected_sample_from_first_ds_percentage + ), + 0.01, + ) + + def test_multi_corpus_sampled_dataset_uniform_sample(self): + self._test_sample_helper(expected_sample_from_first_ds_percentage=0.5) + + def test_multi_corpus_sampled_dataset_weighted_sample(self): + def naive_weighted_sample(weights): + def f(input): + v = np.random.random() + agg = 0 + for i, weight in enumerate(weights): + agg += weight + if agg > v: + return i + + return f + + self._test_sample_helper( + expected_sample_from_first_ds_percentage=0.9, + sampling_func=naive_weighted_sample(weights=[0.9, 0.1]), + ) diff --git a/fairseq/tests/test_multihead_attention.py b/fairseq/tests/test_multihead_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..4a0b430b6f0e98ad83afa21e7b004392313c6c26 --- /dev/null +++ b/fairseq/tests/test_multihead_attention.py @@ -0,0 +1,488 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import random +import unittest + +import pytest +import torch + +from fairseq.modules.multihead_attention import MultiheadAttention, _mask_for_xformers + +BATCH = [20, 41, 97] +SEQ = [64] +EMB = [48] +HEADS = [4] +DROP = 0.1 +DEVICE = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"] +ATTN_MASK_DTYPE = [None, torch.uint8, torch.bool, torch.float] +KEY_PADDING_MASK_DTYPE = [None, torch.uint8, torch.bool] + + +# FIXME: some tests fail when decimal=2, fix this and set decimal to 2 +def assert_almost_equal(x, y, decimal=1, err_msg=""): + import numpy.testing as npt + + if isinstance(x, torch.Tensor): + x = x.cpu().detach().numpy() + if isinstance(y, torch.Tensor): + y = y.cpu().detach().numpy() + npt.assert_array_almost_equal(x, y, err_msg=err_msg, decimal=decimal) + + +def _reset_seeds(): + torch.manual_seed(0) + torch.random.manual_seed(0) + random.seed(0) + torch.cuda.manual_seed_all(0) + + +def _get_mask(to_dtype: torch.dtype, dim0: int, dim1: int): + if to_dtype == torch.float: + mask = torch.randint(0, 2, (dim0, dim1)).to(dtype=torch.bool) + return mask.to(dtype=to_dtype).masked_fill(mask, -float("inf")) + return torch.randint(0, 2, (dim0, dim1)).to(dtype=to_dtype) + + +def test_mask_for_xformers(): + # Additive Mask + m_float_add = torch.tensor([float("-inf"), 0]).to(torch.float) + m_float_add_flipped = torch.tensor([0, float("-inf")]).to(torch.float) + m_float16_add = torch.tensor([float("-inf"), 0]).to(torch.float16) + m_float16_add_flipped = torch.tensor([0, float("-inf")]).to(torch.float16) + m_uint = torch.tensor([1, 0]).to(torch.uint8) + m_uint_flipped = torch.tensor([0, 1]).to(torch.uint8) + m_bool = torch.tensor([False, True]) + + assert torch.equal(_mask_for_xformers(m_float_add), m_float_add) + assert torch.equal(_mask_for_xformers(m_float16_add), m_float16_add) + assert torch.equal(_mask_for_xformers(m_uint), m_uint_flipped) + assert torch.equal(_mask_for_xformers(m_bool), ~m_bool) + + assert torch.equal( + _mask_for_xformers(m_float_add, to_dtype=torch.float16), m_float16_add + ) + assert torch.equal( + _mask_for_xformers(m_float_add, to_dtype=torch.float), m_float_add + ) + assert torch.equal(_mask_for_xformers(m_float_add, to_dtype=torch.bool), m_bool) + assert torch.equal( + _mask_for_xformers(m_float_add, to_dtype=torch.uint8), m_uint_flipped + ) + + assert torch.equal( + _mask_for_xformers(m_float16_add, to_dtype=torch.float16), m_float16_add + ) + assert torch.equal( + _mask_for_xformers(m_float16_add, to_dtype=torch.float), m_float_add + ) + assert torch.equal(_mask_for_xformers(m_float16_add, to_dtype=torch.bool), m_bool) + assert torch.equal( + _mask_for_xformers(m_float16_add, to_dtype=torch.uint8), m_uint_flipped + ) + + assert torch.equal( + _mask_for_xformers(m_bool, to_dtype=torch.float16), m_float16_add_flipped + ) + assert torch.equal( + _mask_for_xformers(m_bool, to_dtype=torch.float), m_float_add_flipped + ) + assert torch.equal(_mask_for_xformers(m_bool, to_dtype=torch.bool), ~m_bool) + assert torch.equal(_mask_for_xformers(m_bool, to_dtype=torch.uint8), m_uint) + + assert torch.equal( + _mask_for_xformers(m_uint, to_dtype=torch.float16), m_float16_add + ) + assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.float), m_float_add) + assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.bool), m_bool) + assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.uint8), m_uint_flipped) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="blocksparse requires gpu") +@pytest.mark.skip(reason="not part of latest xformers") +@pytest.mark.parametrize("device", ["cuda"]) +@pytest.mark.parametrize("add_zero_attn", [False]) +@pytest.mark.parametrize("batch_size", [20]) +@pytest.mark.parametrize("embedding", [64]) +@pytest.mark.parametrize("seq_len", [64]) +@pytest.mark.parametrize("num_heads", [4]) +def test_xformers_blocksparse_parity( + device, + add_zero_attn, + batch_size, + embedding, + seq_len, + num_heads, +): + + xformers_att_config = '{"name": "scaled_dot_product"}' + xformers_blocksparse_blocksize = 16 + xformers_blocksparse_layout = torch.ones( + seq_len // xformers_blocksparse_blocksize, + seq_len // xformers_blocksparse_blocksize, + dtype=torch.int32, + ) + + q = torch.rand(seq_len, batch_size, embedding).to(device).half() + q.requires_grad = True + k = torch.rand(seq_len, batch_size, embedding).to(device).half() + k.requires_grad = True + v = torch.rand(seq_len, batch_size, embedding).to(device).half() + v.requires_grad = True + + q_ = q.detach().clone().half() + q_.requires_grad = True + k_ = k.detach().clone().half() + k_.requires_grad = True + v_ = v.detach().clone().half() + v_.requires_grad = True + + _reset_seeds() + xf_blocksparse_mha = ( + MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + add_zero_attn=add_zero_attn, + xformers_att_config=xformers_att_config, + xformers_blocksparse_layout=xformers_blocksparse_layout, + xformers_blocksparse_blocksize=xformers_blocksparse_blocksize, + ) + .to(device) + .half() + ) + + xf_blocksparse_output, _ = xf_blocksparse_mha( + q, + k, + v, + ) + + _reset_seeds() + xformers_mha = ( + MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + add_zero_attn=add_zero_attn, + xformers_att_config=xformers_att_config, + xformers_blocksparse_layout=None, + ) + .to(device) + .half() + ) + + xformers_output, _ = xformers_mha( + q_, + k_, + v_, + ) + + # # account for when nan != nan + rand = random.uniform(0, 1) + xformers_output = xformers_output.masked_fill(xformers_output.isnan(), rand) + xf_blocksparse_output = xf_blocksparse_output.masked_fill( + xf_blocksparse_output.isnan(), rand + ) + + assert_almost_equal(xformers_output, xf_blocksparse_output) + + loss_blocksparse = torch.norm(xformers_output) + loss_original = torch.norm(xf_blocksparse_output) + loss_blocksparse.backward() + loss_original.backward() + + q.masked_fill(q.isnan(), rand) + q_.masked_fill(q_.isnan(), rand) + k.masked_fill(k.isnan(), rand) + k_.masked_fill(k_.isnan(), rand) + v.masked_fill(v.isnan(), rand) + v_.masked_fill(v_.isnan(), rand) + + assert_almost_equal(q.grad, q_.grad) + assert_almost_equal(k.grad, k_.grad) + assert_almost_equal(v.grad, v_.grad) + + +@pytest.mark.parametrize("device", DEVICE) +@pytest.mark.parametrize("attn_dtype", ATTN_MASK_DTYPE) +@pytest.mark.parametrize("key_padding_dtype", KEY_PADDING_MASK_DTYPE) +@pytest.mark.parametrize("add_bias_kv", [True, False]) +@pytest.mark.parametrize("add_zero_attn", [True, False]) +# TODO: test with static_kv True +@pytest.mark.parametrize("static_kv", [False]) +@pytest.mark.parametrize("batch_size", BATCH) +@pytest.mark.parametrize("embedding", EMB) +@pytest.mark.parametrize("seq_len", SEQ) +@pytest.mark.parametrize("num_heads", HEADS) +def test_xformers_single_forward_parity( + device, + attn_dtype, + key_padding_dtype, + add_bias_kv, + add_zero_attn, + static_kv, + batch_size, + embedding, + seq_len, + num_heads, +): + + xformers_att_config = '{"name": "scaled_dot_product"}' + + attn_mask = ( + None + if attn_dtype is None + else _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len).to(device) + ) + key_padding_mask = ( + None + if key_padding_dtype is None + else _get_mask(to_dtype=key_padding_dtype, dim0=batch_size, dim1=seq_len).to( + device + ) + ) + + q = torch.rand(seq_len, batch_size, embedding).to(device) + q.requires_grad = True + k = torch.rand(seq_len, batch_size, embedding).to(device) + k.requires_grad = True + v = torch.rand(seq_len, batch_size, embedding).to(device) + v.requires_grad = True + + q_ = q.detach().clone() + q_.requires_grad = True + k_ = k.detach().clone() + k_.requires_grad = True + v_ = v.detach().clone() + v_.requires_grad = True + + # TODO: dropouts in the two implementations lead to different entries dropped. + _reset_seeds() + xformers_mha = MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + xformers_att_config=xformers_att_config, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ).to(device) + xformers_output, _ = xformers_mha( + q, + k, + v, + key_padding_mask=key_padding_mask, + attn_mask=attn_mask, + static_kv=static_kv, + ) + + _reset_seeds() + original_mha = MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + xformers_att_config=None, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ).to(device) + original_output, _ = original_mha( + q_, + k_, + v_, + key_padding_mask=key_padding_mask, + attn_mask=attn_mask, + static_kv=static_kv, + ) + + # account for when nan != nan + if xformers_output.isnan().any() or original_output.isnan().any(): + rand = random.uniform(0, 1) + xformers_output = xformers_output.masked_fill(xformers_output.isnan(), rand) + original_output = original_output.masked_fill(original_output.isnan(), rand) + + # torch.equal works for cpu, on cuda allclose is needed. + assert torch.allclose( + xformers_output, original_output, atol=1e-06 + ), f"max diff is {torch.max(torch.abs(xformers_output - original_output))}" + + loss_xformers = torch.norm(xformers_output) + loss_original = torch.norm(original_output) + loss_xformers.backward() + loss_original.backward() + + # torch.equal works for cpu, on cuda allclose is needed. + assert torch.allclose( + q.grad, q_.grad + ), f"max diff is {torch.max(torch.abs(q.grad - q_.grad))}" + assert torch.allclose( + k.grad, k_.grad + ), f"max diff is {torch.max(torch.abs(k.grad - k_.grad))}" + assert torch.allclose( + v.grad, v_.grad + ), f"max diff is {torch.max(torch.abs(v.grad - v_.grad))}" + + +def test_mask_padding_parity(): + def old_padding_code(key_padding_mask, attn_mask): + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask), + ], + dim=1, + ) + return key_padding_mask, attn_mask + + # values don't matter for this test. + mha = MultiheadAttention( + embed_dim=8, + num_heads=2, + dropout=0.0, + add_bias_kv=True, + add_zero_attn=True, + ) + + key_padding_mask = torch.rand((8, 64)) + attn_mask = torch.rand((64, 64)) + + kp_mask_orig, a_mask_orig = old_padding_code(key_padding_mask, attn_mask) + kp_mask_new, a_mask_new = mha._pad_masks(key_padding_mask, attn_mask) + + assert kp_mask_orig.size() == kp_mask_new.size() + assert a_mask_orig.size() == a_mask_new.size() + assert torch.equal(kp_mask_orig, kp_mask_new) + assert torch.equal(a_mask_orig, a_mask_new) + + +def test_add_bias_parity(): + # values don't matter for this test. + mha = MultiheadAttention( + embed_dim=8, + num_heads=2, + dropout=0.0, + add_bias_kv=True, + add_zero_attn=True, + ) + + def old_bias_code(k, v, key_padding_mask, attn_mask, bsz): + k = torch.cat([k, mha.bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, mha.bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + key_padding_mask.new_zeros(key_padding_mask.size(0), 1), + ], + dim=1, + ) + return k, v, key_padding_mask, attn_mask + + seq_len = 64 + bsz = 8 + embedding = 8 + key_padding_mask = torch.rand((bsz, seq_len)) + attn_mask = torch.rand((seq_len, seq_len)) + k = torch.rand((seq_len, bsz, embedding)) + v = torch.rand((seq_len, bsz, embedding)) + + k_orig, v_orig, kp_mask_orig, a_mask_orig = old_bias_code( + k, v, key_padding_mask, attn_mask, bsz + ) + k_new, v_new, kp_mask_new, a_mask_new = mha._add_bias( + k, v, key_padding_mask, attn_mask, bsz + ) + + assert torch.equal(k_orig, k_new) + assert torch.equal(v_orig, v_new) + assert torch.equal(kp_mask_orig, kp_mask_new) + assert torch.equal(a_mask_orig, a_mask_new) + + +class TestMultiheadAttention(unittest.TestCase): + def test_append_prev_key_padding_mask(self): + bsz = 1 + src_len = 4 + + cases = [ + # no padding mask + (None, None, None), + # current padding mask only + ( + torch.tensor([[1]]).bool(), + None, + torch.tensor([[0, 0, 0, 1]]).bool(), + ), + # previous padding mask only + ( + None, + torch.tensor([[0, 1, 0]]).bool(), + torch.tensor([[0, 1, 0, 0]]).bool(), + ), + # both padding masks + ( + torch.tensor([[1]]).bool(), + torch.tensor([[0, 1, 0]]).bool(), + torch.tensor([[0, 1, 0, 1]]).bool(), + ), + # prev_key_padding_mask already full + ( + torch.tensor([[0, 1, 0, 1]]).bool(), + None, + torch.tensor([[0, 1, 0, 1]]).bool(), + ), + # key_padding_mask already full + ( + None, + torch.tensor([[0, 1, 0, 1]]).bool(), + torch.tensor([[0, 1, 0, 1]]).bool(), + ), + ] + for c in cases: + key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( + c[0], + c[1], + batch_size=bsz, + src_len=src_len, + static_kv=False, + ) + + if key_padding_mask is not None: + self.assertTrue( + torch.all(torch.eq(key_padding_mask, c[2])), + f"Unexpected resultant key padding mask: {key_padding_mask}" + f" given current: {c[0]} and previous: {c[1]}", + ) + self.assertEqual(key_padding_mask.size(0), bsz) + self.assertEqual(key_padding_mask.size(1), src_len) + else: + self.assertIsNone(c[2]) + + def test_pruning_heads(self): + embed_dim = 768 + num_heads = 12 + num_heads_to_keep = 8 + dummy_input = torch.randn(32, 2, embed_dim) + mha = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads) + reserve_head_index = mha._get_reserve_head_index( + num_heads_to_keep=num_heads_to_keep + ) + mha._adaptive_prune_heads(reserve_head_index=reserve_head_index) + mha._set_skip_embed_dim_check() + mha(query=dummy_input, key=dummy_input, value=dummy_input) + self.assertEqual(mha.head_dim, embed_dim / num_heads) + self.assertEqual(mha.num_heads, num_heads_to_keep) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_noising.py b/fairseq/tests/test_noising.py new file mode 100644 index 0000000000000000000000000000000000000000..1956f6ad1d0ffd9340a1b028d298b2cf78ae460f --- /dev/null +++ b/fairseq/tests/test_noising.py @@ -0,0 +1,531 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Dict, List + +import torch + +import tests.utils as test_utils +from fairseq import utils +from fairseq.data import ( + Dictionary, + LanguagePairDataset, + TransformEosDataset, + data_utils, + noising, +) + + +class TestDataNoising(unittest.TestCase): + def _get_test_data_with_bpe_cont_marker(self, append_eos=True): + """ + Args: + append_eos: if True, each input sentence in the source tokens tensor + will have an EOS appended to the end. + + Returns: + vocabs: BPE vocab with continuation markers as suffixes to denote + non-end of word tokens. This is the standard BPE format used in + fairseq's preprocessing. + x: input tensor containing numberized source tokens, with EOS at the + end if append_eos is true + src_lengths: and source lengths. + """ + vocab = Dictionary() + vocab.add_symbol("he@@") + vocab.add_symbol("llo") + vocab.add_symbol("how") + vocab.add_symbol("are") + vocab.add_symbol("y@@") + vocab.add_symbol("ou") + vocab.add_symbol("n@@") + vocab.add_symbol("ew") + vocab.add_symbol("or@@") + vocab.add_symbol("k") + + src_tokens = [ + ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], + ["how", "are", "y@@", "ou"], + ] + x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor( + vocab=vocab, src_tokens=src_tokens, append_eos=append_eos + ) + return vocab, x, src_lengths + + def _get_test_data_with_bpe_end_marker(self, append_eos=True): + """ + Args: + append_eos: if True, each input sentence in the source tokens tensor + will have an EOS appended to the end. + + Returns: + vocabs: BPE vocab with end-of-word markers as suffixes to denote + tokens at the end of a word. This is an alternative to fairseq's + standard preprocessing framework and is not generally supported + within fairseq. + x: input tensor containing numberized source tokens, with EOS at the + end if append_eos is true + src_lengths: and source lengths. + """ + vocab = Dictionary() + vocab.add_symbol("he") + vocab.add_symbol("llo_EOW") + vocab.add_symbol("how_EOW") + vocab.add_symbol("are_EOW") + vocab.add_symbol("y") + vocab.add_symbol("ou_EOW") + vocab.add_symbol("n") + vocab.add_symbol("ew_EOW") + vocab.add_symbol("or") + vocab.add_symbol("k_EOW") + + src_tokens = [ + ["he", "llo_EOW", "n", "ew_EOW", "y", "or", "k_EOW"], + ["how_EOW", "are_EOW", "y", "ou_EOW"], + ] + x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor( + vocab=vocab, src_tokens=src_tokens, append_eos=append_eos + ) + return vocab, x, src_lengths + + def _get_test_data_with_word_vocab(self, append_eos=True): + """ + Args: + append_eos: if True, each input sentence in the source tokens tensor + will have an EOS appended to the end. + + Returns: + vocabs: word vocab + x: input tensor containing numberized source tokens, with EOS at the + end if append_eos is true + src_lengths: and source lengths. + """ + vocab = Dictionary() + + vocab.add_symbol("hello") + vocab.add_symbol("how") + vocab.add_symbol("are") + vocab.add_symbol("you") + vocab.add_symbol("new") + vocab.add_symbol("york") + src_tokens = [ + ["hello", "new", "york", "you"], + ["how", "are", "you", "new", "york"], + ] + x, src_lengths = self._convert_src_tokens_to_tensor( + vocab=vocab, src_tokens=src_tokens, append_eos=append_eos + ) + return vocab, x, src_lengths + + def _convert_src_tokens_to_tensor( + self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool + ): + src_len = [len(x) for x in src_tokens] + # If we have to append EOS, we include EOS in counting src length + if append_eos: + src_len = [length + 1 for length in src_len] + + x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) + for i in range(len(src_tokens)): + for j in range(len(src_tokens[i])): + x[i][j] = vocab.index(src_tokens[i][j]) + if append_eos: + x[i][j + 1] = vocab.eos() + + x = x.transpose(1, 0) + return x, torch.LongTensor(src_len) + + def assert_eos_at_end(self, x, x_len, eos): + """Asserts last token of every sentence in x is EOS""" + for i in range(len(x_len)): + self.assertEqual( + x[x_len[i] - 1][i], + eos, + ( + "Expected eos (token id {eos}) at the end of sentence {i} " + "but got {other} instead" + ).format(i=i, eos=eos, other=x[i][-1]), + ) + + def assert_word_dropout_correct(self, x, x_noised, x_len, l_noised): + # Expect only the first word (2 bpe tokens) of the first example + # was dropped out + self.assertEqual(x_len[0] - 2, l_noised[0]) + for i in range(l_noised[0]): + self.assertEqual(x_noised[i][0], x[i + 2][0]) + + def test_word_dropout_with_eos(self): + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) + + with data_utils.numpy_seed(1234): + noising_gen = noising.WordDropout(vocab) + x_noised, l_noised = noising_gen.noising(x, x_len, 0.2) + self.assert_word_dropout_correct( + x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised + ) + self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def assert_word_blanking_correct(self, x, x_noised, x_len, l_noised, unk): + # Expect only the first word (2 bpe tokens) of the first example + # was blanked out + self.assertEqual(x_len[0], l_noised[0]) + for i in range(l_noised[0]): + if i < 2: + self.assertEqual(x_noised[i][0], unk) + else: + self.assertEqual(x_noised[i][0], x[i][0]) + + def test_word_blank_with_eos(self): + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) + + with data_utils.numpy_seed(1234): + noising_gen = noising.WordDropout(vocab) + x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk()) + self.assert_word_blanking_correct( + x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk() + ) + self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def generate_unchanged_shuffle_map(self, length): + return {i: i for i in range(length)} + + def assert_word_shuffle_matches_expected( + self, + x, + x_len, + max_shuffle_distance: int, + vocab: Dictionary, + expected_shufle_maps: List[Dict[int, int]], + expect_eos_at_end: bool, + bpe_end_marker=None, + ): + """ + This verifies that with a given x, x_len, max_shuffle_distance, and + vocab, we get the expected shuffle result. + + Args: + x: Tensor of shape (T x B) = (sequence_length, batch_size) + x_len: Tensor of length B = batch_size + max_shuffle_distance: arg to pass to noising + expected_shuffle_maps: List[mapping] where mapping is a + Dict[old_index, new_index], mapping x's elements from their + old positions in x to their new positions in x. + expect_eos_at_end: if True, check the output to make sure there is + an EOS at the end. + bpe_end_marker: str denoting the BPE end token. If this is not None, we + set the BPE cont token to None in the noising classes. + """ + bpe_cont_marker = None + if bpe_end_marker is None: + bpe_cont_marker = "@@" + + with data_utils.numpy_seed(1234): + word_shuffle = noising.WordShuffle( + vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker + ) + x_noised, l_noised = word_shuffle.noising( + x, x_len, max_shuffle_distance=max_shuffle_distance + ) + + # For every example, we have a different expected shuffle map. We check + # that each example is shuffled as expected according to each + # corresponding shuffle map. + for i in range(len(expected_shufle_maps)): + shuffle_map = expected_shufle_maps[i] + for k, v in shuffle_map.items(): + self.assertEqual(x[k][i], x_noised[v][i]) + + # Shuffling should not affect the length of each example + for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised): + self.assertEqual(pre_shuffle_length, post_shuffle_length) + if expect_eos_at_end: + self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def test_word_shuffle_with_eos(self): + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) + + # Assert word shuffle with max shuffle distance 0 causes input to be + # unchanged + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + max_shuffle_distance=0, + vocab=vocab, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(example_len) + for example_len in x_len + ], + expect_eos_at_end=True, + ) + + # Assert word shuffle with max shuffle distance 3 matches our expected + # shuffle order + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + vocab=vocab, + max_shuffle_distance=3, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(x_len[0]), + {0: 0, 1: 3, 2: 1, 3: 2}, + ], + expect_eos_at_end=True, + ) + + def test_word_shuffle_with_eos_nonbpe(self): + """The purpose of this is to test shuffling logic with word vocabs""" + vocab, x, x_len = self._get_test_data_with_word_vocab(append_eos=True) + + # Assert word shuffle with max shuffle distance 0 causes input to be + # unchanged + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + max_shuffle_distance=0, + vocab=vocab, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(example_len) + for example_len in x_len + ], + expect_eos_at_end=True, + ) + + # Assert word shuffle with max shuffle distance 3 matches our expected + # shuffle order + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + vocab=vocab, + max_shuffle_distance=3, + expected_shufle_maps=[ + {0: 0, 1: 1, 2: 3, 3: 2}, + {0: 0, 1: 2, 2: 1, 3: 3, 4: 4}, + ], + expect_eos_at_end=True, + ) + + def test_word_shuffle_without_eos(self): + """Same result as word shuffle with eos except no EOS at end""" + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) + + # Assert word shuffle with max shuffle distance 0 causes input to be + # unchanged + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + max_shuffle_distance=0, + vocab=vocab, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(example_len) + for example_len in x_len + ], + expect_eos_at_end=False, + ) + + # Assert word shuffle with max shuffle distance 3 matches our expected + # shuffle order + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + vocab=vocab, + max_shuffle_distance=3, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(x_len[0]), + {0: 0, 1: 3, 2: 1, 3: 2}, + ], + expect_eos_at_end=False, + ) + + def test_word_shuffle_without_eos_with_bpe_end_marker(self): + """Same result as word shuffle without eos except using BPE end token""" + vocab, x, x_len = self._get_test_data_with_bpe_end_marker(append_eos=False) + + # Assert word shuffle with max shuffle distance 0 causes input to be + # unchanged + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + max_shuffle_distance=0, + vocab=vocab, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(example_len) + for example_len in x_len + ], + expect_eos_at_end=False, + bpe_end_marker="_EOW", + ) + + # Assert word shuffle with max shuffle distance 3 matches our expected + # shuffle order + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + vocab=vocab, + max_shuffle_distance=3, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(x_len[0]), + {0: 0, 1: 3, 2: 1, 3: 2}, + ], + expect_eos_at_end=False, + bpe_end_marker="_EOW", + ) + + def assert_no_eos_at_end(self, x, x_len, eos): + """Asserts that the last token of each sentence in x is not EOS""" + for i in range(len(x_len)): + self.assertNotEqual( + x[x_len[i] - 1][i], + eos, + "Expected no eos (token id {eos}) at the end of sentence {i}.".format( + eos=eos, i=i + ), + ) + + def test_word_dropout_without_eos(self): + """Same result as word dropout with eos except no EOS at end""" + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) + + with data_utils.numpy_seed(1234): + noising_gen = noising.WordDropout(vocab) + x_noised, l_noised = noising_gen.noising(x, x_len, 0.2) + self.assert_word_dropout_correct( + x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised + ) + self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def test_word_blank_without_eos(self): + """Same result as word blank with eos except no EOS at end""" + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) + + with data_utils.numpy_seed(1234): + noising_gen = noising.WordDropout(vocab) + x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk()) + self.assert_word_blanking_correct( + x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk() + ) + self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def _get_noising_dataset_batch( + self, + src_tokens_no_pad, + src_dict, + append_eos_to_tgt=False, + ): + """ + Constructs a NoisingDataset and the corresponding + ``LanguagePairDataset(NoisingDataset(src), src)``. If + *append_eos_to_tgt* is True, wrap the source dataset in + :class:`TransformEosDataset` to append EOS to the clean source when + using it as the target. + """ + src_dataset = test_utils.TestDataset(data=src_tokens_no_pad) + + noising_dataset = noising.NoisingDataset( + src_dataset=src_dataset, + src_dict=src_dict, + seed=1234, + max_word_shuffle_distance=3, + word_dropout_prob=0.2, + word_blanking_prob=0.2, + noising_class=noising.UnsupervisedMTNoising, + ) + tgt = src_dataset + language_pair_dataset = LanguagePairDataset( + src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict + ) + language_pair_dataset = TransformEosDataset( + language_pair_dataset, + src_dict.eos(), + append_eos_to_tgt=append_eos_to_tgt, + ) + + dataloader = torch.utils.data.DataLoader( + dataset=language_pair_dataset, + batch_size=2, + collate_fn=language_pair_dataset.collater, + ) + denoising_batch_result = next(iter(dataloader)) + return denoising_batch_result + + def test_noising_dataset_with_eos(self): + src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker( + append_eos=True + ) + + # Format data for src_dataset + src_tokens = torch.t(src_tokens) + src_tokens_no_pad = [] + for src_sentence in src_tokens: + src_tokens_no_pad.append( + utils.strip_pad(tensor=src_sentence, pad=src_dict.pad()) + ) + denoising_batch_result = self._get_noising_dataset_batch( + src_tokens_no_pad=src_tokens_no_pad, src_dict=src_dict + ) + + eos, pad = src_dict.eos(), src_dict.pad() + + # Generated noisy source as source + expected_src = torch.LongTensor( + [[4, 5, 10, 11, 8, 12, 13, eos], [pad, pad, pad, 6, 8, 9, 7, eos]] + ) + # Original clean source as target (right-padded) + expected_tgt = torch.LongTensor( + [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]] + ) + generated_src = denoising_batch_result["net_input"]["src_tokens"] + tgt_tokens = denoising_batch_result["target"] + + self.assertTensorEqual(expected_src, generated_src) + self.assertTensorEqual(expected_tgt, tgt_tokens) + + def test_noising_dataset_without_eos(self): + """ + Similar to test noising dataset with eos except that we have to set + *append_eos_to_tgt* to ``True``. + """ + + src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker( + append_eos=False + ) + + # Format data for src_dataset + src_tokens = torch.t(src_tokens) + src_tokens_no_pad = [] + for src_sentence in src_tokens: + src_tokens_no_pad.append( + utils.strip_pad(tensor=src_sentence, pad=src_dict.pad()) + ) + denoising_batch_result = self._get_noising_dataset_batch( + src_tokens_no_pad=src_tokens_no_pad, + src_dict=src_dict, + append_eos_to_tgt=True, + ) + + eos, pad = src_dict.eos(), src_dict.pad() + + # Generated noisy source as source + expected_src = torch.LongTensor( + [[4, 5, 10, 11, 8, 12, 13], [pad, pad, pad, 6, 8, 9, 7]] + ) + # Original clean source as target (right-padded) + expected_tgt = torch.LongTensor( + [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]] + ) + + generated_src = denoising_batch_result["net_input"]["src_tokens"] + tgt_tokens = denoising_batch_result["target"] + + self.assertTensorEqual(expected_src, generated_src) + self.assertTensorEqual(expected_tgt, tgt_tokens) + + def assertTensorEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertEqual(t1.ne(t2).long().sum(), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_online_backtranslation.py b/fairseq/tests/test_online_backtranslation.py new file mode 100644 index 0000000000000000000000000000000000000000..0ae7e773da0ff838b3c8151bc14b84a6a9238a72 --- /dev/null +++ b/fairseq/tests/test_online_backtranslation.py @@ -0,0 +1,206 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import tempfile +import unittest +from pathlib import Path +from typing import Any, Dict, Sequence + +import fairseq.data.indexed_dataset as indexed_dataset +import fairseq.options +import fairseq.tasks.online_backtranslation as obt +import torch +from tests import utils + + +def mk_sample(tokens: Sequence[int], batch_size: int = 2) -> Dict[str, Any]: + batch = torch.stack([torch.tensor(tokens, dtype=torch.long)] * batch_size) + sample = { + "net_input": { + "src_tokens": batch, + "prev_output_tokens": batch, + "src_lengths": torch.tensor([len(tokens)] * batch_size, dtype=torch.long), + }, + "target": batch[:, 1:], + } + return sample + + +def mk_dataset(num_samples: int, max_len: int, output: Path): + output.parent.mkdir(exist_ok=True) + idx = indexed_dataset.IndexedDatasetBuilder(str(output)) + data = torch.randint(5, 100, (num_samples, max_len)) + lengths = torch.randint(3, max_len, (num_samples,)) + for d, l in zip(data, lengths): + d[0] = 0 + idx.add_item(d[:l]) + idx.finalize(output.with_suffix(".idx")) + assert output.exists() + assert output.with_suffix(".idx").exists() + + +class OnlineBacktranslationTest(unittest.TestCase): + + tmp_dir = Path(tempfile.mkdtemp(suffix="OnlineBacktranslationTest")) + + @classmethod + def obt_task( + cls, languages: Sequence[str], data: Path = None, language_mapping: str = None + ): + dict_path = cls.tmp_dir / "dict.txt" + if not dict_path.exists(): + dictionary = utils.dummy_dictionary(100) + dictionary.save(str(dict_path)) + + if data is not None: + (data / "dict.txt").write_text(dict_path.read_text()) + else: + data = cls.tmp_dir + assert len(languages) >= 2 + + kwargs = { + "arch": "transformer", + # --max-sentences=1 for better predictability of batches + "max_sentences": 1, + # Use characteristics dimensions + "encoder_layers": 3, + "encoder_embed_dim": 12, + "encoder_ffn_embed_dim": 14, + "encoder_attention_heads": 4, + "decoder_layers": 3, + "decoder_embed_dim": 12, + "decoder_output_dim": 12, + "decoder_ffn_embed_dim": 14, + "decoder_attention_heads": 4, + # Disable dropout so we have comparable tests. + "dropout": 0, + "attention_dropout": 0, + "activation_dropout": 0, + "encoder_layerdrop": 0, + } + + args = fairseq.options.get_args( + data, + task="online_backtranslation", + mono_langs=",".join(languages), + valid_lang_pairs=f"{languages[0]}-{languages[1]}", + tokens_per_sample=256, + language_mapping=language_mapping, + **kwargs, + ) + task = obt.OnlineBackTranslationTask.setup_task(args) + # we need to build the model to have the correct dictionary + model = task.build_model(task.args) + return task, model + + def tmp_path(self, test_case: str) -> Path: + return Path(tempfile.mkdtemp(test_case, dir=self.tmp_dir)) + + def test_lang_tokens(self): + task, model = self.obt_task(["en", "ro", "zh"]) + assert obt._lang_token("en") in task.dictionary + assert obt._lang_token("ro") in task.dictionary + assert obt._lang_token("zh") in task.dictionary + + en_bos = obt._lang_token_index(task.common_dict, "en") + assert "en" == task.common_dict[en_bos].strip("_") + zh_bos = obt._lang_token_index(task.common_dict, "zh") + assert "zh" == task.common_dict[zh_bos].strip("_") + zh_sample = mk_sample([zh_bos, 16, 14, 12, 10]) + + # we expect to receive the bos token for translation + assert task.get_bos_token_from_sample(zh_sample) == en_bos + + def test_backtranslate_sample(self): + task, model = self.obt_task(["en", "ro", "zh"]) + + en_bos = obt._lang_token_index(task.common_dict, "en") + zh_bos = obt._lang_token_index(task.common_dict, "zh") + sample = mk_sample([zh_bos, 16, 14, 12, 10]) + + task.backtranslate_sample(sample, "zh", "en") + target_zh = list(sample["target"][0]) + assert target_zh == [16, 14, 12, 10] # original zh sentence + generated_en = sample["net_input"]["src_tokens"][0] + assert generated_en[0] == en_bos + + def test_train_dataset(self): + data = self.tmp_path("test_train_dataset") + mk_dataset(20, 10, data / "en" / "train.bin") + mk_dataset(10, 10, data / "zh" / "train.bin") + task, model = self.obt_task(["en", "zh"], data) + task.load_dataset("train") + + en_bos = obt._lang_token_index(task.common_dict, "en") + zh_bos = obt._lang_token_index(task.common_dict, "zh") + + train = task.datasets["train"] + train.ordered_indices() + train.prefetch([0, 19]) + sample_0 = train[0] + sample_19 = train[19] + self.assertEqual( + set(sample_0.keys()), {"en-BT", "en-DENOISE", "zh-BT", "zh-DENOISE"} + ) + for sample in (sample_0, sample_19): + self.assertEqual(sample["en-BT"]["source"][0], en_bos) + # bt target isn't ready to look at. + self.assertEqual(sample["en-DENOISE"]["source"][0], en_bos) + # TODO What could we check on the target side ? + + for i in range(10): + # Zh dataset is shorter, and is wrapped around En dataset. + train.prefetch([i, i + 10]) + self.assertEqual( + list(train[i]["zh-DENOISE"]["source"]), + list(train[i + 10]["zh-DENOISE"]["source"]), + ) + self.assertEqual(train[i]["zh-DENOISE"]["source"][0].item(), zh_bos) + + # Sorted by increasing len + self.assertLess( + len(sample_0["en-BT"]["source"]), len(sample_19["en-BT"]["source"]) + ) + + def test_valid_dataset(self): + data = self.tmp_path("test_valid_dataset") + mk_dataset(10, 21, data / "valid.en-zh.en.bin") + mk_dataset(10, 21, data / "valid.en-zh.zh.bin") + + task, model = self.obt_task(["en", "zh"], data) + valid = task.load_dataset("valid") + en_bos = obt._lang_token_index(task.common_dict, "en") + + assert valid is not None + valid.prefetch(range(10)) + sample_0 = valid[0] + sample_9 = valid[9] + self.assertEqual(sample_0["id"], 0) + self.assertEqual(sample_9["id"], 9) + self.assertEqual(sample_0["source"][0], en_bos) + self.assertEqual(sample_9["source"][0], en_bos) + # TODO: could we test the target side ? + + def assertFnMatch(self, fn, values): + for x, y in values.items(): + fn_x = fn(x) + self.assertEqual(fn_x, y, f"Fn has wrong value: fn({x}) = {fn_x} != {y}") + + def test_piecewise_linear_fn(self): + self.assertFnMatch( + obt.PiecewiseLinearFn.from_string("1.0"), {0: 1, 100: 1, 500: 1, 1000: 1} + ) + self.assertFnMatch( + obt.PiecewiseLinearFn.from_string("0:1,1000:0"), + {0: 1, 500: 0.5, 1000: 0, 2000: 0}, + ) + self.assertFnMatch( + obt.PiecewiseLinearFn.from_string("0:0,1000:1"), + {0: 0, 500: 0.5, 1000: 1, 2000: 1}, + ) + self.assertFnMatch( + obt.PiecewiseLinearFn.from_string("0:0,1000:1,2000:0"), + {0: 0, 500: 0.5, 1000: 1, 1500: 0.5, 2000: 0, 3000: 0}, + ) diff --git a/fairseq/tests/test_plasma_utils.py b/fairseq/tests/test_plasma_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7286c6cd3aa6f50e498bbd530f02b6ce538cf546 --- /dev/null +++ b/fairseq/tests/test_plasma_utils.py @@ -0,0 +1,127 @@ +import contextlib +import tempfile +import unittest +from io import StringIO + +import numpy as np + +from tests.utils import create_dummy_data, preprocess_lm_data, train_language_model + +try: + from pyarrow import plasma + + from fairseq.data.plasma_utils import PlasmaStore, PlasmaView + + PYARROW_AVAILABLE = True +except ImportError: + PYARROW_AVAILABLE = False + +dummy_path = "dummy" + + +@unittest.skipUnless(PYARROW_AVAILABLE, "") +class TestPlasmaView(unittest.TestCase): + def setUp(self) -> None: + self.tmp_file = tempfile.NamedTemporaryFile() # noqa: P201 + self.path = self.tmp_file.name + self.server = PlasmaStore.start(path=self.path, nbytes=10000) + self.client = plasma.connect(self.path, num_retries=10) + + def tearDown(self) -> None: + self.client.disconnect() + self.tmp_file.close() + self.server.kill() + + def test_two_servers_do_not_share_object_id_space(self): + data_server_1 = np.array([0, 1]) + data_server_2 = np.array([2, 3]) + server_2_path = self.path + with tempfile.NamedTemporaryFile() as server_1_path: + server = PlasmaStore.start(path=server_1_path.name, nbytes=10000) + arr1 = PlasmaView( + data_server_1, dummy_path, 1, plasma_path=server_1_path.name + ) + assert len(arr1.client.list()) == 1 + assert (arr1.array == data_server_1).all() + arr2 = PlasmaView(data_server_2, dummy_path, 1, plasma_path=server_2_path) + assert (arr2.array == data_server_2).all() + assert (arr1.array == data_server_1).all() + server.kill() + + def test_hash_collision(self): + data_server_1 = np.array([0, 1]) + data_server_2 = np.array([2, 3]) + arr1 = PlasmaView(data_server_1, dummy_path, 1, plasma_path=self.path) + assert len(arr1.client.list()) == 1 + arr2 = PlasmaView(data_server_2, dummy_path, 1, plasma_path=self.path) + assert len(arr1.client.list()) == 1 + assert len(arr2.client.list()) == 1 + assert (arr2.array == data_server_1).all() + # New hash key based on tuples + arr3 = PlasmaView( + data_server_2, dummy_path, (1, 12312312312, None), plasma_path=self.path + ) + assert ( + len(arr2.client.list()) == 2 + ), "No new object was created by using a novel hash key" + assert ( + arr3.object_id in arr2.client.list() + ), "No new object was created by using a novel hash key" + assert ( + arr3.object_id in arr3.client.list() + ), "No new object was created by using a novel hash key" + del arr3, arr2, arr1 + + @staticmethod + def _assert_view_equal(pv1, pv2): + np.testing.assert_array_equal(pv1.array, pv2.array) + + def test_putting_same_array_twice(self): + data = np.array([4, 4, 4]) + arr1 = PlasmaView(data, dummy_path, 1, plasma_path=self.path) + assert len(self.client.list()) == 1 + arr1b = PlasmaView( + data, dummy_path, 1, plasma_path=self.path + ) # should not change contents of store + arr1c = PlasmaView( + None, dummy_path, 1, plasma_path=self.path + ) # should not change contents of store + + assert len(self.client.list()) == 1 + self._assert_view_equal(arr1, arr1b) + self._assert_view_equal(arr1, arr1c) + PlasmaView( + data, dummy_path, 2, plasma_path=self.path + ) # new object id, adds new entry + assert len(self.client.list()) == 2 + + new_client = plasma.connect(self.path) + assert len(new_client.list()) == 2 # new client can access same objects + assert isinstance(arr1.object_id, plasma.ObjectID) + del arr1b + del arr1c + + def test_plasma_store_full_raises(self): + with tempfile.NamedTemporaryFile() as new_path: + server = PlasmaStore.start(path=new_path.name, nbytes=10000) + with self.assertRaises(plasma.PlasmaStoreFull): + # 2000 floats is more than 2000 bytes + PlasmaView( + np.random.rand(10000, 1), dummy_path, 1, plasma_path=new_path.name + ) + server.kill() + + def test_object_id_overflow(self): + PlasmaView.get_object_id("", 2**21) + + def test_training_lm_plasma(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir: + create_dummy_data(data_dir) + preprocess_lm_data(data_dir) + train_language_model( + data_dir, + "transformer_lm", + ["--use-plasma-view", "--plasma-path", self.path], + run_validation=True, + ) diff --git a/fairseq/tests/test_positional_encoding.py b/fairseq/tests/test_positional_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..4e38c4397dc17fdc9f2bb1fe4f80ebf9c82a0166 --- /dev/null +++ b/fairseq/tests/test_positional_encoding.py @@ -0,0 +1,63 @@ +import unittest + +import torch +from fairseq.modules import RelPositionalEncoding +import numpy as np + + +class TestRelPositionalEncoding(unittest.TestCase): + def setUp(self) -> None: + self.T = 3 + self.B = 1 + self.C = 2 + torch.manual_seed(0) + self.sample = torch.randn(self.T, self.B, self.C) # TBC + self.rel_pos_enc = RelPositionalEncoding(max_len=4, d_model=self.C) + + def test_extend_pe(self): + inp = self.sample.transpose(0, 1) + self.rel_pos_enc.extend_pe(inp) + expected_pe = torch.tensor( + [ + [ + [0.1411, -0.9900], + [0.9093, -0.4161], + [0.8415, 0.5403], + [0.0000, 1.0000], + [-0.8415, 0.5403], + [-0.9093, -0.4161], + [-0.1411, -0.9900], + ] + ] + ) + + self.assertTrue( + np.allclose( + expected_pe.cpu().detach().numpy(), + self.rel_pos_enc.pe.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + def test_forward(self): + pos_enc = self.rel_pos_enc(self.sample) + expected_pos_enc = torch.tensor( + [ + [[0.9093, -0.4161]], + [[0.8415, 0.5403]], + [[0.0000, 1.0000]], + [[-0.8415, 0.5403]], + [[-0.9093, -0.4161]], + ] + ) + self.assertTrue( + np.allclose( + pos_enc.cpu().detach().numpy(), + expected_pos_enc.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_reproducibility.py b/fairseq/tests/test_reproducibility.py new file mode 100644 index 0000000000000000000000000000000000000000..b285593272151ed95d99313d371e2f12628face6 --- /dev/null +++ b/fairseq/tests/test_reproducibility.py @@ -0,0 +1,148 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import tempfile +import unittest + +import torch + +from . import test_binaries + + +class TestReproducibility(unittest.TestCase): + def _test_reproducibility( + self, + name, + extra_flags=None, + delta=0.0001, + resume_checkpoint="checkpoint1.pt", + max_epoch=3, + ): + def get_last_log_stats_containing_string(log_records, search_string): + for log_record in logs.records[::-1]: + if isinstance(log_record.msg, str) and search_string in log_record.msg: + return json.loads(log_record.msg) + + if extra_flags is None: + extra_flags = [] + + with tempfile.TemporaryDirectory(name) as data_dir: + with self.assertLogs() as logs: + test_binaries.create_dummy_data(data_dir) + test_binaries.preprocess_translation_data(data_dir) + + # train epochs 1 and 2 together + with self.assertLogs() as logs: + test_binaries.train_translation_model( + data_dir, + "fconv_iwslt_de_en", + [ + "--dropout", + "0.0", + "--log-format", + "json", + "--log-interval", + "1", + "--max-epoch", + str(max_epoch), + ] + + extra_flags, + ) + train_log = get_last_log_stats_containing_string(logs.records, "train_loss") + valid_log = get_last_log_stats_containing_string(logs.records, "valid_loss") + + # train epoch 2, resuming from previous checkpoint 1 + os.rename( + os.path.join(data_dir, resume_checkpoint), + os.path.join(data_dir, "checkpoint_last.pt"), + ) + with self.assertLogs() as logs: + test_binaries.train_translation_model( + data_dir, + "fconv_iwslt_de_en", + [ + "--dropout", + "0.0", + "--log-format", + "json", + "--log-interval", + "1", + "--max-epoch", + str(max_epoch), + ] + + extra_flags, + ) + train_res_log = get_last_log_stats_containing_string( + logs.records, "train_loss" + ) + valid_res_log = get_last_log_stats_containing_string( + logs.records, "valid_loss" + ) + + for k in ["train_loss", "train_ppl", "train_num_updates", "train_gnorm"]: + self.assertAlmostEqual( + float(train_log[k]), float(train_res_log[k]), delta=delta + ) + for k in [ + "valid_loss", + "valid_ppl", + "valid_num_updates", + "valid_best_loss", + ]: + self.assertAlmostEqual( + float(valid_log[k]), float(valid_res_log[k]), delta=delta + ) + + def test_reproducibility(self): + self._test_reproducibility("test_reproducibility") + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_reproducibility_fp16(self): + self._test_reproducibility( + "test_reproducibility_fp16", + [ + "--fp16", + "--fp16-init-scale", + "4096", + ], + delta=0.011, + ) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_reproducibility_memory_efficient_fp16(self): + self._test_reproducibility( + "test_reproducibility_memory_efficient_fp16", + [ + "--memory-efficient-fp16", + "--fp16-init-scale", + "4096", + ], + ) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_reproducibility_amp(self): + self._test_reproducibility( + "test_reproducibility_amp", + [ + "--amp", + "--fp16-init-scale", + "4096", + ], + delta=0.011, + ) + + def test_mid_epoch_reproducibility(self): + self._test_reproducibility( + "test_mid_epoch_reproducibility", + ["--save-interval-updates", "3"], + resume_checkpoint="checkpoint_1_3.pt", + max_epoch=1, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_resampling_dataset.py b/fairseq/tests/test_resampling_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..ccb53a253ce6ca0d8e972adfa708144b4299b3cb --- /dev/null +++ b/fairseq/tests/test_resampling_dataset.py @@ -0,0 +1,103 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import collections +import unittest + +import numpy as np +from fairseq.data import ListDataset, ResamplingDataset + + +class TestResamplingDataset(unittest.TestCase): + def setUp(self): + self.strings = ["ab", "c", "def", "ghij"] + self.weights = [4.0, 2.0, 7.0, 1.5] + self.size_ratio = 2 + self.dataset = ListDataset( + self.strings, np.array([len(s) for s in self.strings]) + ) + + def _test_common(self, resampling_dataset, iters): + assert len(self.dataset) == len(self.strings) == len(self.weights) + assert len(resampling_dataset) == self.size_ratio * len(self.strings) + + results = {"ordered_by_size": True, "max_distribution_diff": 0.0} + + totalfreqs = 0 + freqs = collections.defaultdict(int) + + for epoch_num in range(iters): + resampling_dataset.set_epoch(epoch_num) + + indices = resampling_dataset.ordered_indices() + assert len(indices) == len(resampling_dataset) + + prev_size = -1 + + for i in indices: + cur_size = resampling_dataset.size(i) + # Make sure indices map to same sequences within an epoch + assert resampling_dataset[i] == resampling_dataset[i] + + # Make sure length of sequence is correct + assert cur_size == len(resampling_dataset[i]) + + freqs[resampling_dataset[i]] += 1 + totalfreqs += 1 + + if prev_size > cur_size: + results["ordered_by_size"] = False + + prev_size = cur_size + + assert set(freqs.keys()) == set(self.strings) + for s, weight in zip(self.strings, self.weights): + freq = freqs[s] / totalfreqs + expected_freq = weight / sum(self.weights) + results["max_distribution_diff"] = max( + results["max_distribution_diff"], abs(expected_freq - freq) + ) + + return results + + def test_resampling_dataset_batch_by_size_false(self): + resampling_dataset = ResamplingDataset( + self.dataset, + self.weights, + size_ratio=self.size_ratio, + batch_by_size=False, + seed=0, + ) + + results = self._test_common(resampling_dataset, iters=1000) + + # For batch_by_size = False, the batches should be returned in + # arbitrary order of size. + assert not results["ordered_by_size"] + + # Allow tolerance in distribution error of 2%. + assert results["max_distribution_diff"] < 0.02 + + def test_resampling_dataset_batch_by_size_true(self): + resampling_dataset = ResamplingDataset( + self.dataset, + self.weights, + size_ratio=self.size_ratio, + batch_by_size=True, + seed=0, + ) + + results = self._test_common(resampling_dataset, iters=1000) + + # For batch_by_size = True, the batches should be returned in + # increasing order of size. + assert results["ordered_by_size"] + + # Allow tolerance in distribution error of 2%. + assert results["max_distribution_diff"] < 0.02 + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_roberta.py b/fairseq/tests/test_roberta.py new file mode 100644 index 0000000000000000000000000000000000000000..14f01f9cb7fe252511037ef9d8165faeeaee44f6 --- /dev/null +++ b/fairseq/tests/test_roberta.py @@ -0,0 +1,344 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +import unittest +from typing import Any, Dict, Sequence + +import fairseq +import fairseq.options +import fairseq.tasks +import torch +from tests.utils import dummy_dictionary + +VOCAB_SIZE = 100 + + +@fairseq.tasks.register_task("fake_task") +class FakeTask(fairseq.tasks.LegacyFairseqTask): + def __init__(self, args): + super().__init__(args) + self.dictionary = dummy_dictionary(VOCAB_SIZE - 4) + assert len(self.dictionary) == VOCAB_SIZE + + @property + def source_dictionary(self): + return self.dictionary + + @property + def target_dictionary(self): + return self.dictionary + + +@functools.lru_cache() +def get_toy_model( + device: str, + architecture: str = "roberta_enc_dec", + **extra_args: Any, +): + assert device in ("gpu", "cpu") + kwargs = { + "arch": architecture, + # Use characteristics dimensions + "encoder_layers": 3, + "encoder_embed_dim": 12, + "encoder_ffn_embed_dim": 14, + "encoder_attention_heads": 4, + "decoder_layers": 3, + "decoder_embed_dim": 12, + "decoder_ffn_embed_dim": 14, + "decoder_attention_heads": 4, + # Disable dropout so we have comparable tests. + "dropout": 0, + "attention_dropout": 0, + "activation_dropout": 0, + "encoder_layerdrop": 0, + # required args + "tokens_per_sample": 256, + "data": "/tmp/test_roberta", + } + kwargs.update(extra_args) + fake_task = FakeTask(kwargs) + args = fairseq.options.get_args( + task="online_backtranslation", + mono_langs="en,ro", + valid_lang_pairs="en-ro", + **kwargs, + ) + torch.manual_seed(0) + model = fake_task.build_model(args) + if device == "gpu": + model.cuda() + return fake_task, model + + +def mk_sample( + lang: str, device: str, tok: Sequence[int] = None, batch_size: int = 2 +) -> Dict[str, Any]: + assert device in ("gpu", "cpu") + if not tok: + if lang == "en": + tok = [10, 11, 12, 13, 14, 15, 2] + else: + tok = [20, 21, 22, 23, 24, 25, 26, 27, 2] + + batch = torch.stack([torch.tensor(tok, dtype=torch.long)] * batch_size) + if device == "gpu": + batch = batch.cuda() + sample = { + "net_input": { + "src_tokens": batch, + "prev_output_tokens": batch, + "src_lengths": torch.tensor( + [len(tok)] * batch_size, dtype=torch.long, device=batch.device + ), + }, + "target": batch[:, 1:], + } + return sample + + +def cpu_gpu(fn): + def helper(self): + fn(self, "cpu") + if torch.cuda.is_available(): + fn(self, "gpu") + + return helper + + +def architectures(fn): + def helper(self): + for arch in ["roberta_enc_dec", "transformer"]: + fn(self, arch) + + return helper + + +class RobertaTest(unittest.TestCase): + def assertTensorEqual(self, t1, t2, delta: float = 1e-6): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + if delta == 0.0: + self.assertEqual(t1.ne(t2).long().sum(), 0) + else: + self.assertEqual(((t2 - t1).abs() > delta).long().sum(), 0) + + def assertSharing(self, model, link_groups: Sequence[Sequence[str]]): + ids = {} + for group in link_groups: + group_ids = {name: id(params(model, name)) for name in group} + shared_id = group_ids[group[0]] + self.assertEqual(group_ids, {name: shared_id for name in group}) + self.assertNotIn(shared_id, ids) + ids[shared_id] = group + + def test_roberta_shared_params(self): + _, roberta = get_toy_model("cpu", architecture="roberta") + self.assertSharing( + roberta, + [ + [ + "encoder.sentence_encoder.embed_tokens.weight", + "encoder.lm_head.weight", + ] + ], + ) + + _, roberta = get_toy_model( + "cpu", architecture="roberta", untie_weights_roberta=True + ) + self.assertSharing( + roberta, + [ + ["encoder.sentence_encoder.embed_tokens.weight"], + ["encoder.lm_head.weight"], + ], + ) + + def test_roberta_enc_dec_shared_params(self): + # 3 distinct embeddings + _, enc_dec = get_toy_model("cpu", architecture="roberta_enc_dec") + self.assertSharing( + enc_dec, + [ + ["encoder.embed_tokens.weight"], + ["decoder.embed_tokens.weight"], + ["decoder.output_projection.weight"], + ], + ) + + # 2 distinct embeddings, one for encoder, one for decoder + _, enc_dec = get_toy_model( + "cpu", architecture="roberta_enc_dec", share_decoder_input_output_embed=True + ) + self.assertSharing( + enc_dec, + [ + ["encoder.embed_tokens.weight"], + [ + "decoder.embed_tokens.weight", + "decoder.output_projection.weight", + ], + ], + ) + + # shared embeddings + _, enc_dec = get_toy_model( + "cpu", architecture="roberta_enc_dec", share_all_embeddings=True + ) + self.assertSharing( + enc_dec, + [ + [ + "encoder.embed_tokens.weight", + "decoder.embed_tokens.weight", + "decoder.output_projection.weight", + ] + ], + ) + + def test_roberta_max_positions_is_correctly_set(self): + device = "cpu" + task, model = get_toy_model(device) + max_pos = model.max_decoder_positions() + self.assertEqual(max_pos, 256) + self.assertEqual(max_pos, model.decoder.max_positions()) + self.assertEqual(max_pos, model.encoder.max_positions()) + self.assertEqual(max_pos, model.encoder.embed_positions.max_positions) + + sentence = [31 for _ in range(max_pos)] + sample = mk_sample("en", device, sentence, batch_size=1) + self.assertEqual(list(sample["net_input"]["src_lengths"]), [max_pos]) + self.assertEqual(len(sample["net_input"]["src_tokens"][0]), max_pos) + x, _ = model.forward(**sample["net_input"]) + self.assertEqual(x.shape, (1, max_pos, VOCAB_SIZE)) + + @cpu_gpu + def test_roberta_forward_backward(self, device: str): + _, model = get_toy_model(device) + sample = mk_sample("en", device) + en_tokens = sample["net_input"]["src_tokens"] + (bs, l) = en_tokens.shape + # Forward + logits, _ = model(**sample["net_input"]) + self.assertEqual(logits.shape, (bs, l, VOCAB_SIZE)) + + # Backward + loss = logits.sum() + loss.backward() + + @cpu_gpu + def test_roberta_forward_backward_bs1(self, device: str): + _, model = get_toy_model(device) + sample = mk_sample("en", device, batch_size=1) + o, _ = model.forward(**sample["net_input"]) + loss = o.sum() + sample2 = mk_sample("ro", device, batch_size=1) + o, _ = model.forward(**sample2["net_input"]) + loss += o.sum() + loss.backward() + + @cpu_gpu + def test_roberta_batching(self, device: str): + """ + Checks that the batch of size 2 give twice the same results than the batch of size 1. + """ + _, model = get_toy_model(device) + sample = mk_sample("en", device, batch_size=1) + slen = sample["net_input"]["src_lengths"][0] + sample2 = mk_sample("en", device, batch_size=2) + with torch.no_grad(): + z = model.encoder.forward( + sample["net_input"]["src_tokens"], sample["net_input"]["src_lengths"] + ) + z = z["encoder_out"][-1] + logits, _ = model.forward(**sample["net_input"]) + + z2 = model.encoder.forward( + sample2["net_input"]["src_tokens"], sample["net_input"]["src_lengths"] + ) + z2 = z2["encoder_out"][-1] + logits2, _ = model.forward(**sample2["net_input"]) + + self.assertEqual(z.shape, (slen, 1, 12)) + self.assertEqual(z2.shape, (slen, 2, 12)) + self.assertTensorEqual(logits2[0], logits2[1]) + self.assertTensorEqual(logits[0], logits2[0]) + + @cpu_gpu + def test_roberta_incremental_decoder(self, device: str): + """ + Checks that incremental decoding yields the same result than non incremental one. + """ + task, model = get_toy_model(device) + + en_sample = mk_sample("en", device) + en_tokens = en_sample["net_input"]["src_tokens"] + ro_sample = mk_sample("ro", device) + ro_tokens = ro_sample["net_input"]["src_tokens"] + + en_enc = model.encoder.forward( + en_tokens, src_lengths=en_sample["net_input"]["src_lengths"] + ) + (bs, tgt_len) = ro_tokens.shape + + # Decode without incremental state + ro_dec, _ = model.decoder.forward(ro_tokens, encoder_out=en_enc) + self.assertEqual(ro_dec.shape, (bs, tgt_len, VOCAB_SIZE)) + self.assertTensorEqual(ro_dec[0], ro_dec[1]) + + # Decode with incremental state + inc_state = {} + ro_dec_inc = [] + for i in range(tgt_len): + ro, _ = model.decoder.forward( + ro_tokens[:, : i + 1], encoder_out=en_enc, incremental_state=inc_state + ) + self.assertEqual(ro.shape, (bs, 1, VOCAB_SIZE)) + ro_dec_inc.append(ro) + + for i in range(tgt_len): + # Intra-batch + self.assertTensorEqual(ro_dec_inc[i][0], ro_dec_inc[i][1]) + # Incremental vs non-incremental + self.assertTensorEqual(ro_dec_inc[i][:, 0], ro_dec[:, i]) + + @cpu_gpu + def test_regularize_for_adaprune_in_roberta(self, device: str): + _, model = get_toy_model( + device=device, + architecture="roberta_base", + mha_reg_scale_factor=0.000375, + ffn_reg_scale_factor=0.000375, + ) + sample = mk_sample("en", device, batch_size=1) + task_loss, _ = model.forward(**sample["net_input"]) + head_loss = model._get_adaptive_head_loss() + ffn_loss = model._get_adaptive_ffn_loss() + loss = task_loss.sum() + head_loss + ffn_loss + loss.backward() + + @cpu_gpu + def test_ffn_prune_for_adaprune_in_roberta(self, device: str): + _, model = get_toy_model( + device=device, + architecture="roberta_base", + ) + sample = mk_sample("en", device, batch_size=1) + for layer in model.encoder.sentence_encoder.layers: + fc1_original_size = layer.fc1.out_features + remove_index = layer._get_fc_rank(remove_num=2) + layer._prune_fc_layer(remove_index=remove_index) + self.assertEqual(layer.fc1.out_features, fc1_original_size - 2) + + task_loss, _ = model.forward(**sample["net_input"]) + + +def params(model, name): + if "." not in name: + return getattr(model, name) + + prefix, name = name.split(".", 1) + return params(getattr(model, prefix), name) diff --git a/fairseq/tests/test_rotary_positional_embedding.py b/fairseq/tests/test_rotary_positional_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..7c44e86d5dcc2f4bb36b73a598700d44bbca791f --- /dev/null +++ b/fairseq/tests/test_rotary_positional_embedding.py @@ -0,0 +1,85 @@ +import torch +import numpy as np +import unittest +from fairseq.modules.rotary_positional_embedding import apply_rotary_pos_emb +from fairseq.modules import RotaryPositionalEmbedding + + +class TestRotaryPositionalEmbedding(unittest.TestCase): + def setUp(self) -> None: + self.T = 3 + self.B = 1 + self.C = 2 + torch.manual_seed(0) + self.sample = torch.randn(self.T, self.B, self.C) # TBC + self.rope_pos_emd = RotaryPositionalEmbedding(dim=self.C) + + def test_forward(self): + expected_cos = torch.tensor( + [[[[1.0000, 1.0000]]], [[[0.5403, 0.5403]]], [[[-0.4161, -0.4161]]]] + ) + expected_sin = torch.tensor( + [[[[0.0000, 0.0000]]], [[[0.8415, 0.8415]]], [[[0.9093, 0.9093]]]] + ) + cos, sin = self.rope_pos_emd(self.sample, self.T) + self.assertTrue( + np.allclose( + expected_cos.cpu().detach().numpy(), + cos.cpu().detach().numpy(), + atol=1e-4, + ) + ) + self.assertTrue( + np.allclose( + expected_sin.cpu().detach().numpy(), + sin.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + def test_apply_rotary_pos_emb(self): + cos, sin = self.rope_pos_emd(self.sample, self.T) + query = self.sample.view(self.T, self.B, 1, self.C) + expected_query = torch.tensor( + [[[[1.5410, -0.2934]]], [[[-1.6555, -1.5263]]], [[[1.7231, -0.4041]]]] + ) + new_query, new_key = apply_rotary_pos_emb(query, query, cos, sin) + self.assertTrue( + np.allclose( + expected_query.cpu().detach().numpy(), + new_query.cpu().detach().numpy(), + atol=1e-4, + ) + ) + self.assertTrue( + np.allclose( + expected_query.cpu().detach().numpy(), + new_key.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + def test_jit_compile_rope_module(self): + module_scripted = torch.jit.script(self.rope_pos_emd) + apply_rotary_scripted = torch.jit.script(apply_rotary_pos_emb) + # Test several different lengths + for T in [3, 5, 10]: + sample = torch.randn(T, self.B, self.C) + # Run forward pass with the original module + cos_original, sin_original = self.rope_pos_emd(sample, T) + query = sample.view(T, self.B, 1, self.C) + new_query, new_key = apply_rotary_pos_emb(query, query, cos_original, sin_original) + + # Run forward pass with the scripted module + cos_scripted, sin_scripted = module_scripted(sample, T) + new_query_scripted, new_key_scripted = apply_rotary_scripted(query, query, cos_scripted, sin_scripted) + + # Ensure the outputs are the same + self.assertTrue(torch.allclose(cos_original, cos_scripted)) + self.assertTrue(torch.allclose(sin_original, sin_scripted)) + self.assertTrue(torch.allclose(new_query, new_query_scripted)) + self.assertTrue(torch.allclose(new_key, new_key_scripted)) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_sequence_generator.py b/fairseq/tests/test_sequence_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..2e42df0e56ff4075e908b255192750cf2f025c00 --- /dev/null +++ b/fairseq/tests/test_sequence_generator.py @@ -0,0 +1,744 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import math +import tempfile +import unittest + +import numpy as np +import torch + +import tests.utils as test_utils +from fairseq import search +from fairseq.data.dictionary import Dictionary +from fairseq.models.transformer import TransformerModel +from fairseq.ngram_repeat_block import NGramRepeatBlock +from fairseq.sequence_generator import EnsembleModel, SequenceGenerator +from fairseq.tasks.fairseq_task import LegacyFairseqTask + +DEFAULT_TEST_VOCAB_SIZE = 100 + + +class DummyTask(LegacyFairseqTask): + def __init__(self, args): + super().__init__(args) + self.dictionary = get_dummy_dictionary() + if getattr(self.args, "ctc", False): + self.dictionary.add_symbol("") + self.src_dict = self.dictionary + self.tgt_dict = self.dictionary + + @property + def source_dictionary(self): + return self.src_dict + + @property + def target_dictionary(self): + return self.dictionary + + +def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE): + dummy_dict = Dictionary() + # add dummy symbol to satisfy vocab size + for id, _ in enumerate(range(vocab_size)): + dummy_dict.add_symbol("{}".format(id), n=1000) + return dummy_dict + + +def get_dummy_task_and_parser(): + """ + to build a fariseq model, we need some dummy parse and task. This function + is used to create dummy task and parser to faciliate model/criterion test + + Note: we use FbSpeechRecognitionTask as the dummy task. You may want + to use other task by providing another function + """ + parser = argparse.ArgumentParser( + description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS + ) + DummyTask.add_args(parser) + args = parser.parse_args([]) + task = DummyTask.setup_task(args) + return task, parser + + +class TestJitSequenceGeneratorBase(unittest.TestCase): + def setUp(self): + self.task, self.parser = get_dummy_task_and_parser() + eos = self.task.tgt_dict.eos() + src_tokens = torch.randint(3, 50, (2, 10)).long() + src_tokens = torch.cat((src_tokens, torch.LongTensor([[eos], [eos]])), -1) + src_lengths = torch.LongTensor([2, 10]) + self.sample = { + "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths} + } + TransformerModel.add_args(self.parser) + args = self.parser.parse_args([]) + args.encoder_layers = 2 + args.decoder_layers = 1 + self.transformer_model = TransformerModel.build_model(args, self.task) + + def assertOutputEqual(self, hypo, pos_probs): + pos_scores = torch.FloatTensor(pos_probs).log() + self.assertTensorSizeEqual(hypo["positional_scores"], pos_scores) + self.assertTensorSizeEqual(pos_scores.numel(), hypo["tokens"].numel()) + + def assertTensorSizeEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess((t1 - t2).abs().max(), 1e-4) + + def assertTensorEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertEqual(t1.ne(t2).long().sum(), 0) + + def assertHypoEqual(self, h1, h2): + "Check two hypos are equal" + self.assertTensorEqual(h1["tokens"], h2["tokens"]) + self.assertAlmostEqual(h1["positional_scores"], h2["positional_scores"]) + self.assertLess(abs(h1["score"] - h2["score"]), 1e-6) + self.assertAlmostEqual(h1["attention"], h2["attention"]) + + def _test_save_and_load(self, scripted_module): + with tempfile.NamedTemporaryFile() as f: + scripted_module.save(f.name) + torch.jit.load(f.name) + + +JIT_MSG = "Targeting OSS scriptability for the 1.6 release" + + +@unittest.skipIf(torch.__version__ < "1.6.0", JIT_MSG) +class TestJitSequenceGenerator(TestJitSequenceGeneratorBase): + def test_export_transformer(self): + model = self.transformer_model + torch.jit.script(model) + + def test_ensemble_sequence_generator(self): + model = self.transformer_model + generator = SequenceGenerator( + [model], + self.task.tgt_dict, + beam_size=2, + no_repeat_ngram_size=2, + max_len_b=10, + ) + scripted_model = torch.jit.script(generator) + self._test_save_and_load(scripted_model) + + def test_export_ensemble_model(self): + model = self.transformer_model + ensemble_models = EnsembleModel([model]) + torch.jit.script(ensemble_models) + + +class TestExportSearch(unittest.TestCase): + def setUp(self): + task, _ = get_dummy_task_and_parser() + self.tgt_dict = task.tgt_dict + self.min_top1_prob = 0.4 + + def test_export_diverse_bs(self): + search_strategy = search.DiverseBeamSearch( + self.tgt_dict, num_groups=2, diversity_strength=0.0 + ) + torch.jit.script(search_strategy) + + def test_export_sampling(self): + low_sampling_topp = self.min_top1_prob / 2.0 + search_strategy = search.Sampling( + self.tgt_dict, sampling_topp=low_sampling_topp + ) + torch.jit.script(search_strategy) + + def test_export_diverse_siblings_search(self): + search_strategy = search.DiverseSiblingsSearch( + self.tgt_dict, diversity_rate=0.5 + ) + torch.jit.script(search_strategy) + + +class TestSequenceGeneratorBase(unittest.TestCase): + def assertHypoTokens(self, hypo, tokens): + self.assertTensorEqual(hypo["tokens"], torch.LongTensor(tokens)) + + def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0): + pos_scores = torch.FloatTensor(pos_probs).log() + self.assertAlmostEqual(hypo["positional_scores"], pos_scores) + self.assertEqual(pos_scores.numel(), hypo["tokens"].numel()) + score = pos_scores.sum() + if normalized: + score /= pos_scores.numel() ** lenpen + self.assertLess(abs(score - hypo["score"]), 1e-6) + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess((t1 - t2).abs().max(), 1e-4) + + def assertTensorEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertEqual(t1.ne(t2).long().sum(), 0) + + +class TestSequenceGenerator(TestSequenceGeneratorBase): + def setUp(self): + ( + self.tgt_dict, + self.w1, + self.w2, + src_tokens, + src_lengths, + self.model, + ) = test_utils.sequence_generator_setup() + self.sample = { + "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths} + } + + def test_with_normalization(self): + generator = SequenceGenerator([self.model], self.tgt_dict, beam_size=2) + hypos = generator.forward(self.sample) + eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 + # sentence 1, beam 1 + self.assertHypoTokens(hypos[0][0], [w1, eos]) + self.assertHypoScore(hypos[0][0], [0.9, 1.0]) + # sentence 1, beam 2 + self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos]) + self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0]) + # sentence 2, beam 1 + self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos]) + self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0]) + # sentence 2, beam 2 + self.assertHypoTokens(hypos[1][1], [w1, w2, eos]) + self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6]) + + def test_without_normalization(self): + # Sentence 1: unchanged from the normalized case + # Sentence 2: beams swap order + generator = SequenceGenerator( + [self.model], self.tgt_dict, beam_size=2, normalize_scores=False + ) + hypos = generator.forward(self.sample) + eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 + # sentence 1, beam 1 + self.assertHypoTokens(hypos[0][0], [w1, eos]) + self.assertHypoScore(hypos[0][0], [0.9, 1.0], normalized=False) + # sentence 1, beam 2 + self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos]) + self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], normalized=False) + # sentence 2, beam 1 + self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) + self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], normalized=False) + # sentence 2, beam 2 + self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos]) + self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], normalized=False) + + def test_with_lenpen_favoring_short_hypos(self): + lenpen = 0.6 + generator = SequenceGenerator( + [self.model], self.tgt_dict, beam_size=2, len_penalty=lenpen + ) + hypos = generator.forward(self.sample) + eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 + # sentence 1, beam 1 + self.assertHypoTokens(hypos[0][0], [w1, eos]) + self.assertHypoScore(hypos[0][0], [0.9, 1.0], lenpen=lenpen) + # sentence 1, beam 2 + self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos]) + self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen) + # sentence 2, beam 1 + self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) + self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], lenpen=lenpen) + # sentence 2, beam 2 + self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos]) + self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen) + + def test_with_lenpen_favoring_long_hypos(self): + lenpen = 5.0 + generator = SequenceGenerator( + [self.model], self.tgt_dict, beam_size=2, len_penalty=lenpen + ) + hypos = generator.forward(self.sample) + eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 + # sentence 1, beam 1 + self.assertHypoTokens(hypos[0][0], [w2, w1, w2, eos]) + self.assertHypoScore(hypos[0][0], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen) + # sentence 1, beam 2 + self.assertHypoTokens(hypos[0][1], [w1, eos]) + self.assertHypoScore(hypos[0][1], [0.9, 1.0], lenpen=lenpen) + # sentence 2, beam 1 + self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos]) + self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen) + # sentence 2, beam 2 + self.assertHypoTokens(hypos[1][1], [w1, w2, eos]) + self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6], lenpen=lenpen) + + def test_maxlen(self): + generator = SequenceGenerator( + [self.model], self.tgt_dict, beam_size=2, max_len_b=2 + ) + hypos = generator.forward(self.sample) + eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 + # sentence 1, beam 1 + self.assertHypoTokens(hypos[0][0], [w1, eos]) + self.assertHypoScore(hypos[0][0], [0.9, 1.0]) + # sentence 1, beam 2 + self.assertHypoTokens(hypos[0][1], [w2, w2, eos]) + self.assertHypoScore(hypos[0][1], [0.1, 0.1, 0.6]) + # sentence 2, beam 1 + self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) + self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6]) + # sentence 2, beam 2 + self.assertHypoTokens(hypos[1][1], [w2, w2, eos]) + self.assertHypoScore(hypos[1][1], [0.3, 0.9, 0.01]) + + def test_encoder_with_different_output_len(self): + args = self.model.encoder.args + task = test_utils.TestTranslationTask.setup_task( + args, self.tgt_dict, self.tgt_dict + ) + reshaping_model = test_utils.TestReshapingModel.build_model(args, task) + generator = SequenceGenerator( + [reshaping_model], self.tgt_dict, beam_size=2, max_len_b=2 + ) + hypos = generator.forward(self.sample) + for sent in [0, 1]: + for beam in [0, 1]: + assert hypos[sent][beam]["attention"] is not None + + def test_generation_with_additional_input(self): + args = self.model.encoder.args + task = test_utils.TestTranslationTask.setup_task( + args, self.tgt_dict, self.tgt_dict + ) + add_input_model = test_utils.TestAdditionalInputModel.build_model(args, task) + generator = SequenceGenerator([add_input_model], self.tgt_dict, beam_size=2) + sample = self.sample.copy() + sample["net_input"]["fancy_other_input"] = sample["net_input"]["src_tokens"] + hypos = generator.forward(self.sample) + eos, w1 = self.tgt_dict.eos(), self.w1 + # sentence 1, beam 1 + self.assertHypoTokens(hypos[0][0], [w1, eos]) + self.assertHypoScore(hypos[0][0], [0.9, 1.0]) + + +@unittest.skipUnless(torch.cuda.is_available(), "") +class TestRepeatNgramBlocking(TestSequenceGeneratorBase): + @classmethod + def setUpClass(cls): + ( + cls.tgt_dict, + cls.w1, + cls.w2, + src_tokens, + src_lengths, + cls.model, + ) = test_utils.sequence_generator_setup() + return cls + + def test_finds_repetitive_tokens(self): + bsz, vocab_size, beam_size, step = 2, 4, 1, 3 + generated_tok = torch.tensor( + [[2, 2, 2, 2], [3, 3, 3, 3]], dtype=torch.long, device="cuda" + ) + lprobs = torch.zeros((beam_size * bsz, vocab_size), device="cuda") + desired_result = lprobs.new_tensor( + [[0.0, 0.0, -math.inf, 0.0], [0.0, 0.0, 0.0, -math.inf]] + ) + + cuda_ext_result, baseline_result = self._compare_cuda_ext_to_default_implem( + bsz, beam_size, generated_tok, lprobs, step, 2 + ) + self.assertTensorEqual(cuda_ext_result, desired_result) + self.assertTensorEqual(baseline_result, desired_result) + + @unittest.skipIf(torch.__version__ < "1.6.0", JIT_MSG) + def test_jit_no_extension(self): + bsz, vocab_size, beam_size, step = 2, 4, 1, 3 + generated_tok = torch.tensor( + [[2, 2, 2, 2], [3, 3, 3, 3]], dtype=torch.long, device="cuda" + ) + lprobs = torch.zeros((beam_size * bsz, vocab_size), device="cuda") + blocker = NGramRepeatBlock(2, use_extension=False) + base_result = blocker(generated_tok, lprobs.clone(), bsz, beam_size, step) + scripted_blocker = torch.jit.script(blocker) + jit_result = scripted_blocker( + generated_tok, lprobs.clone(), bsz, beam_size, step + ) + self.assertTensorEqual(base_result, jit_result) + + def test_ngram_blocking_same_as_default_implem(self): + """Test that cuda extension returns same things as default impl in many settings.""" + vocab_size = 4 + step = 6 + for _ in range(2): + block_param = np.random.choice([1, 2, 3, 4]) + batch_size = np.random.randint(1, 8) + beam_size = np.random.choice([1, 2, 4, 8]) + lprobs = torch.zeros((beam_size * batch_size, vocab_size), device="cuda") + + generated_tok = torch.tensor( + np.random.randint( + 0, vocab_size, size=(batch_size * beam_size, step + 1) + ), + device="cuda", + dtype=torch.long, + ) + self._compare_cuda_ext_to_default_implem( + batch_size, + beam_size, + generated_tok, + lprobs, + step, + block_param, + ) + + def _compare_cuda_ext_to_default_implem( + self, bsz, beam_size, generated_tok, lprobs, step, block_param + ): + """Assert that cuda extension and default implem return the same thing.""" + blocker = NGramRepeatBlock(block_param) + assert blocker.use_extension, "Extension not compiled" + cuda_ext_result = blocker( + generated_tok, + lprobs.clone(), + bsz, + beam_size, + step, + ) + blocker.use_extension = False + baseline_result = blocker( + generated_tok, + lprobs.clone(), + bsz, + beam_size, + step, + ) + self.assertTensorEqual(cuda_ext_result, baseline_result) + blocker.use_extension = True + return cuda_ext_result, baseline_result + + +class TestDiverseBeamSearch(TestSequenceGeneratorBase): + def setUp(self): + # construct dummy dictionary + d = test_utils.dummy_dictionary(vocab_size=2) + self.assertEqual(d.pad(), 1) + self.assertEqual(d.eos(), 2) + self.assertEqual(d.unk(), 3) + self.eos = d.eos() + self.w1 = 4 + self.w2 = 5 + + # construct source data + self.src_tokens = torch.LongTensor( + [ + [self.w1, self.w2, self.eos], + [self.w1, self.w2, self.eos], + ] + ) + self.src_lengths = torch.LongTensor([2, 2]) + + args = argparse.Namespace() + unk = 0.0 + args.beam_probs = [ + # step 0: + torch.FloatTensor( + [ + # eos w1 w2 + # sentence 1: + [0.0, unk, 0.9, 0.1], # beam 1 + [0.0, unk, 0.9, 0.1], # beam 2 + # sentence 2: + [0.0, unk, 0.7, 0.3], + [0.0, unk, 0.7, 0.3], + ] + ), + # step 1: + torch.FloatTensor( + [ + # eos w1 w2 + # sentence 1: + [0.0, unk, 0.6, 0.4], + [0.0, unk, 0.6, 0.4], + # sentence 2: + [0.25, unk, 0.35, 0.4], + [0.25, unk, 0.35, 0.4], + ] + ), + # step 2: + torch.FloatTensor( + [ + # eos w1 w2 + # sentence 1: + [1.0, unk, 0.0, 0.0], + [1.0, unk, 0.0, 0.0], + # sentence 2: + [0.9, unk, 0.1, 0.0], + [0.9, unk, 0.1, 0.0], + ] + ), + ] + + task = test_utils.TestTranslationTask.setup_task(args, d, d) + self.model = task.build_model(args) + self.tgt_dict = task.target_dictionary + + def test_diverse_beam_search(self): + search_strategy = search.DiverseBeamSearch( + self.tgt_dict, num_groups=2, diversity_strength=0.0 + ) + generator = SequenceGenerator( + [self.model], + self.tgt_dict, + beam_size=2, + search_strategy=search_strategy, + ) + sample = { + "net_input": { + "src_tokens": self.src_tokens, + "src_lengths": self.src_lengths, + } + } + hypos = generator.forward(sample) + eos, w1, w2 = self.eos, self.w1, self.w2 + # sentence 1, beam 1 + self.assertHypoTokens(hypos[0][0], [w1, w1, eos]) + self.assertHypoScore(hypos[0][0], [0.9, 0.6, 1.0]) + # sentence 1, beam 2 + self.assertHypoTokens(hypos[0][1], [w1, w1, eos]) + self.assertHypoScore(hypos[0][1], [0.9, 0.6, 1.0]) + # sentence 2, beam 1 + self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) + self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.9]) + # sentence 2, beam 2 + self.assertHypoTokens(hypos[1][1], [w1, w2, eos]) + self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.9]) + + +class TestDiverseSiblingsSearch(TestDiverseBeamSearch): + def assertHypoScore( + self, hypo, pos_probs, sibling_rank, diversity_rate, normalized=True, lenpen=1.0 + ): + pos_scores = torch.FloatTensor(pos_probs).log() + pos_scores.sub_(torch.Tensor(sibling_rank) * diversity_rate) + self.assertAlmostEqual(hypo["positional_scores"], pos_scores) + self.assertEqual(pos_scores.numel(), hypo["tokens"].numel()) + score = pos_scores.sum() + if normalized: + score /= pos_scores.numel() ** lenpen + self.assertLess(abs(score - hypo["score"]), 1e-6) + + def test_diverse_beam_search(self): + search_strategy = search.DiverseSiblingsSearch( + self.tgt_dict, diversity_rate=0.5 + ) + generator = SequenceGenerator( + [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy + ) + sample = { + "net_input": { + "src_tokens": self.src_tokens, + "src_lengths": self.src_lengths, + } + } + hypos = generator.forward(sample) + eos, w1, w2 = self.eos, self.w1, self.w2 + # sentence 1, beam 1 + self.assertHypoTokens(hypos[0][0], [w1, w1, eos]) + self.assertHypoScore(hypos[0][0], [0.9, 0.6, 1.0], [0, 1, 1], 0.5) + # sentence 1, beam 2 + self.assertHypoTokens(hypos[0][1], [w1, w2, eos]) + self.assertHypoScore(hypos[0][1], [0.9, 0.4, 1.0], [0, 2, 1], 0.5) + # sentence 2, beam 1 + self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) + self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.9], [0, 1, 1], 0.5) + # sentence 2, beam 2 + self.assertHypoTokens(hypos[1][1], [w1, w1, eos]) + self.assertHypoScore(hypos[1][1], [0.7, 0.35, 0.9], [0, 2, 1], 0.5) + + +class TestTopPSamplingSearch(TestSequenceGeneratorBase): + def setUp(self): + # construct dummy dictionary + d = test_utils.dummy_dictionary(vocab_size=2) + self.assertEqual(d.pad(), 1) + self.assertEqual(d.eos(), 2) + self.assertEqual(d.unk(), 3) + self.eos = d.eos() + self.w1 = 4 + self.w2 = 5 + + # construct source data + self.src_tokens = torch.LongTensor( + [ + [self.w1, self.w2, self.eos], + [self.w1, self.w2, self.eos], + ] + ) + self.src_lengths = torch.LongTensor([2, 2]) + + args = argparse.Namespace() + unk = 0.0 + # The minimal probability of top 2 tokens. + self.min_top2_prob = 0.75 + # The minimal probability of the top 1 token. + self.min_top1_prob = 0.4 + + w1_prob = self.min_top1_prob + w2_prob = self.min_top2_prob - self.min_top1_prob + eos_prob = 1 - self.min_top2_prob + + args.beam_probs = [ + # step 0: + torch.FloatTensor( + [ + # eos w1 w2 + [0.0, unk, 1.0, 0.0], + [0.0, unk, 1.0, 0.0], + [0.0, unk, 1.0, 0.0], + [0.0, unk, 1.0, 0.0], + ] + ), + # step 1: + torch.FloatTensor( + [ + # eos w1 w2 + [eos_prob, unk, w1_prob, w2_prob], + [eos_prob, unk, w1_prob, w2_prob], + [eos_prob, unk, w1_prob, w2_prob], + [eos_prob, unk, w1_prob, w2_prob], + ] + ), + # step 2: + torch.FloatTensor( + [ + # eos w1 w2 + [1.0, unk, 0.0, 0.0], + [1.0, unk, 0.0, 0.0], + [1.0, unk, 0.0, 0.0], + [1.0, unk, 0.0, 0.0], + ] + ), + ] + + task = test_utils.TestTranslationTask.setup_task(args, d, d) + self.model = task.build_model(args) + self.tgt_dict = task.target_dictionary + + def test_topp_sampling_search_low_prob(self): + # Given a prob low enough to top-P sampling, we expect only the top + # 1 token to be sampled, which always results in the same output. + low_sampling_topp = self.min_top1_prob / 2.0 + search_strategy = search.Sampling( + self.tgt_dict, sampling_topp=low_sampling_topp + ) + generator = SequenceGenerator( + [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy + ) + sample = { + "net_input": { + "src_tokens": self.src_tokens, + "src_lengths": self.src_lengths, + } + } + hypos = generator.forward(sample) + eos, w1 = self.eos, self.w1 + # sentence 1, beam 1 + self.assertHypoTokens(hypos[0][0], [w1, w1, eos]) + self.assertHypoScore(hypos[0][0], [1.0, 0.4, 1.0]) + # sentence 1, beam 2 + self.assertHypoTokens(hypos[0][1], [w1, w1, eos]) + self.assertHypoScore(hypos[0][1], [1.0, 0.4, 1.0]) + # sentence 2, beam 1 + self.assertHypoTokens(hypos[1][0], [w1, w1, eos]) + self.assertHypoScore(hypos[1][0], [1.0, 0.4, 1.0]) + # sentence 2, beam 2 + self.assertHypoTokens(hypos[1][1], [w1, w1, eos]) + self.assertHypoScore(hypos[1][1], [1.0, 0.4, 1.0]) + + def test_topp_sampling_search_high_prob(self): + # Given a prob high enough to top-P sampling, any of the top 2 + # tokens could be sampled. This can cause different outputs. + high_sampling_topp = (self.min_top1_prob + self.min_top2_prob) / 2.0 + search_strategy = search.Sampling( + self.tgt_dict, sampling_topp=high_sampling_topp + ) + generator = SequenceGenerator( + [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy + ) + sample = { + "net_input": { + "src_tokens": self.src_tokens, + "src_lengths": self.src_lengths, + } + } + hypos = generator.forward(sample) + eos, w1, w2 = self.eos, self.w1, self.w2 + # sentence 1, beam 1 + self.assertTrue( + self.hypoTokens(hypos[0][0], [w1, w1, eos]) + or self.hypoTokens(hypos[0][0], [w1, w2, eos]) + ) + self.assertTrue( + self.hypoScore(hypos[0][0], [1.0, 0.4, 1.0]) + or self.hypoScore(hypos[0][0], [1.0, 0.35, 1.0]) + ) + + # sentence 1, beam 2 + self.assertTrue( + self.hypoTokens(hypos[0][1], [w1, w1, eos]) + or self.hypoTokens(hypos[0][1], [w1, w2, eos]) + ) + self.assertTrue( + self.hypoScore(hypos[0][1], [1.0, 0.4, 1.0]) + or self.hypoScore(hypos[0][1], [1.0, 0.35, 1.0]) + ) + + # sentence 2, beam 1 + self.assertTrue( + self.hypoTokens(hypos[1][0], [w1, w1, eos]) + or self.hypoTokens(hypos[1][0], [w1, w2, eos]) + ) + self.assertTrue( + self.hypoScore(hypos[1][0], [1.0, 0.4, 1.0]) + or self.hypoScore(hypos[1][0], [1.0, 0.35, 1.0]) + ) + + # sentence 2, beam 2 + self.assertTrue( + self.hypoTokens(hypos[1][1], [w1, w1, eos]) + or self.hypoTokens(hypos[1][1], [w1, w2, eos]) + ) + self.assertTrue( + self.hypoScore(hypos[1][1], [1.0, 0.4, 1.0]) + or self.hypoScore(hypos[1][1], [1.0, 0.35, 1.0]) + ) + + def hypoTokens(self, hypo, tokens): + return self.tensorEqual(hypo["tokens"], torch.LongTensor(tokens)) + + def hypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0): + pos_scores = torch.FloatTensor(pos_probs).log() + if not self.almostEqual(hypo["positional_scores"], pos_scores): + return False + if pos_scores.numel() != hypo["tokens"].numel(): + return False + score = pos_scores.sum() + if normalized: + score /= pos_scores.numel() ** lenpen + return abs(score - hypo["score"]) < 1e-6 + + def almostEqual(self, t1, t2): + return t1.size() == t2.size() and (t1 - t2).abs().max() < 1e-4 + + def tensorEqual(self, t1, t2): + return t1.size() == t2.size() and t1.ne(t2).long().sum() == 0 + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_sequence_scorer.py b/fairseq/tests/test_sequence_scorer.py new file mode 100644 index 0000000000000000000000000000000000000000..42f9447b599bcd7a9913aec37d94ea5078ff43a3 --- /dev/null +++ b/fairseq/tests/test_sequence_scorer.py @@ -0,0 +1,120 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import unittest + +import tests.utils as test_utils +import torch +from fairseq.sequence_scorer import SequenceScorer + + +class TestSequenceScorer(unittest.TestCase): + def test_sequence_scorer(self): + # construct dummy dictionary + d = test_utils.dummy_dictionary(vocab_size=2) + self.assertEqual(d.pad(), 1) + self.assertEqual(d.eos(), 2) + self.assertEqual(d.unk(), 3) + eos = d.eos() + w1 = 4 + w2 = 5 + + # construct dataloader + data = [ + { + "source": torch.LongTensor([w1, w2, eos]), + "target": torch.LongTensor([w1, w2, w1, eos]), + }, + { + "source": torch.LongTensor([w2, eos]), + "target": torch.LongTensor([w2, w1, eos]), + }, + { + "source": torch.LongTensor([w2, eos]), + "target": torch.LongTensor([w2, eos]), + }, + ] + data_itr = test_utils.dummy_dataloader(data) + + # specify expected output probabilities + args = argparse.Namespace() + unk = 0.0 + args.beam_probs = [ + # step 0: + torch.FloatTensor( + [ + # eos w1 w2 + [0.0, unk, 0.6, 0.4], # sentence 1 + [0.0, unk, 0.4, 0.6], # sentence 2 + [0.0, unk, 0.7, 0.3], # sentence 3 + ] + ), + # step 1: + torch.FloatTensor( + [ + # eos w1 w2 + [0.0, unk, 0.2, 0.7], # sentence 1 + [0.0, unk, 0.8, 0.2], # sentence 2 + [0.7, unk, 0.1, 0.2], # sentence 3 + ] + ), + # step 2: + torch.FloatTensor( + [ + # eos w1 w2 + [0.10, unk, 0.50, 0.4], # sentence 1 + [0.15, unk, 0.15, 0.7], # sentence 2 + [0.00, unk, 0.00, 0.0], # sentence 3 + ] + ), + # step 3: + torch.FloatTensor( + [ + # eos w1 w2 + [0.9, unk, 0.05, 0.05], # sentence 1 + [0.0, unk, 0.00, 0.0], # sentence 2 + [0.0, unk, 0.00, 0.0], # sentence 3 + ] + ), + ] + expected_scores = [ + [0.6, 0.7, 0.5, 0.9], # sentence 1 + [0.6, 0.8, 0.15], # sentence 2 + [0.3, 0.7], # sentence 3 + ] + + task = test_utils.TestTranslationTask.setup_task(args, d, d) + model = task.build_model(args) + scorer = SequenceScorer(task.target_dictionary) + for sample in data_itr: + hypos = task.inference_step(scorer, [model], sample) + for id, hypos_id in zip(sample["id"].tolist(), hypos): + self.assertHypoTokens(hypos_id[0], data[id]["target"]) + self.assertHypoScore(hypos_id[0], expected_scores[id]) + + def assertHypoTokens(self, hypo, tokens): + self.assertTensorEqual(hypo["tokens"], torch.LongTensor(tokens)) + + def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0): + pos_scores = torch.FloatTensor(pos_probs).log() + self.assertAlmostEqual(hypo["positional_scores"], pos_scores) + self.assertEqual(pos_scores.numel(), hypo["tokens"].numel()) + score = pos_scores.sum() + if normalized: + score /= pos_scores.numel() ** lenpen + self.assertLess(abs(score - hypo["score"]), 1e-6) + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess((t1 - t2).abs().max(), 1e-4) + + def assertTensorEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertEqual(t1.ne(t2).long().sum(), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_sparse_multihead_attention.py b/fairseq/tests/test_sparse_multihead_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..3e32b25a7fb1e12295b84d0c65064f8e42b7bdd3 --- /dev/null +++ b/fairseq/tests/test_sparse_multihead_attention.py @@ -0,0 +1,114 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention + + +class TestSparseMultiheadAttention(unittest.TestCase): + def test_sparse_multihead_attention(self): + attn_weights = torch.randn(1, 8, 8) + bidirectional_sparse_mask = torch.tensor( + [ + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + ] + ) + + bidirectional_attention = SparseMultiheadAttention( + 16, 1, stride=4, expressivity=1, is_bidirectional=True + ) + bidirectional_attention_sparse_mask = ( + bidirectional_attention.buffered_sparse_mask(attn_weights, 8, 8) + ) + torch.all( + torch.eq(bidirectional_attention_sparse_mask, bidirectional_sparse_mask) + ) + + sparse_mask = torch.tensor( + [ + [ + 0, + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + ], + [ + 0, + 0, + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + ], + [ + 0, + 0, + 0, + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + ], + [ + 0, + 0, + 0, + 0, + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + ], + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), float("-inf")], + [ + float("-inf"), + float("-inf"), + float("-inf"), + 0, + 0, + 0, + float("-inf"), + float("-inf"), + ], + [ + float("-inf"), + float("-inf"), + float("-inf"), + 0, + 0, + 0, + 0, + float("-inf"), + ], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + ] + ) + + attention = SparseMultiheadAttention( + 16, 1, stride=4, expressivity=1, is_bidirectional=False + ) + attention_sparse_mask = attention.buffered_sparse_mask(attn_weights, 8, 8) + + torch.all(torch.eq(attention_sparse_mask, sparse_mask)) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_token_block_dataset.py b/fairseq/tests/test_token_block_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..c4d7b76dcd55fe7869dbb1fa188f7b36fb639bda --- /dev/null +++ b/fairseq/tests/test_token_block_dataset.py @@ -0,0 +1,92 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import tests.utils as test_utils +import torch +from fairseq.data import TokenBlockDataset + + +class TestTokenBlockDataset(unittest.TestCase): + def _build_dataset(self, data, **kwargs): + sizes = [len(x) for x in data] + underlying_ds = test_utils.TestDataset(data) + return TokenBlockDataset(underlying_ds, sizes, **kwargs) + + def test_eos_break_mode(self): + data = [ + torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), + torch.tensor([1], dtype=torch.long), + torch.tensor([8, 7, 6, 1], dtype=torch.long), + ] + ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos") + self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1]) + self.assertEqual(ds[1].tolist(), [1]) + self.assertEqual(ds[2].tolist(), [8, 7, 6, 1]) + + data = [ + torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), + torch.tensor([8, 7, 6, 1], dtype=torch.long), + torch.tensor([1], dtype=torch.long), + ] + ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos") + self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1]) + self.assertEqual(ds[1].tolist(), [8, 7, 6, 1]) + self.assertEqual(ds[2].tolist(), [1]) + + def test_block_break_mode(self): + data = [ + torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), + torch.tensor([8, 7, 6, 1], dtype=torch.long), + torch.tensor([9, 1], dtype=torch.long), + ] + ds = self._build_dataset(data, block_size=3, pad=0, eos=1, break_mode="none") + self.assertEqual(ds[0].tolist(), [5, 4, 3]) + self.assertEqual(ds[1].tolist(), [2, 1, 8]) + self.assertEqual(ds[2].tolist(), [7, 6, 1]) + self.assertEqual(ds[3].tolist(), [9, 1]) + + def test_complete_break_mode(self): + data = [ + torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), + torch.tensor([8, 7, 6, 1], dtype=torch.long), + torch.tensor([9, 1], dtype=torch.long), + ] + ds = self._build_dataset( + data, block_size=6, pad=0, eos=1, break_mode="complete" + ) + self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1]) + self.assertEqual(ds[1].tolist(), [8, 7, 6, 1, 9, 1]) + + data = [ + torch.tensor([4, 3, 2, 1], dtype=torch.long), + torch.tensor([5, 1], dtype=torch.long), + torch.tensor([1], dtype=torch.long), + torch.tensor([6, 1], dtype=torch.long), + ] + ds = self._build_dataset( + data, block_size=3, pad=0, eos=1, break_mode="complete" + ) + self.assertEqual(ds[0].tolist(), [4, 3, 2, 1]) + self.assertEqual(ds[1].tolist(), [5, 1, 1]) + self.assertEqual(ds[2].tolist(), [6, 1]) + + def test_4billion_tokens(self): + """Regression test for numpy type promotion issue https://github.com/numpy/numpy/issues/5745""" + data = [torch.tensor(list(range(10000)), dtype=torch.long)] * 430000 + ds = self._build_dataset( + data, block_size=6, pad=0, eos=1, break_mode="complete" + ) + ds[-1] # __getitem__ works + start, end = ds.slice_indices[-1] + assert end > 4294967295 # data must be sufficiently large to overflow uint32 + assert not isinstance( + end + 1, float + ) # this would also raise, since np.uint64(1) + 1 => 2.0 + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_train.py b/fairseq/tests/test_train.py new file mode 100644 index 0000000000000000000000000000000000000000..02ef94cc5b80c05485144db67501b2acedbaf291 --- /dev/null +++ b/fairseq/tests/test_train.py @@ -0,0 +1,247 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import logging +import unittest +from io import StringIO +from unittest.mock import MagicMock, patch + +import torch +from fairseq import checkpoint_utils, data +from omegaconf import OmegaConf + + +def mock_trainer(epoch, num_updates, iterations_in_epoch): + trainer = MagicMock() + trainer.load_checkpoint.return_value = { + "train_iterator": { + "epoch": epoch, + "iterations_in_epoch": iterations_in_epoch, + "shuffle": False, + }, + } + trainer.get_num_updates.return_value = num_updates + return trainer + + +def mock_dict(): + d = MagicMock() + d.pad.return_value = 1 + d.eos.return_value = 2 + d.unk.return_value = 3 + return d + + +def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch): + tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1) + tokens_ds = data.TokenBlockDataset( + tokens, + sizes=[tokens.size(-1)], + block_size=1, + pad=0, + eos=1, + include_targets=False, + ) + trainer = mock_trainer(epoch, num_updates, iterations_in_epoch) + dataset = data.LanguagePairDataset( + tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False + ) + epoch_itr = data.EpochBatchIterator( + dataset=dataset, + collate_fn=dataset.collater, + batch_sampler=[[i] for i in range(epoch_size)], + ) + return trainer, epoch_itr + + +def get_mock_cfg(finetune_from_model): + cfg_mock = OmegaConf.create( + { + "checkpoint": { + "save_dir": None, + "optimizer_overrides": "{}", + "reset_dataloader": False, + "reset_meters": False, + "reset_optimizer": False, + "reset_lr_scheduler": False, + "finetune_from_model": finetune_from_model, + "model_parallel_size": 1, + "restore_file": "checkpoint_last.pt", + }, + "common": { + "model_parallel_size": 1, + }, + } + ) + return cfg_mock + + +class TestLoadCheckpoint(unittest.TestCase): + def setUp(self): + self.cfg_mock = get_mock_cfg(None) + self.patches = { + "os.makedirs": MagicMock(), + "os.path.join": MagicMock(), + "os.path.isfile": MagicMock(return_value=True), + "os.path.isabs": MagicMock(return_value=False), + "fairseq.file_io.PathManager.exists": MagicMock(return_value=False), + } + self.applied_patches = [patch(p, d) for p, d in self.patches.items()] + [p.start() for p in self.applied_patches] + logging.disable(logging.CRITICAL) + + def tearDown(self): + patch.stopall() + logging.disable(logging.NOTSET) + + def test_load_partial_checkpoint(self): + with contextlib.redirect_stdout(StringIO()): + trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 200, 50) + trainer.get_train_iterator = MagicMock(return_value=epoch_itr) + + _, epoch_itr = checkpoint_utils.load_checkpoint( + self.cfg_mock.checkpoint, trainer + ) + + self.assertEqual(epoch_itr.epoch, 2) + self.assertEqual(epoch_itr.iterations_in_epoch, 50) + + itr = epoch_itr.next_epoch_itr(shuffle=False) + self.assertEqual(epoch_itr.epoch, 2) + self.assertEqual(epoch_itr.iterations_in_epoch, 50) + + self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 50) + self.assertEqual(epoch_itr.iterations_in_epoch, 51) + + for _ in range(150 - 52): + next(itr) + self.assertEqual(epoch_itr.iterations_in_epoch, 149) + self.assertTrue(itr.has_next()) + next(itr) + self.assertFalse(itr.has_next()) + + itr = epoch_itr.next_epoch_itr(shuffle=False) + self.assertTrue(itr.has_next()) + self.assertEqual(epoch_itr.epoch, 3) + self.assertEqual(epoch_itr.iterations_in_epoch, 0) + + def test_load_full_checkpoint(self): + with contextlib.redirect_stdout(StringIO()): + trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 300, 150) + trainer.get_train_iterator = MagicMock(return_value=epoch_itr) + + _, epoch_itr = checkpoint_utils.load_checkpoint( + self.cfg_mock.checkpoint, trainer + ) + itr = epoch_itr.next_epoch_itr(shuffle=False) + + self.assertEqual(epoch_itr.epoch, 3) + self.assertEqual(epoch_itr.iterations_in_epoch, 0) + self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 0) + + def test_load_no_checkpoint(self): + with contextlib.redirect_stdout(StringIO()): + trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0) + trainer.get_train_iterator = MagicMock(return_value=epoch_itr) + self.patches["os.path.isfile"].return_value = False + + _, epoch_itr = checkpoint_utils.load_checkpoint( + self.cfg_mock.checkpoint, trainer + ) + itr = epoch_itr.next_epoch_itr(shuffle=False) + + self.assertEqual(epoch_itr.epoch, 1) + self.assertEqual(epoch_itr.iterations_in_epoch, 0) + self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 0) + + def test_finetune_from_model_args_conflict(self): + with contextlib.redirect_stdout(StringIO()): + trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0) + trainer.get_train_iterator = MagicMock(return_value=epoch_itr) + + for arg in [ + "reset_optimizer", + "reset_lr_scheduler", + "reset_meters", + "reset_dataloader", + ]: + with self.subTest(arg=arg): + cfg_mock = get_mock_cfg("/temp/checkpoint_pretrained.pt") + cfg_mock["checkpoint"][arg] = True + with self.assertRaises(Exception) as context: + _, _ = checkpoint_utils.load_checkpoint( + cfg_mock.checkpoint, trainer + ) + + self.assertTrue( + "--finetune-from-model can not be set together with either --reset-optimizer" + " or reset_lr_scheduler or reset_meters or reset_dataloader" + in str(context.exception) + ) + + def test_finetune_from_model(self): + with contextlib.redirect_stdout(StringIO()): + trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0) + trainer.get_train_iterator = MagicMock(return_value=epoch_itr) + from_model_path = "/temp/checkpoint_pretrained.pt" + + def mock_finetune_exist(path): + if path == from_model_path: + return True + else: + return False + + self.patches[ + "fairseq.file_io.PathManager.exists" + ].side_effect = mock_finetune_exist + cfg_mock = get_mock_cfg(from_model_path) + cfg_mock.checkpoint.restore_file = "checkpoint_last.pt" + _, _ = checkpoint_utils.load_checkpoint(cfg_mock.checkpoint, trainer) + ( + checkpoint_path, + reset_optimizer, + reset_lr_scheduler, + optimizer_overrides, + ) = trainer.load_checkpoint.call_args[0] + reset_meters = trainer.load_checkpoint.call_args[1]["reset_meters"] + self.assertTrue(reset_optimizer) + self.assertTrue(reset_lr_scheduler) + self.assertTrue(reset_meters) + + def test_finetune_from_model_resume(self): + with contextlib.redirect_stdout(StringIO()): + trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0) + trainer.get_train_iterator = MagicMock(return_value=epoch_itr) + from_model_path = "/temp/checkpoint_pretrained.pt" + + # launch second time + # both restore_file=checkpoint_last.pt and finetune_from_model are set + def mock_finetune_exist(path): + if path == from_model_path or path.endsWith("checkpoint_last.pt"): + return True + else: + return False + + self.patches[ + "fairseq.file_io.PathManager.exists" + ].side_effect = mock_finetune_exist + cfg_mock = get_mock_cfg(from_model_path) + cfg_mock.checkpoint.restore_file = "checkpoint_last.pt" + _, _ = checkpoint_utils.load_checkpoint(cfg_mock.checkpoint, trainer) + ( + checkpoint_path, + reset_optimizer, + reset_lr_scheduler, + optimizer_overrides, + ) = trainer.load_checkpoint.call_args[0] + reset_meters = trainer.load_checkpoint.call_args[1]["reset_meters"] + self.assertFalse(reset_optimizer) + self.assertFalse(reset_lr_scheduler) + self.assertFalse(reset_meters) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_transformer.py b/fairseq/tests/test_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..de5c5bdbd49692e63fb1cb50108a791304425dc1 --- /dev/null +++ b/fairseq/tests/test_transformer.py @@ -0,0 +1,65 @@ +import argparse +import unittest +from typing import Any, Dict, Sequence + +import torch +from fairseq.models import transformer + +from tests.test_roberta import FakeTask + + +def mk_sample(tok: Sequence[int] = None, batch_size: int = 2) -> Dict[str, Any]: + if not tok: + tok = [10, 11, 12, 13, 14, 15, 2] + + batch = torch.stack([torch.tensor(tok, dtype=torch.long)] * batch_size) + sample = { + "net_input": { + "src_tokens": batch, + "prev_output_tokens": batch, + "src_lengths": torch.tensor( + [len(tok)] * batch_size, dtype=torch.long, device=batch.device + ), + }, + "target": batch[:, 1:], + } + return sample + + +def mk_transformer(**extra_args: Any): + overrides = { + # Use characteristics dimensions + "encoder_embed_dim": 12, + "encoder_ffn_embed_dim": 14, + "decoder_embed_dim": 12, + "decoder_ffn_embed_dim": 14, + # Disable dropout so we have comparable tests. + "dropout": 0, + "attention_dropout": 0, + "activation_dropout": 0, + "encoder_layerdrop": 0, + } + overrides.update(extra_args) + # Overrides the defaults from the parser + args = argparse.Namespace(**overrides) + transformer.tiny_architecture(args) + + torch.manual_seed(0) + task = FakeTask(args) + return transformer.TransformerModel.build_model(args, task) + + +class TransformerTestCase(unittest.TestCase): + def test_forward_backward(self): + model = mk_transformer(encoder_embed_dim=12, decoder_embed_dim=12) + sample = mk_sample() + o, _ = model.forward(**sample["net_input"]) + loss = o.sum() + loss.backward() + + def test_different_encoder_decoder_embed_dim(self): + model = mk_transformer(encoder_embed_dim=12, decoder_embed_dim=16) + sample = mk_sample() + o, _ = model.forward(**sample["net_input"]) + loss = o.sum() + loss.backward() diff --git a/fairseq/tests/test_utils.py b/fairseq/tests/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..79195903e0f34372a24fa50312a6e00170c14471 --- /dev/null +++ b/fairseq/tests/test_utils.py @@ -0,0 +1,114 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from fairseq import utils + + +class TestUtils(unittest.TestCase): + def test_convert_padding_direction(self): + pad = 1 + left_pad = torch.LongTensor( + [ + [2, 3, 4, 5, 6], + [1, 7, 8, 9, 10], + [1, 1, 1, 11, 12], + ] + ) + right_pad = torch.LongTensor( + [ + [2, 3, 4, 5, 6], + [7, 8, 9, 10, 1], + [11, 12, 1, 1, 1], + ] + ) + + self.assertAlmostEqual( + right_pad, + utils.convert_padding_direction( + left_pad, + pad, + left_to_right=True, + ), + ) + self.assertAlmostEqual( + left_pad, + utils.convert_padding_direction( + right_pad, + pad, + right_to_left=True, + ), + ) + + def test_make_positions(self): + pad = 1 + left_pad_input = torch.LongTensor( + [ + [9, 9, 9, 9, 9], + [1, 9, 9, 9, 9], + [1, 1, 1, 9, 9], + ] + ) + left_pad_output = torch.LongTensor( + [ + [2, 3, 4, 5, 6], + [1, 2, 3, 4, 5], + [1, 1, 1, 2, 3], + ] + ) + right_pad_input = torch.LongTensor( + [ + [9, 9, 9, 9, 9], + [9, 9, 9, 9, 1], + [9, 9, 1, 1, 1], + ] + ) + right_pad_output = torch.LongTensor( + [ + [2, 3, 4, 5, 6], + [2, 3, 4, 5, 1], + [2, 3, 1, 1, 1], + ] + ) + + self.assertAlmostEqual( + left_pad_output, + utils.make_positions(left_pad_input, pad), + ) + self.assertAlmostEqual( + right_pad_output, + utils.make_positions(right_pad_input, pad), + ) + + def test_clip_grad_norm_(self): + params = torch.nn.Parameter(torch.zeros(5)).requires_grad_(False) + grad_norm = utils.clip_grad_norm_(params, 1.0) + self.assertTrue(torch.is_tensor(grad_norm)) + self.assertEqual(grad_norm, 0.0) + + params = [torch.nn.Parameter(torch.zeros(5)) for i in range(3)] + for p in params: + p.grad = torch.full((5,), fill_value=2.0) + grad_norm = utils.clip_grad_norm_(params, 1.0) + exp_grad_norm = torch.full((15,), fill_value=2.0).norm() + self.assertTrue(torch.is_tensor(grad_norm)) + self.assertEqual(grad_norm, exp_grad_norm) + + grad_norm = utils.clip_grad_norm_(params, 1.0) + self.assertAlmostEqual(grad_norm, torch.tensor(1.0)) + + def test_resolve_max_positions_with_tuple(self): + resolved = utils.resolve_max_positions(None, (2000, 100, 2000), 12000) + self.assertEqual(resolved, (2000, 100, 2000)) + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess(utils.item((t1 - t2).abs().max()), 1e-4) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/tests/test_valid_subset_checks.py b/fairseq/tests/test_valid_subset_checks.py new file mode 100644 index 0000000000000000000000000000000000000000..c39fb89823305dd9a84220d366da9150cedd659e --- /dev/null +++ b/fairseq/tests/test_valid_subset_checks.py @@ -0,0 +1,143 @@ +import os +import shutil +import tempfile +import unittest + +from fairseq import options +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.data.data_utils import raise_if_valid_subsets_unintentionally_ignored +from .utils import create_dummy_data, preprocess_lm_data, train_language_model + + +def make_lm_config( + data_dir=None, + extra_flags=None, + task="language_modeling", + arch="transformer_lm_gpt2_tiny", +): + task_args = [task] + if data_dir is not None: + task_args += [data_dir] + train_parser = options.get_training_parser() + train_args = options.parse_args_and_arch( + train_parser, + [ + "--task", + *task_args, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--max-tokens", + "500", + "--tokens-per-sample", + "500", + "--save-dir", + data_dir, + "--max-epoch", + "1", + ] + + (extra_flags or []), + ) + cfg = convert_namespace_to_omegaconf(train_args) + return cfg + + +def write_empty_file(path): + with open(path, "w"): + pass + assert os.path.exists(path) + + +class TestValidSubsetsErrors(unittest.TestCase): + """Test various filesystem, clarg combinations and ensure that error raising happens as expected""" + + def _test_case(self, paths, extra_flags): + with tempfile.TemporaryDirectory() as data_dir: + [ + write_empty_file(os.path.join(data_dir, f"{p}.bin")) + for p in paths + ["train"] + ] + cfg = make_lm_config(data_dir, extra_flags=extra_flags) + raise_if_valid_subsets_unintentionally_ignored(cfg) + + def test_default_raises(self): + with self.assertRaises(ValueError): + self._test_case(["valid", "valid1"], []) + with self.assertRaises(ValueError): + self._test_case( + ["valid", "valid1", "valid2"], ["--valid-subset", "valid,valid1"] + ) + + def partially_specified_valid_subsets(self): + with self.assertRaises(ValueError): + self._test_case( + ["valid", "valid1", "valid2"], ["--valid-subset", "valid,valid1"] + ) + # Fix with ignore unused + self._test_case( + ["valid", "valid1", "valid2"], + ["--valid-subset", "valid,valid1", "--ignore-unused-valid-subsets"], + ) + + def test_legal_configs(self): + self._test_case(["valid"], []) + self._test_case(["valid", "valid1"], ["--ignore-unused-valid-subsets"]) + self._test_case(["valid", "valid1"], ["--combine-val"]) + self._test_case(["valid", "valid1"], ["--valid-subset", "valid,valid1"]) + self._test_case(["valid", "valid1"], ["--valid-subset", "valid1"]) + self._test_case( + ["valid", "valid1"], ["--combine-val", "--ignore-unused-valid-subsets"] + ) + self._test_case( + ["valid1"], ["--valid-subset", "valid1"] + ) # valid.bin doesn't need to be ignored. + + def test_disable_validation(self): + self._test_case([], ["--disable-validation"]) + self._test_case(["valid", "valid1"], ["--disable-validation"]) + + def test_dummy_task(self): + cfg = make_lm_config(task="dummy_lm") + raise_if_valid_subsets_unintentionally_ignored(cfg) + + def test_masked_dummy_task(self): + cfg = make_lm_config(task="dummy_masked_lm") + raise_if_valid_subsets_unintentionally_ignored(cfg) + + +class TestCombineValidSubsets(unittest.TestCase): + def _train(self, extra_flags): + with self.assertLogs() as logs: + with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir: + create_dummy_data(data_dir, num_examples=20) + preprocess_lm_data(data_dir) + + shutil.copyfile(f"{data_dir}/valid.bin", f"{data_dir}/valid1.bin") + shutil.copyfile(f"{data_dir}/valid.idx", f"{data_dir}/valid1.idx") + train_language_model( + data_dir, + "transformer_lm", + ["--max-update", "0", "--log-format", "json"] + extra_flags, + run_validation=False, + ) + return [x.message for x in logs.records] + + def test_combined(self): + flags = ["--combine-valid-subsets", "--required-batch-size-multiple", "1"] + logs = self._train(flags) + assert any(["valid1" in x for x in logs]) # loaded 100 examples from valid1 + assert not any(["valid1_ppl" in x for x in logs]) # metrics are combined + + def test_subsets(self): + flags = [ + "--valid-subset", + "valid,valid1", + "--required-batch-size-multiple", + "1", + ] + logs = self._train(flags) + assert any(["valid_ppl" in x for x in logs]) # loaded 100 examples from valid1 + assert any(["valid1_ppl" in x for x in logs]) # metrics are combined diff --git a/fairseq/tests/utils.py b/fairseq/tests/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..af3f714ed13ce06faf4426cc83f0b69368a8ac6f --- /dev/null +++ b/fairseq/tests/utils.py @@ -0,0 +1,797 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import json +import os +import random +import shutil +import string +import sys +import typing as tp +from io import StringIO + +import torch +import torch.nn.functional as F + +import fairseq.distributed.utils as distributed_utils +from fairseq import options, utils +from fairseq.data import Dictionary +from fairseq.data.language_pair_dataset import collate +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqIncrementalDecoder, +) +from fairseq.models.fairseq_encoder import EncoderOut +from fairseq.tasks import LegacyFairseqTask +from fairseq_cli import generate, interactive, preprocess, train, validate + + +def dummy_dictionary(vocab_size, prefix="token_"): + d = Dictionary() + for i in range(vocab_size): + token = prefix + str(i) + d.add_symbol(token) + d.finalize(padding_factor=1) # don't add extra padding symbols + return d + + +def dummy_dataloader( + samples, + padding_idx=1, + eos_idx=2, + batch_size=None, +): + if batch_size is None: + batch_size = len(samples) + + # add any missing data to samples + for i, sample in enumerate(samples): + if "id" not in sample: + sample["id"] = i + + # create dataloader + dataset = TestDataset(samples) + dataloader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size, + collate_fn=(lambda samples: collate(samples, padding_idx, eos_idx)), + ) + return iter(dataloader) + + +def sequence_generator_setup(): + # construct dummy dictionary + d = dummy_dictionary(vocab_size=2) + + eos = d.eos() + w1 = 4 + w2 = 5 + + # construct source data + src_tokens = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]]) + src_lengths = torch.LongTensor([2, 2]) + + args = argparse.Namespace() + unk = 0.0 + args.beam_probs = [ + # step 0: + torch.FloatTensor( + [ + # eos w1 w2 + # sentence 1: + [0.0, unk, 0.9, 0.1], # beam 1 + [0.0, unk, 0.9, 0.1], # beam 2 + # sentence 2: + [0.0, unk, 0.7, 0.3], + [0.0, unk, 0.7, 0.3], + ] + ), + # step 1: + torch.FloatTensor( + [ + # eos w1 w2 prefix + # sentence 1: + [1.0, unk, 0.0, 0.0], # w1: 0.9 (emit: w1 : 0.9*1.0) + [0.0, unk, 0.9, 0.1], # w2: 0.1 + # sentence 2: + [0.25, unk, 0.35, 0.4], # w1: 0.7 (don't emit: w1 : 0.7*0.25) + [0.00, unk, 0.10, 0.9], # w2: 0.3 + ] + ), + # step 2: + torch.FloatTensor( + [ + # eos w1 w2 prefix + # sentence 1: + [0.0, unk, 0.1, 0.9], # w2 w1: 0.1*0.9 + [ + 0.6, + unk, + 0.2, + 0.2, + ], # w2 w2: 0.1*0.1 (emit: w2 w2 : 0.1*0.1*0.6) + # sentence 2: + [ + 0.60, + unk, + 0.4, + 0.00, + ], # w1 w2: 0.7*0.4 (emit: w1 w2 : 0.7*0.4*0.6) + [0.01, unk, 0.0, 0.99], # w2 w2: 0.3*0.9 + ] + ), + # step 3: + torch.FloatTensor( + [ + # eos w1 w2 prefix + # sentence 1: + [ + 1.0, + unk, + 0.0, + 0.0, + ], # w2 w1 w2: 0.1*0.9*0.9 (emit: w2 w1 w2 : 0.1*0.9*0.9*1.0) + [ + 1.0, + unk, + 0.0, + 0.0, + ], # w2 w1 w1: 0.1*0.9*0.1 (emit: w2 w1 w1 : 0.1*0.9*0.1*1.0) + # sentence 2: + [ + 0.1, + unk, + 0.5, + 0.4, + ], # w2 w2 w2: 0.3*0.9*0.99 (emit: w2 w2 w2 : 0.3*0.9*0.99*0.1) + [ + 1.0, + unk, + 0.0, + 0.0, + ], # w1 w2 w1: 0.7*0.4*0.4 (emit: w1 w2 w1 : 0.7*0.4*0.4*1.0) + ] + ), + ] + + task = TestTranslationTask.setup_task(args, d, d) + model = task.build_model(args) + tgt_dict = task.target_dictionary + + return tgt_dict, w1, w2, src_tokens, src_lengths, model + + +def create_dummy_data( + data_dir, num_examples=100, maxlen=20, alignment=False, languages=None +): + def _create_dummy_data(dir, filename): + data = torch.rand(num_examples * maxlen) + data = 97 + torch.floor(26 * data).int() + with open(os.path.join(dir, filename), "w") as h: + offset = 0 + for _ in range(num_examples): + ex_len = random.randint(1, maxlen) + ex_str = " ".join(map(chr, data[offset : offset + ex_len])) + print(ex_str, file=h) + offset += ex_len + + def _create_dummy_alignment_data(filename_src, filename_tgt, filename): + with open(os.path.join(data_dir, filename_src), "r") as src_f, open( + os.path.join(data_dir, filename_tgt), "r" + ) as tgt_f, open(os.path.join(data_dir, filename), "w") as h: + for src, tgt in zip(src_f, tgt_f): + src_len = len(src.split()) + tgt_len = len(tgt.split()) + avg_len = (src_len + tgt_len) // 2 + num_alignments = random.randint(avg_len // 2, 2 * avg_len) + src_indices = torch.floor(torch.rand(num_alignments) * src_len).int() + tgt_indices = torch.floor(torch.rand(num_alignments) * tgt_len).int() + ex_str = " ".join( + [ + "{}-{}".format(src, tgt) + for src, tgt in zip(src_indices, tgt_indices) + ] + ) + print(ex_str, file=h) + + files_to_write = [ + "train.in", + "train.out", + "valid.in", + "valid.out", + "test.in", + "test.out", + ] + if languages is None: # En only dummy dataset + for f in files_to_write: + _create_dummy_data(data_dir, f) + else: + for lang in languages: + lang_dir = os.path.join(data_dir, lang) + os.makedirs(lang_dir, exist_ok=True) + for f in files_to_write: + _create_dummy_data(lang_dir, f) + + if alignment: + _create_dummy_alignment_data("train.in", "train.out", "train.align") + _create_dummy_alignment_data("valid.in", "valid.out", "valid.align") + _create_dummy_alignment_data("test.in", "test.out", "test.align") + + +def preprocess_lm_data(data_dir, languages=None): + preprocess_parser = options.get_preprocessing_parser() + if languages is None: + preprocess_args = preprocess_parser.parse_args( + [ + "--only-source", + "--trainpref", + os.path.join(data_dir, "train.out"), + "--validpref", + os.path.join(data_dir, "valid.out"), + "--testpref", + os.path.join(data_dir, "test.out"), + "--destdir", + data_dir, + ] + ) + preprocess.main(preprocess_args) + else: + for lang in languages: + lang_dir = os.path.join(data_dir, lang) + assert os.path.exists(lang_dir) + preprocess_args = preprocess_parser.parse_args( + [ + "--only-source", + "--trainpref", + os.path.join(lang_dir, "train.out"), + "--validpref", + os.path.join(lang_dir, "valid.out"), + "--testpref", + os.path.join(lang_dir, "test.out"), + "--destdir", + lang_dir, + ] + ) + preprocess.main(preprocess_args) + shutil.copyfile( + os.path.join(data_dir, languages[0], "dict.txt"), + os.path.join(data_dir, "dict.txt"), + ) + + +def preprocess_translation_data(data_dir, extra_flags=None): + preprocess_parser = options.get_preprocessing_parser() + preprocess_args = preprocess_parser.parse_args( + [ + "--source-lang", + "in", + "--target-lang", + "out", + "--trainpref", + os.path.join(data_dir, "train"), + "--validpref", + os.path.join(data_dir, "valid"), + "--testpref", + os.path.join(data_dir, "test"), + "--thresholdtgt", + "0", + "--thresholdsrc", + "0", + "--destdir", + data_dir, + ] + + (extra_flags or []), + ) + preprocess.main(preprocess_args) + + +def preprocess_summarization_data(data_dir, extra_flags=None): + preprocess_parser = options.get_preprocessing_parser() + preprocess_args = preprocess_parser.parse_args( + [ + "--source-lang", + "in", + "--target-lang", + "out", + "--trainpref", + os.path.join(data_dir, "train"), + "--validpref", + os.path.join(data_dir, "valid"), + "--testpref", + os.path.join(data_dir, "test"), + "--thresholdtgt", + "0", + "--thresholdsrc", + "0", + "--joined-dictionary", + "--destdir", + data_dir, + ] + + (extra_flags or []), + ) + preprocess.main(preprocess_args) + + +def create_laser_data_and_config_json(data_dir): + src_langs = ["de", "fr", "ru", "tr", "zh"] + tgt_langs = ["en", "es"] + config_json = {} + config_train_json = [] + src_vocab = None + tgt_vocab = None + + for src_lang in src_langs: + for tgt_lang in tgt_langs: + langpair_folder = f"{src_lang}-{tgt_lang}" + + langpair_path = os.path.join(data_dir, langpair_folder) + os.mkdir(langpair_path) + create_dummy_data(langpair_path) + preprocess_translation_data(langpair_path, ["--dataset-impl", "cached"]) + + src_vocab = os.path.join(langpair_path, "dict.in.txt") + tgt_vocab = os.path.join(langpair_path, "dict.out.txt") + config_train_json.append( + { + "id": 0 if tgt_lang == "en" else 1, + "src": os.path.join(langpair_path, "train.in-out.in"), + "tgt": os.path.join(langpair_path, "train.in-out.out"), + } + ) + + config_json["src_vocab"] = src_vocab + config_json["tgt_vocab"] = tgt_vocab + config_json["train"] = config_train_json + + with open(os.path.join(data_dir, "laserconfig.json"), "w") as config_file: + json.dump(config_json, config_file) + + return config_file + + +def train_translation_model( + data_dir, + arch, + extra_flags=None, + task="translation", + run_validation=False, + lang_flags=None, + extra_valid_flags=None, + world_size=1, +): + if lang_flags is None: + lang_flags = [ + "--source-lang", + "in", + "--target-lang", + "out", + ] + train_parser = options.get_training_parser() + train_args = options.parse_args_and_arch( + train_parser, + [ + "--task", + task, + data_dir, + "--save-dir", + data_dir, + "--arch", + arch, + "--optimizer", + "nag", + "--lr", + "0.05", + "--max-tokens", + "500", + "--max-epoch", + "1", + "--no-progress-bar", + "--distributed-world-size", + str(world_size), + "--num-workers", + "0", + ] + + lang_flags + + (extra_flags or []), + ) + + cfg = convert_namespace_to_omegaconf(train_args) + distributed_utils.call_main(cfg, train.main) + + if run_validation: + # test validation + validate_parser = options.get_validation_parser() + validate_args = options.parse_args_and_arch( + validate_parser, + [ + "--task", + task, + data_dir, + "--path", + os.path.join(data_dir, "checkpoint_last.pt"), + "--valid-subset", + "valid", + "--max-tokens", + "500", + "--no-progress-bar", + "--num-workers", + "0", + ] + + lang_flags + + (extra_valid_flags or []), + ) + validate.main(validate_args) + + +def generate_main(data_dir, extra_flags=None, path=None): + if extra_flags is None: + extra_flags = [ + "--print-alignment", + ] + if path is None: + path = os.path.join(data_dir, "checkpoint_last.pt") + generate_parser = options.get_generation_parser() + generate_args = options.parse_args_and_arch( + generate_parser, + [ + data_dir, + "--path", + path, + "--beam", + "3", + "--batch-size", + "64", + "--max-len-b", + "5", + "--gen-subset", + "valid", + "--no-progress-bar", + "--num-workers", + "0", + ] + + (extra_flags or []), + ) + + # evaluate model in batch mode + generate.main(generate_args) + + # evaluate model interactively + generate_args.buffer_size = 0 + generate_args.input = "-" + generate_args.batch_size = None + orig_stdin = sys.stdin + sys.stdin = StringIO("h e l l o\n") + interactive.main(generate_args) + sys.stdin = orig_stdin + + +class TestDataset(torch.utils.data.Dataset): + def __init__(self, data): + super().__init__() + self.data = data + self.sizes = None + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return len(self.data) + + +class TestTranslationTask(LegacyFairseqTask): + def __init__(self, args, src_dict, tgt_dict, model): + super().__init__(args) + self.src_dict = src_dict + self.tgt_dict = tgt_dict + self.model = model + + @classmethod + def setup_task(cls, args, src_dict=None, tgt_dict=None, model=None): + return cls(args, src_dict, tgt_dict, model) + + def build_model(self, args, from_checkpoint=False): + return TestModel.build_model(args, self) + + @property + def source_dictionary(self): + return self.src_dict + + @property + def target_dictionary(self): + return self.tgt_dict + + +class TestModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def build_model(cls, args, task): + encoder = TestEncoder(args, task.source_dictionary) + decoder = TestIncrementalDecoder(args, task.target_dictionary) + return cls(encoder, decoder) + + +class TestEncoder(FairseqEncoder): + def __init__(self, args, dictionary): + super().__init__(dictionary) + self.args = args + + def forward(self, src_tokens, src_lengths=None, **kwargs): + return EncoderOut( + encoder_out=src_tokens, + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + def reorder_encoder_out(self, encoder_out, new_order): + return EncoderOut( + encoder_out=encoder_out.encoder_out.index_select(0, new_order), + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + +class TestIncrementalDecoder(FairseqIncrementalDecoder): + def __init__(self, args, dictionary): + super().__init__(dictionary) + assert hasattr(args, "beam_probs") or hasattr(args, "probs") + args.max_decoder_positions = getattr(args, "max_decoder_positions", 100) + self.args = args + + def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + bbsz = prev_output_tokens.size(0) + vocab = len(self.dictionary) + src_len = encoder_out.encoder_out.size(1) + tgt_len = prev_output_tokens.size(1) + + # determine number of steps + if incremental_state is not None: + # cache step number + step = utils.get_incremental_state(self, incremental_state, "step") + if step is None: + step = 0 + utils.set_incremental_state(self, incremental_state, "step", step + 1) + steps = [step] + else: + steps = list(range(tgt_len)) + + # define output in terms of raw probs + if hasattr(self.args, "probs"): + assert ( + self.args.probs.dim() == 3 + ), "expected probs to have size bsz*steps*vocab" + probs = self.args.probs.index_select(1, torch.LongTensor(steps)) + else: + probs = torch.FloatTensor(bbsz, len(steps), vocab).zero_() + for i, step in enumerate(steps): + # args.beam_probs gives the probability for every vocab element, + # starting with eos, then unknown, and then the rest of the vocab + if step < len(self.args.beam_probs): + probs[:, i, self.dictionary.eos() :] = self.args.beam_probs[step] + else: + probs[:, i, self.dictionary.eos()] = 1.0 + + # random attention + attn = torch.rand(bbsz, tgt_len, src_len) + + dev = prev_output_tokens.device + return probs.to(dev), {"attn": [attn.to(dev)]} + + def get_normalized_probs(self, net_output, log_probs, _): + # the decoder returns probabilities directly + probs = net_output[0] + if log_probs: + return probs.log() + else: + return probs + + def max_positions(self): + return self.args.max_decoder_positions + + +class TestReshapingEncoder(FairseqEncoder): + def __init__(self, args, dictionary): + super().__init__(dictionary) + self.args = args + + def forward(self, src_tokens, src_lengths=None, **kwargs): + b_sz, t_sz = src_tokens.shape + padding_needed = t_sz % 2 + x = src_tokens + if padding_needed > 0: + padding_needed = 2 - padding_needed + x = F.pad(x, (0, padding_needed)) + + return EncoderOut( + encoder_out=x.view(b_sz, -1, 2), + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + def reorder_encoder_out(self, encoder_out, new_order): + return EncoderOut( + encoder_out=encoder_out.encoder_out.index_select(0, new_order), + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + +class TestReshapingModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def build_model(cls, args, task): + encoder = TestReshapingEncoder(args, task.source_dictionary) + decoder = TestIncrementalDecoder(args, task.target_dictionary) + return cls(encoder, decoder) + + +class TestAdditionalInputEncoder(FairseqEncoder): + def __init__(self, args, dictionary): + super().__init__(dictionary) + self.args = args + + def forward(self, src_tokens, src_lengths=None, **kwargs): + assert "fancy_other_input" in kwargs + assert kwargs["fancy_other_input"] is not None + return EncoderOut( + encoder_out=src_tokens, + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + def reorder_encoder_out(self, encoder_out, new_order): + return EncoderOut( + encoder_out=encoder_out.encoder_out.index_select(0, new_order), + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + +class TestAdditionalInputModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def build_model(cls, args, task): + encoder = TestAdditionalInputEncoder(args, task.source_dictionary) + decoder = TestIncrementalDecoder(args, task.target_dictionary) + return cls(encoder, decoder) + + def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + decoder_out = self.decoder( + prev_output_tokens, encoder_out=encoder_out, **kwargs + ) + return decoder_out + + +def train_language_model( + data_dir, + arch, + extra_flags=None, + run_validation=False, + extra_valid_flags=None, + task="language_modeling", + world_size=1, +): + train_parser = options.get_training_parser() + train_args = options.parse_args_and_arch( + train_parser, + [ + "--task", + task, + data_dir, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--max-tokens", + "500", + "--tokens-per-sample", + "500", + "--save-dir", + data_dir, + "--max-epoch", + "1", + "--no-progress-bar", + "--distributed-world-size", + str(world_size), + "--ddp-backend", + "no_c10d", + "--num-workers", + "0", + ] + + (extra_flags or []), + ) + cfg = convert_namespace_to_omegaconf(train_args) + distributed_utils.call_main(cfg, train.main) + + if run_validation: + # test validation + validate_parser = options.get_validation_parser() + validate_args = options.parse_args_and_arch( + validate_parser, + [ + "--task", + task, + data_dir, + "--path", + os.path.join(data_dir, "checkpoint_last.pt"), + "--valid-subset", + "valid", + "--max-tokens", + "500", + "--no-progress-bar", + "--num-workers", + "0", + ] + + (extra_valid_flags or []), + ) + validate.main(validate_args) + + +def sizes(data): + return [len(sentence) for sentence in data] + + +POPULATION = string.ascii_letters + string.digits + + +def make_sentence() -> tp.List[str]: + length = random.randint(10, 50) + return random.choices( + population=POPULATION, k=length, weights=range(1, len(POPULATION) + 1) + ) + + +def make_data(length=1000, out_file=None) -> tp.List[tp.List[str]]: + data = ( + [make_sentence() for _ in range(0, length)] + # add all the symbols at least once + + [list(string.ascii_letters), list(string.digits)] + ) + if out_file is not None: + with open(out_file, "w", encoding="utf-8") as out: + for s in data: + print(" ".join(s), file=out) + + return data + + +def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary: + d = Dictionary() + for s in data: + for token in s: + d.add_symbol(token) + d.finalize() + return d diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c79aa2ce6fb7175ddef605d568ba046b305cc301 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5d83f0f88e9e68f6d25a2a8ff7d48deb39361a2f2eda0e89e154dd80530ce46 +size 1271613458