ash56 commited on 11 days ago

Commit

e5d530b

verified ·

1 Parent(s): 542b977

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fairseq/fairseq/optim/__pycache__/__init__.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/adadelta.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/adafactor.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/adagrad.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/adam.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/adamax.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/amp_optimizer.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/bmuf.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/composite.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/cpu_adam.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/dynamic_loss_scaler.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/fairseq_optimizer.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/fp16_optimizer.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/fused_adam.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/fused_lamb.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/nag.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/sgd.cpython-310.pyc +0 -0
fairseq/fairseq/optim/__pycache__/shard.cpython-310.pyc +0 -0
fairseq/fairseq/optim/bmuf.py +200 -0
fairseq/fairseq/optim/composite.py +273 -0
fairseq/fairseq/optim/fairseq_optimizer.py +187 -0
fairseq/fairseq/optim/fp16_optimizer.py +558 -0
fairseq/fairseq/optim/fused_lamb.py +51 -0
fairseq/fairseq/optim/lr_scheduler/__init__.py +36 -0
fairseq/fairseq/optim/lr_scheduler/__pycache__/cosine_lr_scheduler.cpython-310.pyc +0 -0
fairseq/fairseq/optim/lr_scheduler/__pycache__/inverse_square_root_schedule.cpython-310.pyc +0 -0
fairseq/fairseq/optim/lr_scheduler/__pycache__/polynomial_decay_schedule.cpython-310.pyc +0 -0
fairseq/fairseq/optim/lr_scheduler/__pycache__/reduce_lr_on_plateau.cpython-310.pyc +0 -0
fairseq/fairseq/optim/lr_scheduler/__pycache__/step_lr_scheduler.cpython-310.pyc +0 -0
fairseq/fairseq/optim/lr_scheduler/__pycache__/tri_stage_lr_scheduler.cpython-310.pyc +0 -0
fairseq/fairseq/optim/lr_scheduler/__pycache__/triangular_lr_scheduler.cpython-310.pyc +0 -0
fairseq/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py +146 -0
fairseq/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py +59 -0
fairseq/fairseq/optim/lr_scheduler/fixed_schedule.py +76 -0
fairseq/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py +85 -0
fairseq/fairseq/optim/lr_scheduler/manual_lr_scheduler.py +121 -0
fairseq/fairseq/optim/lr_scheduler/pass_through.py +39 -0
fairseq/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py +89 -0
fairseq/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py +143 -0
fairseq/fairseq/optim/lr_scheduler/step_lr_scheduler.py +85 -0
fairseq/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py +175 -0
fairseq/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py +83 -0
fairseq/fairseq/optim/nag.py +111 -0
fairseq/fairseq/optim/sgd.py +43 -0
fairseq/fairseq/optim/shard.py +58 -0
fairseq/fairseq/scoring/__init__.py +55 -0
fairseq/fairseq/scoring/__pycache__/__init__.cpython-310.pyc +0 -0
fairseq/fairseq/scoring/__pycache__/bertscore.cpython-310.pyc +0 -0
fairseq/fairseq/scoring/__pycache__/bleu.cpython-310.pyc +0 -0
fairseq/fairseq/scoring/__pycache__/chrf.cpython-310.pyc +0 -0

fairseq/fairseq/optim/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.9 kB). View file

fairseq/fairseq/optim/__pycache__/adadelta.cpython-310.pyc ADDED Viewed

Binary file (2.09 kB). View file

fairseq/fairseq/optim/__pycache__/adafactor.cpython-310.pyc ADDED Viewed

Binary file (8.16 kB). View file

fairseq/fairseq/optim/__pycache__/adagrad.cpython-310.pyc ADDED Viewed

Binary file (1.68 kB). View file

fairseq/fairseq/optim/__pycache__/adam.cpython-310.pyc ADDED Viewed

Binary file (7.19 kB). View file

fairseq/fairseq/optim/__pycache__/adamax.cpython-310.pyc ADDED Viewed

Binary file (5.25 kB). View file

fairseq/fairseq/optim/__pycache__/amp_optimizer.cpython-310.pyc ADDED Viewed

Binary file (4.16 kB). View file

fairseq/fairseq/optim/__pycache__/bmuf.cpython-310.pyc ADDED Viewed

Binary file (6.74 kB). View file

fairseq/fairseq/optim/__pycache__/composite.cpython-310.pyc ADDED Viewed

Binary file (9.91 kB). View file

fairseq/fairseq/optim/__pycache__/cpu_adam.cpython-310.pyc ADDED Viewed

Binary file (5.46 kB). View file

fairseq/fairseq/optim/__pycache__/dynamic_loss_scaler.cpython-310.pyc ADDED Viewed

Binary file (2.16 kB). View file

fairseq/fairseq/optim/__pycache__/fairseq_optimizer.cpython-310.pyc ADDED Viewed

Binary file (7.34 kB). View file

fairseq/fairseq/optim/__pycache__/fp16_optimizer.cpython-310.pyc ADDED Viewed

Binary file (16.4 kB). View file

fairseq/fairseq/optim/__pycache__/fused_adam.cpython-310.pyc ADDED Viewed

Binary file (9.2 kB). View file

fairseq/fairseq/optim/__pycache__/fused_lamb.cpython-310.pyc ADDED Viewed

Binary file (2.1 kB). View file

fairseq/fairseq/optim/__pycache__/nag.cpython-310.pyc ADDED Viewed

Binary file (3.66 kB). View file

fairseq/fairseq/optim/__pycache__/sgd.cpython-310.pyc ADDED Viewed

Binary file (1.73 kB). View file

fairseq/fairseq/optim/__pycache__/shard.cpython-310.pyc ADDED Viewed

Binary file (1.94 kB). View file

fairseq/fairseq/optim/bmuf.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+import torch
+import torch.distributed as dist
+from fairseq.dataclass.configs import FairseqBMUFConfig
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from fairseq.optim.fairseq_optimizer import FairseqOptimizer
+class FairseqBMUF(FairseqOptimizer):
+    """
+    Implements incremental block distributed data parallelism similar to
+    https://ieeexplore.ieee.org/document/7472805
+    Paper title: Scalable training of deep learning machines by incremental
+    block training with intra-block parallel optimization and blockwise
+    model-update filtering
+    """
+    def __init__(self, cfg: FairseqBMUFConfig, optimizer):
+        super().__init__(cfg)
+        self._optimizer = optimizer
+        self._num_updates = 0
+        self.sync_iter = cfg.global_sync_iter
+        self.block_momentum = cfg.block_momentum
+        self.block_lr = cfg.block_lr
+        self._reset_local_data()
+        self.warmup_iteration = cfg.warmup_iterations
+        self.use_nbm = cfg.use_nbm
+        self.initial_state = self._optimizer.state_dict()
+        self.average_sync = self.cfg.average_sync
+        self.world_size = self.cfg.distributed_world_size
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        gen_parser_from_dataclass(parser, FairseqBMUFConfig())
+    @property
+    def optimizer(self):
+        return self._optimizer.optimizer
+    @property
+    def optimizer_config(self):
+        return self._optimizer.optimizer_config
+    def get_lr(self):
+        return self._optimizer.get_lr()
+    def set_lr(self, lr):
+        self._optimizer.set_lr(lr)
+    def state_dict(self):
+        return self._optimizer.state_dict()
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        self._optimizer.load_state_dict(state_dict, optimizer_overrides)
+        self.initial_state = self._optimizer.state_dict()
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        self._optimizer.multiply_grads(c)
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm."""
+        return self._optimizer.clip_grad_norm(max_norm, aggregate_norm_fn)
+    def average_params(self):
+        self._optimizer.average_params()
+    def _block_sync(self):
+        if self.world_size <= 1:
+            return
+        # Update the global model using local models from all GPUs
+        # (Step-1) Calculate grad between previously synced model and
+        # currrent local model
+        if self.block_momentum != 0:
+            self._calc_grad()
+        # (Step-2) Average gradient from all GPUs
+        self._avg_grad_from_all_gpus()
+        # (Step-3) Calculate global momentum and update the global model
+        if self.block_momentum != 0:
+            self._update_global_model()
+        # (Step-4) Average local optimizer params
+        if self.average_sync:
+            self.average_params()
+    def _is_warmup_end(self):
+        # Check whether train iterations is equal to warmup iter
+        if self.get_num_updates() == self.warmup_iteration:
+            return True
+        return False
+    def _is_bmuf_iter(self):
+        # Check whether train iterations is equal to bmuf sync iter
+        if (self.get_num_updates() > self.warmup_iteration) and (
+            self.get_num_updates() % self.sync_iter == 0
+        ):
+            return True
+        return False
+    def _warmup_sync(self, root_rank=0):
+        if self.world_size <= 1:
+            return
+        # Broadcast the local model to all gpus
+        for param in self.params:
+            dist.broadcast(param.data, src=root_rank)
+        # Update local optimizer state
+        if self.average_sync:
+            self._optimizer.average_params()
+        else:
+            self._optimizer.load_state_dict(self.initial_state)
+        self._reset_local_data()
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        self._optimizer.step(closure)
+        self.set_num_updates(self.get_num_updates() + 1)
+        if self._is_warmup_end():
+            self._warmup_sync()
+        elif self._is_bmuf_iter():
+            self._block_sync()
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        self._optimizer.zero_grad()
+    def get_num_updates(self):
+        """Get the number of parameters updates."""
+        return self._num_updates
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        self._num_updates = num_updates
+    @torch.no_grad()
+    def _reset_local_data(self):
+        # (Step-0) Initialize global momentum parameters and store global copy on each gpu
+        self.global_params = [torch.zeros_like(p.data) for p in self.params]
+        self.smoothed_grads = [p.data.new_zeros(p.data.size()) for p in self.params]
+        self.grads = [p.data.new_zeros(p.data.size()) for p in self.params]
+        # saving the global model locally for calculating gradient during bmuf sync
+        for param, global_param in zip(self.params, self.global_params):
+            global_param.copy_(param.data)
+    @torch.no_grad()
+    def _calc_grad(self):
+        # global_params is basically the global copy from the previously finished
+        # synchronisation. param.data is local parameter after block_sync_freq
+        # for the local gpu. so grad is difference between previously synced
+        # model and currrent local model.
+        for index, (param, global_param) in enumerate(
+            zip(self.params, self.global_params)
+        ):
+            self.grads[index] = global_param - param.data
+    def _avg_grad_from_all_gpus(self):
+        for index, param in enumerate(self.params):
+            sync_para = param.data if self.block_momentum == 0 else self.grads[index]
+            sync_para /= float(dist.get_world_size())
+            dist.all_reduce(sync_para, op=dist.ReduceOp.SUM)
+    @torch.no_grad()
+    def _update_global_model(self):
+        for index, (param, global_param, smoothed_grad, grad) in enumerate(
+            zip(
+                self.params,
+                self.global_params,
+                self.smoothed_grads,
+                # all gpus would share the same value of smoothed_grad, since it is
+                # always computed on synchronized gradients.
+                self.grads,
+            )
+        ):
+            # global_param is basically last syncrhornized parameter. though
+            # smoothed_grad is local, all processes will have same value of
+            # smoothed_grad and hence param is globally synchronized copy.
+            # smoothed_grad(t) = BM * smoothed_grad(t-1) + BM_lr * grad(t)
+            smoothed_grad = self.block_momentum * smoothed_grad + self.block_lr * grad
+            param.data.copy_(global_param - smoothed_grad)
+            # A Nesterov momentum here is to do a partial weight update before
+            # calculating the gradient
+            if self.use_nbm:
+                param.data.copy_(param.data - self.block_momentum * smoothed_grad)
+            # backup for the next synchronization.
+            self.smoothed_grads[index] = smoothed_grad
+            global_param.copy_(param.data)

fairseq/fairseq/optim/composite.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Dict, Any, List, Optional
+import torch.optim
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim import FairseqOptimizer, register_optimizer, _build_optimizer
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, build_lr_scheduler
+from omegaconf import II, open_dict
+import copy
+logger = logging.getLogger(__name__)
+@dataclass
+class OptimizerAndSchedulerConfig(FairseqDataclass):
+    optimizer: Any = None
+    lr_scheduler: Optional[Any] = None
+    lr: List = II("optimization.lr")
+    lr_float: Optional[
+        float
+    ] = None  # this makes it easier to sweep on learning rate with auto sweepers
+@dataclass
+class CompositeOptimizerConfig(FairseqDataclass):
+    groups: Dict[str, Any] = field(
+        default_factory=lambda: {},
+        metadata={
+            "help": "optimizer name -> optimizer OptimizerAndSchedulerConfig. "
+            "Configures a different optimizer and (optionally) lr scheduler for each parameter group"
+        },
+    )
+    dynamic_groups: bool = field(
+        default=False,
+        metadata={
+            "help": "create groups dynamically based on parameters, if set to False, all parameters needs to have group_names"
+        },
+    )
+@register_optimizer("composite", dataclass=CompositeOptimizerConfig)
+class FairseqCompositeOptimizer(FairseqOptimizer):
+    optimizers: Dict[str, FairseqOptimizer] = {}
+    lr_schedulers: Dict[str, FairseqLRScheduler] = {}
+    lr_scheduler: FairseqLRScheduler = None
+    _optimizer: torch.optim.Optimizer
+    def __init__(self, cfg: CompositeOptimizerConfig, params):
+        super().__init__(cfg)
+        assert (
+            len(params) > 1
+        ), "Composite optimizer only works when there are multiple parameter groups (try fp16_no_flatten_grads: true)"
+        def dict_hash(dictionary: Dict[str, Any]) -> str:
+            import hashlib
+            import json
+            dhash = hashlib.md5()
+            encoded = json.dumps(dictionary, sort_keys=True).encode()
+            dhash.update(encoded)
+            return dhash.hexdigest()
+        groupped_params = defaultdict(list)
+        overrides = defaultdict(dict)
+        if not cfg.dynamic_groups:
+            for p in params:
+                group = getattr(p, "param_group", "default")
+                override_config = getattr(p, "optim_overrides", None)
+                if override_config is not None and bool(override_config):
+                    overrides[group] = override_config
+                else:
+                    assert (
+                        override_config == None or override_config == overrides[group]
+                    ), f"For group {group}, different overrides found {override_config} v/s {overrides[group]}"
+                groupped_params[group].append(p)
+            for p, params in groupped_params.items():
+                override_config = getattr(params[0], "optim_overrides", None)
+                if override_config is not None:
+                    for pp in params[1:]:
+                        assert override_config == getattr(
+                            pp, "optim_overrides", None
+                        ), f" {str(override_config)} != {str(getattr(pp, 'optim_overrides', None))}"
+        else:
+            for p in params:
+                group = getattr(p, "param_group", "default")
+                override_config = getattr(p, "optim_overrides", None)
+                if override_config is not None:
+                    override_config["group_name"] = group
+                    group_name = dict_hash(override_config)
+                    overrides[group_name] = override_config
+                else:
+                    group_name = group
+                groupped_params[group_name].append(p)
+        self.optimizers_config = {}
+        for group, group_params in groupped_params.items():
+            p_group = group
+            if group in overrides and "group_name" in overrides[group]:
+                p_group = overrides[group]["group_name"]
+            if group in cfg.groups:
+                group_cfg = cfg.groups[group]
+                optimizer_config = copy.deepcopy(group_cfg.optimizer)
+                scheduler_config = copy.deepcopy(group_cfg.lr_scheduler)
+                explicit_group_present = True
+            else:
+                group_cfg = cfg.groups[p_group]
+                optimizer_config = copy.deepcopy(group_cfg.optimizer)
+                scheduler_config = copy.deepcopy(group_cfg.lr_scheduler)
+                explicit_group_present = False
+            if getattr(group_cfg, "lr_float", None) is not None:
+                with open_dict(optimizer_config):
+                    optimizer_config.lr = [group_cfg.lr_float]
+            if group in overrides and "optimizer" in overrides[group]:
+                with open_dict(optimizer_config):
+                    if "lr_scale" in overrides[group]["optimizer"]:
+                        lr_scale = overrides[group]["optimizer"]["lr_scale"]
+                        optimizer_config.lr = [
+                            lr * lr_scale for lr in optimizer_config.lr
+                        ]
+                        if explicit_group_present:
+                            logger.info(
+                                f"For group:{group}, config as well as override present for lr"
+                            )
+                    if (
+                        "weight_decay_scale" in overrides[group]["optimizer"]
+                        and "optimizer_config" in optimizer_config
+                    ):
+                        weight_decay_scale = overrides[group]["optimizer"][
+                            "weight_decay_scale"
+                        ]
+                        optimizer_config.weight_decay = (
+                            optimizer_config.weight_decay * weight_decay_scale
+                        )
+                        if explicit_group_present:
+                            logger.info(
+                                f"For group:{group}, config as well as override present for weight_decay"
+                            )
+            with open_dict(scheduler_config):
+                scheduler_config.lr = optimizer_config.lr
+            self.optimizers[group] = _build_optimizer(optimizer_config, group_params)
+            self.optimizers_config[group] = optimizer_config
+            if scheduler_config is not None:
+                self.lr_schedulers[group] = build_lr_scheduler(
+                    scheduler_config, self.optimizers[group]
+                )
+        logger.info("Optimizers for different groups are as below")
+        for group in self.optimizers_config.keys():
+            logger.info(f"Group : {group}:{self.optimizers_config[group]}")
+        if len(self.lr_schedulers) > 0:
+            assert len(self.lr_schedulers) == len(self.optimizers), (
+                f"Please provide an lr scheduler for each optimizer to use pass_through scheduler. "
+                f"Optimizers: {self.optimizers}; Lr scheds: {self.lr_schedulers}"
+            )
+            self.lr_scheduler = CompositeLRScheduler(self.lr_schedulers)
+        self._optimizer = CompositeOptimizer(self.optimizers)
+    @property
+    def supports_groups(self):
+        return True
+    @property
+    def param_groups(self):
+        for opt in self.optimizers.values():
+            for group in opt.param_groups:
+                yield group
+    def get_lr(self):
+        """Return the current learning rate."""
+        k = (
+            "default"
+            if "default" in self.optimizers
+            else next(iter(self.optimizers.keys()))
+        )
+        return self.optimizers[k].param_groups[0]["lr"]
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {k: s.state_dict() for k, s in self.optimizers.items()}
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an LR scheduler state dict."""
+        for k, state in state_dict.items():
+            if k not in self.optimizers:
+                # skip extra keys like "loss_scale" added by fp16 optimizer
+                continue
+            overrides = (
+                optimizer_overrides[k]
+                if isinstance(optimizer_overrides, dict) and k in optimizer_overrides
+                else None
+            )
+            self.optimizers[k].load_state_dict(state, optimizer_overrides=overrides)
+class CompositeOptimizer(torch.optim.Optimizer):
+    def __init__(self, optimizers: Dict[str, FairseqOptimizer]):
+        self.optimizers = optimizers
+    @property
+    def supports_memory_efficient_fp16(self):
+        return all(o.supports_memory_efficient_fp16 for o in self.optimizers.values())
+    @property
+    def supports_flat_params(self):
+        return all(o.supports_flat_params for o in self.optimizers.values())
+    def step(self, closure=None, groups=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for k, opt in self.optimizers.items():
+            if groups is None or k in groups:
+                opt.step()
+        return loss
+    def zero_grad(self):
+        for opt in self.optimizers.values():
+            opt.zero_grad()
+class CompositeLRScheduler(FairseqLRScheduler):
+    def __init__(self, lr_schedulers):
+        super().__init__(None, None)
+        self.lr_schedulers = lr_schedulers
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {k: s.state_dict() for k, s in self.lr_schedulers.items()}
+    def load_state_dict(self, state_dict):
+        """Load an LR scheduler state dict."""
+        for k, state in state_dict.items():
+            self.lr_schedulers[k].load_state_dict(state)
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        for s in self.lr_schedulers.values():
+            s.step_begin_epoch(epoch)
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        for s in self.lr_schedulers.values():
+            s.step(epoch)
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        return {k: s.step_update(num_updates) for k, s in self.lr_schedulers.items()}

fairseq/fairseq/optim/fairseq_optimizer.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from fairseq import utils
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from collections import defaultdict
+class FairseqOptimizer(object):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+    @classmethod
+    def add_args(cls, parser):
+        """Add optimizer-specific arguments to the parser."""
+        dc = getattr(cls, "__dataclass", None)
+        if dc is not None:
+            gen_parser_from_dataclass(parser, dc())
+    @property
+    def optimizer(self):
+        """Return a torch.optim.optimizer.Optimizer instance."""
+        if not hasattr(self, "_optimizer"):
+            raise NotImplementedError
+        if not isinstance(self._optimizer, torch.optim.Optimizer):
+            raise ValueError("_optimizer must be an instance of torch.optim.Optimizer")
+        return self._optimizer
+    @optimizer.setter
+    def optimizer(self, optimizer):
+        """Reset optimizer instance."""
+        if not hasattr(self, "_optimizer"):
+            raise NotImplementedError
+        if not isinstance(self._optimizer, torch.optim.Optimizer):
+            raise ValueError("_optimizer must be an instance of torch.optim.Optimizer")
+        self._optimizer = optimizer
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        raise NotImplementedError
+    @property
+    def params(self):
+        """Return an iterable of the parameters held by the optimizer."""
+        for param_group in self.param_groups:
+            for p in param_group["params"]:
+                yield p
+    @property
+    def param_groups(self):
+        return self.optimizer.param_groups
+    def __getstate__(self):
+        return self._optimizer.__getstate__()
+    def get_lr(self):
+        """Return the current learning rate."""
+        return self.param_groups[0]["lr"]
+    def set_lr(self, lr):
+        """Set the learning rate."""
+        for param_group in self.param_groups:
+            param_group["lr"] = lr
+    def state_dict(self):
+        """Return the optimizer's state dict."""
+        return self.optimizer.state_dict()
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an optimizer state dict.
+        In general we should prefer the configuration of the existing optimizer
+        instance (e.g., learning rate) over that found in the state_dict. This
+        allows us to resume training from a checkpoint using a new set of
+        optimizer args.
+        """
+        self.optimizer.load_state_dict(state_dict)
+        if optimizer_overrides is not None and len(optimizer_overrides) > 0:
+            # override learning rate, momentum, etc. with latest values
+            for group in self.param_groups:
+                group.update(optimizer_overrides)
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves."""
+        loss.backward()
+    def all_reduce_grads(self, module):
+        """Manually all-reduce gradients (if required)."""
+        if hasattr(module, "all_reduce_grads"):
+            module.all_reduce_grads()
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
+        for p in self.params:
+            if p.grad is not None:
+                if p.grad.is_sparse:
+                    p.grad.data.mul_(c.to(p.grad.device) if torch.is_tensor(c) else c)
+                else:
+                    per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append(
+                        p.grad.data
+                    )
+        for device, per_dtype_grads in per_device_and_dtype_grads.items():
+            for grads in per_dtype_grads.values():
+                torch._foreach_mul_(grads, c.to(device) if torch.is_tensor(c) else c)
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm."""
+        return utils.clip_grad_norm_(self.params, max_norm, aggregate_norm_fn)
+    def step(self, closure=None, scale=1.0, groups=None):
+        """Performs a single optimization step."""
+        if self.supports_step_with_scale:
+            if self.supports_groups:
+                self.optimizer.step(closure, scale=scale, groups=groups)
+            else:
+                self.optimizer.step(closure, scale=scale)
+        else:
+            if scale != 1.0:
+                self.multiply_grads(1.0 / scale)
+            if self.supports_groups:
+                self.optimizer.step(closure, groups=groups)
+            else:
+                self.optimizer.step(closure)
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        for p in self.params:
+            p.grad = None
+        self.optimizer.zero_grad()
+    @property
+    def supports_memory_efficient_fp16(self):
+        if hasattr(self.optimizer, "supports_memory_efficient_fp16"):
+            return self.optimizer.supports_memory_efficient_fp16
+        return False
+    @property
+    def supports_step_with_scale(self):
+        if hasattr(self.optimizer, "supports_step_with_scale"):
+            return self.optimizer.supports_step_with_scale
+        return False
+    @property
+    def supports_groups(self):
+        if hasattr(self.optimizer, "supports_groups"):
+            return self.optimizer.supports_groups
+        return False
+    @property
+    def supports_flat_params(self):
+        """
+        Whether the optimizer supports collapsing of the model
+        parameters/gradients into a single contiguous Tensor.
+        """
+        if hasattr(self.optimizer, "supports_flat_params"):
+            return self.optimizer.supports_flat_params
+        return False
+    def average_params(self):
+        pass
+    def broadcast_global_state_dict(self, state_dict):
+        """
+        Broadcasts a global state dict to all ranks.
+        Useful for optimizers that shard state between ranks.
+        """
+        if hasattr(self.optimizer, "broadcast_global_state_dict"):
+            return self.optimizer.broadcast_global_state_dict(state_dict)
+        else:
+            return state_dict
+class LegacyFairseqOptimizer(FairseqOptimizer):
+    def __init__(self, args):
+        self.args = args

fairseq/fairseq/optim/fp16_optimizer.py ADDED Viewed

	@@ -0,0 +1,558 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import defaultdict
+from itertools import chain
+import torch
+from omegaconf import DictConfig
+from fairseq import optim
+from .dynamic_loss_scaler import DynamicLossScaler
+class _FP16OptimizerMixin(object):
+    def __init__(self, *args, **kwargs):
+        # forward __init__ call to the next class in mro(method resolution order)
+        super().__init__(*args, **kwargs)
+        self._multiply_factor = 1.0
+    @property
+    def has_flat_params(self):
+        return torch.is_tensor(self.fp32_params) or (
+            isinstance(self.fp32_params, dict)
+            and all(torch.is_tensor(t) for t in self.fp32_params.values())
+        )
+    @classmethod
+    def build_fp32_params(cls, args, params, flatten=True):
+        # create FP32 copy of parameters and grads
+        if flatten:
+            is_pipeline_parallel = getattr(
+                args, "pipeline_model_parallel", False
+            ) and getattr(args, "distributed_no_spawn", False)
+            total_param_size = sum(p.data.numel() for p in params)
+            devices = [torch.cuda.current_device()]
+            if is_pipeline_parallel:
+                devices = list(set(args.pipeline_devices))
+            fp32_params = {}
+            for device in devices:
+                if is_pipeline_parallel:
+                    device_param_size = sum(
+                        p.data.numel() for p in params if p.device.index == device
+                    )
+                    device_params = [p for p in params if p.device.index == device]
+                else:
+                    device_param_size = total_param_size
+                    device_params = params
+                fp32_params[device] = (
+                    device_params[0].new(0).float().new(device_param_size)
+                )
+                offset = 0
+                for p in device_params:
+                    numel = p.data.numel()
+                    fp32_params[device][offset : offset + numel].copy_(p.data.view(-1))
+                    offset += numel
+                fp32_params[device] = torch.nn.Parameter(fp32_params[device])
+                fp32_params[device].grad = fp32_params[device].data.new(
+                    device_param_size
+                )
+            return fp32_params
+        else:
+            fp32_params = []
+            for p in params:
+                p32 = torch.nn.Parameter(p.data.float())
+                if hasattr(p, "expert"):
+                    p32.expert = True
+                elif hasattr(p, "base_expert"):
+                    p32.base_expert = True
+                p32.grad = torch.zeros_like(p32.data)
+                if hasattr(p, "param_group"):
+                    p32.param_group = p.param_group
+                if hasattr(p, "optim_overrides"):
+                    p32.optim_overrides = p.optim_overrides
+                fp32_params.append(p32)
+            return fp32_params
+    def state_dict(self):
+        """Return the optimizer's state dict."""
+        state_dict = self.fp32_optimizer.state_dict()
+        if self.scaler is not None:
+            state_dict["loss_scale"] = self.scaler.loss_scale
+        return state_dict
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an optimizer state dict.
+        In general we should prefer the configuration of the existing optimizer
+        instance (e.g., learning rate) over that found in the state_dict. This
+        allows us to resume training from a checkpoint using a new set of
+        optimizer args.
+        """
+        if "loss_scale" in state_dict and self.scaler is not None:
+            self.scaler.loss_scale = state_dict["loss_scale"]
+        self.fp32_optimizer.load_state_dict(state_dict, optimizer_overrides)
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves.
+        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this
+        function additionally dynamically scales the loss to avoid gradient
+        underflow.
+        """
+        if self.scaler is not None:
+            loss = self.scaler.scale(loss)
+        loss.backward()
+        self._needs_sync = True
+    def _sync_fp16_grads_to_fp32(self):
+        if self._needs_sync:
+            # copy FP16 grads to FP32
+            if self.has_flat_params:
+                devices = list(self.fp32_params.keys())
+                device_params_dict = defaultdict(list)
+                for p in self.fp16_params:
+                    if p.requires_grad:
+                        device_params_dict[p.device.index].append(p)
+                for device in devices:
+                    device_params = device_params_dict[device]
+                    offset = 0
+                    for p in device_params:
+                        grad_data = (
+                            p.grad.data
+                            if p.grad is not None
+                            else p.data.new_zeros(p.data.shape)
+                        )
+                        numel = grad_data.numel()
+                        self.fp32_params[device].grad.data[
+                            offset : offset + numel
+                        ].copy_(grad_data.view(-1))
+                        offset += numel
+            else:
+                for p, p32 in zip(self.fp16_params, self.fp32_params):
+                    if not p.requires_grad:
+                        continue
+                    if p.grad is not None:
+                        if p32.grad is None:
+                            p32.grad = p.grad.data.float()
+                        else:
+                            p32.grad.data.copy_(p.grad.data)
+                    else:
+                        p32.grad = torch.zeros_like(p.data, dtype=torch.float)
+            self._needs_sync = False
+    def _sync_fp32_params_to_fp16(self):
+        # copy FP32 params back into FP16 model
+        if self.has_flat_params:
+            devices = list(self.fp32_params.keys())
+            device_params_dict = defaultdict(list)
+            for p in self.fp16_params:
+                device_params_dict[p.device.index].append(p)
+            for device in devices:
+                device_params = device_params_dict[device]
+                offset = 0
+                for p in device_params:
+                    numel = p.data.numel()
+                    p.data.copy_(
+                        self.fp32_params[device]
+                        .data[offset : offset + numel]
+                        .view_as(p.data)
+                    )
+                    offset += numel
+        else:
+            for p, p32 in zip(self.fp16_params, self.fp32_params):
+                if not p.requires_grad:
+                    continue
+                p.data.copy_(p32.data)
+    def _unscale_grads(self):
+        self._sync_fp16_grads_to_fp32()
+        if (
+            # Skip the multiplication if it's a no-op (i.e., if _multiply_factor
+            # is 1.0). At the same time, we want to avoid the device-to-host
+            # transfer by comparing it to 1.0. Since _multiply_factor starts as
+            # a Python float, we roughly assume that if it's a tensor then it's
+            # probably not =1.0 anymore and we do the multiplication. Otherwise
+            # we can safely check the value without a D2H transfer.
+            torch.is_tensor(self._multiply_factor)
+            or self._multiply_factor != 1.0
+        ):
+            self.fp32_optimizer.multiply_grads(self._multiply_factor)
+            self._multiply_factor = 1.0
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant ``c``."""
+        self._multiply_factor *= c
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm and updates dynamic loss scaler."""
+        self._sync_fp16_grads_to_fp32()
+        grad_norm = self._multiply_factor * self.fp32_optimizer.clip_grad_norm(
+            0, aggregate_norm_fn
+        )
+        if torch.is_tensor(self._multiply_factor):
+            self._multiply_factor = self._multiply_factor.to(grad_norm.device)
+        if self.scaler is not None:
+            if grad_norm > max_norm > 0.0:
+                self._multiply_factor *= max_norm / grad_norm
+            self.scaler.check_overflow(grad_norm)
+        elif max_norm > 0.0:
+            clip_coef = (max_norm / (grad_norm + 1e-6)).clamp_(max=1)
+            self._multiply_factor *= clip_coef
+        return grad_norm
+    def step(self, closure=None, groups=None):
+        """Performs a single optimization step."""
+        self._sync_fp16_grads_to_fp32()
+        if getattr(self, "supports_step_with_scale", False):
+            self.fp32_optimizer.step(
+                closure, scale=(1.0 / self._multiply_factor), groups=groups
+            )
+        else:
+            self._unscale_grads()
+            self.fp32_optimizer.step(closure, groups=groups)
+        if self.scaler is not None:
+            self.scaler.update()
+        self._sync_fp32_params_to_fp16()
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        for p in self.fp16_params:
+            p.grad = None
+        if self.has_flat_params:
+            if torch.is_tensor(self.fp32_params):
+                self.fp32_params.grad.zero_()
+            elif isinstance(self.fp32_params, dict):
+                for fp32_params in self.fp32_params.values():
+                    fp32_params.grad.zero_()
+            else:
+                raise RuntimeError("self.fp32_params must be a tensor or dict")
+        else:
+            for p32 in self.fp32_params:
+                if p32.grad is not None:
+                    p32.grad.zero_()
+        self._needs_sync = False
+        if self.scaler is not None:
+            self._multiply_factor = 1.0 / float(self.scaler.loss_scale)
+class FP16Optimizer(_FP16OptimizerMixin, optim.FairseqOptimizer):
+    """
+    Wrap an *optimizer* to support FP16 (mixed precision) training.
+    """
+    def __init__(self, cfg: DictConfig, params, fp32_optimizer, fp32_params, **kwargs):
+        super().__init__(cfg.optimizer)
+        self.fp16_params = params
+        self.fp32_optimizer = fp32_optimizer
+        self.fp32_params = fp32_params
+        if getattr(cfg.common, "fp16_scale_window", None) is None:
+            if len(cfg.optimization.update_freq) > 1:
+                raise ValueError(
+                    "--fp16-scale-window must be given explicitly when using a "
+                    "custom --update-freq schedule"
+                )
+            data_parallel_size = int(
+                cfg.distributed_training.distributed_world_size
+                / cfg.common.model_parallel_size
+            )
+            scale_window = int(
+                2**14 / data_parallel_size / cfg.optimization.update_freq[0]
+            )
+        else:
+            scale_window = cfg.common.fp16_scale_window
+        if not getattr(cfg.common, "bf16", False):
+            self.scaler = DynamicLossScaler(
+                init_scale=cfg.common.fp16_init_scale,
+                scale_window=scale_window,
+                tolerance=cfg.common.fp16_scale_tolerance,
+                threshold=cfg.common.threshold_loss_scale,
+                min_loss_scale=cfg.common.min_loss_scale,
+            )
+        else:
+            # disable loss scaling for bfloat16
+            self.scaler = None
+    @classmethod
+    def build_optimizer(cls, cfg: DictConfig, params, **kwargs):
+        """
+        Args:
+            cfg (omegaconf.DictConfig): fairseq args
+            params (iterable): iterable of parameters to optimize
+        """
+        flatten = not getattr(cfg.common, "fp16_no_flatten_grads", False)
+        if getattr(cfg.common, "bf16", False):
+            flatten = False  # mixed precision is faster on TPUs without flat grads
+        fp32_params = cls.build_fp32_params(cfg.optimizer, params, flatten=flatten)
+        if flatten:
+            fp32_optimizer = optim.build_optimizer(cfg.optimizer, [fp32_params])
+        else:
+            fp32_optimizer = optim.build_optimizer(cfg.optimizer, fp32_params)
+        if flatten and not fp32_optimizer.supports_flat_params:
+            raise RuntimeError(
+                f"chosen optimizer {fp32_optimizer.__class__.__name__} does not support flat params, please set --fp16-no-flatten-grads"
+            )
+        return cls(cfg, params, fp32_optimizer, fp32_params, **kwargs)
+    @property
+    def optimizer(self):
+        return self.fp32_optimizer.optimizer
+    @optimizer.setter
+    def optimizer(self, optimizer):
+        self.fp32_optimizer.optimizer = optimizer
+    @property
+    def lr_scheduler(self):
+        return getattr(self.fp32_optimizer, "lr_scheduler", None)
+    @property
+    def optimizer_config(self):
+        return self.fp32_optimizer.optimizer_config
+    def get_lr(self):
+        return self.fp32_optimizer.get_lr()
+    def set_lr(self, lr):
+        self.fp32_optimizer.set_lr(lr)
+    def all_reduce_grads(self, module):
+        self.fp32_optimizer.all_reduce_grads(module)
+    @property
+    def supports_flat_params(self):
+        return self.fp32_optimizer.supports_flat_params
+class _MemoryEfficientFP16OptimizerMixin(object):
+    def __init__(self, *args, **kwargs):
+        # forward __init__ call to the next class in MRO (method resolution order)
+        super().__init__(*args, **kwargs)
+        self._multiply_factor = 1.0
+    @property
+    def has_flat_params(self):
+        return False
+    def state_dict(self):
+        """Return the optimizer's state dict."""
+        state_dict = self.wrapped_optimizer.state_dict()
+        if self.scaler is not None:
+            state_dict["loss_scale"] = self.scaler.loss_scale
+        return state_dict
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an optimizer state dict.
+        In general we should prefer the configuration of the existing optimizer
+        instance (e.g., learning rate) over that found in the state_dict. This
+        allows us to resume training from a checkpoint using a new set of
+        optimizer args.
+        """
+        if "loss_scale" in state_dict and self.scaler is not None:
+            self.scaler.loss_scale = state_dict["loss_scale"]
+        self.wrapped_optimizer.load_state_dict(state_dict, optimizer_overrides)
+        # Hack: PyTorch automatically casts the optimizer state to match the
+        # type of the current parameters. But with --memory-efficient-fp16 the
+        # params are FP16 while the optimizer state is FP32 and we don't want
+        # to cast. A workaround is to manually copy back the original state
+        # after the optimizer has been loaded.
+        if not getattr(self.optimizer, "disable_mem_eff_fp16_loading_hack", False):
+            groups = self.optimizer.param_groups
+            saved_groups = state_dict["param_groups"]
+            id_map = {
+                old_id: p
+                for old_id, p in zip(
+                    chain(*(g["params"] for g in saved_groups)),
+                    chain(*(g["params"] for g in groups)),
+                )
+            }
+            for k, v in state_dict["state"].items():
+                if k in id_map:
+                    param = id_map[k]
+                    self.optimizer.state[param] = v
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves.
+        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this
+        function additionally dynamically scales the loss to avoid gradient
+        underflow.
+        """
+        if self.scaler is not None:
+            loss = self.scaler.scale(loss)
+        loss.backward()
+    def _unscale_grads(self):
+        if (
+            # Skip the multiplication if it's a no-op (i.e., if _multiply_factor
+            # is 1.0). At the same time, we want to avoid the device-to-host
+            # transfer by comparing it to 1.0. Since _multiply_factor starts as
+            # a Python float, we roughly assume that if it's a tensor then it's
+            # probably not =1.0 anymore and we do the multiplication. Otherwise
+            # we can safely check the value without a D2H transfer.
+            torch.is_tensor(self._multiply_factor)
+            or self._multiply_factor != 1.0
+        ):
+            self.wrapped_optimizer.multiply_grads(self._multiply_factor)
+            self._multiply_factor = 1.0
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        self._multiply_factor *= c
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm and updates dynamic loss scaler."""
+        max_norm = float(max_norm)
+        grad_norm = self._multiply_factor * self.wrapped_optimizer.clip_grad_norm(
+            0, aggregate_norm_fn
+        )
+        if self.scaler is not None:
+            grad_norm_cpu = float(grad_norm)
+            if grad_norm_cpu > max_norm > 0.0:
+                self._multiply_factor *= max_norm / grad_norm_cpu
+            # detect overflow and adjust loss scale
+            self.scaler.check_overflow(grad_norm_cpu)
+        elif max_norm > 0.0:
+            clip_coef = (max_norm / (grad_norm + 1e-6)).clamp_(max=1)
+            self._multiply_factor *= clip_coef
+        return grad_norm
+    def step(self, closure=None, groups=None):
+        """Performs a single optimization step."""
+        if getattr(self, "supports_step_with_scale", False):
+            # NOTE(msb) optimizer divides by scale factor
+            self.wrapped_optimizer.step(
+                closure, scale=(1.0 / self._multiply_factor), groups=groups
+            )
+        else:
+            self._unscale_grads()
+            self.wrapped_optimizer.step(closure, groups=groups)
+        if self.scaler is not None:
+            self.scaler.update()
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        self.wrapped_optimizer.zero_grad()
+        if self.scaler is not None:
+            self._multiply_factor = 1.0 / float(self.scaler.loss_scale)
+        else:
+            self._multiply_factor = 1.0
+    @property
+    def supports_flat_params(self):
+        return self.wrapped_optimizer.supports_flat_params
+class MemoryEfficientFP16Optimizer(
+    _MemoryEfficientFP16OptimizerMixin, optim.FairseqOptimizer
+):
+    """
+    Wrap an *optimizer* to support FP16 (mixed precision) training.
+    Compared to :class:`fairseq.optim.FP16Optimizer`, this version does not
+    maintain an FP32 copy of the model. We instead expect the optimizer to
+    convert the gradients to FP32 internally and sync the results back to the
+    FP16 model params. This significantly reduces memory usage but slightly
+    increases the time spent in the optimizer.
+    Since this wrapper depends on specific functionality in the wrapped
+    optimizer (i.e., on-the-fly conversion of grads to FP32), only certain
+    optimizers can be wrapped. This is determined by the
+    *supports_memory_efficient_fp16* property.
+    """
+    def __init__(
+        self, cfg: DictConfig, params, optimizer, allow_unsupported=False, **kwargs
+    ):
+        if not allow_unsupported and not optimizer.supports_memory_efficient_fp16:
+            raise ValueError(
+                "Unsupported optimizer: {}".format(optimizer.__class__.__name__)
+            )
+        super().__init__(getattr(cfg, "optimizer", None))
+        self.wrapped_optimizer = optimizer
+        if getattr(cfg.common, "fp16_scale_window", None) is None:
+            if len(cfg.optimization.update_freq) > 1:
+                raise ValueError(
+                    "--fp16-scale-window must be given explicitly when using a "
+                    "custom --update-freq schedule"
+                )
+            data_parallel_size = int(
+                cfg.distributed_training.distributed_world_size
+                / cfg.common.model_parallel_size
+            )
+            scale_window = int(
+                2**14 / data_parallel_size / cfg.optimization.update_freq[0]
+            )
+        else:
+            scale_window = cfg.common.fp16_scale_window
+        if not getattr(cfg.common, "bf16", False):
+            self.scaler = DynamicLossScaler(
+                init_scale=cfg.common.fp16_init_scale,
+                scale_window=scale_window,
+                tolerance=cfg.common.fp16_scale_tolerance,
+                threshold=cfg.common.threshold_loss_scale,
+                min_loss_scale=cfg.common.min_loss_scale,
+            )
+        else:
+            # disable loss scaling for bfloat16
+            self.scaler = None
+    @classmethod
+    def build_optimizer(cls, cfg: DictConfig, params, **kwargs):
+        """
+        Args:
+            args (argparse.Namespace): fairseq args
+            params (iterable): iterable of parameters to optimize
+        """
+        fp16_optimizer = optim.build_optimizer(cfg.optimizer, params)
+        return cls(cfg, params, fp16_optimizer, **kwargs)
+    @property
+    def optimizer(self):
+        return self.wrapped_optimizer.optimizer
+    @optimizer.setter
+    def optimizer(self, optimizer):
+        self.wrapped_optimizer.optimizer = optimizer
+    @property
+    def optimizer_config(self):
+        return self.wrapped_optimizer.optimizer_config
+    @property
+    def lr_scheduler(self):
+        return getattr(self.wrapped_optimizer, "lr_scheduler", None)
+    def get_lr(self):
+        return self.wrapped_optimizer.get_lr()
+    def set_lr(self, lr):
+        self.wrapped_optimizer.set_lr(lr)
+    def all_reduce_grads(self, module):
+        self.wrapped_optimizer.all_reduce_grads(module)

fairseq/fairseq/optim/fused_lamb.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from fairseq.optim import LegacyFairseqOptimizer, register_optimizer
+@register_optimizer("lamb")
+class FairseqLAMB(LegacyFairseqOptimizer):
+    """LAMB optimizer."""
+    def __init__(self, args, params):
+        super().__init__(args)
+        try:
+            from apex.optimizers import FusedLAMB
+            self._optimizer = FusedLAMB(params, **self.optimizer_config)
+        except ImportError:
+            raise ImportError("Please install apex to use LAMB optimizer")
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--lamb-betas', default='(0.9, 0.999)', metavar='B',
+                            help='betas for LAMB optimizer')
+        parser.add_argument('--lamb-eps', type=float, default=1e-8, metavar='D',
+                            help='epsilon for LAMB optimizer')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # fmt: on
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.args.lr[0],
+            "betas": eval(self.args.lamb_betas),
+            "eps": self.args.lamb_eps,
+            "weight_decay": self.args.weight_decay,
+        }
+    @property
+    def supports_flat_params(self):
+        return False

fairseq/fairseq/optim/lr_scheduler/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+import importlib
+import os
+from fairseq import registry
+from fairseq.optim.lr_scheduler.fairseq_lr_scheduler import (  # noqa
+    FairseqLRScheduler,
+    LegacyFairseqLRScheduler,
+)
+from omegaconf import DictConfig
+(
+    build_lr_scheduler_,
+    register_lr_scheduler,
+    LR_SCHEDULER_REGISTRY,
+    LR_SCHEDULER_DATACLASS_REGISTRY,
+) = registry.setup_registry(
+    "--lr-scheduler", base_class=FairseqLRScheduler, default="fixed"
+)
+def build_lr_scheduler(cfg: DictConfig, optimizer):
+    return build_lr_scheduler_(cfg, optimizer)
+# automatically import any Python files in the optim/lr_scheduler/ directory
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        file_name = file[: file.find(".py")]
+        importlib.import_module("fairseq.optim.lr_scheduler." + file_name)

fairseq/fairseq/optim/lr_scheduler/__pycache__/cosine_lr_scheduler.cpython-310.pyc ADDED Viewed

Binary file (4.24 kB). View file

fairseq/fairseq/optim/lr_scheduler/__pycache__/inverse_square_root_schedule.cpython-310.pyc ADDED Viewed

Binary file (3.16 kB). View file

fairseq/fairseq/optim/lr_scheduler/__pycache__/polynomial_decay_schedule.cpython-310.pyc ADDED Viewed

Binary file (3.07 kB). View file

fairseq/fairseq/optim/lr_scheduler/__pycache__/reduce_lr_on_plateau.cpython-310.pyc ADDED Viewed

Binary file (4.28 kB). View file

fairseq/fairseq/optim/lr_scheduler/__pycache__/step_lr_scheduler.cpython-310.pyc ADDED Viewed

Binary file (2.79 kB). View file

fairseq/fairseq/optim/lr_scheduler/__pycache__/tri_stage_lr_scheduler.cpython-310.pyc ADDED Viewed

Binary file (4.88 kB). View file

fairseq/fairseq/optim/lr_scheduler/__pycache__/triangular_lr_scheduler.cpython-310.pyc ADDED Viewed

Binary file (2.8 kB). View file

fairseq/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import List
+from omegaconf import II
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+@dataclass
+class CosineLRScheduleConfig(FairseqDataclass):
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    warmup_init_lr: float = field(
+        default=-1,
+        metadata={
+            "help": "initial learning rate during warmup phase; default is cfg.lr"
+        },
+    )
+    lr: List[float] = field(
+        default=II("optimization.lr"),
+        metadata={"help": "max learning rate, must be more than cfg.min_lr"},
+    )
+    min_lr: float = field(default=0.0, metadata={"help": "min learning rate"})
+    t_mult: float = field(
+        default=1.0, metadata={"help": "factor to grow the length of each period"}
+    )
+    lr_period_updates: float = field(
+        default=-1, metadata={"help": "initial number of updates per period"}
+    )
+    lr_shrink: float = field(
+        default=0.1, metadata={"help": "shrink factor for annealing"}
+    )
+    # This is not required, but is for convenience in inferring lr_period_updates
+    max_update: int = II("optimization.max_update")
+@register_lr_scheduler("cosine", dataclass=CosineLRScheduleConfig)
+class CosineLRSchedule(FairseqLRScheduler):
+    """Assign LR based on a cyclical schedule that follows the cosine function.
+    See https://arxiv.org/pdf/1608.03983.pdf for details.
+    We also support a warmup phase where we linearly increase the learning rate
+    from some initial learning rate (``--warmup-init-lr``) until the configured
+    max learning rate (``--lr``).
+    During warmup::
+      lrs = torch.linspace(cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates)
+      lr = lrs[update_num]
+    After warmup::
+      lr = cfg.min_lr + 0.5*(cfg.lr - cfg.min_lr)*(1 + cos(t_curr / t_i))
+    where ``t_curr`` is current percentage of updates within the current period
+    range and ``t_i`` is the current period range, which is scaled by ``t_mul``
+    after every iteration.
+    """
+    def __init__(self, cfg: CosineLRScheduleConfig, fairseq_optimizer):
+        super().__init__(cfg, fairseq_optimizer)
+        if isinstance(cfg.lr, Collection) and len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with cosine."
+                f" Consider --lr-scheduler=fixed instead. ({cfg.lr})"
+            )
+        self.max_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr
+        if self.max_lr < cfg.min_lr:
+            cfg.min_lr = self.max_lr
+        warmup_end_lr = self.max_lr
+        if cfg.warmup_init_lr < 0:
+            cfg.warmup_init_lr = cfg.min_lr
+        self.t_mult = cfg.t_mult
+        self.period = cfg.lr_period_updates
+        if self.period <= 0:
+            assert (
+                cfg.max_update > 0
+            ), "Either --max_update or --lr-period-updates must be set"
+            self.period = cfg.max_update - cfg.warmup_updates
+        if cfg.warmup_updates > 0:
+            # linearly warmup for the first cfg.warmup_updates
+            self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates
+        else:
+            self.lr_step = 1
+        self.warmup_updates = cfg.warmup_updates
+        self.lr_shrink = cfg.lr_shrink
+        # initial learning rate
+        self.lr = cfg.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if num_updates < self.cfg.warmup_updates:
+            self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step
+        else:
+            curr_updates = num_updates - self.cfg.warmup_updates
+            if self.t_mult != 1:
+                i = math.floor(
+                    math.log(
+                        1 - curr_updates / self.period * (1 - self.t_mult), self.t_mult
+                    )
+                )
+                t_i = self.t_mult**i * self.period
+                t_curr = (
+                    curr_updates
+                    - (1 - self.t_mult**i) / (1 - self.t_mult) * self.period
+                )
+            else:
+                i = math.floor(curr_updates / self.period)
+                t_i = self.period
+                t_curr = curr_updates - (self.period * i)
+            lr_shrink = self.lr_shrink**i
+            min_lr = self.cfg.min_lr * lr_shrink
+            max_lr = self.max_lr * lr_shrink
+            self.lr = min_lr + 0.5 * (max_lr - min_lr) * (
+                1 + math.cos(math.pi * t_curr / t_i)
+            )
+        self.optimizer.set_lr(self.lr)
+        return self.lr

fairseq/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from argparse import Namespace
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from fairseq.optim import FairseqOptimizer
+class FairseqLRScheduler(object):
+    def __init__(self, cfg, optimizer):
+        super().__init__()
+        if optimizer is not None and not isinstance(optimizer, FairseqOptimizer):
+            raise ValueError("optimizer must be an instance of FairseqOptimizer")
+        self.cfg = cfg
+        self.optimizer = optimizer
+        self.best = None
+    @classmethod
+    def add_args(cls, parser):
+        """Add arguments to the parser for this LR scheduler."""
+        dc = getattr(cls, "__dataclass", None)
+        if dc is not None:
+            gen_parser_from_dataclass(parser, dc())
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {"best": self.best}
+    def load_state_dict(self, state_dict):
+        """Load an LR scheduler state dict."""
+        self.best = state_dict["best"]
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        pass
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        if val_loss is not None:
+            if self.best is None:
+                self.best = val_loss
+            else:
+                self.best = min(self.best, val_loss)
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        return self.optimizer.get_lr()
+class LegacyFairseqLRScheduler(FairseqLRScheduler):
+    def __init__(self, args: Namespace, optimizer):
+        if not isinstance(optimizer, FairseqOptimizer):
+            raise ValueError("optimizer must be an instance of FairseqOptimizer")
+        self.args = args
+        self.optimizer = optimizer
+        self.best = None

fairseq/fairseq/optim/lr_scheduler/fixed_schedule.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+from typing import Optional, List
+from omegaconf import II
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+@dataclass
+class FixedLRScheduleConfig(FairseqDataclass):
+    force_anneal: Optional[int] = field(
+        default=None,
+        metadata={"help": "force annealing at specified epoch"},
+    )
+    lr_shrink: float = field(
+        default=0.1,
+        metadata={"help": "shrink factor for annealing, lr_new = (lr * lr_shrink)"},
+    )
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    lr: List[float] = II("optimization.lr")
+@register_lr_scheduler("fixed", dataclass=FixedLRScheduleConfig)
+class FixedLRSchedule(FairseqLRScheduler):
+    """Decay the LR on a fixed schedule."""
+    def __init__(self, cfg: FixedLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        self.lr = cfg.lr[0]
+        if cfg.warmup_updates > 0:
+            self.warmup_factor = 1.0 / cfg.warmup_updates
+        else:
+            self.warmup_factor = 1
+    def state_dict(self):
+        return {"lr": self.lr}
+    def load_state_dict(self, state_dict):
+        if "lr" in state_dict:
+            self.lr = state_dict["lr"]
+    def get_next_lr(self, epoch):
+        lrs = self.cfg.lr
+        if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal:
+            # use fixed LR schedule
+            next_lr = lrs[min(epoch - 1, len(lrs) - 1)]
+        else:
+            # annneal based on lr_shrink
+            next_lr = lrs[-1] * self.cfg.lr_shrink ** (
+                epoch + 1 - self.cfg.force_anneal
+            )
+        return next_lr
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        self.lr = self.get_next_lr(epoch)
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+        return self.optimizer.get_lr()
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if self.cfg.warmup_updates > 0 and num_updates < self.cfg.warmup_updates:
+            self.warmup_factor = (num_updates + 1) / float(self.cfg.warmup_updates)
+            self.optimizer.set_lr(self.warmup_factor * self.lr)
+        else:
+            self.optimizer.set_lr(self.lr)
+        return self.optimizer.get_lr()

fairseq/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import List
+from omegaconf import II
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+@dataclass
+class InverseSquareRootLRScheduleConfig(FairseqDataclass):
+    warmup_updates: int = field(
+        default=4000,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    warmup_init_lr: float = field(
+        default=-1,
+        metadata={
+            "help": "initial learning rate during warmup phase; default is cfg.lr"
+        },
+    )
+    lr: List[float] = II("optimization.lr")
+@register_lr_scheduler("inverse_sqrt", dataclass=InverseSquareRootLRScheduleConfig)
+class InverseSquareRootSchedule(FairseqLRScheduler):
+    """Decay the LR based on the inverse square root of the update number.
+    We also support a warmup phase where we linearly increase the learning rate
+    from some initial learning rate (``--warmup-init-lr``) until the configured
+    learning rate (``--lr``). Thereafter we decay proportional to the number of
+    updates, with a decay factor set to align with the configured learning rate.
+    During warmup::
+      lrs = torch.linspace(cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates)
+      lr = lrs[update_num]
+    After warmup::
+      decay_factor = cfg.lr * sqrt(cfg.warmup_updates)
+      lr = decay_factor / sqrt(update_num)
+    """
+    def __init__(self, cfg: InverseSquareRootLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        if isinstance(cfg.lr, Collection) and len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with inverse_sqrt."
+                " Consider --lr-scheduler=fixed instead."
+            )
+        warmup_end_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr
+        if cfg.warmup_init_lr < 0:
+            cfg.warmup_init_lr = 0 if cfg.warmup_updates > 0 else warmup_end_lr
+        # linearly warmup for the first cfg.warmup_updates
+        self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates
+        # then, decay prop. to the inverse square root of the update number
+        self.decay_factor = warmup_end_lr * cfg.warmup_updates**0.5
+        # initial learning rate
+        self.lr = cfg.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if num_updates < self.cfg.warmup_updates:
+            self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step
+        else:
+            self.lr = self.decay_factor * num_updates**-0.5
+        self.optimizer.set_lr(self.lr)
+        return self.lr

fairseq/fairseq/optim/lr_scheduler/manual_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import LegacyFairseqLRScheduler, register_lr_scheduler
+import logging
+import ast
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+@register_lr_scheduler("manual")
+class ManualSchedule(LegacyFairseqLRScheduler):
+    """Decay the LR on a manual schedule."""
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+        self.epoch2lr = self.parse_manuallr_args(args.epoch2lr)
+        self.update2lr = self.parse_manuallr_args(args.update2lr)
+        logger.info("@@@ ManualSchedule epoch2lr={}".format(self.epoch2lr))
+        logger.info("@@@ ManualSchedule update2lr={}".format(self.update2lr))
+        if 1 in self.epoch2lr:
+            self.lr = self.epoch2lr[1]
+        elif 1 in self.update2lr:
+            self.lr = self.update2lr[1]
+        else:
+            self.lr = args.lr[0]
+        self.optimizer.set_lr(self.lr)  # Set the beginning of the epoch.
+    def parse_manuallr_args(self, lr_args_str):
+        lr_dict = ast.literal_eval(lr_args_str.replace(" ", ""))
+        if not isinstance(lr_dict, dict):
+            raise ValueError("epoch2lr/update2lr must be abel to evaluated to a dict")
+        lr_args = {}
+        logger.info("@@@ after parsing input dictionary lr_dict = {}".format(lr_dict))
+        for key, val in lr_dict.items():
+            if "," in key:
+                for k in key.split(","):
+                    lr_args[int(k)] = float(val)
+            elif "-" in key:
+                s = int(key.split("-")[0])
+                e = int(key.split("-")[1])
+                for k in range(s, e + 1, 1):
+                    lr_args[k] = float(val)
+            else:
+                lr_args[int(key)] = float(val)
+        return lr_args
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        # fmt: off
+        parser.add_argument(
+            "--epoch2lr",
+            type=str,
+            metavar="DICT",
+            default="{}",
+            help="a dictionary used to set lr for each epoch manually",
+        )
+        parser.add_argument(
+            "--update2lr",
+            type=str,
+            metavar="DICT",
+            default="{}",
+            help="a dictionary used to set lr for each update manually",
+        )
+        # fmt: on
+    def state_dict(self):
+        return {"lr": self.lr}
+    def load_state_dict(self, state_dict):
+        if "lr" in state_dict:
+            self.lr = state_dict["lr"]
+    def get_next_lr(self, epoch):
+        manual_keys = [k for k in self.epoch2lr if k <= epoch]
+        if manual_keys:
+            manual_lr = self.epoch2lr[max(manual_keys)]
+        else:
+            logger.warning(
+                "@@@ epoch={} does not exist in manual lr input. epoch2lr={}...".format(
+                    epoch,
+                    list(self.epoch2lr.items())[
+                        : min(10, len(self.epoch2lr.keys()) - 1)
+                    ],
+                )
+            )
+            manual_lr = self.optimizer.get_lr()
+        return manual_lr
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        self.lr = self.get_next_lr(epoch)
+        self.optimizer.set_lr(self.lr)
+        return self.optimizer.get_lr()
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        manual_keys = [k for k in self.update2lr if k <= num_updates]
+        if manual_keys:
+            manual_lr = self.update2lr[max(manual_keys)]
+        else:
+            logger.warning(
+                "epoch={} does not exist in manual lr input update2lr={}...".format(
+                    num_updates,
+                    list(self.update2lr.items())[
+                        : min(10, len(self.update2lr.keys()) - 1)
+                    ],
+                )
+            )
+            manual_lr = self.optimizer.get_lr()
+        self.optimizer.set_lr(manual_lr)
+        return self.optimizer.get_lr()

fairseq/fairseq/optim/lr_scheduler/pass_through.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+@dataclass
+class PassThroughScheduleConfig(FairseqDataclass):
+    pass
+@register_lr_scheduler("pass_through", dataclass=PassThroughScheduleConfig)
+class PassThroughScheduleSchedule(FairseqLRScheduler):
+    """Delegate lr scheduling to the optimizer."""
+    def __init__(self, cfg: PassThroughScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        assert (
+            hasattr(optimizer, "lr_scheduler") and optimizer.lr_scheduler is not None
+        ), "Pass-through schedule can only be used with optimizers with their own schedulers"
+    def state_dict(self):
+        return self.optimizer.lr_scheduler.state_dict()
+    def load_state_dict(self, state_dict):
+        self.optimizer.lr_scheduler.load_state_dict(state_dict)
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        return self.optimizer.lr_scheduler.step_begin_epoch(epoch)
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        return self.optimizer.lr_scheduler.step_update(num_updates)

fairseq/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+from typing import Optional, List
+from omegaconf import II
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+@dataclass
+class PolynomialDecayLRScheduleConfig(FairseqDataclass):
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    force_anneal: Optional[int] = field(
+        default=None,
+        metadata={"help": "force annealing at specified epoch"},
+    )
+    end_learning_rate: float = field(
+        default=0.0,
+        metadata={"help": "learning rate to decay to"},
+    )
+    power: float = field(
+        default=1.0,
+        metadata={"help": "decay exponent"},
+    )
+    total_num_update: float = field(
+        default=II("optimization.max_update"),
+        metadata={"help": "total number of updates over which to decay learning rate"},
+    )
+    lr: List[float] = II("optimization.lr")
+@register_lr_scheduler("polynomial_decay", dataclass=PolynomialDecayLRScheduleConfig)
+class PolynomialDecayLRSchedule(FairseqLRScheduler):
+    """Decay the LR on a fixed schedule."""
+    def __init__(self, cfg: PolynomialDecayLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        assert cfg.total_num_update > 0
+        self.lr = cfg.lr[0]
+        if cfg.warmup_updates > 0:
+            self.warmup_factor = 1.0 / cfg.warmup_updates
+        else:
+            self.warmup_factor = 1
+        self.end_learning_rate = cfg.end_learning_rate
+        self.total_num_update = cfg.total_num_update
+        self.power = cfg.power
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+    def get_next_lr(self, epoch):
+        lrs = self.cfg.lr
+        if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal:
+            # use fixed LR schedule
+            next_lr = lrs[min(epoch, len(lrs) - 1)]
+        else:
+            # annneal based on lr_shrink
+            next_lr = self.optimizer.get_lr()
+        return next_lr
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        self.lr = self.get_next_lr(epoch)
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+        return self.optimizer.get_lr()
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if self.cfg.warmup_updates > 0 and num_updates <= self.cfg.warmup_updates:
+            self.warmup_factor = num_updates / float(self.cfg.warmup_updates)
+            lr = self.warmup_factor * self.lr
+        elif num_updates >= self.total_num_update:
+            lr = self.end_learning_rate
+        else:
+            warmup = self.cfg.warmup_updates
+            lr_range = self.lr - self.end_learning_rate
+            pct_remaining = 1 - (num_updates - warmup) / (
+                self.total_num_update - warmup
+            )
+            lr = lr_range * pct_remaining ** (self.power) + self.end_learning_rate
+        self.optimizer.set_lr(lr)
+        return self.optimizer.get_lr()

fairseq/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+from typing import List
+import torch.optim.lr_scheduler
+from omegaconf import II
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+@dataclass
+class ReduceLROnPlateauLRScheduleConfig(FairseqDataclass):
+    lr_shrink: float = field(
+        default=0.1, metadata={"help": "shrink factor for annealing"}
+    )
+    lr_threshold: float = field(
+        default=1e-4,
+        metadata={
+            "help": (
+                "threshold for measuring the new optimum, to only focus on "
+                "significant changes"
+            )
+        },
+    )
+    lr_patience: int = field(
+        default=0,
+        metadata={
+            "help": (
+                "number of epochs with no improvement after which learning rate will "
+                "be reduced"
+            )
+        },
+    )
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    warmup_init_lr: float = field(
+        default=-1,
+        metadata={
+            "help": "initial learning rate during warmup phase; default is cfg.lr"
+        },
+    )
+    lr: List[float] = II("optimization.lr")
+    maximize_best_checkpoint_metric: bool = II(
+        "checkpoint.maximize_best_checkpoint_metric"
+    )
+@register_lr_scheduler(
+    "reduce_lr_on_plateau", dataclass=ReduceLROnPlateauLRScheduleConfig
+)
+class ReduceLROnPlateauLRSchedule(FairseqLRScheduler):
+    """
+    Decay the LR by a factor every time the validation loss plateaus.
+    Also comes with optional warmup phase, where we linearly increase
+    the learning rate from some initial learning rate
+    (``--warmup-init-lr``) until the configured learning rate
+    (``--lr``). Thereafter the lr is adjusted according to original
+    reduce_on_plateau scheme.
+    During warmup::
+      lrs = torch.linspace(
+          cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates
+      )
+      lr = lrs[update_num]
+    """
+    def __init__(self, cfg: ReduceLROnPlateauLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        if len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with reduce_lr_on_plateau."
+                " Consider --lr-scheduler=fixed instead."
+            )
+        self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            self.optimizer.optimizer,
+            patience=cfg.lr_patience,
+            factor=cfg.lr_shrink,
+            mode="max" if cfg.maximize_best_checkpoint_metric else "min",
+            threshold=cfg.lr_threshold,
+        )
+        warmup_end_lr = cfg.lr[0]
+        # if no warm up, sets initial lr to be cfg.lr[0]
+        if cfg.warmup_init_lr < 0:
+            cfg.warmup_init_lr = 0 if cfg.warmup_updates > 0 else warmup_end_lr
+        # linearly warmup for the first cfg.warmup_updates
+        if cfg.warmup_updates > 0:
+            self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates
+        # this flag is either set from arg when no warm up, or set by
+        # step_update() when warmup finishes
+        self.warmup_end = True if cfg.warmup_updates <= 0 else False
+        # initial learning rate
+        # this self.lr is used only during init and/or warm up period
+        self.lr = warmup_end_lr if self.warmup_end else cfg.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {
+            "best": self.lr_scheduler.best,
+            "last_epoch": self.lr_scheduler.last_epoch,
+        }
+    def load_state_dict(self, state_dict):
+        """Load an LR scheduler state dict."""
+        self.lr_scheduler.best = state_dict["best"]
+        if "last_epoch" in state_dict:
+            self.lr_scheduler.last_epoch = state_dict["last_epoch"]
+    def step(self, epoch, val_loss=None):
+        """
+        Update the learning rate at the end of the given epoch if warmup
+        finishes otherwise no update of lr on epoch boundaries
+        """
+        if val_loss is not None and self.warmup_end is True:
+            self.lr_scheduler.step(val_loss)
+        else:
+            self.lr_scheduler.last_epoch = epoch
+        return self.optimizer.get_lr()
+    def step_update(self, num_updates):
+        """
+        Update the learning rate after each update."""
+        # if there is warmup
+        if self.cfg.warmup_updates > 0:
+            if num_updates <= self.cfg.warmup_updates:
+                self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step
+                self.optimizer.set_lr(self.lr)
+            else:
+                if self.warmup_end is False:
+                    self.warmup_end = True
+        # else do nothing
+        return self.optimizer.get_lr()

fairseq/fairseq/optim/lr_scheduler/step_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import List
+from omegaconf import II
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+@dataclass
+class StepLRScheduleConfig(FairseqDataclass):
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    warmup_init_lr: float = field(
+        default=-1,
+        metadata={
+            "help": "initial learning rate during warmup phase; default is cfg.lr"
+        },
+    )
+    lr: List[float] = field(
+        default=II("optimization.lr"),
+        metadata={"help": "max learning rate, must be more than cfg.min_lr"},
+    )
+    min_lr: float = field(default=0.0, metadata={"help": "min learning rate"})
+    lr_deacy_period: int = field(default=25000, metadata={"help": "decay period"})
+    lr_decay: float = field(default=0.5, metadata={"help": "decay factor"})
+@register_lr_scheduler("step", dataclass=StepLRScheduleConfig)
+class StepLRSchedule(FairseqLRScheduler):
+    """Decay learning rate every k updates by a fixed factor"""
+    def __init__(self, cfg: StepLRScheduleConfig, fairseq_optimizer):
+        super().__init__(cfg, fairseq_optimizer)
+        self.max_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr
+        self.min_lr = cfg.min_lr
+        self.lr_deacy_period = cfg.lr_deacy_period
+        self.lr_decay = cfg.lr_decay
+        self.warmup_updates = cfg.warmup_updates
+        self.warmup_init_lr = (
+            cfg.warmup_init_lr if cfg.warmup_init_lr >= 0 else self.min_lr
+        )
+        assert self.lr_deacy_period > 0
+        assert self.lr_decay <= 1
+        assert self.min_lr >= 0
+        assert self.max_lr > self.min_lr
+        if cfg.warmup_updates > 0:
+            # linearly warmup for the first cfg.warmup_updates
+            self.warmup_lr_step = (
+                self.max_lr - self.warmup_init_lr
+            ) / self.warmup_updates
+        else:
+            self.warmup_lr_step = 1
+        # initial learning rate
+        self.lr = self.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if num_updates < self.cfg.warmup_updates:
+            self.lr = self.warmup_init_lr + num_updates * self.warmup_lr_step
+        else:
+            curr_updates = num_updates - self.cfg.warmup_updates
+            lr_mult = self.lr_decay ** (curr_updates // self.lr_deacy_period)
+            self.lr = max(self.max_lr * lr_mult, self.min_lr)
+        self.optimizer.set_lr(self.lr)
+        return self.lr

fairseq/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from dataclasses import dataclass, field
+from typing import Optional, List, Tuple
+from omegaconf import II
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+@dataclass
+class TriStageLRScheduleConfig(FairseqDataclass):
+    warmup_steps: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    hold_steps: int = field(
+        default=0,
+        metadata={"help": "steps in hold stage"},
+    )
+    decay_steps: int = field(
+        default=0,
+        metadata={"help": "steps in decay stages"},
+    )
+    phase_ratio: Optional[Tuple[float, float, float]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "if set, automatically sets warmup/hold/decay steps to the ratio "
+                "specified here from max_updates. the ratios must add up to 1.0"
+            )
+        },
+    )
+    init_lr_scale: float = field(
+        default=0.01,
+        metadata={"help": "initial learning rate scale during warmup phase"},
+    )
+    final_lr_scale: float = field(
+        default=0.01,
+        metadata={"help": "final learning rate scale"},
+    )
+    max_update: float = II("optimization.max_update")
+    lr: List[float] = II("optimization.lr")
+@register_lr_scheduler("tri_stage", dataclass=TriStageLRScheduleConfig)
+class TriStageLRSchedule(FairseqLRScheduler):
+    """Tristage learning rate schedulr
+    Implement the learning rate scheduler in https://arxiv.org/pdf/1904.08779.pdf
+    Similar to inverse_squre_root scheduler, but tri_stage learning rate employs
+    three stages LR scheduling:
+        - warmup stage, starting from `lr` * `init_lr_scale`, linearly
+          increased to `lr` in `warmup_steps` iterations
+        - hold stage, after `warmup_steps`, keep the LR as `lr` for `hold_steps`
+          iterations
+        - decay stage, after hold stage, decay LR exponetially to
+          `lr` * `final_lr_scale` in `decay_steps`;
+          after that LR is keep as `final_lr_scale` * `lr`
+    During warmup::
+      init_lr = cfg.init_lr_scale * cfg.lr
+      lrs = torch.linspace(init_lr, cfg.lr, cfg.warmup_steps)
+      lr = lrs[update_num]
+    During hold::
+      lr = cfg.lr
+    During decay::
+      decay_factor = - math.log(cfg.final_lr_scale) / cfg.decay_steps
+      lr = cfg.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor)
+    After that::
+      lr = cfg.lr * cfg.final_lr_scale
+    """
+    def __init__(self, cfg: TriStageLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        if len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with tri-stage lr."
+                " Consider --lr-scheduler=fixed instead."
+            )
+        # calculate LR at each point
+        self.peak_lr = cfg.lr[0]
+        self.init_lr = cfg.init_lr_scale * cfg.lr[0]
+        self.final_lr = cfg.final_lr_scale * cfg.lr[0]
+        if cfg.phase_ratio is not None:
+            assert cfg.max_update > 0
+            assert sum(cfg.phase_ratio) == 1, "phase ratios must add up to 1"
+            self.warmup_steps = int(cfg.max_update * cfg.phase_ratio[0])
+            self.hold_steps = int(cfg.max_update * cfg.phase_ratio[1])
+            self.decay_steps = int(cfg.max_update * cfg.phase_ratio[2])
+        else:
+            self.warmup_steps = cfg.warmup_steps
+            self.hold_steps = cfg.hold_steps
+            self.decay_steps = cfg.decay_steps
+        assert (
+            self.warmup_steps + self.hold_steps + self.decay_steps > 0
+        ), "please specify steps or phase_ratio"
+        self.warmup_rate = (
+            (self.peak_lr - self.init_lr) / self.warmup_steps
+            if self.warmup_steps != 0
+            else 0
+        )
+        self.decay_factor = -math.log(cfg.final_lr_scale) / self.decay_steps
+        # initial learning rate
+        self.lr = self.init_lr
+        self.optimizer.set_lr(self.lr)
+    def _decide_stage(self, update_step):
+        """
+        return stage, and the corresponding steps within the current stage
+        """
+        if update_step < self.warmup_steps:
+            # warmup state
+            return 0, update_step
+        offset = self.warmup_steps
+        if update_step < offset + self.hold_steps:
+            # hold stage
+            return 1, update_step - offset
+        offset += self.hold_steps
+        if update_step <= offset + self.decay_steps:
+            # decay stage
+            return 2, update_step - offset
+        offset += self.decay_steps
+        # still here ? constant lr stage
+        return 3, update_step - offset
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        stage, steps_in_stage = self._decide_stage(num_updates)
+        if stage == 0:
+            self.lr = self.init_lr + self.warmup_rate * steps_in_stage
+        elif stage == 1:
+            self.lr = self.peak_lr
+        elif stage == 2:
+            self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
+        elif stage == 3:
+            self.lr = self.final_lr
+        else:
+            raise ValueError("Undefined stage")
+        self.optimizer.set_lr(self.lr)
+        return self.lr

fairseq/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from dataclasses import dataclass, field
+from typing import List
+from omegaconf import II
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+@dataclass
+class TriangularLRScheduleConfig(FairseqDataclass):
+    max_lr: float = field(
+        default="???", metadata={"help": "max learning rate, must be more than cfg.lr"}
+    )
+    lr_period_updates: float = field(
+        default=5000,
+        metadata={"help": "initial number of updates per period (cycle length)"},
+    )
+    lr_shrink: float = field(
+        default=0.1, metadata={"help": "shrink factor for annealing"}
+    )
+    shrink_min: bool = field(
+        default=False, metadata={"help": "if set, also shrinks min lr"}
+    )
+    lr: List[float] = II("optimization.lr")
+@register_lr_scheduler("triangular", dataclass=TriangularLRScheduleConfig)
+class TriangularLRSchedule(FairseqLRScheduler):
+    """Assign LR based on a triangular cyclical schedule.
+    See https://arxiv.org/pdf/1506.01186.pdf for details.
+    """
+    def __init__(self, cfg: TriangularLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        if len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with triangular."
+                " Consider --lr-scheduler=fixed instead."
+            )
+        lr = cfg.lr[0]
+        assert cfg.max_lr > lr, "max_lr must be more than lr"
+        self.min_lr = lr
+        self.max_lr = cfg.max_lr
+        self.stepsize = cfg.lr_period_updates // 2
+        self.lr_shrink = cfg.lr_shrink
+        self.shrink_min = cfg.shrink_min
+        # initial learning rate
+        self.lr = self.min_lr
+        self.optimizer.set_lr(self.lr)
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        cycle = math.floor(num_updates / (2 * self.stepsize))
+        lr_shrink = self.lr_shrink**cycle
+        max_lr = self.max_lr * lr_shrink
+        if self.shrink_min:
+            min_lr = self.min_lr * lr_shrink
+        else:
+            min_lr = self.min_lr
+        x = abs(num_updates / self.stepsize - 2 * (cycle + 1) + 1)
+        self.lr = min_lr + (max_lr - min_lr) * max(0, (1 - x))
+        self.optimizer.set_lr(self.lr)
+        return self.lr

fairseq/fairseq/optim/nag.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import List
+import torch
+from fairseq.dataclass import FairseqDataclass
+from omegaconf import II, DictConfig
+from torch.optim.optimizer import Optimizer, required
+from . import FairseqOptimizer, register_optimizer
+@dataclass
+class FairseqNAGConfig(FairseqDataclass):
+    momentum: float = field(default=0.99, metadata={"help": "momentum factor"})
+    weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
+    # TODO common vars in parent class
+    lr: List[float] = II("optimization.lr")
+@register_optimizer("nag", dataclass=FairseqNAGConfig)
+class FairseqNAG(FairseqOptimizer):
+    def __init__(self, cfg: DictConfig, params):
+        super().__init__(cfg)
+        self._optimizer = NAG(params, **self.optimizer_config)
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.cfg.lr[0]
+            if isinstance(self.cfg.lr, Collection)
+            else self.cfg.lr,
+            "momentum": self.cfg.momentum,
+            "weight_decay": self.cfg.weight_decay,
+        }
+class NAG(Optimizer):
+    def __init__(self, params, lr=required, momentum=0, weight_decay=0):
+        defaults = dict(lr=lr, lr_old=lr, momentum=momentum, weight_decay=weight_decay)
+        super(NAG, self).__init__(params, defaults)
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+    @property
+    def supports_flat_params(self):
+        return True
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            weight_decay = group["weight_decay"]
+            momentum = group["momentum"]
+            lr = group["lr"]
+            lr_old = group.get("lr_old", lr)
+            lr_correct = lr / lr_old if lr_old > 0 else lr
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                p_data_fp32 = p.data
+                if p_data_fp32.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+                d_p = p.grad.data.float()
+                param_state = self.state[p]
+                if "momentum_buffer" not in param_state:
+                    param_state["momentum_buffer"] = torch.zeros_like(d_p)
+                else:
+                    param_state["momentum_buffer"] = param_state["momentum_buffer"].to(
+                        d_p
+                    )
+                buf = param_state["momentum_buffer"]
+                if weight_decay != 0:
+                    p_data_fp32.mul_(1 - lr * weight_decay)
+                p_data_fp32.add_(buf, alpha=momentum * momentum * lr_correct)
+                p_data_fp32.add_(d_p, alpha=-(1 + momentum) * lr)
+                buf.mul_(momentum * lr_correct).add_(d_p, alpha=-lr)
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)
+            group["lr_old"] = lr
+        return loss

fairseq/fairseq/optim/sgd.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch.optim
+from . import LegacyFairseqOptimizer, register_optimizer
+@register_optimizer("sgd")
+class SGD(LegacyFairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = torch.optim.SGD(params, **self.optimizer_config)
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--momentum', default=0.0, type=float, metavar='M',
+                            help='momentum factor')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # fmt: on
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.args.lr[0],
+            "momentum": self.args.momentum,
+            "weight_decay": self.args.weight_decay,
+        }
+    @property
+    def supports_flat_params(self):
+        return True

fairseq/fairseq/optim/shard.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any, Dict
+from fairseq.distributed import utils
+try:
+    from fairscale.optim import OSS
+    _has_fairscale = True
+except ImportError:
+    _has_fairscale = False
+def shard_(optimizer, group):
+    if not _has_fairscale:
+        raise ImportError(
+            "\n\nPlease install the fairscale package:" "\n\n  pip install fairscale"
+        )
+    class FairseqOSS(OSS):
+        @property
+        def disable_mem_eff_fp16_loading_hack(self):
+            return True
+        def __getattr__(self, name):
+            if name.startswith("supports") and hasattr(self.optim, name):
+                return getattr(self.optim, name)
+            raise AttributeError(
+                "'FairseqOSS' object has no attribute {0!r}".format(name)
+            )
+        def broadcast_global_state_dict(
+            self, state_dict: Dict[str, Any]
+        ) -> Dict[str, Any]:
+            """
+            Broadcasts the entire state_dict to all other ranks
+            each rank is responsible to load their own partition of data
+            """
+            return utils.broadcast_object(
+                state_dict,
+                src_rank=0,
+                group=self.group,
+            )
+    torch_optimizer = optimizer.optimizer
+    optim_cls = type(torch_optimizer)
+    optimizer.optimizer = FairseqOSS(
+        torch_optimizer.param_groups,
+        optim_cls,
+        group=group,
+        **optimizer.optimizer_config
+    )

fairseq/fairseq/scoring/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import os
+from abc import ABC, abstractmethod
+from fairseq import registry
+from omegaconf import DictConfig
+class BaseScorer(ABC):
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.ref = []
+        self.pred = []
+    def add_string(self, ref, pred):
+        self.ref.append(ref)
+        self.pred.append(pred)
+    @abstractmethod
+    def score(self) -> float:
+        pass
+    @abstractmethod
+    def result_string(self) -> str:
+        pass
+_build_scorer, register_scorer, SCORER_REGISTRY, _ = registry.setup_registry(
+    "--scoring", default="bleu"
+)
+def build_scorer(choice, tgt_dict):
+    _choice = choice._name if isinstance(choice, DictConfig) else choice
+    if _choice == "bleu":
+        from fairseq.scoring import bleu
+        return bleu.Scorer(
+            bleu.BleuConfig(pad=tgt_dict.pad(), eos=tgt_dict.eos(), unk=tgt_dict.unk())
+        )
+    return _build_scorer(choice)
+# automatically import any Python files in the current directory
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        module = file[: file.find(".py")]
+        importlib.import_module("fairseq.scoring." + module)

fairseq/fairseq/scoring/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.86 kB). View file

fairseq/fairseq/scoring/__pycache__/bertscore.cpython-310.pyc ADDED Viewed

Binary file (1.89 kB). View file

fairseq/fairseq/scoring/__pycache__/bleu.cpython-310.pyc ADDED Viewed

Binary file (6.1 kB). View file

fairseq/fairseq/scoring/__pycache__/chrf.cpython-310.pyc ADDED Viewed

Binary file (1.5 kB). View file