|
''' |
|
Copyright 2020 The Microsoft DeepSpeed Team |
|
''' |
|
|
|
import torch.nn.init as init |
|
import torch |
|
from torch import nn |
|
import torch.distributed as dist |
|
|
|
|
|
|
|
|
|
from .gate import TopKGate |
|
import copy |
|
import typing |
|
|
|
from .experts import FusedExperts as Experts |
|
|
|
|
|
class TaskMoE(torch.nn.Module): |
|
def __init__(self, |
|
hidden_size, |
|
expert, |
|
num_experts=1, |
|
k=1, |
|
capacity_factor=1., |
|
eval_capacity_factor=1., |
|
min_capacity=4, |
|
noisy_gate_policy: typing.Optional[str] = None, |
|
drop_tokens: bool = True, |
|
use_rts=True, |
|
use_tutel: bool = False, |
|
cfg=None): |
|
"""Initialize an MoE layer. |
|
|
|
Arguments: |
|
hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension. |
|
|
|
expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear). |
|
|
|
num_experts (int, optional): default=1, the total number of experts per layer. |
|
|
|
k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. |
|
|
|
capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. |
|
|
|
eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. |
|
|
|
min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. |
|
|
|
noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'. |
|
|
|
drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). |
|
|
|
use_rts (bool, optional): default=True, whether to use Random Token Selection. |
|
|
|
use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed). |
|
""" |
|
|
|
super().__init__() |
|
|
|
|
|
self.num_experts = num_experts |
|
|
|
if isinstance(expert, nn.Linear): |
|
self.expert_type = 'linear' |
|
elif isinstance(expert, nn.MultiheadAttention): |
|
self.expert_type = 'attention' |
|
else: |
|
raise NotImplementedError('please check expert type') |
|
|
|
experts = Experts(expert, cfg, num_experts) |
|
|
|
self.gate = TopKGate(hidden_size, |
|
num_experts, |
|
k, |
|
noisy_gate_policy, |
|
cfg, |
|
moe_type=self.expert_type) |
|
|
|
|
|
self.experts = experts |
|
|
|
|
|
|
|
def forward(self, hidden_states, gate_decision=None, **kwargs): |
|
""" MoE forward |
|
Arguments: |
|
hidden_states (Tensor): input to the layer |
|
Returns: |
|
A tuple including output |
|
* output (Tensor): output of the model |
|
""" |
|
|
|
|
|
if gate_decision is not None: |
|
top_indices, gates = gate_decision |
|
else: |
|
top_indices, gates = self.gate(hidden_states, **kwargs) |
|
|
|
expert_output = self.experts(hidden_states, top_indices, gates, **kwargs) |
|
|
|
return expert_output, [top_indices, gates] |
|
|