herrius
/

unit_test

Model card Files Files and versions Community

unit_test / uniperceiver /task_moe /layer.py

herrius

Upload 259 files

32b542e over 2 years ago

raw

history blame contribute delete

3.28 kB

	'''
	Copyright 2020 The Microsoft DeepSpeed Team
	'''

	import torch.nn.init as init
	import torch
	from torch import nn
	import torch.distributed as dist




	from .gate import TopKGate
	import copy
	import typing

	from .experts import FusedExperts as Experts


	class TaskMoE(torch.nn.Module):
	def __init__(self,
	hidden_size,
	expert,
	num_experts=1,
	k=1,
	capacity_factor=1.,
	eval_capacity_factor=1.,
	min_capacity=4,
	noisy_gate_policy: typing.Optional[str] = None,
	drop_tokens: bool = True,
	use_rts=True,
	use_tutel: bool = False,
	cfg=None):
	"""Initialize an MoE layer.

	Arguments:
	hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.

	expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).

	num_experts (int, optional): default=1, the total number of experts per layer.

	k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.

	capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.

	eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.

	min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.

	noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.

	drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).

	use_rts (bool, optional): default=True, whether to use Random Token Selection.

	use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
	"""

	super().__init__()


	self.num_experts = num_experts

	if isinstance(expert, nn.Linear):
	self.expert_type = 'linear'
	elif isinstance(expert, nn.MultiheadAttention):
	self.expert_type = 'attention'
	else:
	raise NotImplementedError('please check expert type')

	experts = Experts(expert, cfg, num_experts)

	self.gate = TopKGate(hidden_size,
	num_experts,
	k,
	noisy_gate_policy,
	cfg,
	moe_type=self.expert_type)


	self.experts = experts



	def forward(self, hidden_states, gate_decision=None, **kwargs):
	""" MoE forward
	Arguments:
	hidden_states (Tensor): input to the layer
	Returns:
	A tuple including output
	* output (Tensor): output of the model
	"""


	if gate_decision is not None:
	top_indices, gates = gate_decision
	else:
	top_indices, gates = self.gate(hidden_states, **kwargs)

	expert_output = self.experts(hidden_states, top_indices, gates, **kwargs)

	return expert_output, [top_indices, gates]