Spaces:

mshukor
/

eP-ALM

Runtime error

eP-ALM / models /adapters.py

mshukor

init

3eb682b over 1 year ago

3.88 kB

	import torch
	import torch.nn as nn
	from torchtyping import TensorType

	# same as https://github.com/Aleph-Alpha/magma/blob/master/magma/adapters.py

	class Adapter(nn.Module):
	def __init__(
	self,
	dim: int,
	downsample_factor: int = 4,
	activation: nn.Module = nn.ReLU,
	add_layernorm: bool = False,
	):
	super().__init__()
	layers = []
	if add_layernorm:
	layers.append(nn.LayerNorm(dim))
	layers.extend(
	[
	nn.Linear(dim, dim // downsample_factor),
	activation(),
	nn.Linear(dim // downsample_factor, dim),
	]
	)
	self.adapter = nn.Sequential(*layers)
	self.adapter.apply(self.init_weights)

	def init_weights(self, m: nn.Module, std=1e-3):
	if isinstance(m, nn.Linear):
	torch.nn.init.normal_(m.weight, std=std)
	torch.nn.init.normal_(m.bias, std=std)
	m.weight.data = torch.clamp(m.weight.data, min=-2 * std, max=2 * std)
	m.bias.data = torch.clamp(m.bias.data, min=-2 * std, max=2 * std)
	elif isinstance(m, nn.LayerNorm):
	m.bias.data.zero_()
	m.weight.data.fill_(1.0)

	def forward(self, x: TensorType["b", "s", "d"]) -> TensorType["b", "s", "d"]:
	return self.adapter(x) + x


	class ParallelAdapter(Adapter):
	def __init__(
	self,
	module: nn.Module,
	dim: int,
	downsample_factor: int = 4,
	scaled: bool = False,
	add_layernorm: bool = False,
	activation: nn.Module = nn.ReLU,
	):
	super().__init__(
	dim, downsample_factor, add_layernorm=add_layernorm, activation=activation
	)
	self.module = module

	if scaled:
	# init scaling param
	self.adapter_scale = nn.Parameter(torch.ones(1))
	else:
	self.adapter_scale = 1

	def forward(self, x: TensorType["b", "s", "d"], **module_kwargs):
	y = self.module(x, **module_kwargs)
	z = self.adapter(x)
	return y + (z * self.adapter_scale)


	class ParallelAdapterWrapper(ParallelAdapter):
	# used to add an adapter to the attention block

	def __init__(
	self,
	module: nn.Module,
	dim: int,
	downsample_factor: int = 4,
	scaled: bool = False,
	add_layernorm: bool = False,
	activation: nn.Module = nn.ReLU,
	):
	super().__init__(
	module, dim, downsample_factor, scaled, add_layernorm, activation
	)

	def forward(self, x: TensorType["b", "s", "d"], attn_args, *attn_kwargs):
	attn_outputs = self.module(x, attn_args, *attn_kwargs)
	attn_output, outputs = (
	attn_outputs[0],
	attn_outputs[1:],
	) # output_attn: a, present, (attentions)
	hidden_states = attn_output + (self.adapter(x) * self.adapter_scale)
	return (hidden_states,) + outputs


	class AdapterWrapper(Adapter):
	# used to add an adapter to the attention block

	def __init__(
	self,
	attn_block: nn.Module,
	dim: int,
	downsample_factor: int = 4,
	activation: nn.Module = nn.ReLU,
	add_layernorm: bool = False,
	):
	super().__init__(dim, downsample_factor, activation, add_layernorm)
	self.attn_block = attn_block

	def forward(self, x: TensorType["b", "s", "d"] = None, attn_args, *attn_kwargs):
	if x is None:
	attn_outputs = self.attn_block(attn_args, *attn_kwargs)
	else:
	attn_outputs = self.attn_block(x, attn_args, *attn_kwargs)
	attn_output, outputs = (
	attn_outputs[0],
	attn_outputs[1:],
	) # output_attn: a, present, (attentions)
	hidden_states = self.adapter(attn_output) + attn_output
	return (hidden_states,) + outputs