File size: 4,038 Bytes
9f13819 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import torch
from torch import nn
import torch.nn.functional as F
class Dense(nn.Module):
def __init__(self, dim: int, num_moe: int) -> None:
super().__init__()
self.dim = 64
self.num_moe = num_moe
self.linear_layer = nn.Linear(self.dim, num_moe, bias=False)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
logits = self.linear_layer(x)
probs = self.softmax(logits)
return probs
class topK(nn.Module):
def __init__(self, dim: int, num_moe: int) -> None:
super().__init__()
self.dim = 64
self.num_moe = num_moe
self.linear_layer = nn.Linear(self.dim, num_moe, bias=False)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x, topk=1):
logits = self.linear_layer(x)
probs = self.softmax(logits)
# 使用topk来选择最高的k个概率
topk_values, topk_indices = torch.topk(probs, k=topk, dim=-1)
# 创建一个初始值全为负无穷的张量,形状与probs相同
topk_probs = torch.full_like(probs, float('-inf'))
# 使用scatter填充topk的概率值
topk_probs = topk_probs.scatter_(-1, topk_indices, topk_values)
# 应用softmax确保top k值的和为1
topk_probs = self.softmax(topk_probs)
return topk_probs
class MLP(nn.Module):
def __init__(self, dim: int, num_moe: int, hidden_dim: int = 128) -> None:
super().__init__()
self.dim = 64
self.num_moe = num_moe
# 添加多层感知机结构
self.linear_layer1 = nn.Linear(self.dim, hidden_dim)
self.activation = nn.GELU() # 使用GELU激活函数
self.linear_layer2 = nn.Linear(hidden_dim, self.num_moe)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
x = self.linear_layer1(x)
x = self.activation(x)
logits = self.linear_layer2(x)
probs = self.softmax(logits)
return probs
class Noise(nn.Module):
def __init__(self, dim: int, num_moe: int, noise_std: float = 0.1) -> None:
super().__init__()
self.dim = 64
self.num_moe = num_moe
self.noise_std = noise_std
self.linear_layer = nn.Linear(self.dim, num_moe, bias=False)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
logits = self.linear_layer(x)
# 添加噪声
noise = torch.randn_like(logits) * self.noise_std
logits = logits + noise
probs = self.softmax(logits)
return probs
class MLP_noise(nn.Module):
def __init__(self, dim: int, num_moe: int, hidden_dim: int = 128, noise_std: float = 0.1) -> None:
super().__init__()
self.dim = 64
self.num_moe = num_moe
self.noise_std = noise_std
self.linear1 = nn.Linear(self.dim, hidden_dim, bias=False)
self.relu = nn.ReLU()
self.linear2 = nn.Linear(hidden_dim, num_moe, bias=False)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
hidden = self.linear1(x)
hidden = self.relu(hidden)
logits = self.linear2(hidden)
# 添加噪声
noise = torch.randn_like(logits) * self.noise_std
logits = logits + noise
probs = self.softmax(logits)
return probs
class Drop(nn.Module):
def __init__(self, dim: int, num_moe: int, dropout_rate: float = 0.1) -> None:
super().__init__()
self.dim = 64
self.num_moe = num_moe
self.linear_layer = nn.Linear(self.dim, num_moe, bias=False)
self.dropout = nn.Dropout(dropout_rate)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
logits = self.linear_layer(x)
# 添加Dropout
logits = self.dropout(logits)
probs = self.softmax(logits)
return probs
GATING_TO_MODEL_MAPPING = {
"Dense": Dense,
"topK": topK,
"MLP": MLP,
"Drop": Drop,
"MLP_noise": MLP_noise,
"Noise": Noise,
} |