File size: 4,038 Bytes
9f13819
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import torch
from torch import nn
import torch.nn.functional as F

class Dense(nn.Module):
    def __init__(self, dim: int, num_moe: int) -> None:
        super().__init__()
        self.dim = 64
        self.num_moe = num_moe
        self.linear_layer = nn.Linear(self.dim, num_moe, bias=False)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        logits = self.linear_layer(x)
        probs = self.softmax(logits)
        return probs

class topK(nn.Module):
    def __init__(self, dim: int, num_moe: int) -> None:
        super().__init__()
        self.dim = 64
        self.num_moe = num_moe
        self.linear_layer = nn.Linear(self.dim, num_moe, bias=False)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, topk=1):
        logits = self.linear_layer(x)
        probs = self.softmax(logits)
        # 使用topk来选择最高的k个概率
        topk_values, topk_indices = torch.topk(probs, k=topk, dim=-1)
        # 创建一个初始值全为负无穷的张量,形状与probs相同
        topk_probs = torch.full_like(probs, float('-inf'))
        # 使用scatter填充topk的概率值
        topk_probs = topk_probs.scatter_(-1, topk_indices, topk_values)
        # 应用softmax确保top k值的和为1
        topk_probs = self.softmax(topk_probs)
        return topk_probs

class MLP(nn.Module):
    def __init__(self, dim: int, num_moe: int, hidden_dim: int = 128) -> None:
        super().__init__()
        self.dim = 64
        self.num_moe = num_moe
        # 添加多层感知机结构
        self.linear_layer1 = nn.Linear(self.dim, hidden_dim)
        self.activation = nn.GELU()  # 使用GELU激活函数
        self.linear_layer2 = nn.Linear(hidden_dim, self.num_moe)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.linear_layer1(x)
        x = self.activation(x)
        logits = self.linear_layer2(x)
        probs = self.softmax(logits)
        return probs
    
    
class Noise(nn.Module):
    def __init__(self, dim: int, num_moe: int, noise_std: float = 0.1) -> None:
        super().__init__()
        self.dim = 64
        self.num_moe = num_moe
        self.noise_std = noise_std
        self.linear_layer = nn.Linear(self.dim, num_moe, bias=False)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        logits = self.linear_layer(x)
        
        # 添加噪声
        noise = torch.randn_like(logits) * self.noise_std
        logits = logits + noise
        
        probs = self.softmax(logits)
        return probs
    
class MLP_noise(nn.Module):
    def __init__(self, dim: int, num_moe: int, hidden_dim: int = 128, noise_std: float = 0.1) -> None:
        super().__init__()
        self.dim = 64
        self.num_moe = num_moe
        self.noise_std = noise_std
        self.linear1 = nn.Linear(self.dim, hidden_dim, bias=False)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, num_moe, bias=False)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        hidden = self.linear1(x)
        hidden = self.relu(hidden)
        logits = self.linear2(hidden)
        
        # 添加噪声
        noise = torch.randn_like(logits) * self.noise_std
        logits = logits + noise
        
        probs = self.softmax(logits)
        return probs

    
class Drop(nn.Module):
    def __init__(self, dim: int, num_moe: int, dropout_rate: float = 0.1) -> None:
        super().__init__()
        self.dim = 64
        self.num_moe = num_moe
        self.linear_layer = nn.Linear(self.dim, num_moe, bias=False)
        self.dropout = nn.Dropout(dropout_rate)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        logits = self.linear_layer(x)
        # 添加Dropout
        logits = self.dropout(logits)
        probs = self.softmax(logits)
        return probs
    
GATING_TO_MODEL_MAPPING = {
    "Dense": Dense,
    "topK": topK,
    "MLP": MLP,
    "Drop": Drop,
    "MLP_noise": MLP_noise,
    "Noise": Noise,
}