Superxixixi commited on
Commit
182c3ce
·
1 Parent(s): 29ac319

Delete model

Browse files
model/.DS_Store DELETED
Binary file (6.15 kB)
 
model/__init__.py DELETED
@@ -1,5 +0,0 @@
1
- from model.transformer.position_encoding import PositionalEncoding
2
- from model.transformer.transformer import Transformer
3
- from model.transformer.transformer import TransformerEncoder, TransformerEncoderLayer
4
- from model.transformer.transformer import TransformerDecoder, TransformerDecoderLayer
5
- from model.transformer.utils import layer_norm, generate_square_subsequent_mask, generate_proposal_mask
 
 
 
 
 
 
model/__pycache__/__init__.cpython-36.pyc DELETED
Binary file (561 Bytes)
 
model/__pycache__/__init__.cpython-37.pyc DELETED
Binary file (573 Bytes)
 
model/__pycache__/attentionLayer.cpython-37.pyc DELETED
Binary file (1.38 kB)
 
model/__pycache__/convLayer.cpython-37.pyc DELETED
Binary file (1.32 kB)
 
model/__pycache__/loconet_encoder.cpython-37.pyc DELETED
Binary file (3.21 kB)
 
model/__pycache__/position_encoding.cpython-36.pyc DELETED
Binary file (1.26 kB)
 
model/__pycache__/talkNetModel.cpython-37.pyc DELETED
Binary file (6.33 kB)
 
model/__pycache__/transformer.cpython-36.pyc DELETED
Binary file (8.84 kB)
 
model/__pycache__/utils.cpython-36.pyc DELETED
Binary file (1.08 kB)
 
model/__pycache__/visualEncoder.cpython-37.pyc DELETED
Binary file (6.53 kB)
 
model/attentionLayer.py DELETED
@@ -1,39 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from torch.nn import functional as F
4
- from torch.nn import MultiheadAttention
5
-
6
-
7
- class attentionLayer(nn.Module):
8
-
9
- def __init__(self, d_model, nhead, dropout=0.1):
10
- super(attentionLayer, self).__init__()
11
- self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
12
-
13
- self.linear1 = nn.Linear(d_model, d_model * 4)
14
- self.dropout = nn.Dropout(dropout)
15
- self.linear2 = nn.Linear(d_model * 4, d_model)
16
-
17
- self.norm1 = nn.LayerNorm(d_model)
18
- self.norm2 = nn.LayerNorm(d_model)
19
- self.dropout1 = nn.Dropout(dropout)
20
- self.dropout2 = nn.Dropout(dropout)
21
-
22
- self.activation = F.relu
23
-
24
- def forward(self, src, tar, adjust=False, attn_mask=None):
25
- # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
26
- src = src.transpose(0, 1) # B, T, C -> T, B, C
27
- tar = tar.transpose(0, 1) # B, T, C -> T, B, C
28
- if adjust:
29
- src2 = self.self_attn(src, tar, tar, attn_mask=None, key_padding_mask=None)[0]
30
- else:
31
- src2 = self.self_attn(tar, src, src, attn_mask=None, key_padding_mask=None)[0]
32
- src = src + self.dropout1(src2)
33
- src = self.norm1(src)
34
-
35
- src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
36
- src = src + self.dropout2(src2)
37
- src = self.norm2(src)
38
- src = src.transpose(0, 1) # T, B, C -> B, T, C
39
- return src
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/audioEncoder.py DELETED
@@ -1,108 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
-
5
- class SEBasicBlock(nn.Module):
6
- expansion = 1
7
-
8
- def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
9
- super(SEBasicBlock, self).__init__()
10
- self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
11
- self.bn1 = nn.BatchNorm2d(planes)
12
- self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
13
- self.bn2 = nn.BatchNorm2d(planes)
14
- self.relu = nn.ReLU(inplace=True)
15
- self.se = SELayer(planes, reduction)
16
- self.downsample = downsample
17
- self.stride = stride
18
-
19
- def forward(self, x):
20
- residual = x
21
-
22
- out = self.conv1(x)
23
- out = self.relu(out)
24
- out = self.bn1(out)
25
-
26
- out = self.conv2(out)
27
- out = self.bn2(out)
28
- out = self.se(out)
29
-
30
- if self.downsample is not None:
31
- residual = self.downsample(x)
32
-
33
- out += residual
34
- out = self.relu(out)
35
- return out
36
-
37
- class SELayer(nn.Module):
38
- def __init__(self, channel, reduction=8):
39
- super(SELayer, self).__init__()
40
- self.avg_pool = nn.AdaptiveAvgPool2d(1)
41
- self.fc = nn.Sequential(
42
- nn.Linear(channel, channel // reduction),
43
- nn.ReLU(inplace=True),
44
- nn.Linear(channel // reduction, channel),
45
- nn.Sigmoid()
46
- )
47
-
48
- def forward(self, x):
49
- b, c, _, _ = x.size()
50
- y = self.avg_pool(x).view(b, c)
51
- y = self.fc(y).view(b, c, 1, 1)
52
- return x * y
53
-
54
- class audioEncoder(nn.Module):
55
- def __init__(self, layers, num_filters, **kwargs):
56
- super(audioEncoder, self).__init__()
57
- block = SEBasicBlock
58
- self.inplanes = num_filters[0]
59
-
60
- self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3,
61
- bias=False)
62
- self.bn1 = nn.BatchNorm2d(num_filters[0])
63
- self.relu = nn.ReLU(inplace=True)
64
-
65
- self.layer1 = self._make_layer(block, num_filters[0], layers[0])
66
- self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
67
- self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
68
- self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1))
69
- out_dim = num_filters[3] * block.expansion
70
-
71
- for m in self.modules():
72
- if isinstance(m, nn.Conv2d):
73
- nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
74
- elif isinstance(m, nn.BatchNorm2d):
75
- nn.init.constant_(m.weight, 1)
76
- nn.init.constant_(m.bias, 0)
77
-
78
- def _make_layer(self, block, planes, blocks, stride=1):
79
- downsample = None
80
- if stride != 1 or self.inplanes != planes * block.expansion:
81
- downsample = nn.Sequential(
82
- nn.Conv2d(self.inplanes, planes * block.expansion,
83
- kernel_size=1, stride=stride, bias=False),
84
- nn.BatchNorm2d(planes * block.expansion),
85
- )
86
-
87
- layers = []
88
- layers.append(block(self.inplanes, planes, stride, downsample))
89
- self.inplanes = planes * block.expansion
90
- for i in range(1, blocks):
91
- layers.append(block(self.inplanes, planes))
92
-
93
- return nn.Sequential(*layers)
94
-
95
- def forward(self, x):
96
- x = self.conv1(x)
97
- x = self.bn1(x)
98
- x = self.relu(x)
99
-
100
- x = self.layer1(x)
101
- x = self.layer2(x)
102
- x = self.layer3(x)
103
- x = self.layer4(x)
104
- x = torch.mean(x, dim=2, keepdim=True)
105
- x = x.view((x.size()[0], x.size()[1], -1))
106
- x = x.transpose(1, 2)
107
-
108
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/convLayer.py DELETED
@@ -1,42 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from torch.nn import functional as F
4
-
5
-
6
- class ConvLayer(nn.Module):
7
-
8
- def __init__(self, cfg):
9
- super(ConvLayer, self).__init__()
10
- self.cfg = cfg
11
- self.s = cfg.MODEL.NUM_SPEAKERS
12
- self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (self.s, 7), padding=(0, 3))
13
- # below line is speaker parallel 93.88 code
14
- # self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (3, 7), padding=(0, 3))
15
- self.ln = torch.nn.LayerNorm(256)
16
- self.conv2d_1x1 = torch.nn.Conv2d(256, 512, (1, 1), padding=(0, 0))
17
- self.conv2d_1x1_2 = torch.nn.Conv2d(512, 256, (1, 1), padding=(0, 0))
18
- self.gelu = nn.GELU()
19
-
20
- def forward(self, x, b, s):
21
-
22
- identity = x # b*s, t, c
23
- t = x.shape[1]
24
- c = x.shape[2]
25
- out = x.view(b, s, t, c)
26
- out = out.permute(0, 3, 1, 2) # b, c, s, t
27
-
28
- out = self.conv2d(out) # b, s*c, 1, t
29
- out = out.view(b, c, s, t)
30
- out = out.permute(0, 2, 3, 1) # b, s, t, c
31
- out = self.ln(out)
32
- out = out.permute(0, 3, 1, 2)
33
- out = self.conv2d_1x1(out)
34
- out = self.gelu(out)
35
- out = self.conv2d_1x1_2(out) # b, c, s, t
36
-
37
- out = out.permute(0, 2, 3, 1) # b, s, t, c
38
- out = out.view(b * s, t, c)
39
-
40
- out += identity
41
-
42
- return out, b, s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/faceDetector/README.md DELETED
@@ -1,3 +0,0 @@
1
- # Face detector
2
-
3
- This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
 
 
 
 
model/faceDetector/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .s3fd import S3FD
 
 
model/faceDetector/s3fd/__init__.py DELETED
@@ -1,66 +0,0 @@
1
- import time, os, sys, subprocess
2
- import numpy as np
3
- import cv2
4
- import torch
5
- from torchvision import transforms
6
- from .nets import S3FDNet
7
- from .box_utils import nms_
8
-
9
- PATH_WEIGHT = 'model/faceDetector/s3fd/sfd_face.pth'
10
- if os.path.isfile(PATH_WEIGHT) == False:
11
- Link = "1KafnHz7ccT-3IyddBsL5yi2xGtxAKypt"
12
- cmd = "gdown --id %s -O %s"%(Link, PATH_WEIGHT)
13
- subprocess.call(cmd, shell=True, stdout=None)
14
- img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
15
-
16
-
17
- class S3FD():
18
-
19
- def __init__(self, device='cuda'):
20
-
21
- tstamp = time.time()
22
- self.device = device
23
-
24
- # print('[S3FD] loading with', self.device)
25
- self.net = S3FDNet(device=self.device).to(self.device)
26
- PATH = os.path.join(os.getcwd(), PATH_WEIGHT)
27
- state_dict = torch.load(PATH, map_location=self.device)
28
- self.net.load_state_dict(state_dict)
29
- self.net.eval()
30
- # print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
31
-
32
- def detect_faces(self, image, conf_th=0.8, scales=[1]):
33
-
34
- w, h = image.shape[1], image.shape[0]
35
-
36
- bboxes = np.empty(shape=(0, 5))
37
-
38
- with torch.no_grad():
39
- for s in scales:
40
- scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
41
-
42
- scaled_img = np.swapaxes(scaled_img, 1, 2)
43
- scaled_img = np.swapaxes(scaled_img, 1, 0)
44
- scaled_img = scaled_img[[2, 1, 0], :, :]
45
- scaled_img = scaled_img.astype('float32')
46
- scaled_img -= img_mean
47
- scaled_img = scaled_img[[2, 1, 0], :, :]
48
- x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
49
- y = self.net(x)
50
-
51
- detections = y.data
52
- scale = torch.Tensor([w, h, w, h])
53
-
54
- for i in range(detections.size(1)):
55
- j = 0
56
- while detections[0, i, j, 0] > conf_th:
57
- score = detections[0, i, j, 0]
58
- pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
59
- bbox = (pt[0], pt[1], pt[2], pt[3], score)
60
- bboxes = np.vstack((bboxes, bbox))
61
- j += 1
62
-
63
- keep = nms_(bboxes, 0.1)
64
- bboxes = bboxes[keep]
65
-
66
- return bboxes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/faceDetector/s3fd/box_utils.py DELETED
@@ -1,217 +0,0 @@
1
- import numpy as np
2
- from itertools import product as product
3
- import torch
4
- from torch.autograd import Function
5
-
6
-
7
- def nms_(dets, thresh):
8
- """
9
- Courtesy of Ross Girshick
10
- [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
11
- """
12
- x1 = dets[:, 0]
13
- y1 = dets[:, 1]
14
- x2 = dets[:, 2]
15
- y2 = dets[:, 3]
16
- scores = dets[:, 4]
17
-
18
- areas = (x2 - x1) * (y2 - y1)
19
- order = scores.argsort()[::-1]
20
-
21
- keep = []
22
- while order.size > 0:
23
- i = order[0]
24
- keep.append(int(i))
25
- xx1 = np.maximum(x1[i], x1[order[1:]])
26
- yy1 = np.maximum(y1[i], y1[order[1:]])
27
- xx2 = np.minimum(x2[i], x2[order[1:]])
28
- yy2 = np.minimum(y2[i], y2[order[1:]])
29
-
30
- w = np.maximum(0.0, xx2 - xx1)
31
- h = np.maximum(0.0, yy2 - yy1)
32
- inter = w * h
33
- ovr = inter / (areas[i] + areas[order[1:]] - inter)
34
-
35
- inds = np.where(ovr <= thresh)[0]
36
- order = order[inds + 1]
37
-
38
- return np.array(keep).astype(np.int)
39
-
40
-
41
- def decode(loc, priors, variances):
42
- """Decode locations from predictions using priors to undo
43
- the encoding we did for offset regression at train time.
44
- Args:
45
- loc (tensor): location predictions for loc layers,
46
- Shape: [num_priors,4]
47
- priors (tensor): Prior boxes in center-offset form.
48
- Shape: [num_priors,4].
49
- variances: (list[float]) Variances of priorboxes
50
- Return:
51
- decoded bounding box predictions
52
- """
53
-
54
- boxes = torch.cat((
55
- priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
56
- priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
57
- boxes[:, :2] -= boxes[:, 2:] / 2
58
- boxes[:, 2:] += boxes[:, :2]
59
- return boxes
60
-
61
-
62
- def nms(boxes, scores, overlap=0.5, top_k=200):
63
- """Apply non-maximum suppression at test time to avoid detecting too many
64
- overlapping bounding boxes for a given object.
65
- Args:
66
- boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
67
- scores: (tensor) The class predscores for the img, Shape:[num_priors].
68
- overlap: (float) The overlap thresh for suppressing unnecessary boxes.
69
- top_k: (int) The Maximum number of box preds to consider.
70
- Return:
71
- The indices of the kept boxes with respect to num_priors.
72
- """
73
-
74
- keep = scores.new(scores.size(0)).zero_().long()
75
- if boxes.numel() == 0:
76
- return keep, 0
77
- x1 = boxes[:, 0]
78
- y1 = boxes[:, 1]
79
- x2 = boxes[:, 2]
80
- y2 = boxes[:, 3]
81
- area = torch.mul(x2 - x1, y2 - y1)
82
- v, idx = scores.sort(0) # sort in ascending order
83
- # I = I[v >= 0.01]
84
- idx = idx[-top_k:] # indices of the top-k largest vals
85
- xx1 = boxes.new()
86
- yy1 = boxes.new()
87
- xx2 = boxes.new()
88
- yy2 = boxes.new()
89
- w = boxes.new()
90
- h = boxes.new()
91
-
92
- # keep = torch.Tensor()
93
- count = 0
94
- while idx.numel() > 0:
95
- i = idx[-1] # index of current largest val
96
- # keep.append(i)
97
- keep[count] = i
98
- count += 1
99
- if idx.size(0) == 1:
100
- break
101
- idx = idx[:-1] # remove kept element from view
102
- # load bboxes of next highest vals
103
- torch.index_select(x1, 0, idx, out=xx1)
104
- torch.index_select(y1, 0, idx, out=yy1)
105
- torch.index_select(x2, 0, idx, out=xx2)
106
- torch.index_select(y2, 0, idx, out=yy2)
107
- # store element-wise max with next highest score
108
- xx1 = torch.clamp(xx1, min=x1[i])
109
- yy1 = torch.clamp(yy1, min=y1[i])
110
- xx2 = torch.clamp(xx2, max=x2[i])
111
- yy2 = torch.clamp(yy2, max=y2[i])
112
- w.resize_as_(xx2)
113
- h.resize_as_(yy2)
114
- w = xx2 - xx1
115
- h = yy2 - yy1
116
- # check sizes of xx1 and xx2.. after each iteration
117
- w = torch.clamp(w, min=0.0)
118
- h = torch.clamp(h, min=0.0)
119
- inter = w * h
120
- # IoU = i / (area(a) + area(b) - i)
121
- rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
122
- union = (rem_areas - inter) + area[i]
123
- IoU = inter / union # store result in iou
124
- # keep only elements with an IoU <= overlap
125
- idx = idx[IoU.le(overlap)]
126
- return keep, count
127
-
128
-
129
- class Detect(object):
130
-
131
- def __init__(self, num_classes=2,
132
- top_k=750, nms_thresh=0.3, conf_thresh=0.05,
133
- variance=[0.1, 0.2], nms_top_k=5000):
134
-
135
- self.num_classes = num_classes
136
- self.top_k = top_k
137
- self.nms_thresh = nms_thresh
138
- self.conf_thresh = conf_thresh
139
- self.variance = variance
140
- self.nms_top_k = nms_top_k
141
-
142
- def forward(self, loc_data, conf_data, prior_data):
143
-
144
- num = loc_data.size(0)
145
- num_priors = prior_data.size(0)
146
-
147
- conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
148
- batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
149
- batch_priors = batch_priors.contiguous().view(-1, 4)
150
-
151
- decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
152
- decoded_boxes = decoded_boxes.view(num, num_priors, 4)
153
-
154
- output = torch.zeros(num, self.num_classes, self.top_k, 5)
155
-
156
- for i in range(num):
157
- boxes = decoded_boxes[i].clone()
158
- conf_scores = conf_preds[i].clone()
159
-
160
- for cl in range(1, self.num_classes):
161
- c_mask = conf_scores[cl].gt(self.conf_thresh)
162
- scores = conf_scores[cl][c_mask]
163
-
164
- if scores.dim() == 0:
165
- continue
166
- l_mask = c_mask.unsqueeze(1).expand_as(boxes)
167
- boxes_ = boxes[l_mask].view(-1, 4)
168
- ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
169
- count = count if count < self.top_k else self.top_k
170
-
171
- output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
172
-
173
- return output
174
-
175
-
176
- class PriorBox(object):
177
-
178
- def __init__(self, input_size, feature_maps,
179
- variance=[0.1, 0.2],
180
- min_sizes=[16, 32, 64, 128, 256, 512],
181
- steps=[4, 8, 16, 32, 64, 128],
182
- clip=False):
183
-
184
- super(PriorBox, self).__init__()
185
-
186
- self.imh = input_size[0]
187
- self.imw = input_size[1]
188
- self.feature_maps = feature_maps
189
-
190
- self.variance = variance
191
- self.min_sizes = min_sizes
192
- self.steps = steps
193
- self.clip = clip
194
-
195
- def forward(self):
196
- mean = []
197
- for k, fmap in enumerate(self.feature_maps):
198
- feath = fmap[0]
199
- featw = fmap[1]
200
- for i, j in product(range(feath), range(featw)):
201
- f_kw = self.imw / self.steps[k]
202
- f_kh = self.imh / self.steps[k]
203
-
204
- cx = (j + 0.5) / f_kw
205
- cy = (i + 0.5) / f_kh
206
-
207
- s_kw = self.min_sizes[k] / self.imw
208
- s_kh = self.min_sizes[k] / self.imh
209
-
210
- mean += [cx, cy, s_kw, s_kh]
211
-
212
- output = torch.FloatTensor(mean).view(-1, 4)
213
-
214
- if self.clip:
215
- output.clamp_(max=1, min=0)
216
-
217
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/faceDetector/s3fd/nets.py DELETED
@@ -1,174 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
- import torch.nn.init as init
5
- from .box_utils import Detect, PriorBox
6
-
7
-
8
- class L2Norm(nn.Module):
9
-
10
- def __init__(self, n_channels, scale):
11
- super(L2Norm, self).__init__()
12
- self.n_channels = n_channels
13
- self.gamma = scale or None
14
- self.eps = 1e-10
15
- self.weight = nn.Parameter(torch.Tensor(self.n_channels))
16
- self.reset_parameters()
17
-
18
- def reset_parameters(self):
19
- init.constant_(self.weight, self.gamma)
20
-
21
- def forward(self, x):
22
- norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
23
- x = torch.div(x, norm)
24
- out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
25
- return out
26
-
27
-
28
- class S3FDNet(nn.Module):
29
-
30
- def __init__(self, device='cuda'):
31
- super(S3FDNet, self).__init__()
32
- self.device = device
33
-
34
- self.vgg = nn.ModuleList([
35
- nn.Conv2d(3, 64, 3, 1, padding=1),
36
- nn.ReLU(inplace=True),
37
- nn.Conv2d(64, 64, 3, 1, padding=1),
38
- nn.ReLU(inplace=True),
39
- nn.MaxPool2d(2, 2),
40
-
41
- nn.Conv2d(64, 128, 3, 1, padding=1),
42
- nn.ReLU(inplace=True),
43
- nn.Conv2d(128, 128, 3, 1, padding=1),
44
- nn.ReLU(inplace=True),
45
- nn.MaxPool2d(2, 2),
46
-
47
- nn.Conv2d(128, 256, 3, 1, padding=1),
48
- nn.ReLU(inplace=True),
49
- nn.Conv2d(256, 256, 3, 1, padding=1),
50
- nn.ReLU(inplace=True),
51
- nn.Conv2d(256, 256, 3, 1, padding=1),
52
- nn.ReLU(inplace=True),
53
- nn.MaxPool2d(2, 2, ceil_mode=True),
54
-
55
- nn.Conv2d(256, 512, 3, 1, padding=1),
56
- nn.ReLU(inplace=True),
57
- nn.Conv2d(512, 512, 3, 1, padding=1),
58
- nn.ReLU(inplace=True),
59
- nn.Conv2d(512, 512, 3, 1, padding=1),
60
- nn.ReLU(inplace=True),
61
- nn.MaxPool2d(2, 2),
62
-
63
- nn.Conv2d(512, 512, 3, 1, padding=1),
64
- nn.ReLU(inplace=True),
65
- nn.Conv2d(512, 512, 3, 1, padding=1),
66
- nn.ReLU(inplace=True),
67
- nn.Conv2d(512, 512, 3, 1, padding=1),
68
- nn.ReLU(inplace=True),
69
- nn.MaxPool2d(2, 2),
70
-
71
- nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
72
- nn.ReLU(inplace=True),
73
- nn.Conv2d(1024, 1024, 1, 1),
74
- nn.ReLU(inplace=True),
75
- ])
76
-
77
- self.L2Norm3_3 = L2Norm(256, 10)
78
- self.L2Norm4_3 = L2Norm(512, 8)
79
- self.L2Norm5_3 = L2Norm(512, 5)
80
-
81
- self.extras = nn.ModuleList([
82
- nn.Conv2d(1024, 256, 1, 1),
83
- nn.Conv2d(256, 512, 3, 2, padding=1),
84
- nn.Conv2d(512, 128, 1, 1),
85
- nn.Conv2d(128, 256, 3, 2, padding=1),
86
- ])
87
-
88
- self.loc = nn.ModuleList([
89
- nn.Conv2d(256, 4, 3, 1, padding=1),
90
- nn.Conv2d(512, 4, 3, 1, padding=1),
91
- nn.Conv2d(512, 4, 3, 1, padding=1),
92
- nn.Conv2d(1024, 4, 3, 1, padding=1),
93
- nn.Conv2d(512, 4, 3, 1, padding=1),
94
- nn.Conv2d(256, 4, 3, 1, padding=1),
95
- ])
96
-
97
- self.conf = nn.ModuleList([
98
- nn.Conv2d(256, 4, 3, 1, padding=1),
99
- nn.Conv2d(512, 2, 3, 1, padding=1),
100
- nn.Conv2d(512, 2, 3, 1, padding=1),
101
- nn.Conv2d(1024, 2, 3, 1, padding=1),
102
- nn.Conv2d(512, 2, 3, 1, padding=1),
103
- nn.Conv2d(256, 2, 3, 1, padding=1),
104
- ])
105
-
106
- self.softmax = nn.Softmax(dim=-1)
107
- self.detect = Detect()
108
-
109
- def forward(self, x):
110
- size = x.size()[2:]
111
- sources = list()
112
- loc = list()
113
- conf = list()
114
-
115
- for k in range(16):
116
- x = self.vgg[k](x)
117
- s = self.L2Norm3_3(x)
118
- sources.append(s)
119
-
120
- for k in range(16, 23):
121
- x = self.vgg[k](x)
122
- s = self.L2Norm4_3(x)
123
- sources.append(s)
124
-
125
- for k in range(23, 30):
126
- x = self.vgg[k](x)
127
- s = self.L2Norm5_3(x)
128
- sources.append(s)
129
-
130
- for k in range(30, len(self.vgg)):
131
- x = self.vgg[k](x)
132
- sources.append(x)
133
-
134
- # apply extra layers and cache source layer outputs
135
- for k, v in enumerate(self.extras):
136
- x = F.relu(v(x), inplace=True)
137
- if k % 2 == 1:
138
- sources.append(x)
139
-
140
- # apply multibox head to source layers
141
- loc_x = self.loc[0](sources[0])
142
- conf_x = self.conf[0](sources[0])
143
-
144
- max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
145
- conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
146
-
147
- loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
148
- conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
149
-
150
- for i in range(1, len(sources)):
151
- x = sources[i]
152
- conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
153
- loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
154
-
155
- features_maps = []
156
- for i in range(len(loc)):
157
- feat = []
158
- feat += [loc[i].size(1), loc[i].size(2)]
159
- features_maps += [feat]
160
-
161
- loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
162
- conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
163
-
164
- with torch.no_grad():
165
- self.priorbox = PriorBox(size, features_maps)
166
- self.priors = self.priorbox.forward()
167
-
168
- output = self.detect.forward(
169
- loc.view(loc.size(0), -1, 4),
170
- self.softmax(conf.view(conf.size(0), -1, 2)),
171
- self.priors.type(type(x.data)).to(self.device)
172
- )
173
-
174
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/loconet_encoder.py DELETED
@@ -1,91 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- # from model.visualEncoder import visualFrontend, visualTCN, visualConv1D
5
- from model.attentionLayer import attentionLayer
6
- from model.convLayer import ConvLayer
7
- from torchvggish import vggish
8
- from model.visualEncoder import visualFrontend, visualConv1D, visualTCN
9
-
10
-
11
- class locoencoder(nn.Module):
12
-
13
- def __init__(self, cfg):
14
- super(locoencoder, self).__init__()
15
- self.cfg = cfg
16
- # Visual Temporal Encoder
17
- self.visualFrontend = visualFrontend(cfg) # Visual Frontend
18
- self.visualTCN = visualTCN() # Visual Temporal Network TCN
19
- self.visualConv1D = visualConv1D() # Visual Temporal Network Conv1d
20
-
21
- urls = {
22
- 'vggish':
23
- "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth"
24
- }
25
- self.audioEncoder = vggish.VGGish(urls, preprocess=False, postprocess=False)
26
- self.audio_pool = nn.AdaptiveAvgPool1d(1)
27
-
28
- # Audio-visual Cross Attention
29
- self.crossA2V = attentionLayer(d_model=128, nhead=8)
30
- self.crossV2A = attentionLayer(d_model=128, nhead=8)
31
-
32
- # Audio-visual Self Attention
33
-
34
- num_layers = self.cfg.MODEL.AV_layers
35
- layers = nn.ModuleList()
36
- for i in range(num_layers):
37
- layers.append(ConvLayer(cfg))
38
- layers.append(attentionLayer(d_model=256, nhead=8))
39
- self.convAV = layers
40
-
41
- def forward_visual_frontend(self, x):
42
-
43
- B, T, W, H = x.shape
44
- x = x.view(B * T, 1, 1, W, H)
45
- x = (x / 255 - 0.4161) / 0.1688
46
- x = self.visualFrontend(x)
47
- x = x.view(B, T, 512)
48
- x = x.transpose(1, 2)
49
- x = self.visualTCN(x)
50
- x = self.visualConv1D(x)
51
- x = x.transpose(1, 2)
52
- return x
53
-
54
- def forward_audio_frontend(self, x):
55
- t = x.shape[-2]
56
- numFrames = t // 4
57
- pad = 8 - (t % 8)
58
- x = torch.nn.functional.pad(x, (0, 0, 0, pad), "constant")
59
- # x = x.unsqueeze(1).transpose(2, 3)
60
- x = self.audioEncoder(x)
61
-
62
- b, c, t2, freq = x.shape
63
- x = x.view(b * c, t2, freq)
64
- x = self.audio_pool(x)
65
- x = x.view(b, c, t2)[:, :, :numFrames]
66
- x = x.permute(0, 2, 1)
67
- return x
68
-
69
- def forward_cross_attention(self, x1, x2):
70
- x1_c = self.crossA2V(src=x1, tar=x2, adjust=self.cfg.MODEL.ADJUST_ATTENTION)
71
- x2_c = self.crossV2A(src=x2, tar=x1, adjust=self.cfg.MODEL.ADJUST_ATTENTION)
72
- return x1_c, x2_c
73
-
74
- def forward_audio_visual_backend(self, x1, x2, b=1, s=1):
75
- x = torch.cat((x1, x2), 2) # B*S, T, 2C
76
- for i, layer in enumerate(self.convAV):
77
- if i % 2 == 0:
78
- x, b, s = layer(x, b, s)
79
- else:
80
- x = layer(src=x, tar=x)
81
-
82
- x = torch.reshape(x, (-1, 256))
83
- return x
84
-
85
- def forward_audio_backend(self, x):
86
- x = torch.reshape(x, (-1, 128))
87
- return x
88
-
89
- def forward_visual_backend(self, x):
90
- x = torch.reshape(x, (-1, 128))
91
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/transformer/__pycache__/position_encoding.cpython-37.pyc DELETED
Binary file (1.23 kB)
 
model/transformer/__pycache__/transformer.cpython-37.pyc DELETED
Binary file (8.81 kB)
 
model/transformer/__pycache__/utils.cpython-37.pyc DELETED
Binary file (1.05 kB)
 
model/transformer/position_encoding.py DELETED
@@ -1,28 +0,0 @@
1
- ##########################################################################
2
- # We adopt the positional encoding method from PyTorch Turorial.
3
- # Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
4
- ##########################################################################
5
- import math
6
-
7
- import torch
8
- import torch.nn as nn
9
-
10
-
11
- class PositionalEncoding(nn.Module):
12
-
13
- def __init__(self, d_model, dropout=0.1, max_len=5000):
14
- super(PositionalEncoding, self).__init__()
15
-
16
- self.dropout = nn.Dropout(p=dropout)
17
-
18
- pe = torch.zeros(max_len, d_model)
19
- position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
20
- div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
21
- pe[:, 0::2] = torch.sin(position * div_term)
22
- pe[:, 1::2] = torch.cos(position * div_term)
23
- pe = pe.unsqueeze(0).transpose(0, 1)
24
- self.register_buffer('pe', pe)
25
-
26
- def forward(self, x, padding=0):
27
- x = x + self.pe[padding:padding + x.shape[0], :]
28
- return self.dropout(x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/transformer/transformer.py DELETED
@@ -1,334 +0,0 @@
1
- import copy
2
-
3
- import torch
4
- import torch.nn as nn
5
- import torch.nn.functional as F
6
-
7
-
8
- class DotProductAttention(nn.Module):
9
-
10
- def __init__(self, dropout=0.0):
11
- super(DotProductAttention, self).__init__()
12
-
13
- self.dropout = dropout
14
-
15
- def forward(self, q, k, v, attn_mask=None):
16
- attn_output_weights = torch.bmm(q, k.transpose(1, 2))
17
-
18
- if attn_mask is not None:
19
- attn_output_weights += attn_mask
20
-
21
- attn_output_weights = F.softmax(attn_output_weights, dim=-1)
22
- attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training)
23
- attn_output = torch.bmm(attn_output_weights, v)
24
- return attn_output
25
-
26
-
27
- class MultiheadAttention(nn.Module):
28
-
29
- def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, kdim=None, vdim=None):
30
- super(MultiheadAttention, self).__init__()
31
-
32
- self.embed_dim = embed_dim
33
- self.num_heads = num_heads
34
- self.kdim = kdim if kdim is not None else embed_dim
35
- self.vdim = vdim if vdim is not None else embed_dim
36
- self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
37
-
38
- if self._qkv_same_embed_dim:
39
- self.in_proj_weight = nn.Parameter(torch.empty(3 * embed_dim, embed_dim))
40
- else:
41
- raise RuntimeError('Do not support q, k, v have different dimensions')
42
-
43
- if bias:
44
- self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim))
45
- else:
46
- self.register_parameter('in_proj_bias', None)
47
-
48
- self.out_proj = nn.Linear(embed_dim, embed_dim)
49
-
50
- if self._qkv_same_embed_dim:
51
- nn.init.xavier_uniform_(self.in_proj_weight)
52
-
53
- if self.in_proj_bias is not None:
54
- nn.init.constant_(self.in_proj_bias, 0.)
55
- nn.init.constant_(self.out_proj.bias, 0.)
56
-
57
- self.dotproductattention = DotProductAttention(dropout)
58
-
59
- def forward(self, q, k, v, attn_mask=None, key_padding_mask=None):
60
- tsz, bsz, embed_dim = q.shape[0], q.shape[1], q.shape[2]
61
-
62
- head_dim = embed_dim // self.num_heads
63
- assert head_dim * self.num_heads == embed_dim, \
64
- 'embed_dim must be divisible by num_heads'
65
- scaling = float(head_dim)**-0.5
66
-
67
- _b = self.in_proj_bias
68
- _start = None
69
- _end = embed_dim
70
- _w = self.in_proj_weight[:_end, :]
71
- if _b is not None:
72
- _b = _b[:_end]
73
- q = F.linear(q, _w, _b)
74
-
75
- _b = self.in_proj_bias
76
- _start = embed_dim
77
- _end = embed_dim * 2
78
- _w = self.in_proj_weight[_start:_end, :]
79
- if _b is not None:
80
- _b = _b[_start:_end]
81
- k = F.linear(k, _w, _b)
82
-
83
- _b = self.in_proj_bias
84
- _start = embed_dim * 2
85
- _end = None
86
- _w = self.in_proj_weight[_start:, :]
87
- if _b is not None:
88
- _b = _b[_start:]
89
- v = F.linear(v, _w, _b)
90
-
91
- q = q * scaling
92
-
93
- q = q.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
94
- k = k.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
95
- v = v.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
96
-
97
- if attn_mask is not None:
98
- attn_mask = attn_mask.unsqueeze(0).repeat(bsz, 1, 1)
99
- attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
100
- attn_mask = attn_mask.reshape(-1, *attn_mask.shape[2:])
101
-
102
- if key_padding_mask is not None:
103
- key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, tsz, 1)
104
- key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
105
- key_padding_mask = key_padding_mask.reshape(-1, *key_padding_mask.shape[2:])
106
-
107
- if attn_mask is not None and key_padding_mask is not None:
108
- mask = attn_mask + key_padding_mask
109
- elif attn_mask is not None:
110
- mask = attn_mask
111
- elif key_padding_mask is not None:
112
- mask = key_padding_mask
113
- else:
114
- mask = None
115
-
116
- attn_output = self.dotproductattention(q, k, v, mask)
117
- attn_output = attn_output.transpose(0, 1).contiguous().view(tsz, bsz, self.embed_dim)
118
- return self.out_proj(attn_output), None
119
-
120
-
121
- class Transformer(nn.Module):
122
-
123
- def __init__(self,
124
- d_model=512,
125
- nhead=8,
126
- num_encoder_layers=6,
127
- num_decoder_layers=6,
128
- dim_feedforward=2048,
129
- dropout=0.1,
130
- activation='relu',
131
- custom_encoder=None,
132
- custom_decoder=None):
133
- super(Transformer, self).__init__()
134
-
135
- if custom_encoder is not None:
136
- self.encoder = custom_encoder
137
- else:
138
- encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
139
- activation)
140
- encoder_norm = nn.LayerNorm(d_model)
141
- self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
142
-
143
- if custom_decoder is not None:
144
- self.decoder = custom_decoder
145
- else:
146
- decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout,
147
- activation)
148
- decoder_norm = nn.LayerNorm(d_model)
149
- self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
150
-
151
- self.d_model = d_model
152
- self.nhead = nhead
153
-
154
- def forward(self,
155
- src,
156
- tgt,
157
- src_mask=None,
158
- tgt_mask=None,
159
- memory_mask=None,
160
- src_key_padding_mask=None,
161
- tgt_key_padding_mask=None,
162
- memory_key_padding_mask=None):
163
- if src.size(1) != tgt.size(1):
164
- raise RuntimeError('the batch number of src and tgt must be equal')
165
-
166
- if src.size(2) != self.d_model or tgt.size(2) != self.d_model:
167
- raise RuntimeError('the feature number of src and tgt must be equal to d_model')
168
-
169
- memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
170
- output = self.decoder(tgt,
171
- memory,
172
- tgt_mask=tgt_mask,
173
- memory_mask=memory_mask,
174
- tgt_key_padding_mask=tgt_key_padding_mask,
175
- memory_key_padding_mask=memory_key_padding_mask)
176
- return output
177
-
178
-
179
- class TransformerEncoder(nn.Module):
180
-
181
- def __init__(self, encoder_layer, num_layers, norm=None):
182
- super(TransformerEncoder, self).__init__()
183
-
184
- self.layers = _get_clones(encoder_layer, num_layers)
185
- self.num_layers = num_layers
186
- self.norm = norm
187
-
188
- def forward(self, src, src_mask=None, src_key_padding_mask=None):
189
- output = src
190
-
191
- for mod in self.layers:
192
- output = mod(output, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
193
-
194
- if self.norm is not None:
195
- output = self.norm(output)
196
-
197
- return output
198
-
199
-
200
- class TransformerDecoder(nn.Module):
201
-
202
- def __init__(self, decoder_layer, num_layers, norm=None):
203
- super(TransformerDecoder, self).__init__()
204
-
205
- self.layers = _get_clones(decoder_layer, num_layers)
206
- self.num_layers = num_layers
207
- self.norm = norm
208
-
209
- def forward(self,
210
- tgt,
211
- memory,
212
- tgt_mask=None,
213
- memory_mask=None,
214
- tgt_key_padding_mask=None,
215
- memory_key_padding_mask=None):
216
- output = tgt
217
-
218
- for mod in self.layers:
219
- output = mod(output,
220
- memory,
221
- tgt_mask=tgt_mask,
222
- memory_mask=memory_mask,
223
- tgt_key_padding_mask=tgt_key_padding_mask,
224
- memory_key_padding_mask=memory_key_padding_mask)
225
-
226
- if self.norm is not None:
227
- output = self.norm(output)
228
-
229
- return output
230
-
231
-
232
- class TransformerEncoderLayer(nn.Module):
233
-
234
- def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
235
- super(TransformerEncoderLayer, self).__init__()
236
-
237
- self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
238
-
239
- # Implementation of Feedforward model
240
- self.linear1 = nn.Linear(d_model, dim_feedforward)
241
- self.dropout = nn.Dropout(dropout)
242
- self.linear2 = nn.Linear(dim_feedforward, d_model)
243
-
244
- self.norm1 = nn.LayerNorm(d_model)
245
- self.norm2 = nn.LayerNorm(d_model)
246
- self.dropout1 = nn.Dropout(dropout)
247
- self.dropout2 = nn.Dropout(dropout)
248
-
249
- self.activation = _get_activation_fn(activation)
250
-
251
- def __setstate__(self, state):
252
- if 'activation' not in state:
253
- state['activation'] = F.relu
254
- super(TransformerEncoderLayer, self).__setstate__(state)
255
-
256
- def forward(self, src, src_mask=None, src_key_padding_mask=None):
257
- src2 = self.self_attn(src,
258
- src,
259
- src,
260
- attn_mask=src_mask,
261
- key_padding_mask=src_key_padding_mask)[0]
262
- src = src + self.dropout1(src2)
263
- src = self.norm1(src)
264
- src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
265
- src = src + self.dropout2(src2)
266
- src = self.norm2(src)
267
- return src
268
-
269
-
270
- class TransformerDecoderLayer(nn.Module):
271
-
272
- def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
273
- super(TransformerDecoderLayer, self).__init__()
274
-
275
- self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
276
- self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
277
-
278
- # Implementation of Feedforward model
279
- self.linear1 = nn.Linear(d_model, dim_feedforward)
280
- self.dropout = nn.Dropout(dropout)
281
- self.linear2 = nn.Linear(dim_feedforward, d_model)
282
-
283
- self.norm1 = nn.LayerNorm(d_model)
284
- self.norm2 = nn.LayerNorm(d_model)
285
- self.norm3 = nn.LayerNorm(d_model)
286
- self.dropout1 = nn.Dropout(dropout)
287
- self.dropout2 = nn.Dropout(dropout)
288
- self.dropout3 = nn.Dropout(dropout)
289
-
290
- self.activation = _get_activation_fn(activation)
291
-
292
- def __setstate__(self, state):
293
- if 'activation' not in state:
294
- state['activation'] = F.relu
295
- super(TransformerDecoderLayer, self).__setstate__(state)
296
-
297
- def forward(self,
298
- tgt,
299
- memory,
300
- tgt_mask=None,
301
- memory_mask=None,
302
- tgt_key_padding_mask=None,
303
- memory_key_padding_mask=None):
304
- tgt2 = self.self_attn(tgt,
305
- tgt,
306
- tgt,
307
- attn_mask=tgt_mask,
308
- key_padding_mask=tgt_key_padding_mask)[0]
309
- tgt = tgt + self.dropout1(tgt2)
310
- tgt = self.norm1(tgt)
311
- tgt2 = self.multihead_attn(tgt,
312
- memory,
313
- memory,
314
- attn_mask=memory_mask,
315
- key_padding_mask=memory_key_padding_mask)[0]
316
- tgt = tgt + self.dropout2(tgt2)
317
- tgt = self.norm2(tgt)
318
- tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
319
- tgt = tgt + self.dropout3(tgt2)
320
- tgt = self.norm3(tgt)
321
- return tgt
322
-
323
-
324
- def _get_clones(module, N):
325
- return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
326
-
327
-
328
- def _get_activation_fn(activation):
329
- if activation == 'relu':
330
- return F.relu
331
- elif activation == 'gelu':
332
- return F.gelu
333
-
334
- raise RuntimeError('activation should be relu/gelu, not {}'.format(activation))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/transformer/utils.py DELETED
@@ -1,22 +0,0 @@
1
- import torch
2
- assert torch.__version__ >= '1.6.0'
3
- import torch.nn as nn
4
- import numpy as np
5
-
6
-
7
- def layer_norm(d_model, condition=True):
8
- return nn.LayerNorm(d_model) if condition else None
9
-
10
-
11
- def generate_square_subsequent_mask(sz):
12
- mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
13
- mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
14
- return mask
15
-
16
-
17
- def generate_proposal_mask(T, B):
18
- mask = torch.zeros(T, (T + 1) * T // 2)
19
- for sz, idx in zip(range(1, T + 1), np.cumsum(range(T))):
20
- mask[:sz, idx: idx + sz] = torch.fliplr(torch.tril(torch.ones(sz, sz)))
21
- mask = mask.unsqueeze(1).repeat(1, B, 1)
22
- return mask
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/visualEncoder.py DELETED
@@ -1,199 +0,0 @@
1
- ##
2
- # ResNet18 Pretrained network to extract lip embedding
3
- # This code is modified based on https://github.com/lordmartian/deep_avsr
4
- ##
5
-
6
- import torch
7
- import torch.nn as nn
8
- import torch.nn.functional as F
9
- from model.attentionLayer import attentionLayer
10
-
11
-
12
- class ResNetLayer(nn.Module):
13
- """
14
- A ResNet layer used to build the ResNet network.
15
- Architecture:
16
- --> conv-bn-relu -> conv -> + -> bn-relu -> conv-bn-relu -> conv -> + -> bn-relu -->
17
- | | | |
18
- -----> downsample ------> ------------------------------------->
19
- """
20
-
21
- def __init__(self, inplanes, outplanes, stride):
22
- super(ResNetLayer, self).__init__()
23
- self.conv1a = nn.Conv2d(inplanes,
24
- outplanes,
25
- kernel_size=3,
26
- stride=stride,
27
- padding=1,
28
- bias=False)
29
- self.bn1a = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
30
- self.conv2a = nn.Conv2d(outplanes,
31
- outplanes,
32
- kernel_size=3,
33
- stride=1,
34
- padding=1,
35
- bias=False)
36
- self.stride = stride
37
- if self.stride != 1:
38
- self.downsample = nn.Conv2d(inplanes,
39
- outplanes,
40
- kernel_size=(1, 1),
41
- stride=stride,
42
- bias=False)
43
- self.outbna = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
44
-
45
- self.conv1b = nn.Conv2d(outplanes,
46
- outplanes,
47
- kernel_size=3,
48
- stride=1,
49
- padding=1,
50
- bias=False)
51
- self.bn1b = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
52
- self.conv2b = nn.Conv2d(outplanes,
53
- outplanes,
54
- kernel_size=3,
55
- stride=1,
56
- padding=1,
57
- bias=False)
58
- self.outbnb = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
59
- return
60
-
61
- def forward(self, inputBatch):
62
- batch = F.relu(self.bn1a(self.conv1a(inputBatch)))
63
- batch = self.conv2a(batch)
64
- if self.stride == 1:
65
- residualBatch = inputBatch
66
- else:
67
- residualBatch = self.downsample(inputBatch)
68
- batch = batch + residualBatch
69
- intermediateBatch = batch
70
- batch = F.relu(self.outbna(batch))
71
-
72
- batch = F.relu(self.bn1b(self.conv1b(batch)))
73
- batch = self.conv2b(batch)
74
- residualBatch = intermediateBatch
75
- batch = batch + residualBatch
76
- outputBatch = F.relu(self.outbnb(batch))
77
- return outputBatch
78
-
79
-
80
- class ResNet(nn.Module):
81
- """
82
- An 18-layer ResNet architecture.
83
- """
84
-
85
- def __init__(self):
86
- super(ResNet, self).__init__()
87
- self.layer1 = ResNetLayer(64, 64, stride=1)
88
- self.layer2 = ResNetLayer(64, 128, stride=2)
89
- self.layer3 = ResNetLayer(128, 256, stride=2)
90
- self.layer4 = ResNetLayer(256, 512, stride=2)
91
- self.avgpool = nn.AvgPool2d(kernel_size=(4, 4), stride=(1, 1))
92
-
93
- return
94
-
95
- def forward(self, inputBatch):
96
- batch = self.layer1(inputBatch)
97
- batch = self.layer2(batch)
98
- batch = self.layer3(batch)
99
- batch = self.layer4(batch)
100
- outputBatch = self.avgpool(batch)
101
- return outputBatch
102
-
103
-
104
- class GlobalLayerNorm(nn.Module):
105
-
106
- def __init__(self, channel_size):
107
- super(GlobalLayerNorm, self).__init__()
108
- self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1]
109
- self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1]
110
- self.reset_parameters()
111
-
112
- def reset_parameters(self):
113
- self.gamma.data.fill_(1)
114
- self.beta.data.zero_()
115
-
116
- def forward(self, y):
117
- mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) #[M, 1, 1]
118
- var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
119
- gLN_y = self.gamma * (y - mean) / torch.pow(var + 1e-8, 0.5) + self.beta
120
- return gLN_y
121
-
122
-
123
- class visualFrontend(nn.Module):
124
- """
125
- A visual feature extraction module. Generates a 512-dim feature vector per video frame.
126
- Architecture: A 3D convolution block followed by an 18-layer ResNet.
127
- """
128
-
129
- def __init__(self, cfg):
130
- self.cfg = cfg
131
- super(visualFrontend, self).__init__()
132
- self.frontend3D = nn.Sequential(
133
- nn.Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3),
134
- bias=False), nn.BatchNorm3d(64, momentum=0.01, eps=0.001), nn.ReLU(),
135
- nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
136
- self.resnet = ResNet()
137
- return
138
-
139
- def forward(self, inputBatch):
140
- inputBatch = inputBatch.transpose(0, 1).transpose(1, 2)
141
- batchsize = inputBatch.shape[0]
142
- batch = self.frontend3D(inputBatch)
143
-
144
- batch = batch.transpose(1, 2)
145
- batch = batch.reshape(batch.shape[0] * batch.shape[1], batch.shape[2], batch.shape[3],
146
- batch.shape[4])
147
- outputBatch = self.resnet(batch)
148
- outputBatch = outputBatch.reshape(batchsize, -1, 512)
149
- outputBatch = outputBatch.transpose(1, 2)
150
- outputBatch = outputBatch.transpose(1, 2).transpose(0, 1)
151
- return outputBatch
152
-
153
-
154
- class DSConv1d(nn.Module):
155
-
156
- def __init__(self):
157
- super(DSConv1d, self).__init__()
158
- self.net = nn.Sequential(
159
- nn.ReLU(),
160
- nn.BatchNorm1d(512),
161
- nn.Conv1d(512, 512, 3, stride=1, padding=1, dilation=1, groups=512, bias=False),
162
- nn.PReLU(),
163
- GlobalLayerNorm(512),
164
- nn.Conv1d(512, 512, 1, bias=False),
165
- )
166
-
167
- def forward(self, x):
168
- out = self.net(x)
169
- return out + x
170
-
171
-
172
- class visualTCN(nn.Module):
173
-
174
- def __init__(self):
175
- super(visualTCN, self).__init__()
176
- stacks = []
177
- for x in range(5):
178
- stacks += [DSConv1d()]
179
- self.net = nn.Sequential(*stacks) # Visual Temporal Network V-TCN
180
-
181
- def forward(self, x):
182
- out = self.net(x)
183
- return out
184
-
185
-
186
- class visualConv1D(nn.Module):
187
-
188
- def __init__(self):
189
- super(visualConv1D, self).__init__()
190
- self.net = nn.Sequential(
191
- nn.Conv1d(512, 256, 5, stride=1, padding=2),
192
- nn.BatchNorm1d(256),
193
- nn.ReLU(),
194
- nn.Conv1d(256, 128, 1),
195
- )
196
-
197
- def forward(self, x):
198
- out = self.net(x)
199
- return out