Commit
·
182c3ce
1
Parent(s):
29ac319
Delete model
Browse files- model/.DS_Store +0 -0
- model/__init__.py +0 -5
- model/__pycache__/__init__.cpython-36.pyc +0 -0
- model/__pycache__/__init__.cpython-37.pyc +0 -0
- model/__pycache__/attentionLayer.cpython-37.pyc +0 -0
- model/__pycache__/convLayer.cpython-37.pyc +0 -0
- model/__pycache__/loconet_encoder.cpython-37.pyc +0 -0
- model/__pycache__/position_encoding.cpython-36.pyc +0 -0
- model/__pycache__/talkNetModel.cpython-37.pyc +0 -0
- model/__pycache__/transformer.cpython-36.pyc +0 -0
- model/__pycache__/utils.cpython-36.pyc +0 -0
- model/__pycache__/visualEncoder.cpython-37.pyc +0 -0
- model/attentionLayer.py +0 -39
- model/audioEncoder.py +0 -108
- model/convLayer.py +0 -42
- model/faceDetector/README.md +0 -3
- model/faceDetector/__init__.py +0 -1
- model/faceDetector/s3fd/__init__.py +0 -66
- model/faceDetector/s3fd/box_utils.py +0 -217
- model/faceDetector/s3fd/nets.py +0 -174
- model/loconet_encoder.py +0 -91
- model/transformer/__pycache__/position_encoding.cpython-37.pyc +0 -0
- model/transformer/__pycache__/transformer.cpython-37.pyc +0 -0
- model/transformer/__pycache__/utils.cpython-37.pyc +0 -0
- model/transformer/position_encoding.py +0 -28
- model/transformer/transformer.py +0 -334
- model/transformer/utils.py +0 -22
- model/visualEncoder.py +0 -199
model/.DS_Store
DELETED
Binary file (6.15 kB)
|
|
model/__init__.py
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
from model.transformer.position_encoding import PositionalEncoding
|
2 |
-
from model.transformer.transformer import Transformer
|
3 |
-
from model.transformer.transformer import TransformerEncoder, TransformerEncoderLayer
|
4 |
-
from model.transformer.transformer import TransformerDecoder, TransformerDecoderLayer
|
5 |
-
from model.transformer.utils import layer_norm, generate_square_subsequent_mask, generate_proposal_mask
|
|
|
|
|
|
|
|
|
|
|
|
model/__pycache__/__init__.cpython-36.pyc
DELETED
Binary file (561 Bytes)
|
|
model/__pycache__/__init__.cpython-37.pyc
DELETED
Binary file (573 Bytes)
|
|
model/__pycache__/attentionLayer.cpython-37.pyc
DELETED
Binary file (1.38 kB)
|
|
model/__pycache__/convLayer.cpython-37.pyc
DELETED
Binary file (1.32 kB)
|
|
model/__pycache__/loconet_encoder.cpython-37.pyc
DELETED
Binary file (3.21 kB)
|
|
model/__pycache__/position_encoding.cpython-36.pyc
DELETED
Binary file (1.26 kB)
|
|
model/__pycache__/talkNetModel.cpython-37.pyc
DELETED
Binary file (6.33 kB)
|
|
model/__pycache__/transformer.cpython-36.pyc
DELETED
Binary file (8.84 kB)
|
|
model/__pycache__/utils.cpython-36.pyc
DELETED
Binary file (1.08 kB)
|
|
model/__pycache__/visualEncoder.cpython-37.pyc
DELETED
Binary file (6.53 kB)
|
|
model/attentionLayer.py
DELETED
@@ -1,39 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
from torch.nn import functional as F
|
4 |
-
from torch.nn import MultiheadAttention
|
5 |
-
|
6 |
-
|
7 |
-
class attentionLayer(nn.Module):
|
8 |
-
|
9 |
-
def __init__(self, d_model, nhead, dropout=0.1):
|
10 |
-
super(attentionLayer, self).__init__()
|
11 |
-
self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
|
12 |
-
|
13 |
-
self.linear1 = nn.Linear(d_model, d_model * 4)
|
14 |
-
self.dropout = nn.Dropout(dropout)
|
15 |
-
self.linear2 = nn.Linear(d_model * 4, d_model)
|
16 |
-
|
17 |
-
self.norm1 = nn.LayerNorm(d_model)
|
18 |
-
self.norm2 = nn.LayerNorm(d_model)
|
19 |
-
self.dropout1 = nn.Dropout(dropout)
|
20 |
-
self.dropout2 = nn.Dropout(dropout)
|
21 |
-
|
22 |
-
self.activation = F.relu
|
23 |
-
|
24 |
-
def forward(self, src, tar, adjust=False, attn_mask=None):
|
25 |
-
# type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
|
26 |
-
src = src.transpose(0, 1) # B, T, C -> T, B, C
|
27 |
-
tar = tar.transpose(0, 1) # B, T, C -> T, B, C
|
28 |
-
if adjust:
|
29 |
-
src2 = self.self_attn(src, tar, tar, attn_mask=None, key_padding_mask=None)[0]
|
30 |
-
else:
|
31 |
-
src2 = self.self_attn(tar, src, src, attn_mask=None, key_padding_mask=None)[0]
|
32 |
-
src = src + self.dropout1(src2)
|
33 |
-
src = self.norm1(src)
|
34 |
-
|
35 |
-
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
36 |
-
src = src + self.dropout2(src2)
|
37 |
-
src = self.norm2(src)
|
38 |
-
src = src.transpose(0, 1) # T, B, C -> B, T, C
|
39 |
-
return src
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/audioEncoder.py
DELETED
@@ -1,108 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
import torch.nn.functional as F
|
4 |
-
|
5 |
-
class SEBasicBlock(nn.Module):
|
6 |
-
expansion = 1
|
7 |
-
|
8 |
-
def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
|
9 |
-
super(SEBasicBlock, self).__init__()
|
10 |
-
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
|
11 |
-
self.bn1 = nn.BatchNorm2d(planes)
|
12 |
-
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
|
13 |
-
self.bn2 = nn.BatchNorm2d(planes)
|
14 |
-
self.relu = nn.ReLU(inplace=True)
|
15 |
-
self.se = SELayer(planes, reduction)
|
16 |
-
self.downsample = downsample
|
17 |
-
self.stride = stride
|
18 |
-
|
19 |
-
def forward(self, x):
|
20 |
-
residual = x
|
21 |
-
|
22 |
-
out = self.conv1(x)
|
23 |
-
out = self.relu(out)
|
24 |
-
out = self.bn1(out)
|
25 |
-
|
26 |
-
out = self.conv2(out)
|
27 |
-
out = self.bn2(out)
|
28 |
-
out = self.se(out)
|
29 |
-
|
30 |
-
if self.downsample is not None:
|
31 |
-
residual = self.downsample(x)
|
32 |
-
|
33 |
-
out += residual
|
34 |
-
out = self.relu(out)
|
35 |
-
return out
|
36 |
-
|
37 |
-
class SELayer(nn.Module):
|
38 |
-
def __init__(self, channel, reduction=8):
|
39 |
-
super(SELayer, self).__init__()
|
40 |
-
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
41 |
-
self.fc = nn.Sequential(
|
42 |
-
nn.Linear(channel, channel // reduction),
|
43 |
-
nn.ReLU(inplace=True),
|
44 |
-
nn.Linear(channel // reduction, channel),
|
45 |
-
nn.Sigmoid()
|
46 |
-
)
|
47 |
-
|
48 |
-
def forward(self, x):
|
49 |
-
b, c, _, _ = x.size()
|
50 |
-
y = self.avg_pool(x).view(b, c)
|
51 |
-
y = self.fc(y).view(b, c, 1, 1)
|
52 |
-
return x * y
|
53 |
-
|
54 |
-
class audioEncoder(nn.Module):
|
55 |
-
def __init__(self, layers, num_filters, **kwargs):
|
56 |
-
super(audioEncoder, self).__init__()
|
57 |
-
block = SEBasicBlock
|
58 |
-
self.inplanes = num_filters[0]
|
59 |
-
|
60 |
-
self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3,
|
61 |
-
bias=False)
|
62 |
-
self.bn1 = nn.BatchNorm2d(num_filters[0])
|
63 |
-
self.relu = nn.ReLU(inplace=True)
|
64 |
-
|
65 |
-
self.layer1 = self._make_layer(block, num_filters[0], layers[0])
|
66 |
-
self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
|
67 |
-
self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
|
68 |
-
self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1))
|
69 |
-
out_dim = num_filters[3] * block.expansion
|
70 |
-
|
71 |
-
for m in self.modules():
|
72 |
-
if isinstance(m, nn.Conv2d):
|
73 |
-
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
74 |
-
elif isinstance(m, nn.BatchNorm2d):
|
75 |
-
nn.init.constant_(m.weight, 1)
|
76 |
-
nn.init.constant_(m.bias, 0)
|
77 |
-
|
78 |
-
def _make_layer(self, block, planes, blocks, stride=1):
|
79 |
-
downsample = None
|
80 |
-
if stride != 1 or self.inplanes != planes * block.expansion:
|
81 |
-
downsample = nn.Sequential(
|
82 |
-
nn.Conv2d(self.inplanes, planes * block.expansion,
|
83 |
-
kernel_size=1, stride=stride, bias=False),
|
84 |
-
nn.BatchNorm2d(planes * block.expansion),
|
85 |
-
)
|
86 |
-
|
87 |
-
layers = []
|
88 |
-
layers.append(block(self.inplanes, planes, stride, downsample))
|
89 |
-
self.inplanes = planes * block.expansion
|
90 |
-
for i in range(1, blocks):
|
91 |
-
layers.append(block(self.inplanes, planes))
|
92 |
-
|
93 |
-
return nn.Sequential(*layers)
|
94 |
-
|
95 |
-
def forward(self, x):
|
96 |
-
x = self.conv1(x)
|
97 |
-
x = self.bn1(x)
|
98 |
-
x = self.relu(x)
|
99 |
-
|
100 |
-
x = self.layer1(x)
|
101 |
-
x = self.layer2(x)
|
102 |
-
x = self.layer3(x)
|
103 |
-
x = self.layer4(x)
|
104 |
-
x = torch.mean(x, dim=2, keepdim=True)
|
105 |
-
x = x.view((x.size()[0], x.size()[1], -1))
|
106 |
-
x = x.transpose(1, 2)
|
107 |
-
|
108 |
-
return x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/convLayer.py
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
from torch.nn import functional as F
|
4 |
-
|
5 |
-
|
6 |
-
class ConvLayer(nn.Module):
|
7 |
-
|
8 |
-
def __init__(self, cfg):
|
9 |
-
super(ConvLayer, self).__init__()
|
10 |
-
self.cfg = cfg
|
11 |
-
self.s = cfg.MODEL.NUM_SPEAKERS
|
12 |
-
self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (self.s, 7), padding=(0, 3))
|
13 |
-
# below line is speaker parallel 93.88 code
|
14 |
-
# self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (3, 7), padding=(0, 3))
|
15 |
-
self.ln = torch.nn.LayerNorm(256)
|
16 |
-
self.conv2d_1x1 = torch.nn.Conv2d(256, 512, (1, 1), padding=(0, 0))
|
17 |
-
self.conv2d_1x1_2 = torch.nn.Conv2d(512, 256, (1, 1), padding=(0, 0))
|
18 |
-
self.gelu = nn.GELU()
|
19 |
-
|
20 |
-
def forward(self, x, b, s):
|
21 |
-
|
22 |
-
identity = x # b*s, t, c
|
23 |
-
t = x.shape[1]
|
24 |
-
c = x.shape[2]
|
25 |
-
out = x.view(b, s, t, c)
|
26 |
-
out = out.permute(0, 3, 1, 2) # b, c, s, t
|
27 |
-
|
28 |
-
out = self.conv2d(out) # b, s*c, 1, t
|
29 |
-
out = out.view(b, c, s, t)
|
30 |
-
out = out.permute(0, 2, 3, 1) # b, s, t, c
|
31 |
-
out = self.ln(out)
|
32 |
-
out = out.permute(0, 3, 1, 2)
|
33 |
-
out = self.conv2d_1x1(out)
|
34 |
-
out = self.gelu(out)
|
35 |
-
out = self.conv2d_1x1_2(out) # b, c, s, t
|
36 |
-
|
37 |
-
out = out.permute(0, 2, 3, 1) # b, s, t, c
|
38 |
-
out = out.view(b * s, t, c)
|
39 |
-
|
40 |
-
out += identity
|
41 |
-
|
42 |
-
return out, b, s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/faceDetector/README.md
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
# Face detector
|
2 |
-
|
3 |
-
This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
|
|
|
|
|
|
|
|
model/faceDetector/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
from .s3fd import S3FD
|
|
|
|
model/faceDetector/s3fd/__init__.py
DELETED
@@ -1,66 +0,0 @@
|
|
1 |
-
import time, os, sys, subprocess
|
2 |
-
import numpy as np
|
3 |
-
import cv2
|
4 |
-
import torch
|
5 |
-
from torchvision import transforms
|
6 |
-
from .nets import S3FDNet
|
7 |
-
from .box_utils import nms_
|
8 |
-
|
9 |
-
PATH_WEIGHT = 'model/faceDetector/s3fd/sfd_face.pth'
|
10 |
-
if os.path.isfile(PATH_WEIGHT) == False:
|
11 |
-
Link = "1KafnHz7ccT-3IyddBsL5yi2xGtxAKypt"
|
12 |
-
cmd = "gdown --id %s -O %s"%(Link, PATH_WEIGHT)
|
13 |
-
subprocess.call(cmd, shell=True, stdout=None)
|
14 |
-
img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
|
15 |
-
|
16 |
-
|
17 |
-
class S3FD():
|
18 |
-
|
19 |
-
def __init__(self, device='cuda'):
|
20 |
-
|
21 |
-
tstamp = time.time()
|
22 |
-
self.device = device
|
23 |
-
|
24 |
-
# print('[S3FD] loading with', self.device)
|
25 |
-
self.net = S3FDNet(device=self.device).to(self.device)
|
26 |
-
PATH = os.path.join(os.getcwd(), PATH_WEIGHT)
|
27 |
-
state_dict = torch.load(PATH, map_location=self.device)
|
28 |
-
self.net.load_state_dict(state_dict)
|
29 |
-
self.net.eval()
|
30 |
-
# print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
|
31 |
-
|
32 |
-
def detect_faces(self, image, conf_th=0.8, scales=[1]):
|
33 |
-
|
34 |
-
w, h = image.shape[1], image.shape[0]
|
35 |
-
|
36 |
-
bboxes = np.empty(shape=(0, 5))
|
37 |
-
|
38 |
-
with torch.no_grad():
|
39 |
-
for s in scales:
|
40 |
-
scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
|
41 |
-
|
42 |
-
scaled_img = np.swapaxes(scaled_img, 1, 2)
|
43 |
-
scaled_img = np.swapaxes(scaled_img, 1, 0)
|
44 |
-
scaled_img = scaled_img[[2, 1, 0], :, :]
|
45 |
-
scaled_img = scaled_img.astype('float32')
|
46 |
-
scaled_img -= img_mean
|
47 |
-
scaled_img = scaled_img[[2, 1, 0], :, :]
|
48 |
-
x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
|
49 |
-
y = self.net(x)
|
50 |
-
|
51 |
-
detections = y.data
|
52 |
-
scale = torch.Tensor([w, h, w, h])
|
53 |
-
|
54 |
-
for i in range(detections.size(1)):
|
55 |
-
j = 0
|
56 |
-
while detections[0, i, j, 0] > conf_th:
|
57 |
-
score = detections[0, i, j, 0]
|
58 |
-
pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
|
59 |
-
bbox = (pt[0], pt[1], pt[2], pt[3], score)
|
60 |
-
bboxes = np.vstack((bboxes, bbox))
|
61 |
-
j += 1
|
62 |
-
|
63 |
-
keep = nms_(bboxes, 0.1)
|
64 |
-
bboxes = bboxes[keep]
|
65 |
-
|
66 |
-
return bboxes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/faceDetector/s3fd/box_utils.py
DELETED
@@ -1,217 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from itertools import product as product
|
3 |
-
import torch
|
4 |
-
from torch.autograd import Function
|
5 |
-
|
6 |
-
|
7 |
-
def nms_(dets, thresh):
|
8 |
-
"""
|
9 |
-
Courtesy of Ross Girshick
|
10 |
-
[https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
|
11 |
-
"""
|
12 |
-
x1 = dets[:, 0]
|
13 |
-
y1 = dets[:, 1]
|
14 |
-
x2 = dets[:, 2]
|
15 |
-
y2 = dets[:, 3]
|
16 |
-
scores = dets[:, 4]
|
17 |
-
|
18 |
-
areas = (x2 - x1) * (y2 - y1)
|
19 |
-
order = scores.argsort()[::-1]
|
20 |
-
|
21 |
-
keep = []
|
22 |
-
while order.size > 0:
|
23 |
-
i = order[0]
|
24 |
-
keep.append(int(i))
|
25 |
-
xx1 = np.maximum(x1[i], x1[order[1:]])
|
26 |
-
yy1 = np.maximum(y1[i], y1[order[1:]])
|
27 |
-
xx2 = np.minimum(x2[i], x2[order[1:]])
|
28 |
-
yy2 = np.minimum(y2[i], y2[order[1:]])
|
29 |
-
|
30 |
-
w = np.maximum(0.0, xx2 - xx1)
|
31 |
-
h = np.maximum(0.0, yy2 - yy1)
|
32 |
-
inter = w * h
|
33 |
-
ovr = inter / (areas[i] + areas[order[1:]] - inter)
|
34 |
-
|
35 |
-
inds = np.where(ovr <= thresh)[0]
|
36 |
-
order = order[inds + 1]
|
37 |
-
|
38 |
-
return np.array(keep).astype(np.int)
|
39 |
-
|
40 |
-
|
41 |
-
def decode(loc, priors, variances):
|
42 |
-
"""Decode locations from predictions using priors to undo
|
43 |
-
the encoding we did for offset regression at train time.
|
44 |
-
Args:
|
45 |
-
loc (tensor): location predictions for loc layers,
|
46 |
-
Shape: [num_priors,4]
|
47 |
-
priors (tensor): Prior boxes in center-offset form.
|
48 |
-
Shape: [num_priors,4].
|
49 |
-
variances: (list[float]) Variances of priorboxes
|
50 |
-
Return:
|
51 |
-
decoded bounding box predictions
|
52 |
-
"""
|
53 |
-
|
54 |
-
boxes = torch.cat((
|
55 |
-
priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
|
56 |
-
priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
|
57 |
-
boxes[:, :2] -= boxes[:, 2:] / 2
|
58 |
-
boxes[:, 2:] += boxes[:, :2]
|
59 |
-
return boxes
|
60 |
-
|
61 |
-
|
62 |
-
def nms(boxes, scores, overlap=0.5, top_k=200):
|
63 |
-
"""Apply non-maximum suppression at test time to avoid detecting too many
|
64 |
-
overlapping bounding boxes for a given object.
|
65 |
-
Args:
|
66 |
-
boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
|
67 |
-
scores: (tensor) The class predscores for the img, Shape:[num_priors].
|
68 |
-
overlap: (float) The overlap thresh for suppressing unnecessary boxes.
|
69 |
-
top_k: (int) The Maximum number of box preds to consider.
|
70 |
-
Return:
|
71 |
-
The indices of the kept boxes with respect to num_priors.
|
72 |
-
"""
|
73 |
-
|
74 |
-
keep = scores.new(scores.size(0)).zero_().long()
|
75 |
-
if boxes.numel() == 0:
|
76 |
-
return keep, 0
|
77 |
-
x1 = boxes[:, 0]
|
78 |
-
y1 = boxes[:, 1]
|
79 |
-
x2 = boxes[:, 2]
|
80 |
-
y2 = boxes[:, 3]
|
81 |
-
area = torch.mul(x2 - x1, y2 - y1)
|
82 |
-
v, idx = scores.sort(0) # sort in ascending order
|
83 |
-
# I = I[v >= 0.01]
|
84 |
-
idx = idx[-top_k:] # indices of the top-k largest vals
|
85 |
-
xx1 = boxes.new()
|
86 |
-
yy1 = boxes.new()
|
87 |
-
xx2 = boxes.new()
|
88 |
-
yy2 = boxes.new()
|
89 |
-
w = boxes.new()
|
90 |
-
h = boxes.new()
|
91 |
-
|
92 |
-
# keep = torch.Tensor()
|
93 |
-
count = 0
|
94 |
-
while idx.numel() > 0:
|
95 |
-
i = idx[-1] # index of current largest val
|
96 |
-
# keep.append(i)
|
97 |
-
keep[count] = i
|
98 |
-
count += 1
|
99 |
-
if idx.size(0) == 1:
|
100 |
-
break
|
101 |
-
idx = idx[:-1] # remove kept element from view
|
102 |
-
# load bboxes of next highest vals
|
103 |
-
torch.index_select(x1, 0, idx, out=xx1)
|
104 |
-
torch.index_select(y1, 0, idx, out=yy1)
|
105 |
-
torch.index_select(x2, 0, idx, out=xx2)
|
106 |
-
torch.index_select(y2, 0, idx, out=yy2)
|
107 |
-
# store element-wise max with next highest score
|
108 |
-
xx1 = torch.clamp(xx1, min=x1[i])
|
109 |
-
yy1 = torch.clamp(yy1, min=y1[i])
|
110 |
-
xx2 = torch.clamp(xx2, max=x2[i])
|
111 |
-
yy2 = torch.clamp(yy2, max=y2[i])
|
112 |
-
w.resize_as_(xx2)
|
113 |
-
h.resize_as_(yy2)
|
114 |
-
w = xx2 - xx1
|
115 |
-
h = yy2 - yy1
|
116 |
-
# check sizes of xx1 and xx2.. after each iteration
|
117 |
-
w = torch.clamp(w, min=0.0)
|
118 |
-
h = torch.clamp(h, min=0.0)
|
119 |
-
inter = w * h
|
120 |
-
# IoU = i / (area(a) + area(b) - i)
|
121 |
-
rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
|
122 |
-
union = (rem_areas - inter) + area[i]
|
123 |
-
IoU = inter / union # store result in iou
|
124 |
-
# keep only elements with an IoU <= overlap
|
125 |
-
idx = idx[IoU.le(overlap)]
|
126 |
-
return keep, count
|
127 |
-
|
128 |
-
|
129 |
-
class Detect(object):
|
130 |
-
|
131 |
-
def __init__(self, num_classes=2,
|
132 |
-
top_k=750, nms_thresh=0.3, conf_thresh=0.05,
|
133 |
-
variance=[0.1, 0.2], nms_top_k=5000):
|
134 |
-
|
135 |
-
self.num_classes = num_classes
|
136 |
-
self.top_k = top_k
|
137 |
-
self.nms_thresh = nms_thresh
|
138 |
-
self.conf_thresh = conf_thresh
|
139 |
-
self.variance = variance
|
140 |
-
self.nms_top_k = nms_top_k
|
141 |
-
|
142 |
-
def forward(self, loc_data, conf_data, prior_data):
|
143 |
-
|
144 |
-
num = loc_data.size(0)
|
145 |
-
num_priors = prior_data.size(0)
|
146 |
-
|
147 |
-
conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
|
148 |
-
batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
|
149 |
-
batch_priors = batch_priors.contiguous().view(-1, 4)
|
150 |
-
|
151 |
-
decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
|
152 |
-
decoded_boxes = decoded_boxes.view(num, num_priors, 4)
|
153 |
-
|
154 |
-
output = torch.zeros(num, self.num_classes, self.top_k, 5)
|
155 |
-
|
156 |
-
for i in range(num):
|
157 |
-
boxes = decoded_boxes[i].clone()
|
158 |
-
conf_scores = conf_preds[i].clone()
|
159 |
-
|
160 |
-
for cl in range(1, self.num_classes):
|
161 |
-
c_mask = conf_scores[cl].gt(self.conf_thresh)
|
162 |
-
scores = conf_scores[cl][c_mask]
|
163 |
-
|
164 |
-
if scores.dim() == 0:
|
165 |
-
continue
|
166 |
-
l_mask = c_mask.unsqueeze(1).expand_as(boxes)
|
167 |
-
boxes_ = boxes[l_mask].view(-1, 4)
|
168 |
-
ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
|
169 |
-
count = count if count < self.top_k else self.top_k
|
170 |
-
|
171 |
-
output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
|
172 |
-
|
173 |
-
return output
|
174 |
-
|
175 |
-
|
176 |
-
class PriorBox(object):
|
177 |
-
|
178 |
-
def __init__(self, input_size, feature_maps,
|
179 |
-
variance=[0.1, 0.2],
|
180 |
-
min_sizes=[16, 32, 64, 128, 256, 512],
|
181 |
-
steps=[4, 8, 16, 32, 64, 128],
|
182 |
-
clip=False):
|
183 |
-
|
184 |
-
super(PriorBox, self).__init__()
|
185 |
-
|
186 |
-
self.imh = input_size[0]
|
187 |
-
self.imw = input_size[1]
|
188 |
-
self.feature_maps = feature_maps
|
189 |
-
|
190 |
-
self.variance = variance
|
191 |
-
self.min_sizes = min_sizes
|
192 |
-
self.steps = steps
|
193 |
-
self.clip = clip
|
194 |
-
|
195 |
-
def forward(self):
|
196 |
-
mean = []
|
197 |
-
for k, fmap in enumerate(self.feature_maps):
|
198 |
-
feath = fmap[0]
|
199 |
-
featw = fmap[1]
|
200 |
-
for i, j in product(range(feath), range(featw)):
|
201 |
-
f_kw = self.imw / self.steps[k]
|
202 |
-
f_kh = self.imh / self.steps[k]
|
203 |
-
|
204 |
-
cx = (j + 0.5) / f_kw
|
205 |
-
cy = (i + 0.5) / f_kh
|
206 |
-
|
207 |
-
s_kw = self.min_sizes[k] / self.imw
|
208 |
-
s_kh = self.min_sizes[k] / self.imh
|
209 |
-
|
210 |
-
mean += [cx, cy, s_kw, s_kh]
|
211 |
-
|
212 |
-
output = torch.FloatTensor(mean).view(-1, 4)
|
213 |
-
|
214 |
-
if self.clip:
|
215 |
-
output.clamp_(max=1, min=0)
|
216 |
-
|
217 |
-
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/faceDetector/s3fd/nets.py
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
import torch.nn.functional as F
|
4 |
-
import torch.nn.init as init
|
5 |
-
from .box_utils import Detect, PriorBox
|
6 |
-
|
7 |
-
|
8 |
-
class L2Norm(nn.Module):
|
9 |
-
|
10 |
-
def __init__(self, n_channels, scale):
|
11 |
-
super(L2Norm, self).__init__()
|
12 |
-
self.n_channels = n_channels
|
13 |
-
self.gamma = scale or None
|
14 |
-
self.eps = 1e-10
|
15 |
-
self.weight = nn.Parameter(torch.Tensor(self.n_channels))
|
16 |
-
self.reset_parameters()
|
17 |
-
|
18 |
-
def reset_parameters(self):
|
19 |
-
init.constant_(self.weight, self.gamma)
|
20 |
-
|
21 |
-
def forward(self, x):
|
22 |
-
norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
|
23 |
-
x = torch.div(x, norm)
|
24 |
-
out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
|
25 |
-
return out
|
26 |
-
|
27 |
-
|
28 |
-
class S3FDNet(nn.Module):
|
29 |
-
|
30 |
-
def __init__(self, device='cuda'):
|
31 |
-
super(S3FDNet, self).__init__()
|
32 |
-
self.device = device
|
33 |
-
|
34 |
-
self.vgg = nn.ModuleList([
|
35 |
-
nn.Conv2d(3, 64, 3, 1, padding=1),
|
36 |
-
nn.ReLU(inplace=True),
|
37 |
-
nn.Conv2d(64, 64, 3, 1, padding=1),
|
38 |
-
nn.ReLU(inplace=True),
|
39 |
-
nn.MaxPool2d(2, 2),
|
40 |
-
|
41 |
-
nn.Conv2d(64, 128, 3, 1, padding=1),
|
42 |
-
nn.ReLU(inplace=True),
|
43 |
-
nn.Conv2d(128, 128, 3, 1, padding=1),
|
44 |
-
nn.ReLU(inplace=True),
|
45 |
-
nn.MaxPool2d(2, 2),
|
46 |
-
|
47 |
-
nn.Conv2d(128, 256, 3, 1, padding=1),
|
48 |
-
nn.ReLU(inplace=True),
|
49 |
-
nn.Conv2d(256, 256, 3, 1, padding=1),
|
50 |
-
nn.ReLU(inplace=True),
|
51 |
-
nn.Conv2d(256, 256, 3, 1, padding=1),
|
52 |
-
nn.ReLU(inplace=True),
|
53 |
-
nn.MaxPool2d(2, 2, ceil_mode=True),
|
54 |
-
|
55 |
-
nn.Conv2d(256, 512, 3, 1, padding=1),
|
56 |
-
nn.ReLU(inplace=True),
|
57 |
-
nn.Conv2d(512, 512, 3, 1, padding=1),
|
58 |
-
nn.ReLU(inplace=True),
|
59 |
-
nn.Conv2d(512, 512, 3, 1, padding=1),
|
60 |
-
nn.ReLU(inplace=True),
|
61 |
-
nn.MaxPool2d(2, 2),
|
62 |
-
|
63 |
-
nn.Conv2d(512, 512, 3, 1, padding=1),
|
64 |
-
nn.ReLU(inplace=True),
|
65 |
-
nn.Conv2d(512, 512, 3, 1, padding=1),
|
66 |
-
nn.ReLU(inplace=True),
|
67 |
-
nn.Conv2d(512, 512, 3, 1, padding=1),
|
68 |
-
nn.ReLU(inplace=True),
|
69 |
-
nn.MaxPool2d(2, 2),
|
70 |
-
|
71 |
-
nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
|
72 |
-
nn.ReLU(inplace=True),
|
73 |
-
nn.Conv2d(1024, 1024, 1, 1),
|
74 |
-
nn.ReLU(inplace=True),
|
75 |
-
])
|
76 |
-
|
77 |
-
self.L2Norm3_3 = L2Norm(256, 10)
|
78 |
-
self.L2Norm4_3 = L2Norm(512, 8)
|
79 |
-
self.L2Norm5_3 = L2Norm(512, 5)
|
80 |
-
|
81 |
-
self.extras = nn.ModuleList([
|
82 |
-
nn.Conv2d(1024, 256, 1, 1),
|
83 |
-
nn.Conv2d(256, 512, 3, 2, padding=1),
|
84 |
-
nn.Conv2d(512, 128, 1, 1),
|
85 |
-
nn.Conv2d(128, 256, 3, 2, padding=1),
|
86 |
-
])
|
87 |
-
|
88 |
-
self.loc = nn.ModuleList([
|
89 |
-
nn.Conv2d(256, 4, 3, 1, padding=1),
|
90 |
-
nn.Conv2d(512, 4, 3, 1, padding=1),
|
91 |
-
nn.Conv2d(512, 4, 3, 1, padding=1),
|
92 |
-
nn.Conv2d(1024, 4, 3, 1, padding=1),
|
93 |
-
nn.Conv2d(512, 4, 3, 1, padding=1),
|
94 |
-
nn.Conv2d(256, 4, 3, 1, padding=1),
|
95 |
-
])
|
96 |
-
|
97 |
-
self.conf = nn.ModuleList([
|
98 |
-
nn.Conv2d(256, 4, 3, 1, padding=1),
|
99 |
-
nn.Conv2d(512, 2, 3, 1, padding=1),
|
100 |
-
nn.Conv2d(512, 2, 3, 1, padding=1),
|
101 |
-
nn.Conv2d(1024, 2, 3, 1, padding=1),
|
102 |
-
nn.Conv2d(512, 2, 3, 1, padding=1),
|
103 |
-
nn.Conv2d(256, 2, 3, 1, padding=1),
|
104 |
-
])
|
105 |
-
|
106 |
-
self.softmax = nn.Softmax(dim=-1)
|
107 |
-
self.detect = Detect()
|
108 |
-
|
109 |
-
def forward(self, x):
|
110 |
-
size = x.size()[2:]
|
111 |
-
sources = list()
|
112 |
-
loc = list()
|
113 |
-
conf = list()
|
114 |
-
|
115 |
-
for k in range(16):
|
116 |
-
x = self.vgg[k](x)
|
117 |
-
s = self.L2Norm3_3(x)
|
118 |
-
sources.append(s)
|
119 |
-
|
120 |
-
for k in range(16, 23):
|
121 |
-
x = self.vgg[k](x)
|
122 |
-
s = self.L2Norm4_3(x)
|
123 |
-
sources.append(s)
|
124 |
-
|
125 |
-
for k in range(23, 30):
|
126 |
-
x = self.vgg[k](x)
|
127 |
-
s = self.L2Norm5_3(x)
|
128 |
-
sources.append(s)
|
129 |
-
|
130 |
-
for k in range(30, len(self.vgg)):
|
131 |
-
x = self.vgg[k](x)
|
132 |
-
sources.append(x)
|
133 |
-
|
134 |
-
# apply extra layers and cache source layer outputs
|
135 |
-
for k, v in enumerate(self.extras):
|
136 |
-
x = F.relu(v(x), inplace=True)
|
137 |
-
if k % 2 == 1:
|
138 |
-
sources.append(x)
|
139 |
-
|
140 |
-
# apply multibox head to source layers
|
141 |
-
loc_x = self.loc[0](sources[0])
|
142 |
-
conf_x = self.conf[0](sources[0])
|
143 |
-
|
144 |
-
max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
|
145 |
-
conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
|
146 |
-
|
147 |
-
loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
|
148 |
-
conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
|
149 |
-
|
150 |
-
for i in range(1, len(sources)):
|
151 |
-
x = sources[i]
|
152 |
-
conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
|
153 |
-
loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
|
154 |
-
|
155 |
-
features_maps = []
|
156 |
-
for i in range(len(loc)):
|
157 |
-
feat = []
|
158 |
-
feat += [loc[i].size(1), loc[i].size(2)]
|
159 |
-
features_maps += [feat]
|
160 |
-
|
161 |
-
loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
|
162 |
-
conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
|
163 |
-
|
164 |
-
with torch.no_grad():
|
165 |
-
self.priorbox = PriorBox(size, features_maps)
|
166 |
-
self.priors = self.priorbox.forward()
|
167 |
-
|
168 |
-
output = self.detect.forward(
|
169 |
-
loc.view(loc.size(0), -1, 4),
|
170 |
-
self.softmax(conf.view(conf.size(0), -1, 2)),
|
171 |
-
self.priors.type(type(x.data)).to(self.device)
|
172 |
-
)
|
173 |
-
|
174 |
-
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/loconet_encoder.py
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
|
4 |
-
# from model.visualEncoder import visualFrontend, visualTCN, visualConv1D
|
5 |
-
from model.attentionLayer import attentionLayer
|
6 |
-
from model.convLayer import ConvLayer
|
7 |
-
from torchvggish import vggish
|
8 |
-
from model.visualEncoder import visualFrontend, visualConv1D, visualTCN
|
9 |
-
|
10 |
-
|
11 |
-
class locoencoder(nn.Module):
|
12 |
-
|
13 |
-
def __init__(self, cfg):
|
14 |
-
super(locoencoder, self).__init__()
|
15 |
-
self.cfg = cfg
|
16 |
-
# Visual Temporal Encoder
|
17 |
-
self.visualFrontend = visualFrontend(cfg) # Visual Frontend
|
18 |
-
self.visualTCN = visualTCN() # Visual Temporal Network TCN
|
19 |
-
self.visualConv1D = visualConv1D() # Visual Temporal Network Conv1d
|
20 |
-
|
21 |
-
urls = {
|
22 |
-
'vggish':
|
23 |
-
"https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth"
|
24 |
-
}
|
25 |
-
self.audioEncoder = vggish.VGGish(urls, preprocess=False, postprocess=False)
|
26 |
-
self.audio_pool = nn.AdaptiveAvgPool1d(1)
|
27 |
-
|
28 |
-
# Audio-visual Cross Attention
|
29 |
-
self.crossA2V = attentionLayer(d_model=128, nhead=8)
|
30 |
-
self.crossV2A = attentionLayer(d_model=128, nhead=8)
|
31 |
-
|
32 |
-
# Audio-visual Self Attention
|
33 |
-
|
34 |
-
num_layers = self.cfg.MODEL.AV_layers
|
35 |
-
layers = nn.ModuleList()
|
36 |
-
for i in range(num_layers):
|
37 |
-
layers.append(ConvLayer(cfg))
|
38 |
-
layers.append(attentionLayer(d_model=256, nhead=8))
|
39 |
-
self.convAV = layers
|
40 |
-
|
41 |
-
def forward_visual_frontend(self, x):
|
42 |
-
|
43 |
-
B, T, W, H = x.shape
|
44 |
-
x = x.view(B * T, 1, 1, W, H)
|
45 |
-
x = (x / 255 - 0.4161) / 0.1688
|
46 |
-
x = self.visualFrontend(x)
|
47 |
-
x = x.view(B, T, 512)
|
48 |
-
x = x.transpose(1, 2)
|
49 |
-
x = self.visualTCN(x)
|
50 |
-
x = self.visualConv1D(x)
|
51 |
-
x = x.transpose(1, 2)
|
52 |
-
return x
|
53 |
-
|
54 |
-
def forward_audio_frontend(self, x):
|
55 |
-
t = x.shape[-2]
|
56 |
-
numFrames = t // 4
|
57 |
-
pad = 8 - (t % 8)
|
58 |
-
x = torch.nn.functional.pad(x, (0, 0, 0, pad), "constant")
|
59 |
-
# x = x.unsqueeze(1).transpose(2, 3)
|
60 |
-
x = self.audioEncoder(x)
|
61 |
-
|
62 |
-
b, c, t2, freq = x.shape
|
63 |
-
x = x.view(b * c, t2, freq)
|
64 |
-
x = self.audio_pool(x)
|
65 |
-
x = x.view(b, c, t2)[:, :, :numFrames]
|
66 |
-
x = x.permute(0, 2, 1)
|
67 |
-
return x
|
68 |
-
|
69 |
-
def forward_cross_attention(self, x1, x2):
|
70 |
-
x1_c = self.crossA2V(src=x1, tar=x2, adjust=self.cfg.MODEL.ADJUST_ATTENTION)
|
71 |
-
x2_c = self.crossV2A(src=x2, tar=x1, adjust=self.cfg.MODEL.ADJUST_ATTENTION)
|
72 |
-
return x1_c, x2_c
|
73 |
-
|
74 |
-
def forward_audio_visual_backend(self, x1, x2, b=1, s=1):
|
75 |
-
x = torch.cat((x1, x2), 2) # B*S, T, 2C
|
76 |
-
for i, layer in enumerate(self.convAV):
|
77 |
-
if i % 2 == 0:
|
78 |
-
x, b, s = layer(x, b, s)
|
79 |
-
else:
|
80 |
-
x = layer(src=x, tar=x)
|
81 |
-
|
82 |
-
x = torch.reshape(x, (-1, 256))
|
83 |
-
return x
|
84 |
-
|
85 |
-
def forward_audio_backend(self, x):
|
86 |
-
x = torch.reshape(x, (-1, 128))
|
87 |
-
return x
|
88 |
-
|
89 |
-
def forward_visual_backend(self, x):
|
90 |
-
x = torch.reshape(x, (-1, 128))
|
91 |
-
return x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/transformer/__pycache__/position_encoding.cpython-37.pyc
DELETED
Binary file (1.23 kB)
|
|
model/transformer/__pycache__/transformer.cpython-37.pyc
DELETED
Binary file (8.81 kB)
|
|
model/transformer/__pycache__/utils.cpython-37.pyc
DELETED
Binary file (1.05 kB)
|
|
model/transformer/position_encoding.py
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
##########################################################################
|
2 |
-
# We adopt the positional encoding method from PyTorch Turorial.
|
3 |
-
# Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
|
4 |
-
##########################################################################
|
5 |
-
import math
|
6 |
-
|
7 |
-
import torch
|
8 |
-
import torch.nn as nn
|
9 |
-
|
10 |
-
|
11 |
-
class PositionalEncoding(nn.Module):
|
12 |
-
|
13 |
-
def __init__(self, d_model, dropout=0.1, max_len=5000):
|
14 |
-
super(PositionalEncoding, self).__init__()
|
15 |
-
|
16 |
-
self.dropout = nn.Dropout(p=dropout)
|
17 |
-
|
18 |
-
pe = torch.zeros(max_len, d_model)
|
19 |
-
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
20 |
-
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
|
21 |
-
pe[:, 0::2] = torch.sin(position * div_term)
|
22 |
-
pe[:, 1::2] = torch.cos(position * div_term)
|
23 |
-
pe = pe.unsqueeze(0).transpose(0, 1)
|
24 |
-
self.register_buffer('pe', pe)
|
25 |
-
|
26 |
-
def forward(self, x, padding=0):
|
27 |
-
x = x + self.pe[padding:padding + x.shape[0], :]
|
28 |
-
return self.dropout(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/transformer/transformer.py
DELETED
@@ -1,334 +0,0 @@
|
|
1 |
-
import copy
|
2 |
-
|
3 |
-
import torch
|
4 |
-
import torch.nn as nn
|
5 |
-
import torch.nn.functional as F
|
6 |
-
|
7 |
-
|
8 |
-
class DotProductAttention(nn.Module):
|
9 |
-
|
10 |
-
def __init__(self, dropout=0.0):
|
11 |
-
super(DotProductAttention, self).__init__()
|
12 |
-
|
13 |
-
self.dropout = dropout
|
14 |
-
|
15 |
-
def forward(self, q, k, v, attn_mask=None):
|
16 |
-
attn_output_weights = torch.bmm(q, k.transpose(1, 2))
|
17 |
-
|
18 |
-
if attn_mask is not None:
|
19 |
-
attn_output_weights += attn_mask
|
20 |
-
|
21 |
-
attn_output_weights = F.softmax(attn_output_weights, dim=-1)
|
22 |
-
attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training)
|
23 |
-
attn_output = torch.bmm(attn_output_weights, v)
|
24 |
-
return attn_output
|
25 |
-
|
26 |
-
|
27 |
-
class MultiheadAttention(nn.Module):
|
28 |
-
|
29 |
-
def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, kdim=None, vdim=None):
|
30 |
-
super(MultiheadAttention, self).__init__()
|
31 |
-
|
32 |
-
self.embed_dim = embed_dim
|
33 |
-
self.num_heads = num_heads
|
34 |
-
self.kdim = kdim if kdim is not None else embed_dim
|
35 |
-
self.vdim = vdim if vdim is not None else embed_dim
|
36 |
-
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
|
37 |
-
|
38 |
-
if self._qkv_same_embed_dim:
|
39 |
-
self.in_proj_weight = nn.Parameter(torch.empty(3 * embed_dim, embed_dim))
|
40 |
-
else:
|
41 |
-
raise RuntimeError('Do not support q, k, v have different dimensions')
|
42 |
-
|
43 |
-
if bias:
|
44 |
-
self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim))
|
45 |
-
else:
|
46 |
-
self.register_parameter('in_proj_bias', None)
|
47 |
-
|
48 |
-
self.out_proj = nn.Linear(embed_dim, embed_dim)
|
49 |
-
|
50 |
-
if self._qkv_same_embed_dim:
|
51 |
-
nn.init.xavier_uniform_(self.in_proj_weight)
|
52 |
-
|
53 |
-
if self.in_proj_bias is not None:
|
54 |
-
nn.init.constant_(self.in_proj_bias, 0.)
|
55 |
-
nn.init.constant_(self.out_proj.bias, 0.)
|
56 |
-
|
57 |
-
self.dotproductattention = DotProductAttention(dropout)
|
58 |
-
|
59 |
-
def forward(self, q, k, v, attn_mask=None, key_padding_mask=None):
|
60 |
-
tsz, bsz, embed_dim = q.shape[0], q.shape[1], q.shape[2]
|
61 |
-
|
62 |
-
head_dim = embed_dim // self.num_heads
|
63 |
-
assert head_dim * self.num_heads == embed_dim, \
|
64 |
-
'embed_dim must be divisible by num_heads'
|
65 |
-
scaling = float(head_dim)**-0.5
|
66 |
-
|
67 |
-
_b = self.in_proj_bias
|
68 |
-
_start = None
|
69 |
-
_end = embed_dim
|
70 |
-
_w = self.in_proj_weight[:_end, :]
|
71 |
-
if _b is not None:
|
72 |
-
_b = _b[:_end]
|
73 |
-
q = F.linear(q, _w, _b)
|
74 |
-
|
75 |
-
_b = self.in_proj_bias
|
76 |
-
_start = embed_dim
|
77 |
-
_end = embed_dim * 2
|
78 |
-
_w = self.in_proj_weight[_start:_end, :]
|
79 |
-
if _b is not None:
|
80 |
-
_b = _b[_start:_end]
|
81 |
-
k = F.linear(k, _w, _b)
|
82 |
-
|
83 |
-
_b = self.in_proj_bias
|
84 |
-
_start = embed_dim * 2
|
85 |
-
_end = None
|
86 |
-
_w = self.in_proj_weight[_start:, :]
|
87 |
-
if _b is not None:
|
88 |
-
_b = _b[_start:]
|
89 |
-
v = F.linear(v, _w, _b)
|
90 |
-
|
91 |
-
q = q * scaling
|
92 |
-
|
93 |
-
q = q.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
|
94 |
-
k = k.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
|
95 |
-
v = v.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
|
96 |
-
|
97 |
-
if attn_mask is not None:
|
98 |
-
attn_mask = attn_mask.unsqueeze(0).repeat(bsz, 1, 1)
|
99 |
-
attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
|
100 |
-
attn_mask = attn_mask.reshape(-1, *attn_mask.shape[2:])
|
101 |
-
|
102 |
-
if key_padding_mask is not None:
|
103 |
-
key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, tsz, 1)
|
104 |
-
key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
|
105 |
-
key_padding_mask = key_padding_mask.reshape(-1, *key_padding_mask.shape[2:])
|
106 |
-
|
107 |
-
if attn_mask is not None and key_padding_mask is not None:
|
108 |
-
mask = attn_mask + key_padding_mask
|
109 |
-
elif attn_mask is not None:
|
110 |
-
mask = attn_mask
|
111 |
-
elif key_padding_mask is not None:
|
112 |
-
mask = key_padding_mask
|
113 |
-
else:
|
114 |
-
mask = None
|
115 |
-
|
116 |
-
attn_output = self.dotproductattention(q, k, v, mask)
|
117 |
-
attn_output = attn_output.transpose(0, 1).contiguous().view(tsz, bsz, self.embed_dim)
|
118 |
-
return self.out_proj(attn_output), None
|
119 |
-
|
120 |
-
|
121 |
-
class Transformer(nn.Module):
|
122 |
-
|
123 |
-
def __init__(self,
|
124 |
-
d_model=512,
|
125 |
-
nhead=8,
|
126 |
-
num_encoder_layers=6,
|
127 |
-
num_decoder_layers=6,
|
128 |
-
dim_feedforward=2048,
|
129 |
-
dropout=0.1,
|
130 |
-
activation='relu',
|
131 |
-
custom_encoder=None,
|
132 |
-
custom_decoder=None):
|
133 |
-
super(Transformer, self).__init__()
|
134 |
-
|
135 |
-
if custom_encoder is not None:
|
136 |
-
self.encoder = custom_encoder
|
137 |
-
else:
|
138 |
-
encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
|
139 |
-
activation)
|
140 |
-
encoder_norm = nn.LayerNorm(d_model)
|
141 |
-
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
|
142 |
-
|
143 |
-
if custom_decoder is not None:
|
144 |
-
self.decoder = custom_decoder
|
145 |
-
else:
|
146 |
-
decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout,
|
147 |
-
activation)
|
148 |
-
decoder_norm = nn.LayerNorm(d_model)
|
149 |
-
self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
|
150 |
-
|
151 |
-
self.d_model = d_model
|
152 |
-
self.nhead = nhead
|
153 |
-
|
154 |
-
def forward(self,
|
155 |
-
src,
|
156 |
-
tgt,
|
157 |
-
src_mask=None,
|
158 |
-
tgt_mask=None,
|
159 |
-
memory_mask=None,
|
160 |
-
src_key_padding_mask=None,
|
161 |
-
tgt_key_padding_mask=None,
|
162 |
-
memory_key_padding_mask=None):
|
163 |
-
if src.size(1) != tgt.size(1):
|
164 |
-
raise RuntimeError('the batch number of src and tgt must be equal')
|
165 |
-
|
166 |
-
if src.size(2) != self.d_model or tgt.size(2) != self.d_model:
|
167 |
-
raise RuntimeError('the feature number of src and tgt must be equal to d_model')
|
168 |
-
|
169 |
-
memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
|
170 |
-
output = self.decoder(tgt,
|
171 |
-
memory,
|
172 |
-
tgt_mask=tgt_mask,
|
173 |
-
memory_mask=memory_mask,
|
174 |
-
tgt_key_padding_mask=tgt_key_padding_mask,
|
175 |
-
memory_key_padding_mask=memory_key_padding_mask)
|
176 |
-
return output
|
177 |
-
|
178 |
-
|
179 |
-
class TransformerEncoder(nn.Module):
|
180 |
-
|
181 |
-
def __init__(self, encoder_layer, num_layers, norm=None):
|
182 |
-
super(TransformerEncoder, self).__init__()
|
183 |
-
|
184 |
-
self.layers = _get_clones(encoder_layer, num_layers)
|
185 |
-
self.num_layers = num_layers
|
186 |
-
self.norm = norm
|
187 |
-
|
188 |
-
def forward(self, src, src_mask=None, src_key_padding_mask=None):
|
189 |
-
output = src
|
190 |
-
|
191 |
-
for mod in self.layers:
|
192 |
-
output = mod(output, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
|
193 |
-
|
194 |
-
if self.norm is not None:
|
195 |
-
output = self.norm(output)
|
196 |
-
|
197 |
-
return output
|
198 |
-
|
199 |
-
|
200 |
-
class TransformerDecoder(nn.Module):
|
201 |
-
|
202 |
-
def __init__(self, decoder_layer, num_layers, norm=None):
|
203 |
-
super(TransformerDecoder, self).__init__()
|
204 |
-
|
205 |
-
self.layers = _get_clones(decoder_layer, num_layers)
|
206 |
-
self.num_layers = num_layers
|
207 |
-
self.norm = norm
|
208 |
-
|
209 |
-
def forward(self,
|
210 |
-
tgt,
|
211 |
-
memory,
|
212 |
-
tgt_mask=None,
|
213 |
-
memory_mask=None,
|
214 |
-
tgt_key_padding_mask=None,
|
215 |
-
memory_key_padding_mask=None):
|
216 |
-
output = tgt
|
217 |
-
|
218 |
-
for mod in self.layers:
|
219 |
-
output = mod(output,
|
220 |
-
memory,
|
221 |
-
tgt_mask=tgt_mask,
|
222 |
-
memory_mask=memory_mask,
|
223 |
-
tgt_key_padding_mask=tgt_key_padding_mask,
|
224 |
-
memory_key_padding_mask=memory_key_padding_mask)
|
225 |
-
|
226 |
-
if self.norm is not None:
|
227 |
-
output = self.norm(output)
|
228 |
-
|
229 |
-
return output
|
230 |
-
|
231 |
-
|
232 |
-
class TransformerEncoderLayer(nn.Module):
|
233 |
-
|
234 |
-
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
|
235 |
-
super(TransformerEncoderLayer, self).__init__()
|
236 |
-
|
237 |
-
self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
|
238 |
-
|
239 |
-
# Implementation of Feedforward model
|
240 |
-
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
241 |
-
self.dropout = nn.Dropout(dropout)
|
242 |
-
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
243 |
-
|
244 |
-
self.norm1 = nn.LayerNorm(d_model)
|
245 |
-
self.norm2 = nn.LayerNorm(d_model)
|
246 |
-
self.dropout1 = nn.Dropout(dropout)
|
247 |
-
self.dropout2 = nn.Dropout(dropout)
|
248 |
-
|
249 |
-
self.activation = _get_activation_fn(activation)
|
250 |
-
|
251 |
-
def __setstate__(self, state):
|
252 |
-
if 'activation' not in state:
|
253 |
-
state['activation'] = F.relu
|
254 |
-
super(TransformerEncoderLayer, self).__setstate__(state)
|
255 |
-
|
256 |
-
def forward(self, src, src_mask=None, src_key_padding_mask=None):
|
257 |
-
src2 = self.self_attn(src,
|
258 |
-
src,
|
259 |
-
src,
|
260 |
-
attn_mask=src_mask,
|
261 |
-
key_padding_mask=src_key_padding_mask)[0]
|
262 |
-
src = src + self.dropout1(src2)
|
263 |
-
src = self.norm1(src)
|
264 |
-
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
265 |
-
src = src + self.dropout2(src2)
|
266 |
-
src = self.norm2(src)
|
267 |
-
return src
|
268 |
-
|
269 |
-
|
270 |
-
class TransformerDecoderLayer(nn.Module):
|
271 |
-
|
272 |
-
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
|
273 |
-
super(TransformerDecoderLayer, self).__init__()
|
274 |
-
|
275 |
-
self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
|
276 |
-
self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
|
277 |
-
|
278 |
-
# Implementation of Feedforward model
|
279 |
-
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
280 |
-
self.dropout = nn.Dropout(dropout)
|
281 |
-
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
282 |
-
|
283 |
-
self.norm1 = nn.LayerNorm(d_model)
|
284 |
-
self.norm2 = nn.LayerNorm(d_model)
|
285 |
-
self.norm3 = nn.LayerNorm(d_model)
|
286 |
-
self.dropout1 = nn.Dropout(dropout)
|
287 |
-
self.dropout2 = nn.Dropout(dropout)
|
288 |
-
self.dropout3 = nn.Dropout(dropout)
|
289 |
-
|
290 |
-
self.activation = _get_activation_fn(activation)
|
291 |
-
|
292 |
-
def __setstate__(self, state):
|
293 |
-
if 'activation' not in state:
|
294 |
-
state['activation'] = F.relu
|
295 |
-
super(TransformerDecoderLayer, self).__setstate__(state)
|
296 |
-
|
297 |
-
def forward(self,
|
298 |
-
tgt,
|
299 |
-
memory,
|
300 |
-
tgt_mask=None,
|
301 |
-
memory_mask=None,
|
302 |
-
tgt_key_padding_mask=None,
|
303 |
-
memory_key_padding_mask=None):
|
304 |
-
tgt2 = self.self_attn(tgt,
|
305 |
-
tgt,
|
306 |
-
tgt,
|
307 |
-
attn_mask=tgt_mask,
|
308 |
-
key_padding_mask=tgt_key_padding_mask)[0]
|
309 |
-
tgt = tgt + self.dropout1(tgt2)
|
310 |
-
tgt = self.norm1(tgt)
|
311 |
-
tgt2 = self.multihead_attn(tgt,
|
312 |
-
memory,
|
313 |
-
memory,
|
314 |
-
attn_mask=memory_mask,
|
315 |
-
key_padding_mask=memory_key_padding_mask)[0]
|
316 |
-
tgt = tgt + self.dropout2(tgt2)
|
317 |
-
tgt = self.norm2(tgt)
|
318 |
-
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
|
319 |
-
tgt = tgt + self.dropout3(tgt2)
|
320 |
-
tgt = self.norm3(tgt)
|
321 |
-
return tgt
|
322 |
-
|
323 |
-
|
324 |
-
def _get_clones(module, N):
|
325 |
-
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
|
326 |
-
|
327 |
-
|
328 |
-
def _get_activation_fn(activation):
|
329 |
-
if activation == 'relu':
|
330 |
-
return F.relu
|
331 |
-
elif activation == 'gelu':
|
332 |
-
return F.gelu
|
333 |
-
|
334 |
-
raise RuntimeError('activation should be relu/gelu, not {}'.format(activation))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/transformer/utils.py
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
assert torch.__version__ >= '1.6.0'
|
3 |
-
import torch.nn as nn
|
4 |
-
import numpy as np
|
5 |
-
|
6 |
-
|
7 |
-
def layer_norm(d_model, condition=True):
|
8 |
-
return nn.LayerNorm(d_model) if condition else None
|
9 |
-
|
10 |
-
|
11 |
-
def generate_square_subsequent_mask(sz):
|
12 |
-
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
|
13 |
-
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
|
14 |
-
return mask
|
15 |
-
|
16 |
-
|
17 |
-
def generate_proposal_mask(T, B):
|
18 |
-
mask = torch.zeros(T, (T + 1) * T // 2)
|
19 |
-
for sz, idx in zip(range(1, T + 1), np.cumsum(range(T))):
|
20 |
-
mask[:sz, idx: idx + sz] = torch.fliplr(torch.tril(torch.ones(sz, sz)))
|
21 |
-
mask = mask.unsqueeze(1).repeat(1, B, 1)
|
22 |
-
return mask
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model/visualEncoder.py
DELETED
@@ -1,199 +0,0 @@
|
|
1 |
-
##
|
2 |
-
# ResNet18 Pretrained network to extract lip embedding
|
3 |
-
# This code is modified based on https://github.com/lordmartian/deep_avsr
|
4 |
-
##
|
5 |
-
|
6 |
-
import torch
|
7 |
-
import torch.nn as nn
|
8 |
-
import torch.nn.functional as F
|
9 |
-
from model.attentionLayer import attentionLayer
|
10 |
-
|
11 |
-
|
12 |
-
class ResNetLayer(nn.Module):
|
13 |
-
"""
|
14 |
-
A ResNet layer used to build the ResNet network.
|
15 |
-
Architecture:
|
16 |
-
--> conv-bn-relu -> conv -> + -> bn-relu -> conv-bn-relu -> conv -> + -> bn-relu -->
|
17 |
-
| | | |
|
18 |
-
-----> downsample ------> ------------------------------------->
|
19 |
-
"""
|
20 |
-
|
21 |
-
def __init__(self, inplanes, outplanes, stride):
|
22 |
-
super(ResNetLayer, self).__init__()
|
23 |
-
self.conv1a = nn.Conv2d(inplanes,
|
24 |
-
outplanes,
|
25 |
-
kernel_size=3,
|
26 |
-
stride=stride,
|
27 |
-
padding=1,
|
28 |
-
bias=False)
|
29 |
-
self.bn1a = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
|
30 |
-
self.conv2a = nn.Conv2d(outplanes,
|
31 |
-
outplanes,
|
32 |
-
kernel_size=3,
|
33 |
-
stride=1,
|
34 |
-
padding=1,
|
35 |
-
bias=False)
|
36 |
-
self.stride = stride
|
37 |
-
if self.stride != 1:
|
38 |
-
self.downsample = nn.Conv2d(inplanes,
|
39 |
-
outplanes,
|
40 |
-
kernel_size=(1, 1),
|
41 |
-
stride=stride,
|
42 |
-
bias=False)
|
43 |
-
self.outbna = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
|
44 |
-
|
45 |
-
self.conv1b = nn.Conv2d(outplanes,
|
46 |
-
outplanes,
|
47 |
-
kernel_size=3,
|
48 |
-
stride=1,
|
49 |
-
padding=1,
|
50 |
-
bias=False)
|
51 |
-
self.bn1b = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
|
52 |
-
self.conv2b = nn.Conv2d(outplanes,
|
53 |
-
outplanes,
|
54 |
-
kernel_size=3,
|
55 |
-
stride=1,
|
56 |
-
padding=1,
|
57 |
-
bias=False)
|
58 |
-
self.outbnb = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
|
59 |
-
return
|
60 |
-
|
61 |
-
def forward(self, inputBatch):
|
62 |
-
batch = F.relu(self.bn1a(self.conv1a(inputBatch)))
|
63 |
-
batch = self.conv2a(batch)
|
64 |
-
if self.stride == 1:
|
65 |
-
residualBatch = inputBatch
|
66 |
-
else:
|
67 |
-
residualBatch = self.downsample(inputBatch)
|
68 |
-
batch = batch + residualBatch
|
69 |
-
intermediateBatch = batch
|
70 |
-
batch = F.relu(self.outbna(batch))
|
71 |
-
|
72 |
-
batch = F.relu(self.bn1b(self.conv1b(batch)))
|
73 |
-
batch = self.conv2b(batch)
|
74 |
-
residualBatch = intermediateBatch
|
75 |
-
batch = batch + residualBatch
|
76 |
-
outputBatch = F.relu(self.outbnb(batch))
|
77 |
-
return outputBatch
|
78 |
-
|
79 |
-
|
80 |
-
class ResNet(nn.Module):
|
81 |
-
"""
|
82 |
-
An 18-layer ResNet architecture.
|
83 |
-
"""
|
84 |
-
|
85 |
-
def __init__(self):
|
86 |
-
super(ResNet, self).__init__()
|
87 |
-
self.layer1 = ResNetLayer(64, 64, stride=1)
|
88 |
-
self.layer2 = ResNetLayer(64, 128, stride=2)
|
89 |
-
self.layer3 = ResNetLayer(128, 256, stride=2)
|
90 |
-
self.layer4 = ResNetLayer(256, 512, stride=2)
|
91 |
-
self.avgpool = nn.AvgPool2d(kernel_size=(4, 4), stride=(1, 1))
|
92 |
-
|
93 |
-
return
|
94 |
-
|
95 |
-
def forward(self, inputBatch):
|
96 |
-
batch = self.layer1(inputBatch)
|
97 |
-
batch = self.layer2(batch)
|
98 |
-
batch = self.layer3(batch)
|
99 |
-
batch = self.layer4(batch)
|
100 |
-
outputBatch = self.avgpool(batch)
|
101 |
-
return outputBatch
|
102 |
-
|
103 |
-
|
104 |
-
class GlobalLayerNorm(nn.Module):
|
105 |
-
|
106 |
-
def __init__(self, channel_size):
|
107 |
-
super(GlobalLayerNorm, self).__init__()
|
108 |
-
self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1]
|
109 |
-
self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1]
|
110 |
-
self.reset_parameters()
|
111 |
-
|
112 |
-
def reset_parameters(self):
|
113 |
-
self.gamma.data.fill_(1)
|
114 |
-
self.beta.data.zero_()
|
115 |
-
|
116 |
-
def forward(self, y):
|
117 |
-
mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) #[M, 1, 1]
|
118 |
-
var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
|
119 |
-
gLN_y = self.gamma * (y - mean) / torch.pow(var + 1e-8, 0.5) + self.beta
|
120 |
-
return gLN_y
|
121 |
-
|
122 |
-
|
123 |
-
class visualFrontend(nn.Module):
|
124 |
-
"""
|
125 |
-
A visual feature extraction module. Generates a 512-dim feature vector per video frame.
|
126 |
-
Architecture: A 3D convolution block followed by an 18-layer ResNet.
|
127 |
-
"""
|
128 |
-
|
129 |
-
def __init__(self, cfg):
|
130 |
-
self.cfg = cfg
|
131 |
-
super(visualFrontend, self).__init__()
|
132 |
-
self.frontend3D = nn.Sequential(
|
133 |
-
nn.Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3),
|
134 |
-
bias=False), nn.BatchNorm3d(64, momentum=0.01, eps=0.001), nn.ReLU(),
|
135 |
-
nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
|
136 |
-
self.resnet = ResNet()
|
137 |
-
return
|
138 |
-
|
139 |
-
def forward(self, inputBatch):
|
140 |
-
inputBatch = inputBatch.transpose(0, 1).transpose(1, 2)
|
141 |
-
batchsize = inputBatch.shape[0]
|
142 |
-
batch = self.frontend3D(inputBatch)
|
143 |
-
|
144 |
-
batch = batch.transpose(1, 2)
|
145 |
-
batch = batch.reshape(batch.shape[0] * batch.shape[1], batch.shape[2], batch.shape[3],
|
146 |
-
batch.shape[4])
|
147 |
-
outputBatch = self.resnet(batch)
|
148 |
-
outputBatch = outputBatch.reshape(batchsize, -1, 512)
|
149 |
-
outputBatch = outputBatch.transpose(1, 2)
|
150 |
-
outputBatch = outputBatch.transpose(1, 2).transpose(0, 1)
|
151 |
-
return outputBatch
|
152 |
-
|
153 |
-
|
154 |
-
class DSConv1d(nn.Module):
|
155 |
-
|
156 |
-
def __init__(self):
|
157 |
-
super(DSConv1d, self).__init__()
|
158 |
-
self.net = nn.Sequential(
|
159 |
-
nn.ReLU(),
|
160 |
-
nn.BatchNorm1d(512),
|
161 |
-
nn.Conv1d(512, 512, 3, stride=1, padding=1, dilation=1, groups=512, bias=False),
|
162 |
-
nn.PReLU(),
|
163 |
-
GlobalLayerNorm(512),
|
164 |
-
nn.Conv1d(512, 512, 1, bias=False),
|
165 |
-
)
|
166 |
-
|
167 |
-
def forward(self, x):
|
168 |
-
out = self.net(x)
|
169 |
-
return out + x
|
170 |
-
|
171 |
-
|
172 |
-
class visualTCN(nn.Module):
|
173 |
-
|
174 |
-
def __init__(self):
|
175 |
-
super(visualTCN, self).__init__()
|
176 |
-
stacks = []
|
177 |
-
for x in range(5):
|
178 |
-
stacks += [DSConv1d()]
|
179 |
-
self.net = nn.Sequential(*stacks) # Visual Temporal Network V-TCN
|
180 |
-
|
181 |
-
def forward(self, x):
|
182 |
-
out = self.net(x)
|
183 |
-
return out
|
184 |
-
|
185 |
-
|
186 |
-
class visualConv1D(nn.Module):
|
187 |
-
|
188 |
-
def __init__(self):
|
189 |
-
super(visualConv1D, self).__init__()
|
190 |
-
self.net = nn.Sequential(
|
191 |
-
nn.Conv1d(512, 256, 5, stride=1, padding=2),
|
192 |
-
nn.BatchNorm1d(256),
|
193 |
-
nn.ReLU(),
|
194 |
-
nn.Conv1d(256, 128, 1),
|
195 |
-
)
|
196 |
-
|
197 |
-
def forward(self, x):
|
198 |
-
out = self.net(x)
|
199 |
-
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|