Superxixixi commited on Oct 10, 2023

Commit

182c3ce

1 Parent(s): 29ac319

Delete model

Browse files

Files changed (28) hide show

model/.DS_Store +0 -0
model/__init__.py +0 -5
model/__pycache__/__init__.cpython-36.pyc +0 -0
model/__pycache__/__init__.cpython-37.pyc +0 -0
model/__pycache__/attentionLayer.cpython-37.pyc +0 -0
model/__pycache__/convLayer.cpython-37.pyc +0 -0
model/__pycache__/loconet_encoder.cpython-37.pyc +0 -0
model/__pycache__/position_encoding.cpython-36.pyc +0 -0
model/__pycache__/talkNetModel.cpython-37.pyc +0 -0
model/__pycache__/transformer.cpython-36.pyc +0 -0
model/__pycache__/utils.cpython-36.pyc +0 -0
model/__pycache__/visualEncoder.cpython-37.pyc +0 -0
model/attentionLayer.py +0 -39
model/audioEncoder.py +0 -108
model/convLayer.py +0 -42
model/faceDetector/README.md +0 -3
model/faceDetector/__init__.py +0 -1
model/faceDetector/s3fd/__init__.py +0 -66
model/faceDetector/s3fd/box_utils.py +0 -217
model/faceDetector/s3fd/nets.py +0 -174
model/loconet_encoder.py +0 -91
model/transformer/__pycache__/position_encoding.cpython-37.pyc +0 -0
model/transformer/__pycache__/transformer.cpython-37.pyc +0 -0
model/transformer/__pycache__/utils.cpython-37.pyc +0 -0
model/transformer/position_encoding.py +0 -28
model/transformer/transformer.py +0 -334
model/transformer/utils.py +0 -22
model/visualEncoder.py +0 -199

model/.DS_Store DELETED Viewed

Binary file (6.15 kB)

model/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from model.transformer.position_encoding import PositionalEncoding
-from model.transformer.transformer import Transformer
-from model.transformer.transformer import TransformerEncoder, TransformerEncoderLayer
-from model.transformer.transformer import TransformerDecoder, TransformerDecoderLayer
-from model.transformer.utils import layer_norm, generate_square_subsequent_mask, generate_proposal_mask

model/__pycache__/__init__.cpython-36.pyc DELETED Viewed

Binary file (561 Bytes)

model/__pycache__/__init__.cpython-37.pyc DELETED Viewed

Binary file (573 Bytes)

model/__pycache__/attentionLayer.cpython-37.pyc DELETED Viewed

Binary file (1.38 kB)

model/__pycache__/convLayer.cpython-37.pyc DELETED Viewed

Binary file (1.32 kB)

model/__pycache__/loconet_encoder.cpython-37.pyc DELETED Viewed

Binary file (3.21 kB)

model/__pycache__/position_encoding.cpython-36.pyc DELETED Viewed

Binary file (1.26 kB)

model/__pycache__/talkNetModel.cpython-37.pyc DELETED Viewed

Binary file (6.33 kB)

model/__pycache__/transformer.cpython-36.pyc DELETED Viewed

Binary file (8.84 kB)

model/__pycache__/utils.cpython-36.pyc DELETED Viewed

Binary file (1.08 kB)

model/__pycache__/visualEncoder.cpython-37.pyc DELETED Viewed

Binary file (6.53 kB)

model/attentionLayer.py DELETED Viewed

@@ -1,39 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-from torch.nn import MultiheadAttention
-class attentionLayer(nn.Module):
-    def __init__(self, d_model, nhead, dropout=0.1):
-        super(attentionLayer, self).__init__()
-        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
-        self.linear1 = nn.Linear(d_model, d_model * 4)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(d_model * 4, d_model)
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.activation = F.relu
-    def forward(self, src, tar, adjust=False, attn_mask=None):
-        # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
-        src = src.transpose(0, 1)    # B, T, C -> T, B, C
-        tar = tar.transpose(0, 1)    # B, T, C -> T, B, C
-        if adjust:
-            src2 = self.self_attn(src, tar, tar, attn_mask=None, key_padding_mask=None)[0]
-        else:
-            src2 = self.self_attn(tar, src, src, attn_mask=None, key_padding_mask=None)[0]
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = src + self.dropout2(src2)
-        src = self.norm2(src)
-        src = src.transpose(0, 1)    # T, B, C -> B, T, C
-        return src

model/audioEncoder.py DELETED Viewed

@@ -1,108 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class SEBasicBlock(nn.Module):
-    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
-        super(SEBasicBlock, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.se = SELayer(planes, reduction)
-        self.downsample = downsample
-        self.stride = stride
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.relu(out)
-        out = self.bn1(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.se(out)
-        if self.downsample is not None:
-            residual = self.downsample(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class SELayer(nn.Module):
-    def __init__(self, channel, reduction=8):
-        super(SELayer, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Sequential(
-                nn.Linear(channel, channel // reduction),
-                nn.ReLU(inplace=True),
-                nn.Linear(channel // reduction, channel),
-                nn.Sigmoid()
-        )
-    def forward(self, x):
-        b, c, _, _ = x.size()
-        y = self.avg_pool(x).view(b, c)
-        y = self.fc(y).view(b, c, 1, 1)
-        return x * y
-class audioEncoder(nn.Module):
-    def __init__(self, layers, num_filters, **kwargs):
-        super(audioEncoder, self).__init__()
-        block = SEBasicBlock
-        self.inplanes   = num_filters[0]
-        self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3,
-                               bias=False)
-        self.bn1 = nn.BatchNorm2d(num_filters[0])
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self._make_layer(block, num_filters[0], layers[0])
-        self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
-        self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
-        self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1))
-        out_dim = num_filters[3] * block.expansion
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-    def _make_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-        return nn.Sequential(*layers)
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-        x = torch.mean(x, dim=2, keepdim=True)
-        x = x.view((x.size()[0], x.size()[1], -1))
-        x = x.transpose(1, 2)
-        return x

model/convLayer.py DELETED Viewed

@@ -1,42 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-class ConvLayer(nn.Module):
-    def __init__(self, cfg):
-        super(ConvLayer, self).__init__()
-        self.cfg = cfg
-        self.s = cfg.MODEL.NUM_SPEAKERS
-        self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (self.s, 7), padding=(0, 3))
-        # below line is speaker parallel 93.88 code
-        # self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (3, 7), padding=(0, 3))
-        self.ln = torch.nn.LayerNorm(256)
-        self.conv2d_1x1 = torch.nn.Conv2d(256, 512, (1, 1), padding=(0, 0))
-        self.conv2d_1x1_2 = torch.nn.Conv2d(512, 256, (1, 1), padding=(0, 0))
-        self.gelu = nn.GELU()
-    def forward(self, x, b, s):
-        identity = x    # b*s, t, c
-        t = x.shape[1]
-        c = x.shape[2]
-        out = x.view(b, s, t, c)
-        out = out.permute(0, 3, 1, 2)    # b, c, s, t
-        out = self.conv2d(out)    # b, s*c, 1, t
-        out = out.view(b, c, s, t)
-        out = out.permute(0, 2, 3, 1)    # b, s, t, c
-        out = self.ln(out)
-        out = out.permute(0, 3, 1, 2)
-        out = self.conv2d_1x1(out)
-        out = self.gelu(out)
-        out = self.conv2d_1x1_2(out)    # b, c, s, t
-        out = out.permute(0, 2, 3, 1)    # b, s, t, c
-        out = out.view(b * s, t, c)
-        out += identity
-        return out, b, s

model/faceDetector/README.md DELETED Viewed

@@ -1,3 +0,0 @@
-# Face detector
-This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.

model/faceDetector/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .s3fd import S3FD

model/faceDetector/s3fd/__init__.py DELETED Viewed

@@ -1,66 +0,0 @@
-import time, os, sys, subprocess
-import numpy as np
-import cv2
-import torch
-from torchvision import transforms
-from .nets import S3FDNet
-from .box_utils import nms_
-PATH_WEIGHT = 'model/faceDetector/s3fd/sfd_face.pth'
-if os.path.isfile(PATH_WEIGHT) == False:
-    Link = "1KafnHz7ccT-3IyddBsL5yi2xGtxAKypt"
-    cmd = "gdown --id %s -O %s"%(Link, PATH_WEIGHT)
-    subprocess.call(cmd, shell=True, stdout=None)
-img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
-class S3FD():
-    def __init__(self, device='cuda'):
-        tstamp = time.time()
-        self.device = device
-        # print('[S3FD] loading with', self.device)
-        self.net = S3FDNet(device=self.device).to(self.device)
-        PATH = os.path.join(os.getcwd(), PATH_WEIGHT)
-        state_dict = torch.load(PATH, map_location=self.device)
-        self.net.load_state_dict(state_dict)
-        self.net.eval()
-        # print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
-    def detect_faces(self, image, conf_th=0.8, scales=[1]):
-        w, h = image.shape[1], image.shape[0]
-        bboxes = np.empty(shape=(0, 5))
-        with torch.no_grad():
-            for s in scales:
-                scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
-                scaled_img = np.swapaxes(scaled_img, 1, 2)
-                scaled_img = np.swapaxes(scaled_img, 1, 0)
-                scaled_img = scaled_img[[2, 1, 0], :, :]
-                scaled_img = scaled_img.astype('float32')
-                scaled_img -= img_mean
-                scaled_img = scaled_img[[2, 1, 0], :, :]
-                x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
-                y = self.net(x)
-                detections = y.data
-                scale = torch.Tensor([w, h, w, h])
-                for i in range(detections.size(1)):
-                    j = 0
-                    while detections[0, i, j, 0] > conf_th:
-                        score = detections[0, i, j, 0]
-                        pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
-                        bbox = (pt[0], pt[1], pt[2], pt[3], score)
-                        bboxes = np.vstack((bboxes, bbox))
-                        j += 1
-            keep = nms_(bboxes, 0.1)
-            bboxes = bboxes[keep]
-        return bboxes

model/faceDetector/s3fd/box_utils.py DELETED Viewed

@@ -1,217 +0,0 @@
-import numpy as np
-from itertools import product as product
-import torch
-from torch.autograd import Function
-def nms_(dets, thresh):
-    """
-    Courtesy of Ross Girshick
-    [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
-    """
-    x1 = dets[:, 0]
-    y1 = dets[:, 1]
-    x2 = dets[:, 2]
-    y2 = dets[:, 3]
-    scores = dets[:, 4]
-    areas = (x2 - x1) * (y2 - y1)
-    order = scores.argsort()[::-1]
-    keep = []
-    while order.size > 0:
-        i = order[0]
-        keep.append(int(i))
-        xx1 = np.maximum(x1[i], x1[order[1:]])
-        yy1 = np.maximum(y1[i], y1[order[1:]])
-        xx2 = np.minimum(x2[i], x2[order[1:]])
-        yy2 = np.minimum(y2[i], y2[order[1:]])
-        w = np.maximum(0.0, xx2 - xx1)
-        h = np.maximum(0.0, yy2 - yy1)
-        inter = w * h
-        ovr = inter / (areas[i] + areas[order[1:]] - inter)
-        inds = np.where(ovr <= thresh)[0]
-        order = order[inds + 1]
-    return np.array(keep).astype(np.int)
-def decode(loc, priors, variances):
-    """Decode locations from predictions using priors to undo
-    the encoding we did for offset regression at train time.
-    Args:
-        loc (tensor): location predictions for loc layers,
-            Shape: [num_priors,4]
-        priors (tensor): Prior boxes in center-offset form.
-            Shape: [num_priors,4].
-        variances: (list[float]) Variances of priorboxes
-    Return:
-        decoded bounding box predictions
-    """
-    boxes = torch.cat((
-        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
-        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
-    boxes[:, :2] -= boxes[:, 2:] / 2
-    boxes[:, 2:] += boxes[:, :2]
-    return boxes
-def nms(boxes, scores, overlap=0.5, top_k=200):
-    """Apply non-maximum suppression at test time to avoid detecting too many
-    overlapping bounding boxes for a given object.
-    Args:
-        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
-        scores: (tensor) The class predscores for the img, Shape:[num_priors].
-        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
-        top_k: (int) The Maximum number of box preds to consider.
-    Return:
-        The indices of the kept boxes with respect to num_priors.
-    """
-    keep = scores.new(scores.size(0)).zero_().long()
-    if boxes.numel() == 0:
-        return keep, 0
-    x1 = boxes[:, 0]
-    y1 = boxes[:, 1]
-    x2 = boxes[:, 2]
-    y2 = boxes[:, 3]
-    area = torch.mul(x2 - x1, y2 - y1)
-    v, idx = scores.sort(0)  # sort in ascending order
-    # I = I[v >= 0.01]
-    idx = idx[-top_k:]  # indices of the top-k largest vals
-    xx1 = boxes.new()
-    yy1 = boxes.new()
-    xx2 = boxes.new()
-    yy2 = boxes.new()
-    w = boxes.new()
-    h = boxes.new()
-    # keep = torch.Tensor()
-    count = 0
-    while idx.numel() > 0:
-        i = idx[-1]  # index of current largest val
-        # keep.append(i)
-        keep[count] = i
-        count += 1
-        if idx.size(0) == 1:
-            break
-        idx = idx[:-1]  # remove kept element from view
-        # load bboxes of next highest vals
-        torch.index_select(x1, 0, idx, out=xx1)
-        torch.index_select(y1, 0, idx, out=yy1)
-        torch.index_select(x2, 0, idx, out=xx2)
-        torch.index_select(y2, 0, idx, out=yy2)
-        # store element-wise max with next highest score
-        xx1 = torch.clamp(xx1, min=x1[i])
-        yy1 = torch.clamp(yy1, min=y1[i])
-        xx2 = torch.clamp(xx2, max=x2[i])
-        yy2 = torch.clamp(yy2, max=y2[i])
-        w.resize_as_(xx2)
-        h.resize_as_(yy2)
-        w = xx2 - xx1
-        h = yy2 - yy1
-        # check sizes of xx1 and xx2.. after each iteration
-        w = torch.clamp(w, min=0.0)
-        h = torch.clamp(h, min=0.0)
-        inter = w * h
-        # IoU = i / (area(a) + area(b) - i)
-        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
-        union = (rem_areas - inter) + area[i]
-        IoU = inter / union  # store result in iou
-        # keep only elements with an IoU <= overlap
-        idx = idx[IoU.le(overlap)]
-    return keep, count
-class Detect(object):
-    def __init__(self, num_classes=2,
-                    top_k=750, nms_thresh=0.3, conf_thresh=0.05,
-                    variance=[0.1, 0.2], nms_top_k=5000):
-        self.num_classes = num_classes
-        self.top_k = top_k
-        self.nms_thresh = nms_thresh
-        self.conf_thresh = conf_thresh
-        self.variance = variance
-        self.nms_top_k = nms_top_k
-    def forward(self, loc_data, conf_data, prior_data):
-        num = loc_data.size(0)
-        num_priors = prior_data.size(0)
-        conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
-        batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
-        batch_priors = batch_priors.contiguous().view(-1, 4)
-        decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
-        decoded_boxes = decoded_boxes.view(num, num_priors, 4)
-        output = torch.zeros(num, self.num_classes, self.top_k, 5)
-        for i in range(num):
-            boxes = decoded_boxes[i].clone()
-            conf_scores = conf_preds[i].clone()
-            for cl in range(1, self.num_classes):
-                c_mask = conf_scores[cl].gt(self.conf_thresh)
-                scores = conf_scores[cl][c_mask]
-                if scores.dim() == 0:
-                    continue
-                l_mask = c_mask.unsqueeze(1).expand_as(boxes)
-                boxes_ = boxes[l_mask].view(-1, 4)
-                ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
-                count = count if count < self.top_k else self.top_k
-                output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
-        return output
-class PriorBox(object):
-    def __init__(self, input_size, feature_maps,
-                    variance=[0.1, 0.2],
-                    min_sizes=[16, 32, 64, 128, 256, 512],
-                    steps=[4, 8, 16, 32, 64, 128],
-                    clip=False):
-        super(PriorBox, self).__init__()
-        self.imh = input_size[0]
-        self.imw = input_size[1]
-        self.feature_maps = feature_maps
-        self.variance = variance
-        self.min_sizes = min_sizes
-        self.steps = steps
-        self.clip = clip
-    def forward(self):
-        mean = []
-        for k, fmap in enumerate(self.feature_maps):
-            feath = fmap[0]
-            featw = fmap[1]
-            for i, j in product(range(feath), range(featw)):
-                f_kw = self.imw / self.steps[k]
-                f_kh = self.imh / self.steps[k]
-                cx = (j + 0.5) / f_kw
-                cy = (i + 0.5) / f_kh
-                s_kw = self.min_sizes[k] / self.imw
-                s_kh = self.min_sizes[k] / self.imh
-                mean += [cx, cy, s_kw, s_kh]
-        output = torch.FloatTensor(mean).view(-1, 4)
-        if self.clip:
-            output.clamp_(max=1, min=0)
-        return output

model/faceDetector/s3fd/nets.py DELETED Viewed

@@ -1,174 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.nn.init as init
-from .box_utils import Detect, PriorBox
-class L2Norm(nn.Module):
-    def __init__(self, n_channels, scale):
-        super(L2Norm, self).__init__()
-        self.n_channels = n_channels
-        self.gamma = scale or None
-        self.eps = 1e-10
-        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
-        self.reset_parameters()
-    def reset_parameters(self):
-        init.constant_(self.weight, self.gamma)
-    def forward(self, x):
-        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
-        x = torch.div(x, norm)
-        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
-        return out
-class S3FDNet(nn.Module):
-    def __init__(self, device='cuda'):
-        super(S3FDNet, self).__init__()
-        self.device = device
-        self.vgg = nn.ModuleList([
-            nn.Conv2d(3, 64, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(64, 64, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(2, 2),
-            nn.Conv2d(64, 128, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, 128, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(2, 2),
-            nn.Conv2d(128, 256, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, 256, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, 256, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(2, 2, ceil_mode=True),
-            nn.Conv2d(256, 512, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(512, 512, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(512, 512, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(2, 2),
-            nn.Conv2d(512, 512, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(512, 512, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(512, 512, 3, 1, padding=1),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(2, 2),
-            nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(1024, 1024, 1, 1),
-            nn.ReLU(inplace=True),
-        ])
-        self.L2Norm3_3 = L2Norm(256, 10)
-        self.L2Norm4_3 = L2Norm(512, 8)
-        self.L2Norm5_3 = L2Norm(512, 5)
-        self.extras = nn.ModuleList([
-            nn.Conv2d(1024, 256, 1, 1),
-            nn.Conv2d(256, 512, 3, 2, padding=1),
-            nn.Conv2d(512, 128, 1, 1),
-            nn.Conv2d(128, 256, 3, 2, padding=1),
-        ])
-        self.loc = nn.ModuleList([
-            nn.Conv2d(256, 4, 3, 1, padding=1),
-            nn.Conv2d(512, 4, 3, 1, padding=1),
-            nn.Conv2d(512, 4, 3, 1, padding=1),
-            nn.Conv2d(1024, 4, 3, 1, padding=1),
-            nn.Conv2d(512, 4, 3, 1, padding=1),
-            nn.Conv2d(256, 4, 3, 1, padding=1),
-        ])
-        self.conf = nn.ModuleList([
-            nn.Conv2d(256, 4, 3, 1, padding=1),
-            nn.Conv2d(512, 2, 3, 1, padding=1),
-            nn.Conv2d(512, 2, 3, 1, padding=1),
-            nn.Conv2d(1024, 2, 3, 1, padding=1),
-            nn.Conv2d(512, 2, 3, 1, padding=1),
-            nn.Conv2d(256, 2, 3, 1, padding=1),
-        ])
-        self.softmax = nn.Softmax(dim=-1)
-        self.detect = Detect()
-    def forward(self, x):
-        size = x.size()[2:]
-        sources = list()
-        loc = list()
-        conf = list()
-        for k in range(16):
-            x = self.vgg[k](x)
-        s = self.L2Norm3_3(x)
-        sources.append(s)
-        for k in range(16, 23):
-            x = self.vgg[k](x)
-        s = self.L2Norm4_3(x)
-        sources.append(s)
-        for k in range(23, 30):
-            x = self.vgg[k](x)
-        s = self.L2Norm5_3(x)
-        sources.append(s)
-        for k in range(30, len(self.vgg)):
-            x = self.vgg[k](x)
-        sources.append(x)
-        # apply extra layers and cache source layer outputs
-        for k, v in enumerate(self.extras):
-            x = F.relu(v(x), inplace=True)
-            if k % 2 == 1:
-                sources.append(x)
-        # apply multibox head to source layers
-        loc_x = self.loc[0](sources[0])
-        conf_x = self.conf[0](sources[0])
-        max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
-        conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
-        loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
-        conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
-        for i in range(1, len(sources)):
-            x = sources[i]
-            conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
-            loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
-        features_maps = []
-        for i in range(len(loc)):
-            feat = []
-            feat += [loc[i].size(1), loc[i].size(2)]
-            features_maps += [feat]
-        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
-        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
-        with torch.no_grad():
-            self.priorbox = PriorBox(size, features_maps)
-            self.priors = self.priorbox.forward()
-        output = self.detect.forward(
-            loc.view(loc.size(0), -1, 4),
-            self.softmax(conf.view(conf.size(0), -1, 2)),
-            self.priors.type(type(x.data)).to(self.device)
-        )
-        return output

model/loconet_encoder.py DELETED Viewed

@@ -1,91 +0,0 @@
-import torch
-import torch.nn as nn
-# from model.visualEncoder import visualFrontend, visualTCN, visualConv1D
-from model.attentionLayer import attentionLayer
-from model.convLayer import ConvLayer
-from torchvggish import vggish
-from model.visualEncoder import visualFrontend, visualConv1D, visualTCN
-class locoencoder(nn.Module):
-    def __init__(self, cfg):
-        super(locoencoder, self).__init__()
-        self.cfg = cfg
-        # Visual Temporal Encoder
-        self.visualFrontend = visualFrontend(cfg)    # Visual Frontend
-        self.visualTCN = visualTCN()    # Visual Temporal Network TCN
-        self.visualConv1D = visualConv1D()    # Visual Temporal Network Conv1d
-        urls = {
-            'vggish':
-                "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth"
-        }
-        self.audioEncoder = vggish.VGGish(urls, preprocess=False, postprocess=False)
-        self.audio_pool = nn.AdaptiveAvgPool1d(1)
-        # Audio-visual Cross Attention
-        self.crossA2V = attentionLayer(d_model=128, nhead=8)
-        self.crossV2A = attentionLayer(d_model=128, nhead=8)
-        # Audio-visual Self Attention
-        num_layers = self.cfg.MODEL.AV_layers
-        layers = nn.ModuleList()
-        for i in range(num_layers):
-            layers.append(ConvLayer(cfg))
-            layers.append(attentionLayer(d_model=256, nhead=8))
-        self.convAV = layers
-    def forward_visual_frontend(self, x):
-        B, T, W, H = x.shape
-        x = x.view(B * T, 1, 1, W, H)
-        x = (x / 255 - 0.4161) / 0.1688
-        x = self.visualFrontend(x)
-        x = x.view(B, T, 512)
-        x = x.transpose(1, 2)
-        x = self.visualTCN(x)
-        x = self.visualConv1D(x)
-        x = x.transpose(1, 2)
-        return x
-    def forward_audio_frontend(self, x):
-        t = x.shape[-2]
-        numFrames = t // 4
-        pad = 8 - (t % 8)
-        x = torch.nn.functional.pad(x, (0, 0, 0, pad), "constant")
-        # x = x.unsqueeze(1).transpose(2, 3)
-        x = self.audioEncoder(x)
-        b, c, t2, freq = x.shape
-        x = x.view(b * c, t2, freq)
-        x = self.audio_pool(x)
-        x = x.view(b, c, t2)[:, :, :numFrames]
-        x = x.permute(0, 2, 1)
-        return x
-    def forward_cross_attention(self, x1, x2):
-        x1_c = self.crossA2V(src=x1, tar=x2, adjust=self.cfg.MODEL.ADJUST_ATTENTION)
-        x2_c = self.crossV2A(src=x2, tar=x1, adjust=self.cfg.MODEL.ADJUST_ATTENTION)
-        return x1_c, x2_c
-    def forward_audio_visual_backend(self, x1, x2, b=1, s=1):
-        x = torch.cat((x1, x2), 2)    # B*S, T, 2C
-        for i, layer in enumerate(self.convAV):
-            if i % 2 == 0:
-                x, b, s = layer(x, b, s)
-            else:
-                x = layer(src=x, tar=x)
-        x = torch.reshape(x, (-1, 256))
-        return x
-    def forward_audio_backend(self, x):
-        x = torch.reshape(x, (-1, 128))
-        return x
-    def forward_visual_backend(self, x):
-        x = torch.reshape(x, (-1, 128))
-        return x

model/transformer/__pycache__/position_encoding.cpython-37.pyc DELETED Viewed

Binary file (1.23 kB)

model/transformer/__pycache__/transformer.cpython-37.pyc DELETED Viewed

Binary file (8.81 kB)

model/transformer/__pycache__/utils.cpython-37.pyc DELETED Viewed

Binary file (1.05 kB)

model/transformer/position_encoding.py DELETED Viewed

@@ -1,28 +0,0 @@
-##########################################################################
-# We adopt the positional encoding method from PyTorch Turorial.
-# Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
-##########################################################################
-import math
-import torch
-import torch.nn as nn
-class PositionalEncoding(nn.Module):
-    def __init__(self, d_model, dropout=0.1, max_len=5000):
-        super(PositionalEncoding, self).__init__()
-        self.dropout = nn.Dropout(p=dropout)
-        pe = torch.zeros(max_len, d_model)
-        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0).transpose(0, 1)
-        self.register_buffer('pe', pe)
-    def forward(self, x, padding=0):
-        x = x + self.pe[padding:padding + x.shape[0], :]
-        return self.dropout(x)

model/transformer/transformer.py DELETED Viewed

@@ -1,334 +0,0 @@
-import copy
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class DotProductAttention(nn.Module):
-    def __init__(self, dropout=0.0):
-        super(DotProductAttention, self).__init__()
-        self.dropout = dropout
-    def forward(self, q, k, v, attn_mask=None):
-        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
-        if attn_mask is not None:
-            attn_output_weights += attn_mask
-        attn_output_weights = F.softmax(attn_output_weights, dim=-1)
-        attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training)
-        attn_output = torch.bmm(attn_output_weights, v)
-        return attn_output
-class MultiheadAttention(nn.Module):
-    def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, kdim=None, vdim=None):
-        super(MultiheadAttention, self).__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.kdim = kdim if kdim is not None else embed_dim
-        self.vdim = vdim if vdim is not None else embed_dim
-        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
-        if self._qkv_same_embed_dim:
-            self.in_proj_weight = nn.Parameter(torch.empty(3 * embed_dim, embed_dim))
-        else:
-            raise RuntimeError('Do not support q, k, v have different dimensions')
-        if bias:
-            self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim))
-        else:
-            self.register_parameter('in_proj_bias', None)
-        self.out_proj = nn.Linear(embed_dim, embed_dim)
-        if self._qkv_same_embed_dim:
-            nn.init.xavier_uniform_(self.in_proj_weight)
-        if self.in_proj_bias is not None:
-            nn.init.constant_(self.in_proj_bias, 0.)
-            nn.init.constant_(self.out_proj.bias, 0.)
-        self.dotproductattention = DotProductAttention(dropout)
-    def forward(self, q, k, v, attn_mask=None, key_padding_mask=None):
-        tsz, bsz, embed_dim = q.shape[0], q.shape[1], q.shape[2]
-        head_dim = embed_dim // self.num_heads
-        assert head_dim * self.num_heads == embed_dim, \
-            'embed_dim must be divisible by num_heads'
-        scaling = float(head_dim)**-0.5
-        _b = self.in_proj_bias
-        _start = None
-        _end = embed_dim
-        _w = self.in_proj_weight[:_end, :]
-        if _b is not None:
-            _b = _b[:_end]
-        q = F.linear(q, _w, _b)
-        _b = self.in_proj_bias
-        _start = embed_dim
-        _end = embed_dim * 2
-        _w = self.in_proj_weight[_start:_end, :]
-        if _b is not None:
-            _b = _b[_start:_end]
-        k = F.linear(k, _w, _b)
-        _b = self.in_proj_bias
-        _start = embed_dim * 2
-        _end = None
-        _w = self.in_proj_weight[_start:, :]
-        if _b is not None:
-            _b = _b[_start:]
-        v = F.linear(v, _w, _b)
-        q = q * scaling
-        q = q.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
-        k = k.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
-        v = v.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
-        if attn_mask is not None:
-            attn_mask = attn_mask.unsqueeze(0).repeat(bsz, 1, 1)
-            attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
-            attn_mask = attn_mask.reshape(-1, *attn_mask.shape[2:])
-        if key_padding_mask is not None:
-            key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, tsz, 1)
-            key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
-            key_padding_mask = key_padding_mask.reshape(-1, *key_padding_mask.shape[2:])
-        if attn_mask is not None and key_padding_mask is not None:
-            mask = attn_mask + key_padding_mask
-        elif attn_mask is not None:
-            mask = attn_mask
-        elif key_padding_mask is not None:
-            mask = key_padding_mask
-        else:
-            mask = None
-        attn_output = self.dotproductattention(q, k, v, mask)
-        attn_output = attn_output.transpose(0, 1).contiguous().view(tsz, bsz, self.embed_dim)
-        return self.out_proj(attn_output), None
-class Transformer(nn.Module):
-    def __init__(self,
-                 d_model=512,
-                 nhead=8,
-                 num_encoder_layers=6,
-                 num_decoder_layers=6,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation='relu',
-                 custom_encoder=None,
-                 custom_decoder=None):
-        super(Transformer, self).__init__()
-        if custom_encoder is not None:
-            self.encoder = custom_encoder
-        else:
-            encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
-                                                    activation)
-            encoder_norm = nn.LayerNorm(d_model)
-            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
-        if custom_decoder is not None:
-            self.decoder = custom_decoder
-        else:
-            decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout,
-                                                    activation)
-            decoder_norm = nn.LayerNorm(d_model)
-            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
-        self.d_model = d_model
-        self.nhead = nhead
-    def forward(self,
-                src,
-                tgt,
-                src_mask=None,
-                tgt_mask=None,
-                memory_mask=None,
-                src_key_padding_mask=None,
-                tgt_key_padding_mask=None,
-                memory_key_padding_mask=None):
-        if src.size(1) != tgt.size(1):
-            raise RuntimeError('the batch number of src and tgt must be equal')
-        if src.size(2) != self.d_model or tgt.size(2) != self.d_model:
-            raise RuntimeError('the feature number of src and tgt must be equal to d_model')
-        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
-        output = self.decoder(tgt,
-                              memory,
-                              tgt_mask=tgt_mask,
-                              memory_mask=memory_mask,
-                              tgt_key_padding_mask=tgt_key_padding_mask,
-                              memory_key_padding_mask=memory_key_padding_mask)
-        return output
-class TransformerEncoder(nn.Module):
-    def __init__(self, encoder_layer, num_layers, norm=None):
-        super(TransformerEncoder, self).__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-    def forward(self, src, src_mask=None, src_key_padding_mask=None):
-        output = src
-        for mod in self.layers:
-            output = mod(output, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
-        if self.norm is not None:
-            output = self.norm(output)
-        return output
-class TransformerDecoder(nn.Module):
-    def __init__(self, decoder_layer, num_layers, norm=None):
-        super(TransformerDecoder, self).__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-    def forward(self,
-                tgt,
-                memory,
-                tgt_mask=None,
-                memory_mask=None,
-                tgt_key_padding_mask=None,
-                memory_key_padding_mask=None):
-        output = tgt
-        for mod in self.layers:
-            output = mod(output,
-                         memory,
-                         tgt_mask=tgt_mask,
-                         memory_mask=memory_mask,
-                         tgt_key_padding_mask=tgt_key_padding_mask,
-                         memory_key_padding_mask=memory_key_padding_mask)
-        if self.norm is not None:
-            output = self.norm(output)
-        return output
-class TransformerEncoderLayer(nn.Module):
-    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
-        super(TransformerEncoderLayer, self).__init__()
-        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.activation = _get_activation_fn(activation)
-    def __setstate__(self, state):
-        if 'activation' not in state:
-            state['activation'] = F.relu
-        super(TransformerEncoderLayer, self).__setstate__(state)
-    def forward(self, src, src_mask=None, src_key_padding_mask=None):
-        src2 = self.self_attn(src,
-                              src,
-                              src,
-                              attn_mask=src_mask,
-                              key_padding_mask=src_key_padding_mask)[0]
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = src + self.dropout2(src2)
-        src = self.norm2(src)
-        return src
-class TransformerDecoderLayer(nn.Module):
-    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
-        super(TransformerDecoderLayer, self).__init__()
-        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
-        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.dropout3 = nn.Dropout(dropout)
-        self.activation = _get_activation_fn(activation)
-    def __setstate__(self, state):
-        if 'activation' not in state:
-            state['activation'] = F.relu
-        super(TransformerDecoderLayer, self).__setstate__(state)
-    def forward(self,
-                tgt,
-                memory,
-                tgt_mask=None,
-                memory_mask=None,
-                tgt_key_padding_mask=None,
-                memory_key_padding_mask=None):
-        tgt2 = self.self_attn(tgt,
-                              tgt,
-                              tgt,
-                              attn_mask=tgt_mask,
-                              key_padding_mask=tgt_key_padding_mask)[0]
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm1(tgt)
-        tgt2 = self.multihead_attn(tgt,
-                                   memory,
-                                   memory,
-                                   attn_mask=memory_mask,
-                                   key_padding_mask=memory_key_padding_mask)[0]
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm2(tgt)
-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
-        tgt = tgt + self.dropout3(tgt2)
-        tgt = self.norm3(tgt)
-        return tgt
-def _get_clones(module, N):
-    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
-def _get_activation_fn(activation):
-    if activation == 'relu':
-        return F.relu
-    elif activation == 'gelu':
-        return F.gelu
-    raise RuntimeError('activation should be relu/gelu, not {}'.format(activation))

model/transformer/utils.py DELETED Viewed

@@ -1,22 +0,0 @@
-import torch
-assert torch.__version__ >= '1.6.0'
-import torch.nn as nn
-import numpy as np
-def layer_norm(d_model, condition=True):
-    return nn.LayerNorm(d_model) if condition else None
-def generate_square_subsequent_mask(sz):
-    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
-    return mask
-def generate_proposal_mask(T, B):
-    mask = torch.zeros(T, (T + 1) * T // 2)
-    for sz, idx in zip(range(1, T + 1), np.cumsum(range(T))):
-        mask[:sz, idx: idx + sz] = torch.fliplr(torch.tril(torch.ones(sz, sz)))
-    mask = mask.unsqueeze(1).repeat(1, B, 1)
-    return mask

model/visualEncoder.py DELETED Viewed

@@ -1,199 +0,0 @@
-##
-# ResNet18 Pretrained network to extract lip embedding
-# This code is modified based on https://github.com/lordmartian/deep_avsr
-##
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from model.attentionLayer import attentionLayer
-class ResNetLayer(nn.Module):
-    """
-    A ResNet layer used to build the ResNet network.
-    Architecture:
-    --> conv-bn-relu -> conv -> + -> bn-relu -> conv-bn-relu -> conv -> + -> bn-relu -->
-     |                        |   |                                    |
-     -----> downsample ------>    ------------------------------------->
-    """
-    def __init__(self, inplanes, outplanes, stride):
-        super(ResNetLayer, self).__init__()
-        self.conv1a = nn.Conv2d(inplanes,
-                                outplanes,
-                                kernel_size=3,
-                                stride=stride,
-                                padding=1,
-                                bias=False)
-        self.bn1a = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
-        self.conv2a = nn.Conv2d(outplanes,
-                                outplanes,
-                                kernel_size=3,
-                                stride=1,
-                                padding=1,
-                                bias=False)
-        self.stride = stride
-        if self.stride != 1:
-            self.downsample = nn.Conv2d(inplanes,
-                                        outplanes,
-                                        kernel_size=(1, 1),
-                                        stride=stride,
-                                        bias=False)
-        self.outbna = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
-        self.conv1b = nn.Conv2d(outplanes,
-                                outplanes,
-                                kernel_size=3,
-                                stride=1,
-                                padding=1,
-                                bias=False)
-        self.bn1b = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
-        self.conv2b = nn.Conv2d(outplanes,
-                                outplanes,
-                                kernel_size=3,
-                                stride=1,
-                                padding=1,
-                                bias=False)
-        self.outbnb = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
-        return
-    def forward(self, inputBatch):
-        batch = F.relu(self.bn1a(self.conv1a(inputBatch)))
-        batch = self.conv2a(batch)
-        if self.stride == 1:
-            residualBatch = inputBatch
-        else:
-            residualBatch = self.downsample(inputBatch)
-        batch = batch + residualBatch
-        intermediateBatch = batch
-        batch = F.relu(self.outbna(batch))
-        batch = F.relu(self.bn1b(self.conv1b(batch)))
-        batch = self.conv2b(batch)
-        residualBatch = intermediateBatch
-        batch = batch + residualBatch
-        outputBatch = F.relu(self.outbnb(batch))
-        return outputBatch
-class ResNet(nn.Module):
-    """
-    An 18-layer ResNet architecture.
-    """
-    def __init__(self):
-        super(ResNet, self).__init__()
-        self.layer1 = ResNetLayer(64, 64, stride=1)
-        self.layer2 = ResNetLayer(64, 128, stride=2)
-        self.layer3 = ResNetLayer(128, 256, stride=2)
-        self.layer4 = ResNetLayer(256, 512, stride=2)
-        self.avgpool = nn.AvgPool2d(kernel_size=(4, 4), stride=(1, 1))
-        return
-    def forward(self, inputBatch):
-        batch = self.layer1(inputBatch)
-        batch = self.layer2(batch)
-        batch = self.layer3(batch)
-        batch = self.layer4(batch)
-        outputBatch = self.avgpool(batch)
-        return outputBatch
-class GlobalLayerNorm(nn.Module):
-    def __init__(self, channel_size):
-        super(GlobalLayerNorm, self).__init__()
-        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))    # [1, N, 1]
-        self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))    # [1, N, 1]
-        self.reset_parameters()
-    def reset_parameters(self):
-        self.gamma.data.fill_(1)
-        self.beta.data.zero_()
-    def forward(self, y):
-        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)    #[M, 1, 1]
-        var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
-        gLN_y = self.gamma * (y - mean) / torch.pow(var + 1e-8, 0.5) + self.beta
-        return gLN_y
-class visualFrontend(nn.Module):
-    """
-    A visual feature extraction module. Generates a 512-dim feature vector per video frame.
-    Architecture: A 3D convolution block followed by an 18-layer ResNet.
-    """
-    def __init__(self, cfg):
-        self.cfg = cfg
-        super(visualFrontend, self).__init__()
-        self.frontend3D = nn.Sequential(
-            nn.Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3),
-                      bias=False), nn.BatchNorm3d(64, momentum=0.01, eps=0.001), nn.ReLU(),
-            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
-        self.resnet = ResNet()
-        return
-    def forward(self, inputBatch):
-        inputBatch = inputBatch.transpose(0, 1).transpose(1, 2)
-        batchsize = inputBatch.shape[0]
-        batch = self.frontend3D(inputBatch)
-        batch = batch.transpose(1, 2)
-        batch = batch.reshape(batch.shape[0] * batch.shape[1], batch.shape[2], batch.shape[3],
-                              batch.shape[4])
-        outputBatch = self.resnet(batch)
-        outputBatch = outputBatch.reshape(batchsize, -1, 512)
-        outputBatch = outputBatch.transpose(1, 2)
-        outputBatch = outputBatch.transpose(1, 2).transpose(0, 1)
-        return outputBatch
-class DSConv1d(nn.Module):
-    def __init__(self):
-        super(DSConv1d, self).__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(),
-            nn.BatchNorm1d(512),
-            nn.Conv1d(512, 512, 3, stride=1, padding=1, dilation=1, groups=512, bias=False),
-            nn.PReLU(),
-            GlobalLayerNorm(512),
-            nn.Conv1d(512, 512, 1, bias=False),
-        )
-    def forward(self, x):
-        out = self.net(x)
-        return out + x
-class visualTCN(nn.Module):
-    def __init__(self):
-        super(visualTCN, self).__init__()
-        stacks = []
-        for x in range(5):
-            stacks += [DSConv1d()]
-        self.net = nn.Sequential(*stacks)    # Visual Temporal Network V-TCN
-    def forward(self, x):
-        out = self.net(x)
-        return out
-class visualConv1D(nn.Module):
-    def __init__(self):
-        super(visualConv1D, self).__init__()
-        self.net = nn.Sequential(
-            nn.Conv1d(512, 256, 5, stride=1, padding=2),
-            nn.BatchNorm1d(256),
-            nn.ReLU(),
-            nn.Conv1d(256, 128, 1),
-        )
-    def forward(self, x):
-        out = self.net(x)
-        return out