|
|
|
|
|
|
|
|
|
|
|
from itertools import zip_longest |
|
import numpy as np |
|
from scipy import ndimage |
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
import time |
|
from torchlibrosa.augmentation import SpecAugmentation |
|
from torchlibrosa.stft import Spectrogram, LogmelFilterBank |
|
import math |
|
from sklearn.cluster import KMeans |
|
import os |
|
import time |
|
from functools import partial |
|
|
|
|
|
import warnings |
|
from functools import partial |
|
|
|
|
|
|
|
|
|
|
|
|
|
import copy |
|
from collections import OrderedDict |
|
import io |
|
import re |
|
DEBUG=0 |
|
event_labels = ['Alarm', 'Alarm_clock', 'Animal', 'Applause', 'Arrow', 'Artillery_fire', |
|
'Babbling', 'Baby_laughter', 'Bark', 'Basketball_bounce', 'Battle_cry', |
|
'Bell', 'Bird', 'Bleat', 'Bouncing', 'Breathing', 'Buzz', 'Camera', |
|
'Cap_gun', 'Car', 'Car_alarm', 'Cat', 'Caw', 'Cheering', 'Child_singing', |
|
'Choir', 'Chop', 'Chopping_(food)', 'Clapping', 'Clickety-clack', 'Clicking', |
|
'Clip-clop', 'Cluck', 'Coin_(dropping)', 'Computer_keyboard', 'Conversation', |
|
'Coo', 'Cough', 'Cowbell', 'Creak', 'Cricket', 'Croak', 'Crow', 'Crowd', 'DTMF', |
|
'Dog', 'Door', 'Drill', 'Drip', 'Engine', 'Engine_starting', 'Explosion', 'Fart', |
|
'Female_singing', 'Filing_(rasp)', 'Finger_snapping', 'Fire', 'Fire_alarm', 'Firecracker', |
|
'Fireworks', 'Frog', 'Gasp', 'Gears', 'Giggle', 'Glass', 'Glass_shatter', 'Gobble', 'Groan', |
|
'Growling', 'Hammer', 'Hands', 'Hiccup', 'Honk', 'Hoot', 'Howl', 'Human_sounds', 'Human_voice', |
|
'Insect', 'Laughter', 'Liquid', 'Machine_gun', 'Male_singing', 'Mechanisms', 'Meow', 'Moo', |
|
'Motorcycle', 'Mouse', 'Music', 'Oink', 'Owl', 'Pant', 'Pant_(dog)', 'Patter', 'Pig', 'Plop', |
|
'Pour', 'Power_tool', 'Purr', 'Quack', 'Radio', 'Rain_on_surface', 'Rapping', 'Rattle', |
|
'Reversing_beeps', 'Ringtone', 'Roar', 'Run', 'Rustle', 'Scissors', 'Scrape', 'Scratch', |
|
'Screaming', 'Sewing_machine', 'Shout', 'Shuffle', 'Shuffling_cards', 'Singing', |
|
'Single-lens_reflex_camera', 'Siren', 'Skateboard', 'Sniff', 'Snoring', 'Speech', |
|
'Speech_synthesizer', 'Spray', 'Squeak', 'Squeal', 'Steam', 'Stir', 'Surface_contact', |
|
'Tap', 'Tap_dance', 'Telephone_bell_ringing', 'Television', 'Tick', 'Tick-tock', 'Tools', |
|
'Train', 'Train_horn', 'Train_wheels_squealing', 'Truck', 'Turkey', 'Typewriter', 'Typing', |
|
'Vehicle', 'Video_game_sound', 'Water', 'Whimper_(dog)', 'Whip', 'Whispering', 'Whistle', |
|
'Whistling', 'Whoop', 'Wind', 'Writing', 'Yip', 'and_pans', 'bird_song', 'bleep', 'clink', |
|
'cock-a-doodle-doo', 'crinkling', 'dove', 'dribble', 'eructation', 'faucet', 'flapping_wings', |
|
'footsteps', 'gunfire', 'heartbeat', 'infant_cry', 'kid_speaking', 'man_speaking', 'mastication', |
|
'mice', 'river', 'rooster', 'silverware', 'skidding', 'smack', 'sobbing', 'speedboat', 'splatter', |
|
'surf', 'thud', 'thwack', 'toot', 'truck_horn', 'tweet', 'vroom', 'waterfowl', 'woman_speaking'] |
|
def load_checkpoint(model, |
|
filename, |
|
map_location=None, |
|
strict=False, |
|
logger=None, |
|
revise_keys=[(r'^module\.', '')]): |
|
"""Load checkpoint from a file or URI. |
|
Args: |
|
model (Module): Module to load checkpoint. |
|
filename (str): Accept local filepath, URL, ``torchvision://xxx``, |
|
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for |
|
details. |
|
map_location (str): Same as :func:`torch.load`. |
|
strict (bool): Whether to allow different params for the model and |
|
checkpoint. |
|
logger (:mod:`logging.Logger` or None): The logger for error message. |
|
revise_keys (list): A list of customized keywords to modify the |
|
state_dict in checkpoint. Each item is a (pattern, replacement) |
|
pair of the regular expression operations. Default: strip |
|
the prefix 'module.' by [(r'^module\\.', '')]. |
|
Returns: |
|
dict or OrderedDict: The loaded checkpoint. |
|
""" |
|
|
|
checkpoint = _load_checkpoint(filename, map_location, logger) |
|
''' |
|
new_proj = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) |
|
new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1)) |
|
checkpoint['patch_embed1.proj.weight'] = new_proj.weight |
|
new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=2).unsqueeze(2).repeat(1,1,3,1)) |
|
checkpoint['patch_embed1.proj.weight'] = new_proj.weight |
|
new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=3).unsqueeze(3).repeat(1,1,1,3)) |
|
checkpoint['patch_embed1.proj.weight'] = new_proj.weight |
|
''' |
|
new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2)) |
|
new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1)) |
|
checkpoint['patch_embed1.proj.weight'] = new_proj.weight |
|
|
|
if not isinstance(checkpoint, dict): |
|
raise RuntimeError( |
|
f'No state_dict found in checkpoint file {filename}') |
|
|
|
if 'state_dict' in checkpoint: |
|
state_dict = checkpoint['state_dict'] |
|
else: |
|
state_dict = checkpoint |
|
|
|
|
|
metadata = getattr(state_dict, '_metadata', OrderedDict()) |
|
for p, r in revise_keys: |
|
state_dict = OrderedDict( |
|
{re.sub(p, r, k): v |
|
for k, v in state_dict.items()}) |
|
state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()}) |
|
|
|
state_dict._metadata = metadata |
|
|
|
|
|
load_state_dict(model, state_dict, strict, logger) |
|
return checkpoint |
|
|
|
def init_weights(m): |
|
if isinstance(m, (nn.Conv2d, nn.Conv1d)): |
|
nn.init.kaiming_normal_(m.weight) |
|
if m.bias is not None: |
|
nn.init.constant_(m.bias, 0) |
|
elif isinstance(m, nn.BatchNorm2d): |
|
nn.init.constant_(m.weight, 1) |
|
if m.bias is not None: |
|
nn.init.constant_(m.bias, 0) |
|
if isinstance(m, nn.Linear): |
|
nn.init.kaiming_uniform_(m.weight) |
|
if m.bias is not None: |
|
nn.init.constant_(m.bias, 0) |
|
def init_layer(layer): |
|
"""Initialize a Linear or Convolutional layer. """ |
|
nn.init.xavier_uniform_(layer.weight) |
|
if hasattr(layer, 'bias'): |
|
if layer.bias is not None: |
|
layer.bias.data.fill_(0.) |
|
|
|
|
|
def init_bn(bn): |
|
"""Initialize a Batchnorm layer. """ |
|
bn.bias.data.fill_(0.) |
|
bn.weight.data.fill_(1.) |
|
|
|
class MaxPool(nn.Module): |
|
def __init__(self, pooldim=1): |
|
super().__init__() |
|
self.pooldim = pooldim |
|
|
|
def forward(self, logits, decision): |
|
return torch.max(decision, dim=self.pooldim)[0] |
|
|
|
|
|
class LinearSoftPool(nn.Module): |
|
"""LinearSoftPool |
|
Linear softmax, takes logits and returns a probability, near to the actual maximum value. |
|
Taken from the paper: |
|
A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling |
|
https://arxiv.org/abs/1810.09050 |
|
""" |
|
def __init__(self, pooldim=1): |
|
super().__init__() |
|
self.pooldim = pooldim |
|
|
|
def forward(self, logits, time_decision): |
|
return (time_decision**2).sum(self.pooldim) / (time_decision.sum( |
|
self.pooldim)+1e-7) |
|
|
|
class ConvBlock(nn.Module): |
|
def __init__(self, in_channels, out_channels): |
|
|
|
super(ConvBlock, self).__init__() |
|
|
|
self.conv1 = nn.Conv2d(in_channels=in_channels, |
|
out_channels=out_channels, |
|
kernel_size=(3, 3), stride=(1, 1), |
|
padding=(1, 1), bias=False) |
|
|
|
self.conv2 = nn.Conv2d(in_channels=out_channels, |
|
out_channels=out_channels, |
|
kernel_size=(3, 3), stride=(1, 1), |
|
padding=(1, 1), bias=False) |
|
|
|
self.bn1 = nn.BatchNorm2d(out_channels) |
|
self.bn2 = nn.BatchNorm2d(out_channels) |
|
|
|
self.init_weight() |
|
|
|
def init_weight(self): |
|
init_layer(self.conv1) |
|
init_layer(self.conv2) |
|
init_bn(self.bn1) |
|
init_bn(self.bn2) |
|
|
|
|
|
def forward(self, input, pool_size=(2, 2), pool_type='avg'): |
|
|
|
x = input |
|
x = F.relu_(self.bn1(self.conv1(x))) |
|
x = F.relu_(self.bn2(self.conv2(x))) |
|
if pool_type == 'max': |
|
x = F.max_pool2d(x, kernel_size=pool_size) |
|
elif pool_type == 'avg': |
|
x = F.avg_pool2d(x, kernel_size=pool_size) |
|
elif pool_type == 'avg+max': |
|
x1 = F.avg_pool2d(x, kernel_size=pool_size) |
|
x2 = F.max_pool2d(x, kernel_size=pool_size) |
|
x = x1 + x2 |
|
else: |
|
raise Exception('Incorrect argument!') |
|
|
|
return x |
|
|
|
class ConvBlock_GLU(nn.Module): |
|
def __init__(self, in_channels, out_channels,kernel_size=(3,3)): |
|
super(ConvBlock_GLU, self).__init__() |
|
self.conv1 = nn.Conv2d(in_channels=in_channels, |
|
out_channels=out_channels, |
|
kernel_size=kernel_size, stride=(1, 1), |
|
padding=(1, 1), bias=False) |
|
self.bn1 = nn.BatchNorm2d(out_channels) |
|
self.sigmoid = nn.Sigmoid() |
|
self.init_weight() |
|
|
|
def init_weight(self): |
|
init_layer(self.conv1) |
|
init_bn(self.bn1) |
|
|
|
def forward(self, input, pool_size=(2, 2), pool_type='avg'): |
|
x = input |
|
x = self.bn1(self.conv1(x)) |
|
cnn1 = self.sigmoid(x[:, :x.shape[1]//2, :, :]) |
|
cnn2 = x[:,x.shape[1]//2:,:,:] |
|
x = cnn1*cnn2 |
|
if pool_type == 'max': |
|
x = F.max_pool2d(x, kernel_size=pool_size) |
|
elif pool_type == 'avg': |
|
x = F.avg_pool2d(x, kernel_size=pool_size) |
|
elif pool_type == 'avg+max': |
|
x1 = F.avg_pool2d(x, kernel_size=pool_size) |
|
x2 = F.max_pool2d(x, kernel_size=pool_size) |
|
x = x1 + x2 |
|
elif pool_type == 'None': |
|
pass |
|
elif pool_type == 'LP': |
|
pass |
|
|
|
else: |
|
raise Exception('Incorrect argument!') |
|
return x |
|
|
|
class Mul_scale_GLU(nn.Module): |
|
def __init__(self): |
|
super(Mul_scale_GLU,self).__init__() |
|
self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1)) |
|
self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3)) |
|
self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5)) |
|
self.conv_block2 = ConvBlock_GLU(in_channels=96, out_channels=128*2) |
|
|
|
self.conv_block3 = ConvBlock_GLU(in_channels=128, out_channels=128*2) |
|
self.conv_block4 = ConvBlock_GLU(in_channels=128, out_channels=256*2) |
|
self.conv_block5 = ConvBlock_GLU(in_channels=256, out_channels=256*2) |
|
self.conv_block6 = ConvBlock_GLU(in_channels=256, out_channels=512*2) |
|
self.conv_block7 = ConvBlock_GLU(in_channels=512, out_channels=512*2) |
|
self.padding = nn.ReplicationPad2d((0,1,0,1)) |
|
|
|
def forward(self, input, fi=None): |
|
""" |
|
Input: (batch_size, data_length)""" |
|
x1 = self.conv_block1_1(input, pool_size=(2, 2), pool_type='avg') |
|
x1 = x1[:,:,:500,:32] |
|
|
|
x2 = self.conv_block1_2(input,pool_size=(2,2),pool_type='avg') |
|
|
|
x3 = self.conv_block1_3(input,pool_size=(2,2),pool_type='avg') |
|
x3 = self.padding(x3) |
|
|
|
|
|
x = torch.cat([x1,x2],dim=1) |
|
x = torch.cat([x,x3],dim=1) |
|
|
|
x = self.conv_block2(x, pool_size=(2, 2), pool_type='None') |
|
x = self.conv_block3(x,pool_size=(2,2),pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
|
|
x = self.conv_block4(x, pool_size=(2, 4), pool_type='None') |
|
x = self.conv_block5(x,pool_size=(2,4),pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
|
|
|
|
x = self.conv_block6(x, pool_size=(1, 4), pool_type='None') |
|
x = self.conv_block7(x, pool_size=(1, 4), pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
|
|
|
|
return x |
|
|
|
class Cnn14(nn.Module): |
|
def __init__(self, sample_rate=32000, window_size=1024, hop_size=320, mel_bins=64, fmin=50, |
|
fmax=14000, classes_num=527): |
|
|
|
super(Cnn14, self).__init__() |
|
|
|
window = 'hann' |
|
center = True |
|
pad_mode = 'reflect' |
|
ref = 1.0 |
|
amin = 1e-10 |
|
top_db = None |
|
|
|
|
|
self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, |
|
win_length=window_size, window=window, center=center, pad_mode=pad_mode, |
|
freeze_parameters=True) |
|
|
|
|
|
self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, |
|
n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, |
|
freeze_parameters=True) |
|
|
|
|
|
self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, |
|
freq_drop_width=8, freq_stripes_num=2) |
|
|
|
self.bn0 = nn.BatchNorm2d(64) |
|
|
|
self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) |
|
self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) |
|
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) |
|
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) |
|
self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) |
|
self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) |
|
|
|
self.fc1 = nn.Linear(2048, 128, bias=True) |
|
self.fc_audioset = nn.Linear(128, classes_num, bias=True) |
|
|
|
self.init_weight() |
|
|
|
def init_weight(self): |
|
init_layer(self.fc1) |
|
init_layer(self.fc_audioset) |
|
|
|
def forward(self, input_, mixup_lambda=None): |
|
""" |
|
Input: (batch_size, data_length)""" |
|
input_ = input_.unsqueeze(1) |
|
x = self.conv_block1(input_, pool_size=(2, 2), pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block4(x, pool_size=(1, 2), pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block5(x, pool_size=(1, 2), pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block6(x, pool_size=(1, 2), pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
|
|
|
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
x = self.fc1(x) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return x |
|
|
|
class Cnn10_fi(nn.Module): |
|
def __init__(self): |
|
super(Cnn10_fi, self).__init__() |
|
self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) |
|
self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) |
|
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) |
|
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def forward(self, input, fi=None): |
|
""" |
|
Input: (batch_size, data_length)""" |
|
|
|
x = self.conv_block1(input, pool_size=(2, 2), pool_type='avg') |
|
if fi != None: |
|
gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) |
|
beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) |
|
x = (gamma)*x + beta |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') |
|
if fi != None: |
|
gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) |
|
beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) |
|
x = (gamma)*x + beta |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block3(x, pool_size=(2, 4), pool_type='avg') |
|
if fi != None: |
|
gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) |
|
beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) |
|
x = (gamma)*x + beta |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block4(x, pool_size=(1, 4), pool_type='avg') |
|
if fi != None: |
|
gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) |
|
beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x) |
|
x = (gamma)*x + beta |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
return x |
|
|
|
class Cnn10_mul_scale(nn.Module): |
|
def __init__(self,scale=8): |
|
super(Cnn10_mul_scale, self).__init__() |
|
self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1)) |
|
self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3)) |
|
self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5)) |
|
self.conv_block2 = ConvBlock(in_channels=96, out_channels=128) |
|
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) |
|
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) |
|
self.scale = scale |
|
self.padding = nn.ReplicationPad2d((0,1,0,1)) |
|
def forward(self, input, pool_size=(2, 2), pool_type='avg'): |
|
""" |
|
Input: (batch_size, data_length)""" |
|
if self.scale == 8: |
|
pool_size1 = (2,2) |
|
pool_size2 = (2,2) |
|
pool_size3 = (2,4) |
|
pool_size4 = (1,4) |
|
elif self.scale == 4: |
|
pool_size1 = (2,2) |
|
pool_size2 = (2,2) |
|
pool_size3 = (1,4) |
|
pool_size4 = (1,4) |
|
elif self.scale == 2: |
|
pool_size1 = (2,2) |
|
pool_size2 = (1,2) |
|
pool_size3 = (1,4) |
|
pool_size4 = (1,4) |
|
else: |
|
pool_size1 = (1,2) |
|
pool_size2 = (1,2) |
|
pool_size3 = (1,4) |
|
pool_size4 = (1,4) |
|
|
|
x1 = self.conv_block1_1(input, pool_size=pool_size1, pool_type='avg') |
|
x1 = x1[:,:,:500,:32] |
|
|
|
x2 = self.conv_block1_2(input, pool_size=pool_size1, pool_type='avg') |
|
|
|
x3 = self.conv_block1_3(input, pool_size=pool_size1, pool_type='avg') |
|
x3 = self.padding(x3) |
|
|
|
|
|
m_i = min(x3.shape[2],min(x1.shape[2],x2.shape[2])) |
|
|
|
x = torch.cat([x1[:,:,:m_i,:],x2[:,:, :m_i,:],x3[:,:, :m_i,:]],dim=1) |
|
|
|
|
|
|
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
return x |
|
|
|
|
|
class Cnn10(nn.Module): |
|
def __init__(self,scale=8): |
|
super(Cnn10, self).__init__() |
|
self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) |
|
self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) |
|
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) |
|
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) |
|
self.scale = scale |
|
def forward(self, input, pool_size=(2, 2), pool_type='avg'): |
|
""" |
|
Input: (batch_size, data_length)""" |
|
if self.scale == 8: |
|
pool_size1 = (2,2) |
|
pool_size2 = (2,2) |
|
pool_size3 = (2,4) |
|
pool_size4 = (1,4) |
|
elif self.scale == 4: |
|
pool_size1 = (2,2) |
|
pool_size2 = (2,2) |
|
pool_size3 = (1,4) |
|
pool_size4 = (1,4) |
|
elif self.scale == 2: |
|
pool_size1 = (2,2) |
|
pool_size2 = (1,2) |
|
pool_size3 = (1,4) |
|
pool_size4 = (1,4) |
|
else: |
|
pool_size1 = (1,2) |
|
pool_size2 = (1,2) |
|
pool_size3 = (1,4) |
|
pool_size4 = (1,4) |
|
x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg') |
|
x = F.dropout(x, p=0.2, training=self.training) |
|
return x |
|
|
|
class MeanPool(nn.Module): |
|
def __init__(self, pooldim=1): |
|
super().__init__() |
|
self.pooldim = pooldim |
|
|
|
def forward(self, logits, decision): |
|
return torch.mean(decision, dim=self.pooldim) |
|
|
|
class ResPool(nn.Module): |
|
def __init__(self, pooldim=1): |
|
super().__init__() |
|
self.pooldim = pooldim |
|
self.linPool = LinearSoftPool(pooldim=1) |
|
|
|
class AutoExpPool(nn.Module): |
|
def __init__(self, outputdim=10, pooldim=1): |
|
super().__init__() |
|
self.outputdim = outputdim |
|
self.alpha = nn.Parameter(torch.full((outputdim, ), 1)) |
|
self.pooldim = pooldim |
|
|
|
def forward(self, logits, decision): |
|
scaled = self.alpha * decision |
|
return (logits * torch.exp(scaled)).sum( |
|
self.pooldim) / torch.exp(scaled).sum(self.pooldim) |
|
|
|
|
|
class SoftPool(nn.Module): |
|
def __init__(self, T=1, pooldim=1): |
|
super().__init__() |
|
self.pooldim = pooldim |
|
self.T = T |
|
|
|
def forward(self, logits, decision): |
|
w = torch.softmax(decision / self.T, dim=self.pooldim) |
|
return torch.sum(decision * w, dim=self.pooldim) |
|
|
|
|
|
class AutoPool(nn.Module): |
|
"""docstring for AutoPool""" |
|
def __init__(self, outputdim=10, pooldim=1): |
|
super().__init__() |
|
self.outputdim = outputdim |
|
self.alpha = nn.Parameter(torch.ones(outputdim)) |
|
self.dim = pooldim |
|
|
|
def forward(self, logits, decision): |
|
scaled = self.alpha * decision |
|
weight = torch.softmax(scaled, dim=self.dim) |
|
return torch.sum(decision * weight, dim=self.dim) |
|
|
|
|
|
class ExtAttentionPool(nn.Module): |
|
def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs): |
|
super().__init__() |
|
self.inputdim = inputdim |
|
self.outputdim = outputdim |
|
self.pooldim = pooldim |
|
self.attention = nn.Linear(inputdim, outputdim) |
|
nn.init.zeros_(self.attention.weight) |
|
nn.init.zeros_(self.attention.bias) |
|
self.activ = nn.Softmax(dim=self.pooldim) |
|
|
|
def forward(self, logits, decision): |
|
|
|
w_x = self.activ(self.attention(logits) / self.outputdim) |
|
h = (logits.permute(0, 2, 1).contiguous().unsqueeze(-2) * |
|
w_x.unsqueeze(-1)).flatten(-2).contiguous() |
|
return torch.sum(h, self.pooldim) |
|
|
|
|
|
class AttentionPool(nn.Module): |
|
"""docstring for AttentionPool""" |
|
def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs): |
|
super().__init__() |
|
self.inputdim = inputdim |
|
self.outputdim = outputdim |
|
self.pooldim = pooldim |
|
self.transform = nn.Linear(inputdim, outputdim) |
|
self.activ = nn.Softmax(dim=self.pooldim) |
|
self.eps = 1e-7 |
|
|
|
def forward(self, logits, decision): |
|
|
|
|
|
w = self.activ(torch.clamp(self.transform(logits), -15, 15)) |
|
detect = (decision * w).sum( |
|
self.pooldim) / (w.sum(self.pooldim) + self.eps) |
|
|
|
return detect |
|
|
|
class Block2D(nn.Module): |
|
def __init__(self, cin, cout, kernel_size=3, padding=1): |
|
super().__init__() |
|
self.block = nn.Sequential( |
|
nn.BatchNorm2d(cin), |
|
nn.Conv2d(cin, |
|
cout, |
|
kernel_size=kernel_size, |
|
padding=padding, |
|
bias=False), |
|
nn.LeakyReLU(inplace=True, negative_slope=0.1)) |
|
|
|
def forward(self, x): |
|
return self.block(x) |
|
|
|
class AudioCNN(nn.Module): |
|
def __init__(self, classes_num): |
|
super(AudioCNN, self).__init__() |
|
self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) |
|
self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) |
|
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) |
|
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) |
|
self.fc1 = nn.Linear(512,128,bias=True) |
|
self.fc = nn.Linear(128, classes_num, bias=True) |
|
self.init_weights() |
|
|
|
def init_weights(self): |
|
init_layer(self.fc) |
|
|
|
def forward(self, input): |
|
''' |
|
Input: (batch_size, times_steps, freq_bins)''' |
|
|
|
x = input[:, None, :, :] |
|
'''(batch_size, 1, times_steps, freq_bins)''' |
|
x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') |
|
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') |
|
x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') |
|
x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') |
|
'''(batch_size, feature_maps, time_steps, freq_bins)''' |
|
x = torch.mean(x, dim=3) |
|
(x, _) = torch.max(x, dim=2) |
|
x = self.fc1(x) |
|
output = self.fc(x) |
|
return x,output |
|
|
|
def extract(self,input): |
|
'''Input: (batch_size, times_steps, freq_bins)''' |
|
x = input[:, None, :, :] |
|
x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') |
|
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') |
|
x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') |
|
x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') |
|
'''(batch_size, feature_maps, time_steps, freq_bins)''' |
|
x = torch.mean(x, dim=3) |
|
(x, _) = torch.max(x, dim=2) |
|
x = self.fc1(x) |
|
return x |
|
|
|
def parse_poolingfunction(poolingfunction_name='mean', **kwargs): |
|
"""parse_poolingfunction |
|
A heler function to parse any temporal pooling |
|
Pooling is done on dimension 1 |
|
:param poolingfunction_name: |
|
:param **kwargs: |
|
""" |
|
poolingfunction_name = poolingfunction_name.lower() |
|
if poolingfunction_name == 'mean': |
|
return MeanPool(pooldim=1) |
|
elif poolingfunction_name == 'max': |
|
return MaxPool(pooldim=1) |
|
elif poolingfunction_name == 'linear': |
|
return LinearSoftPool(pooldim=1) |
|
elif poolingfunction_name == 'expalpha': |
|
return AutoExpPool(outputdim=kwargs['outputdim'], pooldim=1) |
|
|
|
elif poolingfunction_name == 'soft': |
|
return SoftPool(pooldim=1) |
|
elif poolingfunction_name == 'auto': |
|
return AutoPool(outputdim=kwargs['outputdim']) |
|
elif poolingfunction_name == 'attention': |
|
return AttentionPool(inputdim=kwargs['inputdim'], |
|
outputdim=kwargs['outputdim']) |
|
class conv1d(nn.Module): |
|
def __init__(self, nin, nout, kernel_size=3, stride=1, padding='VALID', dilation=1): |
|
super(conv1d, self).__init__() |
|
if padding == 'VALID': |
|
dconv_pad = 0 |
|
elif padding == 'SAME': |
|
dconv_pad = dilation * ((kernel_size - 1) // 2) |
|
else: |
|
raise ValueError("Padding Mode Error!") |
|
self.conv = nn.Conv1d(nin, nout, kernel_size=kernel_size, stride=stride, padding=dconv_pad) |
|
self.act = nn.ReLU() |
|
self.init_layer(self.conv) |
|
|
|
def init_layer(self, layer, nonlinearity='relu'): |
|
"""Initialize a Linear or Convolutional layer. """ |
|
nn.init.kaiming_normal_(layer.weight, nonlinearity=nonlinearity) |
|
nn.init.constant_(layer.bias, 0.1) |
|
|
|
def forward(self, x): |
|
out = self.act(self.conv(x)) |
|
return out |
|
|
|
class Atten_1(nn.Module): |
|
def __init__(self, input_dim, context=2, dropout_rate=0.2): |
|
super(Atten_1, self).__init__() |
|
self._matrix_k = nn.Linear(input_dim, input_dim // 4) |
|
self._matrix_q = nn.Linear(input_dim, input_dim // 4) |
|
self.relu = nn.ReLU() |
|
self.context = context |
|
self._dropout_layer = nn.Dropout(dropout_rate) |
|
self.init_layer(self._matrix_k) |
|
self.init_layer(self._matrix_q) |
|
|
|
def init_layer(self, layer, nonlinearity='leaky_relu'): |
|
"""Initialize a Linear or Convolutional layer. """ |
|
nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity) |
|
if hasattr(layer, 'bias'): |
|
if layer.bias is not None: |
|
layer.bias.data.fill_(0.) |
|
|
|
def forward(self, input_x): |
|
k_x = input_x |
|
k_x = self.relu(self._matrix_k(k_x)) |
|
k_x = self._dropout_layer(k_x) |
|
|
|
q_x = input_x[:, self.context, :] |
|
|
|
q_x = q_x[:, None, :] |
|
|
|
q_x = self.relu(self._matrix_q(q_x)) |
|
q_x = self._dropout_layer(q_x) |
|
|
|
x_ = torch.matmul(k_x, q_x.transpose(-2, -1) / math.sqrt(k_x.size(-1))) |
|
|
|
x_ = x_.squeeze(2) |
|
alpha = F.softmax(x_, dim=-1) |
|
att_ = alpha |
|
|
|
alpha = alpha.unsqueeze(2).repeat(1,1,input_x.shape[2]) |
|
|
|
|
|
out = alpha * input_x |
|
|
|
|
|
out = out.mean(1) |
|
|
|
|
|
|
|
|
|
out = input_x[:, self.context, :] + out |
|
return out |
|
|
|
class Fusion(nn.Module): |
|
def __init__(self, inputdim, inputdim2, n_fac): |
|
super().__init__() |
|
self.fuse_layer1 = conv1d(inputdim, inputdim2*n_fac,1) |
|
self.fuse_layer2 = conv1d(inputdim2, inputdim2*n_fac,1) |
|
self.avg_pool = nn.AvgPool1d(n_fac, stride=n_fac) |
|
|
|
def forward(self,embedding,mix_embed): |
|
embedding = embedding.permute(0,2,1) |
|
fuse1_out = self.fuse_layer1(embedding) |
|
fuse1_out = fuse1_out.permute(0,2,1) |
|
|
|
mix_embed = mix_embed.permute(0,2,1) |
|
fuse2_out = self.fuse_layer2(mix_embed) |
|
fuse2_out = fuse2_out.permute(0,2,1) |
|
as_embs = torch.mul(fuse1_out, fuse2_out) |
|
|
|
as_embs = self.avg_pool(as_embs) |
|
return as_embs |
|
|
|
class CDur_fusion(nn.Module): |
|
def __init__(self, inputdim, outputdim, **kwargs): |
|
super().__init__() |
|
self.features = nn.Sequential( |
|
Block2D(1, 32), |
|
nn.LPPool2d(4, (2, 4)), |
|
Block2D(32, 128), |
|
Block2D(128, 128), |
|
nn.LPPool2d(4, (2, 4)), |
|
Block2D(128, 128), |
|
Block2D(128, 128), |
|
nn.LPPool2d(4, (1, 4)), |
|
nn.Dropout(0.3), |
|
) |
|
with torch.no_grad(): |
|
rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape |
|
rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] |
|
|
|
self.gru = nn.GRU(128, 128, bidirectional=True, batch_first=True) |
|
self.fusion = Fusion(128,2) |
|
self.fc = nn.Linear(256,256) |
|
self.outputlayer = nn.Linear(256, outputdim) |
|
self.features.apply(init_weights) |
|
self.outputlayer.apply(init_weights) |
|
|
|
def forward(self, x, embedding): |
|
batch, time, dim = x.shape |
|
x = x.unsqueeze(1) |
|
x = self.features(x) |
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
embedding = embedding.unsqueeze(1) |
|
embedding = embedding.repeat(1, x.shape[1], 1) |
|
x = self.fusion(embedding,x) |
|
|
|
if not hasattr(self, '_flattened'): |
|
self.gru.flatten_parameters() |
|
x, _ = self.gru(x) |
|
x = self.fc(x) |
|
decision_time = torch.softmax(self.outputlayer(x),dim=2) |
|
decision_up = torch.nn.functional.interpolate( |
|
decision_time.transpose(1, 2), |
|
time, |
|
mode='linear', |
|
align_corners=False).transpose(1, 2) |
|
return decision_time[:,:,0],decision_up |
|
|
|
class CDur(nn.Module): |
|
def __init__(self, inputdim, outputdim,time_resolution, **kwargs): |
|
super().__init__() |
|
self.features = nn.Sequential( |
|
Block2D(1, 32), |
|
nn.LPPool2d(4, (2, 4)), |
|
Block2D(32, 128), |
|
Block2D(128, 128), |
|
nn.LPPool2d(4, (2, 4)), |
|
Block2D(128, 128), |
|
Block2D(128, 128), |
|
nn.LPPool2d(4, (2, 4)), |
|
nn.Dropout(0.3), |
|
) |
|
with torch.no_grad(): |
|
rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape |
|
rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] |
|
|
|
self.gru = nn.GRU(256, 256, bidirectional=True, batch_first=True) |
|
self.fc = nn.Linear(512,256) |
|
self.outputlayer = nn.Linear(256, outputdim) |
|
self.features.apply(init_weights) |
|
self.outputlayer.apply(init_weights) |
|
|
|
def forward(self, x, embedding,one_hot=None): |
|
batch, time, dim = x.shape |
|
x = x.unsqueeze(1) |
|
x = self.features(x) |
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
embedding = embedding.unsqueeze(1) |
|
embedding = embedding.repeat(1, x.shape[1], 1) |
|
x = torch.cat((x, embedding), dim=2) |
|
if not hasattr(self, '_flattened'): |
|
self.gru.flatten_parameters() |
|
x, _ = self.gru(x) |
|
x = self.fc(x) |
|
decision_time = torch.softmax(self.outputlayer(x),dim=2) |
|
decision_up = torch.nn.functional.interpolate( |
|
decision_time.transpose(1, 2), |
|
time, |
|
mode='linear', |
|
align_corners=False).transpose(1, 2) |
|
return decision_time[:,:,0],decision_up |
|
|
|
class CDur_big(nn.Module): |
|
def __init__(self, inputdim, outputdim, **kwargs): |
|
super().__init__() |
|
self.features = nn.Sequential( |
|
Block2D(1, 64), |
|
Block2D(64, 64), |
|
nn.LPPool2d(4, (2, 2)), |
|
Block2D(64, 128), |
|
Block2D(128, 128), |
|
nn.LPPool2d(4, (2, 2)), |
|
Block2D(128, 256), |
|
Block2D(256, 256), |
|
nn.LPPool2d(4, (2, 4)), |
|
Block2D(256, 512), |
|
Block2D(512, 512), |
|
nn.LPPool2d(4, (1, 4)), |
|
nn.Dropout(0.3),) |
|
with torch.no_grad(): |
|
rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape |
|
rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] |
|
self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True) |
|
self.fc = nn.Linear(1024,256) |
|
self.outputlayer = nn.Linear(256, outputdim) |
|
self.features.apply(init_weights) |
|
self.outputlayer.apply(init_weights) |
|
|
|
def forward(self, x, embedding): |
|
batch, time, dim = x.shape |
|
x = x.unsqueeze(1) |
|
x = self.features(x) |
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
embedding = embedding.unsqueeze(1) |
|
embedding = embedding.repeat(1, x.shape[1], 1) |
|
x = torch.cat((x, embedding), dim=2) |
|
if not hasattr(self, '_flattened'): |
|
self.gru.flatten_parameters() |
|
x, _ = self.gru(x) |
|
x = self.fc(x) |
|
decision_time = torch.softmax(self.outputlayer(x),dim=2) |
|
decision_up = torch.nn.functional.interpolate( |
|
decision_time.transpose(1, 2), |
|
time, |
|
mode='linear', |
|
align_corners=False).transpose(1, 2) |
|
return decision_time[:,:,0],decision_up |
|
|
|
class CDur_GLU(nn.Module): |
|
def __init__(self, inputdim, outputdim, **kwargs): |
|
super().__init__() |
|
self.features = Mul_scale_GLU() |
|
|
|
|
|
|
|
self.gru = nn.GRU(640, 512,1, bidirectional=True, batch_first=True) |
|
|
|
self.fc = nn.Linear(1024,256) |
|
self.outputlayer = nn.Linear(256, outputdim) |
|
|
|
self.outputlayer.apply(init_weights) |
|
|
|
def forward(self, x, embedding,one_hot=None): |
|
batch, time, dim = x.shape |
|
x = x.unsqueeze(1) |
|
x = self.features(x) |
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
|
|
|
|
embedding = embedding.unsqueeze(1) |
|
embedding = embedding.repeat(1, x.shape[1], 1) |
|
|
|
x = torch.cat((x, embedding), dim=2) |
|
if not hasattr(self, '_flattened'): |
|
self.gru.flatten_parameters() |
|
x, _ = self.gru(x) |
|
|
|
x = self.fc(x) |
|
decision_time = torch.softmax(self.outputlayer(x),dim=2) |
|
decision_up = torch.nn.functional.interpolate( |
|
decision_time.transpose(1, 2), |
|
time, |
|
mode='linear', |
|
align_corners=False).transpose(1, 2) |
|
return decision_time[:,:,0],decision_up |
|
|
|
class CDur_CNN14(nn.Module): |
|
def __init__(self, inputdim, outputdim,time_resolution,**kwargs): |
|
super().__init__() |
|
if time_resolution==125: |
|
self.features = Cnn10(8) |
|
elif time_resolution == 250: |
|
|
|
self.features = Cnn10(4) |
|
elif time_resolution == 500: |
|
self.features = Cnn10(2) |
|
else: |
|
self.features = Cnn10(0) |
|
with torch.no_grad(): |
|
rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape |
|
rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1] |
|
|
|
self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True) |
|
|
|
self.fc = nn.Linear(1024,256) |
|
self.outputlayer = nn.Linear(256, outputdim) |
|
|
|
self.outputlayer.apply(init_weights) |
|
|
|
def forward(self, x, embedding,one_hot=None): |
|
batch, time, dim = x.shape |
|
x = x.unsqueeze(1) |
|
x = self.features(x) |
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
|
|
|
|
embedding = embedding.unsqueeze(1) |
|
embedding = embedding.repeat(1, x.shape[1], 1) |
|
x = torch.cat((x, embedding), dim=2) |
|
if not hasattr(self, '_flattened'): |
|
self.gru.flatten_parameters() |
|
x, _ = self.gru(x) |
|
|
|
x = self.fc(x) |
|
decision_time = torch.softmax(self.outputlayer(x),dim=2) |
|
decision_up = torch.nn.functional.interpolate( |
|
decision_time.transpose(1, 2), |
|
time, |
|
mode='linear', |
|
align_corners=False).transpose(1, 2) |
|
return decision_time[:,:,0],decision_up |
|
|
|
class CDur_CNN_mul_scale(nn.Module): |
|
def __init__(self, inputdim, outputdim,time_resolution,**kwargs): |
|
super().__init__() |
|
if time_resolution==125: |
|
self.features = Cnn10_mul_scale(8) |
|
elif time_resolution == 250: |
|
|
|
self.features = Cnn10_mul_scale(4) |
|
elif time_resolution == 500: |
|
self.features = Cnn10_mul_scale(2) |
|
else: |
|
self.features = Cnn10_mul_scale(0) |
|
|
|
|
|
|
|
|
|
self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True) |
|
|
|
self.fc = nn.Linear(1024,256) |
|
self.outputlayer = nn.Linear(256, outputdim) |
|
|
|
self.outputlayer.apply(init_weights) |
|
|
|
def forward(self, x, embedding,one_hot=None): |
|
|
|
|
|
batch, time, dim = x.shape |
|
x = x.unsqueeze(1) |
|
x = self.features(x) |
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
|
|
|
|
embedding = embedding.unsqueeze(1) |
|
embedding = embedding.repeat(1, x.shape[1], 1) |
|
x = torch.cat((x, embedding), dim=2) |
|
if not hasattr(self, '_flattened'): |
|
self.gru.flatten_parameters() |
|
x, _ = self.gru(x) |
|
|
|
x = self.fc(x) |
|
decision_time = torch.softmax(self.outputlayer(x),dim=2) |
|
decision_up = torch.nn.functional.interpolate( |
|
decision_time.transpose(1, 2), |
|
time, |
|
mode='linear', |
|
align_corners=False).transpose(1, 2) |
|
return decision_time[:,:,0],decision_up |
|
|
|
class CDur_CNN_mul_scale_fusion(nn.Module): |
|
def __init__(self, inputdim, outputdim, time_resolution,**kwargs): |
|
super().__init__() |
|
if time_resolution==125: |
|
self.features = Cnn10_mul_scale(8) |
|
elif time_resolution == 250: |
|
|
|
self.features = Cnn10_mul_scale(4) |
|
elif time_resolution == 500: |
|
self.features = Cnn10_mul_scale(2) |
|
else: |
|
self.features = Cnn10_mul_scale(0) |
|
|
|
|
|
|
|
|
|
self.gru = nn.GRU(512, 512, bidirectional=True, batch_first=True) |
|
|
|
self.fc = nn.Linear(1024,256) |
|
self.fusion = Fusion(128,512,2) |
|
self.outputlayer = nn.Linear(256, outputdim) |
|
|
|
self.outputlayer.apply(init_weights) |
|
|
|
def forward(self, x, embedding,one_hot=None): |
|
|
|
|
|
batch, time, dim = x.shape |
|
x = x.unsqueeze(1) |
|
x = self.features(x) |
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
|
|
|
|
embedding = embedding.unsqueeze(1) |
|
embedding = embedding.repeat(1, x.shape[1], 1) |
|
x = self.fusion(embedding, x) |
|
|
|
if not hasattr(self, '_flattened'): |
|
self.gru.flatten_parameters() |
|
x, _ = self.gru(x) |
|
|
|
x = self.fc(x) |
|
decision_time = torch.softmax(self.outputlayer(x),dim=2) |
|
decision_up = torch.nn.functional.interpolate( |
|
decision_time.transpose(1, 2), |
|
time, |
|
mode='linear', |
|
align_corners=False).transpose(1, 2) |
|
return decision_time[:,:,0],decision_up |
|
|
|
|
|
class RaDur_fusion(nn.Module): |
|
def __init__(self, model_config, inputdim, outputdim, time_resolution, **kwargs): |
|
super().__init__() |
|
self.encoder = Cnn14() |
|
self.detection = CDur_CNN_mul_scale_fusion(inputdim, outputdim, time_resolution) |
|
self.softmax = nn.Softmax(dim=2) |
|
|
|
|
|
|
|
|
|
|
|
self.q = nn.Linear(128,128) |
|
self.k = nn.Linear(128,128) |
|
self.q_ee = nn.Linear(128, 128) |
|
self.k_ee = nn.Linear(128, 128) |
|
self.temperature = 11.3 |
|
self.att_pool = model_config['att_pool'] |
|
self.enhancement = model_config['enhancement'] |
|
self.tao = model_config['tao'] |
|
self.top = model_config['top'] |
|
self.bn = nn.BatchNorm1d(128) |
|
self.EE_fusion = Fusion(128, 128, 4) |
|
|
|
def get_w(self,q,k): |
|
q = self.q(q) |
|
k = self.k(k) |
|
q = q.unsqueeze(1) |
|
attn = torch.bmm(q, k.transpose(1, 2)) |
|
attn = attn/self.temperature |
|
attn = self.softmax(attn) |
|
return attn |
|
|
|
def get_w_ee(self,q,k): |
|
q = self.q_ee(q) |
|
k = self.k_ee(k) |
|
q = q.unsqueeze(1) |
|
attn = torch.bmm(q, k.transpose(1, 2)) |
|
attn = attn/self.temperature |
|
attn = self.softmax(attn) |
|
return attn |
|
|
|
def attention_pooling(self, embeddings, mean_embedding): |
|
att_pool_w = self.get_w(mean_embedding,embeddings) |
|
embedding = torch.bmm(att_pool_w, embeddings).squeeze(1) |
|
|
|
|
|
|
|
|
|
return embedding |
|
|
|
def select_topk_embeddings(self, scores, embeddings, k): |
|
_, idx_DESC = scores.sort(descending=True, dim=1) |
|
top_k = _[:,:k] |
|
|
|
|
|
idx_topk = idx_DESC[:, :k] |
|
|
|
idx_topk = idx_topk.unsqueeze(2).expand([-1, -1, embeddings.shape[2]]) |
|
selected_embeddings = torch.gather(embeddings, 1, idx_topk) |
|
return selected_embeddings,top_k |
|
|
|
def sum_with_attention(self, embedding, top_k, selected_embeddings): |
|
|
|
|
|
att_1 = self.get_w_ee(embedding, selected_embeddings) |
|
att_1 = att_1.squeeze(1) |
|
|
|
larger = top_k > self.tao |
|
|
|
top_k = top_k*larger |
|
|
|
|
|
att_1 = att_1*top_k |
|
|
|
|
|
att_2 = att_1.unsqueeze(2).repeat(1,1,128) |
|
Es = selected_embeddings*att_2 |
|
return Es |
|
|
|
def orcal_EE(self, x, embedding, label): |
|
batch, time, dim = x.shape |
|
|
|
mixture_embedding = self.encoder(x) |
|
mixture_embedding = mixture_embedding.transpose(1,2) |
|
mixture_embedding = self.bn(mixture_embedding) |
|
mixture_embedding = mixture_embedding.transpose(1,2) |
|
|
|
x = x.unsqueeze(1) |
|
x = self.detection.features(x) |
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
embedding_pre = embedding.unsqueeze(1) |
|
embedding_pre = embedding_pre.repeat(1, x.shape[1], 1) |
|
f = self.detection.fusion(embedding_pre, x) |
|
|
|
if not hasattr(self, '_flattened'): |
|
self.detection.gru.flatten_parameters() |
|
f, _ = self.detection.gru(f) |
|
f = self.detection.fc(f) |
|
decision_time = torch.softmax(self.detection.outputlayer(f),dim=2) |
|
|
|
selected_embeddings, top_k = self.select_topk_embeddings(decision_time[:,:,0], mixture_embedding, self.top) |
|
|
|
selected_embeddings = self.sum_with_attention(embedding, top_k, selected_embeddings) |
|
|
|
mix_embedding = selected_embeddings.mean(1).unsqueeze(1) |
|
mix_embedding = mix_embedding.repeat(1, x.shape[1], 1) |
|
embedding = embedding.unsqueeze(1) |
|
embedding = embedding.repeat(1, x.shape[1], 1) |
|
mix_embedding = self.EE_fusion(mix_embedding, embedding) |
|
|
|
|
|
|
|
|
|
|
|
f_now = self.detection.fusion(mix_embedding, x) |
|
|
|
f_now, _ = self.detection.gru(f_now) |
|
f_now = self.detection.fc(f_now) |
|
decision_time_now = torch.softmax(self.detection.outputlayer(f_now), dim=2) |
|
|
|
top_k = top_k.mean(1) |
|
larger = top_k > self.tao |
|
top_k = top_k * larger |
|
top_k = top_k/2.0 |
|
|
|
|
|
|
|
|
|
|
|
neg_w = top_k.unsqueeze(1).unsqueeze(2) |
|
neg_w = neg_w.repeat(1, decision_time_now.shape[1], decision_time_now.shape[2]) |
|
|
|
|
|
pos_w = 1-neg_w |
|
|
|
decision_time_final = decision_time*pos_w + neg_w*decision_time_now |
|
|
|
|
|
|
|
return decision_time_final |
|
|
|
def forward(self, x, ref, label=None): |
|
batch, time, dim = x.shape |
|
logit = torch.zeros(1).cuda() |
|
embeddings = self.encoder(ref) |
|
mean_embedding = embeddings.mean(1) |
|
if self.att_pool == True: |
|
mean_embedding = self.bn(mean_embedding) |
|
embeddings = embeddings.transpose(1,2) |
|
embeddings = self.bn(embeddings) |
|
embeddings = embeddings.transpose(1,2) |
|
embedding = self.attention_pooling(embeddings, mean_embedding) |
|
else: |
|
embedding = mean_embedding |
|
if self.enhancement == True: |
|
decision_time = self.orcal_EE(x, embedding, label) |
|
decision_up = torch.nn.functional.interpolate( |
|
decision_time.transpose(1, 2), |
|
time, |
|
mode='linear', |
|
align_corners=False).transpose(1, 2) |
|
return decision_time[:,:,0], decision_up, logit |
|
|
|
x = x.unsqueeze(1) |
|
x = self.detection.features(x) |
|
x = x.transpose(1, 2).contiguous().flatten(-2) |
|
embedding = embedding.unsqueeze(1) |
|
embedding = embedding.repeat(1, x.shape[1], 1) |
|
|
|
x = self.detection.fusion(embedding, x) |
|
|
|
|
|
|
|
if not hasattr(self, '_flattened'): |
|
self.detection.gru.flatten_parameters() |
|
x, _ = self.detection.gru(x) |
|
x = self.detection.fc(x) |
|
decision_time = torch.softmax(self.detection.outputlayer(x),dim=2) |
|
decision_up = torch.nn.functional.interpolate( |
|
decision_time.transpose(1, 2), |
|
time, |
|
mode='linear', |
|
align_corners=False).transpose(1, 2) |
|
return decision_time[:,:,0], decision_up, logit |