from collections import OrderedDict import torch from diffusers import ConfigMixin, ModelMixin class AudioEmotionClassifierModel(ModelMixin, ConfigMixin): num_emotion_classes = 9 def __init__(self, num_classifier_layers=5, num_classifier_channels=2048): super().__init__() if num_classifier_layers == 1: self.layers = torch.nn.Linear(1024, self.num_emotion_classes) else: layer_list = [ ("fc1", torch.nn.Linear(1024, num_classifier_channels)), ("relu1", torch.nn.ReLU()), ] for n in range(num_classifier_layers - 2): layer_list.append((f"fc{n+2}", torch.nn.Linear(num_classifier_channels, num_classifier_channels))) layer_list.append((f"relu{n+2}", torch.nn.ReLU())) layer_list.append( (f"fc{num_classifier_layers}", torch.nn.Linear(num_classifier_channels, self.num_emotion_classes)) ) self.layers = torch.nn.Sequential(OrderedDict(layer_list)) def forward(self, x): x = self.layers(x) x = torch.softmax(x, dim=-1) return x