HoneyTian commited on
Commit
12e00ec
·
1 Parent(s): 68ac03b
examples/vm_sound_classification/conv2d_classifier.yaml CHANGED
@@ -10,29 +10,24 @@ mel_spectrogram_param:
10
  window_fn: hamming
11
  n_mels: 80
12
 
13
- spec_augment_param:
14
- aug_volume_factor_range:
15
- - 0.5
16
- - 2.0
17
-
18
  conv2d_block_param_list:
19
  - batch_norm: true
20
  in_channels: 1
21
- out_channels: 16
22
  kernel_size: 3
23
  stride: 1
24
  dilation: 3
25
  activation: relu
26
  dropout: 0.1
27
- - in_channels: 16
28
- out_channels: 16
29
  kernel_size: 5
30
  stride: 2
31
  dilation: 3
32
  activation: relu
33
  dropout: 0.1
34
- - in_channels: 16
35
- out_channels: 16
36
  kernel_size: 3
37
  stride: 1
38
  dilation: 2
@@ -40,7 +35,7 @@ conv2d_block_param_list:
40
  dropout: 0.1
41
 
42
  cls_head_param:
43
- input_dim: 432
44
  num_layers: 2
45
  hidden_dims:
46
  - 128
 
10
  window_fn: hamming
11
  n_mels: 80
12
 
 
 
 
 
 
13
  conv2d_block_param_list:
14
  - batch_norm: true
15
  in_channels: 1
16
+ out_channels: 4
17
  kernel_size: 3
18
  stride: 1
19
  dilation: 3
20
  activation: relu
21
  dropout: 0.1
22
+ - in_channels: 4
23
+ out_channels: 4
24
  kernel_size: 5
25
  stride: 2
26
  dilation: 3
27
  activation: relu
28
  dropout: 0.1
29
+ - in_channels: 4
30
+ out_channels: 4
31
  kernel_size: 3
32
  stride: 1
33
  dilation: 2
 
35
  dropout: 0.1
36
 
37
  cls_head_param:
38
+ input_dim: 108
39
  num_layers: 2
40
  hidden_dims:
41
  - 128
examples/vm_sound_classification/run.sh CHANGED
@@ -12,8 +12,8 @@ sh run.sh --stage 2 --stop_stage 2 --system_version windows --file_folder_name f
12
  E:/Users/tianx/HuggingDatasets/vm_sound_classification/data/wav_finished/id-ID/wav_finished/*/*.wav" \
13
  --label_plan 4
14
 
15
- sh run.sh --stage 0 --stop_stage 5 --system_version centos --file_folder_name file_dir --final_model_name vm_sound_classification2-ch16 \
16
- --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" --label_plan 2
17
 
18
  "
19
 
 
12
  E:/Users/tianx/HuggingDatasets/vm_sound_classification/data/wav_finished/id-ID/wav_finished/*/*.wav" \
13
  --label_plan 4
14
 
15
+ sh run.sh --stage 0 --stop_stage 5 --system_version centos --file_folder_name file_dir --final_model_name vm_sound_classification3-ch16 \
16
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" --label_plan 3
17
 
18
  "
19
 
toolbox/torchaudio/models/cnn_audio_classifier/configuration_cnn_audio_classifier.py CHANGED
@@ -8,7 +8,6 @@ from toolbox.torchaudio.configuration_utils import PretrainedConfig
8
  class CnnAudioClassifierConfig(PretrainedConfig):
9
  def __init__(self,
10
  mel_spectrogram_param: dict,
11
- spec_augment_param: dict,
12
  cls_head_param: dict,
13
  conv1d_block_param_list: List[dict] = None,
14
  conv2d_block_param_list: List[dict] = None,
@@ -16,7 +15,6 @@ class CnnAudioClassifierConfig(PretrainedConfig):
16
  ):
17
  super(CnnAudioClassifierConfig, self).__init__(**kwargs)
18
  self.mel_spectrogram_param = mel_spectrogram_param
19
- self.spec_augment_param = spec_augment_param
20
  self.cls_head_param = cls_head_param
21
  self.conv1d_block_param_list = conv1d_block_param_list
22
  self.conv2d_block_param_list = conv2d_block_param_list
 
8
  class CnnAudioClassifierConfig(PretrainedConfig):
9
  def __init__(self,
10
  mel_spectrogram_param: dict,
 
11
  cls_head_param: dict,
12
  conv1d_block_param_list: List[dict] = None,
13
  conv2d_block_param_list: List[dict] = None,
 
15
  ):
16
  super(CnnAudioClassifierConfig, self).__init__(**kwargs)
17
  self.mel_spectrogram_param = mel_spectrogram_param
 
18
  self.cls_head_param = cls_head_param
19
  self.conv1d_block_param_list = conv1d_block_param_list
20
  self.conv2d_block_param_list = conv2d_block_param_list
toolbox/torchaudio/models/cnn_audio_classifier/modeling_cnn_audio_classifier.py CHANGED
@@ -9,7 +9,6 @@ import torchaudio
9
 
10
  from toolbox.torchaudio.models.cnn_audio_classifier.configuration_cnn_audio_classifier import CnnAudioClassifierConfig
11
  from toolbox.torchaudio.configuration_utils import CONFIG_FILE
12
- from toolbox.torchaudio.augment.spec_augment import SpecAugment
13
 
14
 
15
  MODEL_FILE = "model.pt"
@@ -241,7 +240,6 @@ class SpectrogramEncoder(nn.Module):
241
  class WaveEncoder(nn.Module):
242
  def __init__(self,
243
  mel_spectrogram_param: dict,
244
- spec_augment_param: dict,
245
  conv1d_block_param_list: List[dict] = None,
246
  conv2d_block_param_list: List[dict] = None,
247
  ):
@@ -264,21 +262,11 @@ class WaveEncoder(nn.Module):
264
  ),
265
  )
266
 
267
- self.spec_augment = SpecAugment(
268
- aug_volume_factor_range=spec_augment_param["aug_volume_factor_range"]
269
- )
270
-
271
  self.spectrogram_encoder = SpectrogramEncoder(
272
  conv1d_block_param_list=conv1d_block_param_list,
273
  conv2d_block_param_list=conv2d_block_param_list,
274
  )
275
 
276
- @torch.jit.ignore
277
- def do_spec_augment(self, spec: torch.Tensor) -> torch.Tensor:
278
- if self.training:
279
- spec = self.spec_augment.forward(spec)
280
- return spec
281
-
282
  def forward(self, inputs: torch.Tensor):
283
  # x: [batch_size, spec_dim, seq_length]
284
  x = inputs
@@ -289,8 +277,6 @@ class WaveEncoder(nn.Module):
289
  x = x.log()
290
  x = x - torch.mean(x, dim=-1, keepdim=True)
291
 
292
- x = self.do_spec_augment(x)
293
-
294
  x = x.transpose(1, 2)
295
 
296
  features = self.spectrogram_encoder.forward(x)
 
9
 
10
  from toolbox.torchaudio.models.cnn_audio_classifier.configuration_cnn_audio_classifier import CnnAudioClassifierConfig
11
  from toolbox.torchaudio.configuration_utils import CONFIG_FILE
 
12
 
13
 
14
  MODEL_FILE = "model.pt"
 
240
  class WaveEncoder(nn.Module):
241
  def __init__(self,
242
  mel_spectrogram_param: dict,
 
243
  conv1d_block_param_list: List[dict] = None,
244
  conv2d_block_param_list: List[dict] = None,
245
  ):
 
262
  ),
263
  )
264
 
 
 
 
 
265
  self.spectrogram_encoder = SpectrogramEncoder(
266
  conv1d_block_param_list=conv1d_block_param_list,
267
  conv2d_block_param_list=conv2d_block_param_list,
268
  )
269
 
 
 
 
 
 
 
270
  def forward(self, inputs: torch.Tensor):
271
  # x: [batch_size, spec_dim, seq_length]
272
  x = inputs
 
277
  x = x.log()
278
  x = x - torch.mean(x, dim=-1, keepdim=True)
279
 
 
 
280
  x = x.transpose(1, 2)
281
 
282
  features = self.spectrogram_encoder.forward(x)