alibabasglab commited on
Commit
503168e
·
verified ·
1 Parent(s): 330c178

Update utils/decode.py

Browse files
Files changed (1) hide show
  1. utils/decode.py +9 -1
utils/decode.py CHANGED
@@ -188,6 +188,9 @@ def decode_one_audio_frcrn_se_16k(model, device, inputs, args):
188
  # If no segmentation is required, process the entire input
189
  outputs = model.inference(inputs).detach().cpu().numpy() # Inference on full input
190
 
 
 
 
191
  return outputs # Return the decoded audio output
192
 
193
  def decode_one_audio_mossformergan_se_16k(model, device, inputs, args):
@@ -439,7 +442,12 @@ def decode_one_audio_mossformer2_se_48k(model, device, inputs, args):
439
  # Reconstruct audio from the masked spectrogram
440
  outputs = istft(masked_spec_complex, args, len(audio))
441
 
442
- return outputs.numpy() / MAX_WAV_VALUE # Return the output normalized to [-1, 1]
 
 
 
 
 
443
 
444
  def decode_one_audio_AV_MossFormer2_TSE_16K(model, inputs, args):
445
  """Processes video inputs through the AV mossformer2 model with Target speaker extraction (TSE) for decoding at 16kHz.
 
188
  # If no segmentation is required, process the entire input
189
  outputs = model.inference(inputs).detach().cpu().numpy() # Inference on full input
190
 
191
+ #normalize outputs
192
+ max_abs = max(abs(outputs), 1e-6)
193
+ outputs = outputs / max_abs
194
  return outputs # Return the decoded audio output
195
 
196
  def decode_one_audio_mossformergan_se_16k(model, device, inputs, args):
 
442
  # Reconstruct audio from the masked spectrogram
443
  outputs = istft(masked_spec_complex, args, len(audio))
444
 
445
+ outpus = outputs.numpy() / MAX_WAV_VALUE # Return the output normalized to [-1, 1]
446
+ #normalize outputs
447
+ max_abs = max(abs(outputs), 1e-6)
448
+ outputs = outputs / max_abs
449
+
450
+ return outputs
451
 
452
  def decode_one_audio_AV_MossFormer2_TSE_16K(model, inputs, args):
453
  """Processes video inputs through the AV mossformer2 model with Target speaker extraction (TSE) for decoding at 16kHz.