Spaces:

alibabasglab
/

ClearVoice

Running on Zero

alibabasglab commited on Oct 22, 2024

Commit

503168e

verified ·

1 Parent(s): 330c178

Update utils/decode.py

Files changed (1) hide show

utils/decode.py CHANGED Viewed

@@ -188,6 +188,9 @@ def decode_one_audio_frcrn_se_16k(model, device, inputs, args):
         # If no segmentation is required, process the entire input
         outputs = model.inference(inputs).detach().cpu().numpy()  # Inference on full input
     return outputs  # Return the decoded audio output
 def decode_one_audio_mossformergan_se_16k(model, device, inputs, args):
@@ -439,7 +442,12 @@ def decode_one_audio_mossformer2_se_48k(model, device, inputs, args):
         # Reconstruct audio from the masked spectrogram
         outputs = istft(masked_spec_complex, args, len(audio))
-    return outputs.numpy() / MAX_WAV_VALUE  # Return the output normalized to [-1, 1]
 def decode_one_audio_AV_MossFormer2_TSE_16K(model, inputs, args):
     """Processes video inputs through the AV mossformer2 model with Target speaker extraction (TSE) for decoding at 16kHz.

         # If no segmentation is required, process the entire input
         outputs = model.inference(inputs).detach().cpu().numpy()  # Inference on full input
+    #normalize outputs
+    max_abs = max(abs(outputs), 1e-6)
+    outputs = outputs / max_abs
     return outputs  # Return the decoded audio output
 def decode_one_audio_mossformergan_se_16k(model, device, inputs, args):
         # Reconstruct audio from the masked spectrogram
         outputs = istft(masked_spec_complex, args, len(audio))
+    outpus = outputs.numpy() / MAX_WAV_VALUE  # Return the output normalized to [-1, 1]
+    #normalize outputs
+    max_abs = max(abs(outputs), 1e-6)
+    outputs = outputs / max_abs
+    return outputs
 def decode_one_audio_AV_MossFormer2_TSE_16K(model, inputs, args):
     """Processes video inputs through the AV mossformer2 model with Target speaker extraction (TSE) for decoding at 16kHz.