Spaces:

Staticaliza
/

Voice

Running

App Files Files Community

Staticaliza commited on Dec 14, 2024

Commit

18943e0

verified ·

1 Parent(s): 7cd73f4

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -43

app.py CHANGED Viewed

@@ -182,7 +182,7 @@ footer {
 @torch.no_grad()
 @torch.inference_mode()
-def voice_conversion(input, reference, steps, guidance, pitch, speed):
     print("[INFO] | Voice conversion started.")
     inference_module, mel_fn, bigvgan_fn = model, to_mel, bigvgan_model
@@ -203,15 +203,29 @@ def voice_conversion(input, reference, steps, guidance, pitch, speed):
     ref_audio_tensor = torch.tensor(ref_audio).unsqueeze(0).float().to(device)
     # Resample to 16kHz
-    ref_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate)
-    converted_waves_16k = torchaudio.functional.resample(source_audio_tensor, sr_current, sampling_rate)
-    # Generate Whisper features
     print("[INFO] | Generating Whisper features for source audio.")
     if converted_waves_16k.size(-1) <= sampling_rate * 30:
-        alt_inputs = whisper_feature_extractor([converted_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate=sampling_rate)
-        alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
-        alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
         S_alt = alt_outputs.last_hidden_state.to(torch.float32)
         S_alt = S_alt[:, :converted_waves_16k.size(-1) // 320 + 1]
         print(f"[INFO] | S_alt shape: {S_alt.shape}")
@@ -227,13 +241,29 @@ def voice_conversion(input, reference, steps, guidance, pitch, speed):
         total_length = converted_waves_16k.size(-1)
         while traversed_time < total_length:
-            if buffer is None:
-                chunk = converted_waves_16k[:, traversed_time:traversed_time + chunk_size]
-            else:
-                chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + chunk_size - overlap_size]], dim=-1)
-            alt_inputs = whisper_feature_extractor([chunk.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate=sampling_rate)
-            alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
-            alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
             S_chunk = alt_outputs.last_hidden_state.to(torch.float32)
             S_chunk = S_chunk[:, :chunk.size(-1) // 320 + 1]
             print(f"[INFO] | Processed chunk with S_chunk shape: {S_chunk.shape}")
@@ -250,12 +280,26 @@ def voice_conversion(input, reference, steps, guidance, pitch, speed):
         S_alt = torch.cat(S_alt_list, dim=1)
         print(f"[INFO] | Final S_alt shape after chunk processing: {S_alt.shape}")
-    # Original Whisper features
     print("[INFO] | Generating Whisper features for reference audio.")
-    ori_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate)
-    ori_inputs = whisper_feature_extractor([ori_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate=sampling_rate)
-    ori_input_features = whisper_model._mask_input_features(ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
-    ori_outputs = whisper_model.encoder(ori_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
     S_ori = ori_outputs.last_hidden_state.to(torch.float32)
     S_ori = S_ori[:, :ori_waves_16k.size(-1) // 320 + 1]
     print(f"[INFO] | S_ori shape: {S_ori.shape}")
@@ -267,21 +311,30 @@ def voice_conversion(input, reference, steps, guidance, pitch, speed):
     print(f"[INFO] | Mel spectrogram shapes: mel={mel.shape}, mel2={mel2.shape}")
     # Length adjustment
-    target_lengths = torch.LongTensor([int(mel.size(2) / speed)]).to(mel.device)
     target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
     print(f"[INFO] | Target lengths: {target_lengths.item()}, {target2_lengths.item()}")
     # Extract style features
     print("[INFO] | Extracting style features from reference audio.")
-    feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k, num_mel_bins=80, dither=0, sample_frequency=sampling_rate)
     feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
-    style2 = campplus_model(feat2.unsqueeze(0))
     print(f"[INFO] | Style2 shape: {style2.shape}")
     # Length Regulation
     print("[INFO] | Applying length regulation.")
-    cond, _, _, _, _ = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=3, f0=None)
-    prompt_condition, _, _, _, _ = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=3, f0=None)
     print(f"[INFO] | Cond shape: {cond.shape}, Prompt condition shape: {prompt_condition.shape}")
     # Initialize variables for audio generation
@@ -297,56 +350,104 @@ def voice_conversion(input, reference, steps, guidance, pitch, speed):
         cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
         # Perform inference
-        vc_target = inference_module.cfm.inference(cat_condition, torch.LongTensor([cat_condition.size(1)]).to(mel2.device), mel2, style2, None, steps, inference_cfg_rate=guidance)
         vc_target = vc_target[:, :, mel2.size(2):]
         print(f"[INFO] | vc_target shape: {vc_target.shape}")
-        # TEMP
-        output_wave = vc_target[0].cpu().numpy()
-        generated_wave_chunks.append(output_wave)
         # Generate waveform using BigVGAN
-        """
         vc_wave = bigvgan_fn(vc_target.float())[0]
         print(f"[INFO] | vc_wave shape: {vc_wave.shape}")
         # Handle the generated waveform
-        output_wave = vc_wave[0].cpu().numpy()
         generated_wave_chunks.append(output_wave)
-        """
         # Ensure processed_frames increments correctly to avoid infinite loop
         processed_frames += vc_target.size(2)
         print(f"[INFO] | Processed frames updated to: {processed_frames}")
     # Concatenate all generated wave chunks
     final_audio = np.concatenate(generated_wave_chunks).astype(np.float32)
     # Pitch Shifting using librosa
     print("[INFO] | Applying pitch shifting.")
     try:
         if pitch != 0:
-            final_audio = librosa.effects.pitch_shift(final_audio, sr=sr_current, n_steps=pitch)
             print(f"[INFO] | Pitch shifted by {pitch} semitones.")
         else:
             print("[INFO] | No pitch shift applied.")
     except Exception as e:
         print(f"[ERROR] | Pitch shifting failed: {e}")
-    # Normalize the audio to ensure it's within [-1.0, 1.0]
     max_val = np.max(np.abs(final_audio))
     if max_val > 1.0:
         final_audio = final_audio / max_val
-    print("[INFO] | Final audio normalized.")
     # Save the audio to a temporary WAV file
     print("[INFO] | Saving final audio to a temporary WAV file.")
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-        sf.write(tmp_file.name, final_audio, sr_current, format='WAV')
-        temp_file_path = tmp_file.name
-    print(f"[INFO] | Final audio saved to {temp_file_path}")
     return temp_file_path

 @torch.no_grad()
 @torch.inference_mode()
+def voice_conversion(input, reference, steps, guidance, speed, pitch):
     print("[INFO] | Voice conversion started.")
     inference_module, mel_fn, bigvgan_fn = model, to_mel, bigvgan_model
     ref_audio_tensor = torch.tensor(ref_audio).unsqueeze(0).float().to(device)
     # Resample to 16kHz
+    ref_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate).to(device)
+    converted_waves_16k = torchaudio.functional.resample(source_audio_tensor, sr_current, sampling_rate).to(device)
+    # Generate Whisper features for source audio
     print("[INFO] | Generating Whisper features for source audio.")
     if converted_waves_16k.size(-1) <= sampling_rate * 30:
+        alt_inputs = whisper_feature_extractor(
+            [converted_waves_16k.squeeze(0).cpu().numpy()],
+            return_tensors="pt",
+            return_attention_mask=True,
+            sampling_rate=sampling_rate
+        )
+        alt_input_features = whisper_model._mask_input_features(
+            alt_inputs.input_features,
+            attention_mask=alt_inputs.attention_mask
+        ).to(device)
+        alt_outputs = whisper_model.encoder(
+            alt_input_features.to(torch.float32),
+            head_mask=None,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True
+        )
         S_alt = alt_outputs.last_hidden_state.to(torch.float32)
         S_alt = S_alt[:, :converted_waves_16k.size(-1) // 320 + 1]
         print(f"[INFO] | S_alt shape: {S_alt.shape}")
         total_length = converted_waves_16k.size(-1)
         while traversed_time < total_length:
+            end_time = traversed_time + chunk_size
+            if end_time > total_length:
+                end_time = total_length
+            chunk = converted_waves_16k[:, traversed_time:end_time]
+            if buffer is not None:
+                chunk = torch.cat([buffer, chunk], dim=-1)
+            alt_inputs = whisper_feature_extractor(
+                [chunk.squeeze(0).cpu().numpy()],
+                return_tensors="pt",
+                return_attention_mask=True,
+                sampling_rate=sampling_rate
+            )
+            alt_input_features = whisper_model._mask_input_features(
+                alt_inputs.input_features,
+                attention_mask=alt_inputs.attention_mask
+            ).to(device)
+            alt_outputs = whisper_model.encoder(
+                alt_input_features.to(torch.float32),
+                head_mask=None,
+                output_attentions=False,
+                output_hidden_states=False,
+                return_dict=True
+            )
             S_chunk = alt_outputs.last_hidden_state.to(torch.float32)
             S_chunk = S_chunk[:, :chunk.size(-1) // 320 + 1]
             print(f"[INFO] | Processed chunk with S_chunk shape: {S_chunk.shape}")
         S_alt = torch.cat(S_alt_list, dim=1)
         print(f"[INFO] | Final S_alt shape after chunk processing: {S_alt.shape}")
+    # Generate Whisper features for reference audio
     print("[INFO] | Generating Whisper features for reference audio.")
+    ori_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate).to(device)
+    ori_inputs = whisper_feature_extractor(
+        [ori_waves_16k.squeeze(0).cpu().numpy()],
+        return_tensors="pt",
+        return_attention_mask=True,
+        sampling_rate=sampling_rate
+    )
+    ori_input_features = whisper_model._mask_input_features(
+        ori_inputs.input_features,
+        attention_mask=ori_inputs.attention_mask
+    ).to(device)
+    ori_outputs = whisper_model.encoder(
+        ori_input_features.to(torch.float32),
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True
+    )
     S_ori = ori_outputs.last_hidden_state.to(torch.float32)
     S_ori = S_ori[:, :ori_waves_16k.size(-1) // 320 + 1]
     print(f"[INFO] | S_ori shape: {S_ori.shape}")
     print(f"[INFO] | Mel spectrogram shapes: mel={mel.shape}, mel2={mel2.shape}")
     # Length adjustment
+    target_lengths = torch.LongTensor([int(mel.size(2) * speed)]).to(mel.device)
     target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
     print(f"[INFO] | Target lengths: {target_lengths.item()}, {target2_lengths.item()}")
     # Extract style features
     print("[INFO] | Extracting style features from reference audio.")
+    feat2 = torchaudio.compliance.kaldi.fbank(
+        ref_waves_16k,
+        num_mel_bins=80,
+        dither=0,
+        sample_frequency=sampling_rate
+    )
     feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
+    style2 = campplus_model(feat2.unsqueeze(0)).to(device)
     print(f"[INFO] | Style2 shape: {style2.shape}")
     # Length Regulation
     print("[INFO] | Applying length regulation.")
+    cond, _, _, _, _ = inference_module.length_regulator(
+        S_alt, ylens=target_lengths, n_quantizers=3, f0=None
+    )
+    prompt_condition, _, _, _, _ = inference_module.length_regulator(
+        S_ori, ylens=target2_lengths, n_quantizers=3, f0=None
+    )
     print(f"[INFO] | Cond shape: {cond.shape}, Prompt condition shape: {prompt_condition.shape}")
     # Initialize variables for audio generation
         cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
         # Perform inference
+        vc_target = inference_module.cfm.inference(
+            cat_condition,
+            torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
+            mel2,
+            style2,
+            None,
+            steps,
+            inference_cfg_rate=guidance
+        )
         vc_target = vc_target[:, :, mel2.size(2):]
         print(f"[INFO] | vc_target shape: {vc_target.shape}")
         # Generate waveform using BigVGAN
         vc_wave = bigvgan_fn(vc_target.float())[0]
         print(f"[INFO] | vc_wave shape: {vc_wave.shape}")
         # Handle the generated waveform
+        output_wave = vc_wave.squeeze(0).cpu().numpy()
         generated_wave_chunks.append(output_wave)
         # Ensure processed_frames increments correctly to avoid infinite loop
         processed_frames += vc_target.size(2)
         print(f"[INFO] | Processed frames updated to: {processed_frames}")
     # Concatenate all generated wave chunks
     final_audio = np.concatenate(generated_wave_chunks).astype(np.float32)
+    # Normalize the audio to ensure it's within [-1.0, 1.0]
+    max_val = np.max(np.abs(final_audio))
+    if max_val > 1.0:
+        final_audio = final_audio / max_val
+    print("[INFO] | Final audio normalized.")
+    # ----------------------------
+    # Audio Processing: Noise Reduction and Pitch Shifting
+    # ----------------------------
+    # Noise Reduction using noisereduce
+    print("[INFO] | Applying noise reduction.")
+    try:
+        # Option 1: Using a Noise Sample (first 0.5 seconds)
+        noise_duration = 0.5  # seconds
+        noise_sample = final_audio[:int(noise_duration * sr_current)]
+        final_audio = nr.reduce_noise(
+            y=final_audio,
+            sr=sr_current,
+            y_noise=noise_sample,
+            prop_decrease=1.0
+        )
+        print("[INFO] | Noise reduction applied using a noise sample.")
+    except Exception as e:
+        print(f"[ERROR] | Noise reduction with noise sample failed: {e}")
+        # Option 2: Automatic Noise Estimation
+        try:
+            final_audio = nr.reduce_noise(
+                y=final_audio,
+                sr=sr_current,
+                stationary=False
+            )
+            print("[INFO] | Noise reduction applied with automatic noise estimation.")
+        except Exception as e:
+            print(f"[ERROR] | Noise reduction with automatic estimation failed: {e}")
     # Pitch Shifting using librosa
     print("[INFO] | Applying pitch shifting.")
     try:
         if pitch != 0:
+            final_audio = librosa.effects.pitch_shift(
+                final_audio,
+                sr=sr_current,
+                n_steps=pitch
+            )
             print(f"[INFO] | Pitch shifted by {pitch} semitones.")
         else:
             print("[INFO] | No pitch shift applied.")
     except Exception as e:
         print(f"[ERROR] | Pitch shifting failed: {e}")
+    # Optional: Further Normalization after Pitch Shifting
     max_val = np.max(np.abs(final_audio))
     if max_val > 1.0:
         final_audio = final_audio / max_val
+        print("[INFO] | Final audio normalized after pitch shifting.")
+    # ----------------------------
+    # Save the Audio
+    # ----------------------------
     # Save the audio to a temporary WAV file
     print("[INFO] | Saving final audio to a temporary WAV file.")
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            sf.write(tmp_file.name, final_audio, sr_current, format='WAV')
+            temp_file_path = tmp_file.name
+        print(f"[INFO] | Final audio saved to {temp_file_path}")
+    except Exception as e:
+        print(f"[ERROR] | Saving audio failed: {e}")
+        return None
     return temp_file_path