Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import soundfile as sf
|
4 |
-
import noisereduce as nr
|
5 |
import spaces
|
6 |
import torch
|
7 |
import torchaudio
|
@@ -13,7 +12,6 @@ import os
|
|
13 |
from huggingface_hub import hf_hub_download
|
14 |
from transformers import AutoFeatureExtractor, WhisperModel
|
15 |
from torch.nn.utils import parametrizations
|
16 |
-
from scipy.signal import butter, filtfilt
|
17 |
|
18 |
from modules.commons import build_model, load_checkpoint, recursive_munch
|
19 |
from modules.campplus.DTDNN import CAMPPlus
|
@@ -183,7 +181,7 @@ footer {
|
|
183 |
|
184 |
@torch.no_grad()
|
185 |
@torch.inference_mode()
|
186 |
-
def voice_conversion(input, reference, steps, guidance, speed):
|
187 |
print("[INFO] | Voice conversion started.")
|
188 |
|
189 |
inference_module, mel_fn, bigvgan_fn = model, to_mel, bigvgan_model
|
@@ -317,6 +315,17 @@ def voice_conversion(input, reference, steps, guidance, speed):
|
|
317 |
# Concatenate all generated wave chunks
|
318 |
final_audio = np.concatenate(generated_wave_chunks).astype(np.float32)
|
319 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
# Normalize the audio to ensure it's within [-1.0, 1.0]
|
321 |
max_val = np.max(np.abs(final_audio))
|
322 |
if max_val > 1.0:
|
@@ -353,8 +362,9 @@ with gr.Blocks(css=css) as main:
|
|
353 |
reference_input = gr.Audio(label="Reference Audio", type="filepath")
|
354 |
|
355 |
with gr.Column():
|
356 |
-
steps = gr.Slider(label="Steps", value=
|
357 |
guidance = gr.Slider(label="Guidance", value=0.7, minimum=0.0, maximum=1.0, step=0.1)
|
|
|
358 |
speed = gr.Slider(label="Speed", value=1.0, minimum=0.5, maximum=2.0, step=0.1)
|
359 |
|
360 |
with gr.Column():
|
@@ -364,7 +374,7 @@ with gr.Blocks(css=css) as main:
|
|
364 |
with gr.Column():
|
365 |
output = gr.Audio(label="Output", type="filepath")
|
366 |
|
367 |
-
submit.click(voice_conversion, inputs=[input, reference_input, steps, guidance, speed], outputs=output, queue=False)
|
368 |
maintain.click(cloud, inputs=[], outputs=[], queue=False)
|
369 |
|
370 |
main.launch(show_api=True)
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import soundfile as sf
|
|
|
4 |
import spaces
|
5 |
import torch
|
6 |
import torchaudio
|
|
|
12 |
from huggingface_hub import hf_hub_download
|
13 |
from transformers import AutoFeatureExtractor, WhisperModel
|
14 |
from torch.nn.utils import parametrizations
|
|
|
15 |
|
16 |
from modules.commons import build_model, load_checkpoint, recursive_munch
|
17 |
from modules.campplus.DTDNN import CAMPPlus
|
|
|
181 |
|
182 |
@torch.no_grad()
|
183 |
@torch.inference_mode()
|
184 |
+
def voice_conversion(input, reference, steps, guidance, pitch, speed):
|
185 |
print("[INFO] | Voice conversion started.")
|
186 |
|
187 |
inference_module, mel_fn, bigvgan_fn = model, to_mel, bigvgan_model
|
|
|
315 |
# Concatenate all generated wave chunks
|
316 |
final_audio = np.concatenate(generated_wave_chunks).astype(np.float32)
|
317 |
|
318 |
+
# Pitch Shifting using librosa
|
319 |
+
print("[INFO] | Applying pitch shifting.")
|
320 |
+
try:
|
321 |
+
if pitch != 0:
|
322 |
+
final_audio = librosa.effects.pitch_shift(final_audio, sr=sr_current, n_steps=pitch)
|
323 |
+
print(f"[INFO] | Pitch shifted by {pitch} semitones.")
|
324 |
+
else:
|
325 |
+
print("[INFO] | No pitch shift applied.")
|
326 |
+
except Exception as e:
|
327 |
+
print(f"[ERROR] | Pitch shifting failed: {e}")
|
328 |
+
|
329 |
# Normalize the audio to ensure it's within [-1.0, 1.0]
|
330 |
max_val = np.max(np.abs(final_audio))
|
331 |
if max_val > 1.0:
|
|
|
362 |
reference_input = gr.Audio(label="Reference Audio", type="filepath")
|
363 |
|
364 |
with gr.Column():
|
365 |
+
steps = gr.Slider(label="Steps", value=4, minimum=1, maximum=100, step=1)
|
366 |
guidance = gr.Slider(label="Guidance", value=0.7, minimum=0.0, maximum=1.0, step=0.1)
|
367 |
+
pitch = gr.Slider(label="Pitch", value=0.0, minimum=-12.0, maximum=12.0, step=0.1)
|
368 |
speed = gr.Slider(label="Speed", value=1.0, minimum=0.5, maximum=2.0, step=0.1)
|
369 |
|
370 |
with gr.Column():
|
|
|
374 |
with gr.Column():
|
375 |
output = gr.Audio(label="Output", type="filepath")
|
376 |
|
377 |
+
submit.click(voice_conversion, inputs=[input, reference_input, steps, guidance, pitch, speed], outputs=output, queue=False)
|
378 |
maintain.click(cloud, inputs=[], outputs=[], queue=False)
|
379 |
|
380 |
main.launch(show_api=True)
|