Spaces:
Runtime error
Runtime error
ramkamal2000
commited on
Commit
·
8313242
1
Parent(s):
02f4ff2
'normalization'
Browse files
app.py
CHANGED
@@ -107,7 +107,7 @@ print("Loading WavLM for content...")
|
|
107 |
cmodel = utils.get_cmodel(device).to(device)
|
108 |
# cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
|
109 |
|
110 |
-
def voice_conversion_yourtts(da, ta):
|
111 |
|
112 |
# write(target_audio, ta[0], ta[1])
|
113 |
# write(driving_audio, da[0], da[1])
|
@@ -118,20 +118,20 @@ def voice_conversion_yourtts(da, ta):
|
|
118 |
|
119 |
files = [da, ta]
|
120 |
|
121 |
-
|
122 |
-
|
123 |
|
124 |
# ta_ = read(target_audio)
|
125 |
|
126 |
-
target_emb = SE_speaker_manager.compute_d_vector_from_clip([
|
127 |
target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
|
128 |
|
129 |
-
driving_emb = SE_speaker_manager.compute_d_vector_from_clip([
|
130 |
driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
|
131 |
|
132 |
# Convert the voice
|
133 |
|
134 |
-
driving_spec = compute_spec(
|
135 |
y_lengths = torch.tensor([driving_spec.size(-1)])
|
136 |
if USE_CUDA:
|
137 |
ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
|
@@ -145,13 +145,16 @@ def voice_conversion_yourtts(da, ta):
|
|
145 |
|
146 |
return (ap.sample_rate, ref_wav_voc)
|
147 |
|
148 |
-
|
|
|
149 |
with torch.no_grad():
|
150 |
-
|
|
|
151 |
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
|
152 |
g_tgt = smodel.embed_utterance(wav_tgt)
|
153 |
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
|
154 |
-
|
|
|
155 |
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
|
156 |
# c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
|
157 |
c = utils.get_content(cmodel, wav_src)
|
@@ -178,9 +181,9 @@ outputs_2 = gr.outputs.Audio(label="Target Speaker - Output Audio", type='filep
|
|
178 |
def voice_conversion(mod, sa, ta):
|
179 |
|
180 |
if mod=='FreeVC':
|
181 |
-
return voice_conversion_yourtts(sa, ta)
|
182 |
-
else:
|
183 |
return voice_conversion_freevc(sa, ta)
|
|
|
|
|
184 |
|
185 |
examples_1 = [['FreeVC', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav'], ['YourTTS', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav']]
|
186 |
|
@@ -199,5 +202,5 @@ vc_2 = gr.Interface(
|
|
199 |
description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the target speaker and record the voice of the input speaker using the microphone.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project"
|
200 |
)
|
201 |
|
202 |
-
demo = gr.TabbedInterface([vc_1, vc_2], ["wav Input", "Microphone Input"], title="Voice Conversion")
|
203 |
demo.launch(debug='True')
|
|
|
107 |
cmodel = utils.get_cmodel(device).to(device)
|
108 |
# cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
|
109 |
|
110 |
+
def voice_conversion_yourtts(da, ta, normalize=False):
|
111 |
|
112 |
# write(target_audio, ta[0], ta[1])
|
113 |
# write(driving_audio, da[0], da[1])
|
|
|
118 |
|
119 |
files = [da, ta]
|
120 |
|
121 |
+
subprocess.run(["ffmpeg-normalize", da, "-nt", "rms", "-t=-27", "-o", "source_yourtts.wav", "-ar", "16000", "-f"])
|
122 |
+
subprocess.run(["ffmpeg-normalize", ta, "-nt", "rms", "-t=-27", "-o", "target_yourtts.wav", "-ar", "16000", "-f"])
|
123 |
|
124 |
# ta_ = read(target_audio)
|
125 |
|
126 |
+
target_emb = SE_speaker_manager.compute_d_vector_from_clip(["target_yourtts.wav"])
|
127 |
target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
|
128 |
|
129 |
+
driving_emb = SE_speaker_manager.compute_d_vector_from_clip(["source_yourtts.wav"])
|
130 |
driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
|
131 |
|
132 |
# Convert the voice
|
133 |
|
134 |
+
driving_spec = compute_spec("source_yourtts.wav")
|
135 |
y_lengths = torch.tensor([driving_spec.size(-1)])
|
136 |
if USE_CUDA:
|
137 |
ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
|
|
|
145 |
|
146 |
return (ap.sample_rate, ref_wav_voc)
|
147 |
|
148 |
+
|
149 |
+
def voice_conversion_freevc(src, tgt, normalize=False):
|
150 |
with torch.no_grad():
|
151 |
+
subprocess.run(["ffmpeg-normalize", tgt, "-nt", "rms", "-t=-27", "-o", "target_fvc.wav", "-ar", "16000", "-f"])
|
152 |
+
wav_tgt, _ = librosa.load("target_fvc.wav", sr=hps.data.sampling_rate)
|
153 |
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
|
154 |
g_tgt = smodel.embed_utterance(wav_tgt)
|
155 |
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
|
156 |
+
subprocess.run(["ffmpeg-normalize", src, "-nt", "rms", "-t=-27", "-o", "source_fvc.wav", "-ar", "16000", "-f"])
|
157 |
+
wav_src, _ = librosa.load("source_fvc.wav", sr=hps.data.sampling_rate)
|
158 |
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
|
159 |
# c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
|
160 |
c = utils.get_content(cmodel, wav_src)
|
|
|
181 |
def voice_conversion(mod, sa, ta):
|
182 |
|
183 |
if mod=='FreeVC':
|
|
|
|
|
184 |
return voice_conversion_freevc(sa, ta)
|
185 |
+
else:
|
186 |
+
return voice_conversion_yourtts(sa, ta)
|
187 |
|
188 |
examples_1 = [['FreeVC', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav'], ['YourTTS', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav']]
|
189 |
|
|
|
202 |
description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the target speaker and record the voice of the input speaker using the microphone.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project"
|
203 |
)
|
204 |
|
205 |
+
demo = gr.TabbedInterface([vc_1, vc_2], ["wav Input", "Microphone Input"], title="Voice Conversion Demo")
|
206 |
demo.launch(debug='True')
|