ramkamal2000 commited on
Commit
8313242
·
1 Parent(s): 02f4ff2

'normalization'

Browse files
Files changed (1) hide show
  1. app.py +15 -12
app.py CHANGED
@@ -107,7 +107,7 @@ print("Loading WavLM for content...")
107
  cmodel = utils.get_cmodel(device).to(device)
108
  # cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
109
 
110
- def voice_conversion_yourtts(da, ta):
111
 
112
  # write(target_audio, ta[0], ta[1])
113
  # write(driving_audio, da[0], da[1])
@@ -118,20 +118,20 @@ def voice_conversion_yourtts(da, ta):
118
 
119
  files = [da, ta]
120
 
121
- for file in files:
122
- subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
123
 
124
  # ta_ = read(target_audio)
125
 
126
- target_emb = SE_speaker_manager.compute_d_vector_from_clip([ta])
127
  target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
128
 
129
- driving_emb = SE_speaker_manager.compute_d_vector_from_clip([da])
130
  driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
131
 
132
  # Convert the voice
133
 
134
- driving_spec = compute_spec(da)
135
  y_lengths = torch.tensor([driving_spec.size(-1)])
136
  if USE_CUDA:
137
  ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
@@ -145,13 +145,16 @@ def voice_conversion_yourtts(da, ta):
145
 
146
  return (ap.sample_rate, ref_wav_voc)
147
 
148
- def voice_conversion_freevc(src, tgt):
 
149
  with torch.no_grad():
150
- wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
 
151
  wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
152
  g_tgt = smodel.embed_utterance(wav_tgt)
153
  g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
154
- wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
 
155
  wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
156
  # c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
157
  c = utils.get_content(cmodel, wav_src)
@@ -178,9 +181,9 @@ outputs_2 = gr.outputs.Audio(label="Target Speaker - Output Audio", type='filep
178
  def voice_conversion(mod, sa, ta):
179
 
180
  if mod=='FreeVC':
181
- return voice_conversion_yourtts(sa, ta)
182
- else:
183
  return voice_conversion_freevc(sa, ta)
 
 
184
 
185
  examples_1 = [['FreeVC', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav'], ['YourTTS', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav']]
186
 
@@ -199,5 +202,5 @@ vc_2 = gr.Interface(
199
  description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the target speaker and record the voice of the input speaker using the microphone.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project"
200
  )
201
 
202
- demo = gr.TabbedInterface([vc_1, vc_2], ["wav Input", "Microphone Input"], title="Voice Conversion")
203
  demo.launch(debug='True')
 
107
  cmodel = utils.get_cmodel(device).to(device)
108
  # cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
109
 
110
+ def voice_conversion_yourtts(da, ta, normalize=False):
111
 
112
  # write(target_audio, ta[0], ta[1])
113
  # write(driving_audio, da[0], da[1])
 
118
 
119
  files = [da, ta]
120
 
121
+ subprocess.run(["ffmpeg-normalize", da, "-nt", "rms", "-t=-27", "-o", "source_yourtts.wav", "-ar", "16000", "-f"])
122
+ subprocess.run(["ffmpeg-normalize", ta, "-nt", "rms", "-t=-27", "-o", "target_yourtts.wav", "-ar", "16000", "-f"])
123
 
124
  # ta_ = read(target_audio)
125
 
126
+ target_emb = SE_speaker_manager.compute_d_vector_from_clip(["target_yourtts.wav"])
127
  target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
128
 
129
+ driving_emb = SE_speaker_manager.compute_d_vector_from_clip(["source_yourtts.wav"])
130
  driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
131
 
132
  # Convert the voice
133
 
134
+ driving_spec = compute_spec("source_yourtts.wav")
135
  y_lengths = torch.tensor([driving_spec.size(-1)])
136
  if USE_CUDA:
137
  ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
 
145
 
146
  return (ap.sample_rate, ref_wav_voc)
147
 
148
+
149
+ def voice_conversion_freevc(src, tgt, normalize=False):
150
  with torch.no_grad():
151
+ subprocess.run(["ffmpeg-normalize", tgt, "-nt", "rms", "-t=-27", "-o", "target_fvc.wav", "-ar", "16000", "-f"])
152
+ wav_tgt, _ = librosa.load("target_fvc.wav", sr=hps.data.sampling_rate)
153
  wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
154
  g_tgt = smodel.embed_utterance(wav_tgt)
155
  g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
156
+ subprocess.run(["ffmpeg-normalize", src, "-nt", "rms", "-t=-27", "-o", "source_fvc.wav", "-ar", "16000", "-f"])
157
+ wav_src, _ = librosa.load("source_fvc.wav", sr=hps.data.sampling_rate)
158
  wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
159
  # c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
160
  c = utils.get_content(cmodel, wav_src)
 
181
  def voice_conversion(mod, sa, ta):
182
 
183
  if mod=='FreeVC':
 
 
184
  return voice_conversion_freevc(sa, ta)
185
+ else:
186
+ return voice_conversion_yourtts(sa, ta)
187
 
188
  examples_1 = [['FreeVC', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav'], ['YourTTS', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav']]
189
 
 
202
  description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the target speaker and record the voice of the input speaker using the microphone.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project"
203
  )
204
 
205
+ demo = gr.TabbedInterface([vc_1, vc_2], ["wav Input", "Microphone Input"], title="Voice Conversion Demo")
206
  demo.launch(debug='True')