Spaces:
Running
Running
Katock
commited on
Commit
·
cf90f08
1
Parent(s):
f8edc9e
Update infer_tool.py
Browse files- inference/infer_tool.py +5 -36
inference/infer_tool.py
CHANGED
@@ -245,42 +245,11 @@ class Svc(object):
|
|
245 |
with torch.no_grad():
|
246 |
start = time.time()
|
247 |
vol = None
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
else:
|
254 |
-
audio = torch.FloatTensor(wav).to(self.dev)
|
255 |
-
audio_mel = None
|
256 |
-
if self.only_diffusion or self.shallow_diffusion:
|
257 |
-
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
|
258 |
-
if self.shallow_diffusion and second_encoding:
|
259 |
-
audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
|
260 |
-
audio16k = torch.from_numpy(audio16k).to(self.dev)
|
261 |
-
c = self.hubert_model.encoder(audio16k)
|
262 |
-
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
263 |
-
f0 = f0[:,:,None]
|
264 |
-
c = c.transpose(-1,-2)
|
265 |
-
audio_mel = self.diffusion_model(
|
266 |
-
c,
|
267 |
-
f0,
|
268 |
-
vol,
|
269 |
-
spk_id = sid,
|
270 |
-
spk_mix_dict = None,
|
271 |
-
gt_spec=audio_mel,
|
272 |
-
infer=True,
|
273 |
-
infer_speedup=self.diffusion_args.infer.speedup,
|
274 |
-
method=self.diffusion_args.infer.method,
|
275 |
-
k_step=k_step)
|
276 |
-
audio = self.vocoder.infer(audio_mel, f0).squeeze()
|
277 |
-
if self.nsf_hifigan_enhance:
|
278 |
-
audio, _ = self.enhancer.enhance(
|
279 |
-
audio[None,:],
|
280 |
-
self.target_sample,
|
281 |
-
f0[:,:,None],
|
282 |
-
self.hps_ms.data.hop_length,
|
283 |
-
adaptive_key = enhancer_adaptive_key)
|
284 |
if loudness_envelope_adjustment != 1:
|
285 |
audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
|
286 |
use_time = time.time() - start
|
|
|
245 |
with torch.no_grad():
|
246 |
start = time.time()
|
247 |
vol = None
|
248 |
+
vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
|
249 |
+
audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
|
250 |
+
audio = audio[0,0].data.float()
|
251 |
+
audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
|
252 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
if loudness_envelope_adjustment != 1:
|
254 |
audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
|
255 |
use_time = time.time() - start
|