Katock commited on
Commit
cf90f08
·
1 Parent(s): f8edc9e

Update infer_tool.py

Browse files
Files changed (1) hide show
  1. inference/infer_tool.py +5 -36
inference/infer_tool.py CHANGED
@@ -245,42 +245,11 @@ class Svc(object):
245
  with torch.no_grad():
246
  start = time.time()
247
  vol = None
248
- if not self.only_diffusion:
249
- vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
250
- audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
251
- audio = audio[0,0].data.float()
252
- audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
253
- else:
254
- audio = torch.FloatTensor(wav).to(self.dev)
255
- audio_mel = None
256
- if self.only_diffusion or self.shallow_diffusion:
257
- vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
258
- if self.shallow_diffusion and second_encoding:
259
- audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
260
- audio16k = torch.from_numpy(audio16k).to(self.dev)
261
- c = self.hubert_model.encoder(audio16k)
262
- c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
263
- f0 = f0[:,:,None]
264
- c = c.transpose(-1,-2)
265
- audio_mel = self.diffusion_model(
266
- c,
267
- f0,
268
- vol,
269
- spk_id = sid,
270
- spk_mix_dict = None,
271
- gt_spec=audio_mel,
272
- infer=True,
273
- infer_speedup=self.diffusion_args.infer.speedup,
274
- method=self.diffusion_args.infer.method,
275
- k_step=k_step)
276
- audio = self.vocoder.infer(audio_mel, f0).squeeze()
277
- if self.nsf_hifigan_enhance:
278
- audio, _ = self.enhancer.enhance(
279
- audio[None,:],
280
- self.target_sample,
281
- f0[:,:,None],
282
- self.hps_ms.data.hop_length,
283
- adaptive_key = enhancer_adaptive_key)
284
  if loudness_envelope_adjustment != 1:
285
  audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
286
  use_time = time.time() - start
 
245
  with torch.no_grad():
246
  start = time.time()
247
  vol = None
248
+ vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
249
+ audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
250
+ audio = audio[0,0].data.float()
251
+ audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
252
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  if loudness_envelope_adjustment != 1:
254
  audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
255
  use_time = time.time() - start