Spaces:

GroveStreet
/

GTA_SOVITS

Running

App Files Files Community

Katock commited on Jul 29, 2023

Commit

c7182d9

1 Parent(s): 3d4653f

debug

Browse files

Files changed (2) hide show

app.py +23 -10
inference/infer_tool.py +124 -107

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import librosa
 import numpy as np
 import soundfile
 from inference.infer_tool import Svc
 logging.getLogger('numba').setLevel(logging.WARNING)
@@ -31,7 +32,7 @@ def audio_postprocess(self, y):
 gr.Audio.postprocess = audio_postprocess
-def create_vc_fn(model, sid):
     def vc_fn(input_audio, vc_transform, auto_f0):
         if input_audio is None:
             return "请先上传音频", None
@@ -39,17 +40,29 @@ def create_vc_fn(model, sid):
         duration = audio.shape[0] / sampling_rate
         if duration > 20 and limitation:
             return "请上传小于20秒的音频，或点击右上角裁剪", None
-        audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
-        if len(audio.shape) > 1:
-            audio = librosa.to_mono(audio.transpose(1, 0))
-        if sampling_rate != 16000:
-            audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
         raw_path = io.BytesIO()
-        soundfile.write(raw_path, audio, 16000, format="wav")
         raw_path.seek(0)
-        out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
-                                        auto_predict_f0=auto_f0,
-                                        )
         return "Success", (44100, out_audio.cpu().numpy())
     return vc_fn

 import numpy as np
 import soundfile
+from inference import infer_tool
 from inference.infer_tool import Svc
 logging.getLogger('numba').setLevel(logging.WARNING)
 gr.Audio.postprocess = audio_postprocess
+def create_vc_fn(model, spk):
     def vc_fn(input_audio, vc_transform, auto_f0):
         if input_audio is None:
             return "请先上传音频", None
         duration = audio.shape[0] / sampling_rate
         if duration > 20 and limitation:
             return "请上传小于20秒的音频，或点击右上角裁剪", None
+        # audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
+        # if len(audio.shape) > 1:
+        #     audio = librosa.to_mono(audio.transpose(1, 0))
+        # if sampling_rate != 16000:
+        #     audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
         raw_path = io.BytesIO()
+        soundfile.write(raw_path, audio, sampling_rate, format="wav")
         raw_path.seek(0)
+        if "." not in raw_path:
+            raw_path += ".wav"
+        print("path: ", raw_path)
+        infer_tool.format_wav(raw_path)
+        # out_audio, out_sr, _ = model.infer(spk, vc_transform, raw_path,
+        #                                    auto_predict_f0=auto_f0,
+        #                                    )
+        out_audio = model.slice_inference(raw_audio_path=raw_path,
+                                          spk=spk,
+                                          tran=vc_transform,
+                                          slice_db=-40,
+                                          cluster_infer_ratio=0,
+                                          auto_predict_f0=auto_f0,
+                                          noice_scale=0.4)
         return "Success", (44100, out_audio.cpu().numpy())
     return vc_fn

inference/infer_tool.py CHANGED Viewed

@@ -85,16 +85,19 @@ def get_end_file(dir_path, end):
 def get_md5(content):
     return hashlib.new("md5", content).hexdigest()
 def fill_a_to_b(a, b):
     if len(a) < len(b):
         for _ in range(0, len(b) - len(a)):
             a.append(a[0])
 def mkdir(paths: list):
     for path in paths:
         if not os.path.exists(path):
             os.mkdir(path)
 def pad_array(arr, target_length):
     current_length = arr.shape[0]
     if current_length >= target_length:
@@ -105,26 +108,28 @@ def pad_array(arr, target_length):
         pad_right = pad_width - pad_left
         padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
         return padded_arr
 def split_list_by_n(list_collection, n, pre=0):
     for i in range(0, len(list_collection), n):
-        yield list_collection[i-pre if i-pre>=0 else i: i + n]
 class F0FilterException(Exception):
     pass
 class Svc(object):
     def __init__(self, net_g_path, config_path,
                  device=None,
                  cluster_model_path="logs/44k/kmeans_10000.pt",
-                 nsf_hifigan_enhance = False,
                  diffusion_model_path="logs/44k/diffusion/model_0.pt",
                  diffusion_config_path="configs/diffusion.yaml",
-                 shallow_diffusion = False,
-                 only_diffusion = False,
-                 spk_mix_enable = False,
-                 feature_retrieval = False
                  ):
         self.net_g_path = net_g_path
         self.only_diffusion = only_diffusion
@@ -152,7 +157,9 @@ class Svc(object):
         self.nsf_hifigan_enhance = nsf_hifigan_enhance
         if self.shallow_diffusion or self.only_diffusion:
             if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
-                self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
                 if self.only_diffusion:
                     self.target_sample = self.diffusion_args.data.sampling_rate
                     self.hop_size = self.diffusion_args.data.block_size
@@ -163,32 +170,32 @@ class Svc(object):
             else:
                 print("No diffusion model or config found. Shallow diffusion mode will False")
                 self.shallow_diffusion = self.only_diffusion = False
         # load hubert and model
         if not self.only_diffusion:
             self.load_model(spk_mix_enable)
-            self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
             self.volume_extractor = utils.Volume_Extractor(self.hop_size)
         else:
-            self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
             self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
         if os.path.exists(cluster_model_path):
             if self.feature_retrieval:
-                with open(cluster_model_path,"rb") as f:
                     self.cluster_model = pickle.load(f)
                 self.big_npy = None
                 self.now_spk_id = -1
             else:
                 self.cluster_model = cluster.get_cluster_model(cluster_model_path)
         else:
-            self.feature_retrieval=False
-        if self.shallow_diffusion : self.nsf_hifigan_enhance = False
         if self.nsf_hifigan_enhance:
             from modules.enhancer import Enhancer
-            self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
     def load_model(self, spk_mix_enable=False):
         # get model configuration
         self.net_g_ms = SynthesizerTrn(
@@ -203,10 +210,12 @@ class Svc(object):
         if spk_mix_enable:
             self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
-    def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
-        f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
         f0, uv = f0_predictor_object.compute_f0_uv(wav)
         if f0_filter and sum(f0) == 0:
             raise F0FilterException("No voice detected")
@@ -222,7 +231,7 @@ class Svc(object):
         c = self.hubert_model.encoder(wav16k)
         c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
-        if cluster_infer_ratio !=0:
             if self.feature_retrieval:
                 speaker_id = self.spk2id.get(speaker)
                 if speaker_id is None:
@@ -231,17 +240,17 @@ class Svc(object):
                     if len(self.spk2id.__dict__) >= speaker:
                         speaker_id = speaker
                 feature_index = self.cluster_model[speaker_id]
-                feat_np = c.transpose(0,1).cpu().numpy()
                 if self.big_npy is None or self.now_spk_id != speaker_id:
-                   self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
-                   self.now_spk_id = speaker_id
                 print("starting feature retrieval...")
                 score, ix = feature_index.search(feat_np, k=8)
                 weight = np.square(1 / score)
                 weight /= weight.sum(axis=1, keepdims=True)
                 npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
                 c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
-                c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
                 print("end feature retrieval...")
             else:
                 cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
@@ -257,19 +266,19 @@ class Svc(object):
               noice_scale=0.4,
               f0_filter=False,
               f0_predictor='pm',
-              enhancer_adaptive_key = 0,
-              cr_threshold = 0.05,
-              k_step = 100,
-              frame = 0,
-              spk_mix = False,
-              second_encoding = False,
-              loudness_envelope_adjustment = 1
               ):
         wav, sr = librosa.load(raw_path, sr=self.target_sample)
         if spk_mix:
-            c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
             n_frames = f0.size(1)
-            sid = speaker[:, frame:frame+n_frames].transpose(0,1)
         else:
             speaker_id = self.spk2id.get(speaker)
             if not speaker_id and type(speaker) is int:
@@ -278,7 +287,8 @@ class Svc(object):
             if speaker_id is None:
                 raise RuntimeError("The name you entered is not in the speaker list!")
             sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
-            c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
             n_frames = f0.size(1)
         if "half" in self.net_g_path and torch.cuda.is_available():
             c = c.half()
@@ -286,43 +296,50 @@ class Svc(object):
             start = time.time()
             vol = None
             if not self.only_diffusion:
-                vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
-                audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
-                audio = audio[0,0].data.float()
-                audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
             else:
                 audio = torch.FloatTensor(wav).to(self.dev)
                 audio_mel = None
             if self.only_diffusion or self.shallow_diffusion:
-                vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
                 if self.shallow_diffusion and second_encoding:
-                    audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
                     audio16k = torch.from_numpy(audio16k).to(self.dev)
                     c = self.hubert_model.encoder(audio16k)
                     c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
-                f0 = f0[:,:,None]
-                c = c.transpose(-1,-2)
                 audio_mel = self.diffusion_model(
-                c,
-                f0,
-                vol,
-                spk_id = sid,
-                spk_mix_dict = None,
-                gt_spec=audio_mel,
-                infer=True,
-                infer_speedup=self.diffusion_args.infer.speedup,
-                method=self.diffusion_args.infer.method,
-                k_step=k_step)
                 audio = self.vocoder.infer(audio_mel, f0).squeeze()
             if self.nsf_hifigan_enhance:
                 audio, _ = self.enhancer.enhance(
-                                    audio[None,:],
-                                    self.target_sample,
-                                    f0[:,:,None],
-                                    self.hps_ms.data.hop_length,
-                                    adaptive_key = enhancer_adaptive_key)
             if loudness_envelope_adjustment != 1:
-                audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
             use_time = time.time() - start
             print("vits use time:{}".format(use_time))
         return audio, audio.shape[-1], n_frames
@@ -335,7 +352,7 @@ class Svc(object):
         # unload model
         self.net_g_ms = self.net_g_ms.to("cpu")
         del self.net_g_ms
-        if hasattr(self,"enhancer"):
             self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
             del self.enhancer.enhancer
             del self.enhancer
@@ -352,14 +369,14 @@ class Svc(object):
                         pad_seconds=0.5,
                         clip_seconds=0,
                         lg_num=0,
-                        lgr_num =0.75,
                         f0_predictor='pm',
-                        enhancer_adaptive_key = 0,
-                        cr_threshold = 0.05,
-                        k_step = 100,
-                        use_spk_mix = False,
-                        second_encoding = False,
-                        loudness_envelope_adjustment = 1
                         ):
         if use_spk_mix:
             if len(self.spk2id) == 1:
@@ -368,12 +385,12 @@ class Svc(object):
         wav_path = Path(raw_audio_path).with_suffix('.wav')
         chunks = slicer.cut(wav_path, db_thresh=slice_db)
         audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
-        per_size = int(clip_seconds*audio_sr)
-        lg_size = int(lg_num*audio_sr)
-        lg_size_r = int(lg_size*lgr_num)
-        lg_size_c_l = (lg_size-lg_size_r)//2
-        lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
-        lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
         if use_spk_mix:
             assert len(self.spk2id) == len(spk)
@@ -384,10 +401,10 @@ class Svc(object):
                     audio_length += aud_length // self.hop_size
                     continue
                 if per_size != 0:
-                    datas = split_list_by_n(data, per_size,lg_size)
                 else:
                     datas = [data]
-                for k,dat in enumerate(datas):
                     pad_len = int(audio_sr * pad_seconds)
                     per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
                     a_length = per_length + 2 * pad_len
@@ -397,14 +414,14 @@ class Svc(object):
             for i in range(len(spk)):
                 last_end = None
                 for mix in spk[i]:
-                    if mix[3]<0. or mix[2]<0.:
                         raise RuntimeError("mix value must higer Than zero!")
                     begin = int(audio_length * mix[0])
                     end = int(audio_length * mix[1])
                     length = end - begin
-                    if length<=0:
                         raise RuntimeError("begin Must lower Than end!")
-                    step = (mix[3] - mix[2])/length
                     if last_end is not None:
                         if last_end != begin:
                             raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
@@ -412,20 +429,20 @@ class Svc(object):
                     if step == 0.:
                         spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
                     else:
-                        spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
-                    if(len(spk_mix_data)<length):
                         num_pad = length - len(spk_mix_data)
                         spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
                     spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
-            spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
             # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
             for i, x in enumerate(spk_mix_ten[0]):
                 if x == 0.0:
                     spk_mix_ten[0][i] = 1.0
-                    spk_mix_tensor[:,i] = 1.0 / len(spk)
             spk_mix_tensor = spk_mix_tensor / spk_mix_ten
-            if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
                 raise RuntimeError("sum(spk_mix_tensor) not equal 1")
             spk = spk_mix_tensor
@@ -442,12 +459,12 @@ class Svc(object):
                 global_frame += length // self.hop_size
                 continue
             if per_size != 0:
-                datas = split_list_by_n(data, per_size,lg_size)
             else:
                 datas = [data]
-            for k,dat in enumerate(datas):
-                per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
-                if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
                 # padd
                 pad_len = int(audio_sr * pad_seconds)
                 dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
@@ -455,33 +472,34 @@ class Svc(object):
                 soundfile.write(raw_path, dat, audio_sr, format="wav")
                 raw_path.seek(0)
                 out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
-                                                    cluster_infer_ratio=cluster_infer_ratio,
-                                                    auto_predict_f0=auto_predict_f0,
-                                                    noice_scale=noice_scale,
-                                                    f0_predictor = f0_predictor,
-                                                    enhancer_adaptive_key = enhancer_adaptive_key,
-                                                    cr_threshold = cr_threshold,
-                                                    k_step = k_step,
-                                                    frame = global_frame,
-                                                    spk_mix = use_spk_mix,
-                                                    second_encoding = second_encoding,
-                                                    loudness_envelope_adjustment = loudness_envelope_adjustment
-                                                    )
                 global_frame += out_frame
                 _audio = out_audio.cpu().numpy()
                 pad_len = int(self.target_sample * pad_seconds)
                 _audio = _audio[pad_len:-pad_len]
                 _audio = pad_array(_audio, per_length)
-                if lg_size!=0 and k!=0:
-                    lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
-                    lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r]  if lgr_num != 1 else _audio[0:lg_size]
-                    lg_pre = lg1*(1-lg)+lg2*lg
-                    audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
                     audio.extend(lg_pre)
-                    _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
                 audio.extend(list(_audio))
         return np.array(audio)
 class RealTimeVC:
     def __init__(self):
         self.last_chunk = None
@@ -509,7 +527,7 @@ class RealTimeVC:
                                         auto_predict_f0=auto_predict_f0,
                                         noice_scale=noice_scale,
                                         f0_filter=f0_filter)
             audio = audio.cpu().numpy()
             self.last_chunk = audio[-self.pre_len:]
             self.last_o = audio
@@ -530,4 +548,3 @@ class RealTimeVC:
             self.last_chunk = audio[-self.pre_len:]
             self.last_o = audio
             return ret[self.chunk_len:2 * self.chunk_len]

 def get_md5(content):
     return hashlib.new("md5", content).hexdigest()
 def fill_a_to_b(a, b):
     if len(a) < len(b):
         for _ in range(0, len(b) - len(a)):
             a.append(a[0])
 def mkdir(paths: list):
     for path in paths:
         if not os.path.exists(path):
             os.mkdir(path)
 def pad_array(arr, target_length):
     current_length = arr.shape[0]
     if current_length >= target_length:
         pad_right = pad_width - pad_left
         padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
         return padded_arr
 def split_list_by_n(list_collection, n, pre=0):
     for i in range(0, len(list_collection), n):
+        yield list_collection[i - pre if i - pre >= 0 else i: i + n]
 class F0FilterException(Exception):
     pass
 class Svc(object):
     def __init__(self, net_g_path, config_path,
                  device=None,
                  cluster_model_path="logs/44k/kmeans_10000.pt",
+                 nsf_hifigan_enhance=False,
                  diffusion_model_path="logs/44k/diffusion/model_0.pt",
                  diffusion_config_path="configs/diffusion.yaml",
+                 shallow_diffusion=False,
+                 only_diffusion=False,
+                 spk_mix_enable=False,
+                 feature_retrieval=False
                  ):
         self.net_g_path = net_g_path
         self.only_diffusion = only_diffusion
         self.nsf_hifigan_enhance = nsf_hifigan_enhance
         if self.shallow_diffusion or self.only_diffusion:
             if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
+                self.diffusion_model, self.vocoder, self.diffusion_args = load_model_vocoder(diffusion_model_path,
+                                                                                             self.dev,
+                                                                                             config_path=diffusion_config_path)
                 if self.only_diffusion:
                     self.target_sample = self.diffusion_args.data.sampling_rate
                     self.hop_size = self.diffusion_args.data.block_size
             else:
                 print("No diffusion model or config found. Shallow diffusion mode will False")
                 self.shallow_diffusion = self.only_diffusion = False
         # load hubert and model
         if not self.only_diffusion:
             self.load_model(spk_mix_enable)
+            self.hubert_model = utils.get_speech_encoder(self.speech_encoder, device=self.dev)
             self.volume_extractor = utils.Volume_Extractor(self.hop_size)
         else:
+            self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder, device=self.dev)
             self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
         if os.path.exists(cluster_model_path):
             if self.feature_retrieval:
+                with open(cluster_model_path, "rb") as f:
                     self.cluster_model = pickle.load(f)
                 self.big_npy = None
                 self.now_spk_id = -1
             else:
                 self.cluster_model = cluster.get_cluster_model(cluster_model_path)
         else:
+            self.feature_retrieval = False
+        if self.shallow_diffusion: self.nsf_hifigan_enhance = False
         if self.nsf_hifigan_enhance:
             from modules.enhancer import Enhancer
+            self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model', device=self.dev)
     def load_model(self, spk_mix_enable=False):
         # get model configuration
         self.net_g_ms = SynthesizerTrn(
         if spk_mix_enable:
             self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
+    def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter, f0_predictor, cr_threshold=0.05):
+        f0_predictor_object = utils.get_f0_predictor(f0_predictor, hop_length=self.hop_size,
+                                                     sampling_rate=self.target_sample, device=self.dev,
+                                                     threshold=cr_threshold)
         f0, uv = f0_predictor_object.compute_f0_uv(wav)
         if f0_filter and sum(f0) == 0:
             raise F0FilterException("No voice detected")
         c = self.hubert_model.encoder(wav16k)
         c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
+        if cluster_infer_ratio != 0:
             if self.feature_retrieval:
                 speaker_id = self.spk2id.get(speaker)
                 if speaker_id is None:
                     if len(self.spk2id.__dict__) >= speaker:
                         speaker_id = speaker
                 feature_index = self.cluster_model[speaker_id]
+                feat_np = c.transpose(0, 1).cpu().numpy()
                 if self.big_npy is None or self.now_spk_id != speaker_id:
+                    self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
+                    self.now_spk_id = speaker_id
                 print("starting feature retrieval...")
                 score, ix = feature_index.search(feat_np, k=8)
                 weight = np.square(1 / score)
                 weight /= weight.sum(axis=1, keepdims=True)
                 npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
                 c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
+                c = torch.FloatTensor(c).to(self.dev).transpose(0, 1)
                 print("end feature retrieval...")
             else:
                 cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
               noice_scale=0.4,
               f0_filter=False,
               f0_predictor='pm',
+              enhancer_adaptive_key=0,
+              cr_threshold=0.05,
+              k_step=100,
+              frame=0,
+              spk_mix=False,
+              second_encoding=False,
+              loudness_envelope_adjustment=1
               ):
         wav, sr = librosa.load(raw_path, sr=self.target_sample)
         if spk_mix:
+            c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter, f0_predictor, cr_threshold=cr_threshold)
             n_frames = f0.size(1)
+            sid = speaker[:, frame:frame + n_frames].transpose(0, 1)
         else:
             speaker_id = self.spk2id.get(speaker)
             if not speaker_id and type(speaker) is int:
             if speaker_id is None:
                 raise RuntimeError("The name you entered is not in the speaker list!")
             sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
+            c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter, f0_predictor,
+                                         cr_threshold=cr_threshold)
             n_frames = f0.size(1)
         if "half" in self.net_g_path and torch.cuda.is_available():
             c = c.half()
             start = time.time()
             vol = None
             if not self.only_diffusion:
+                vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None, :])[None, :].to(
+                    self.dev) if self.vol_embedding else None
+                audio, f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0,
+                                                noice_scale=noice_scale, vol=vol)
+                audio = audio[0, 0].data.float()
+                audio_mel = self.vocoder.extract(audio[None, :], self.target_sample) if self.shallow_diffusion else None
             else:
                 audio = torch.FloatTensor(wav).to(self.dev)
                 audio_mel = None
             if self.only_diffusion or self.shallow_diffusion:
+                vol = self.volume_extractor.extract(audio[None, :])[None, :, None].to(self.dev) if vol == None else vol[
+                                                                                                                    :,
+                                                                                                                    :,
+                                                                                                                    None]
                 if self.shallow_diffusion and second_encoding:
+                    audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample,
+                                                target_sr=16000)
                     audio16k = torch.from_numpy(audio16k).to(self.dev)
                     c = self.hubert_model.encoder(audio16k)
                     c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
+                f0 = f0[:, :, None]
+                c = c.transpose(-1, -2)
                 audio_mel = self.diffusion_model(
+                    c,
+                    f0,
+                    vol,
+                    spk_id=sid,
+                    spk_mix_dict=None,
+                    gt_spec=audio_mel,
+                    infer=True,
+                    infer_speedup=self.diffusion_args.infer.speedup,
+                    method=self.diffusion_args.infer.method,
+                    k_step=k_step)
                 audio = self.vocoder.infer(audio_mel, f0).squeeze()
             if self.nsf_hifigan_enhance:
                 audio, _ = self.enhancer.enhance(
+                    audio[None, :],
+                    self.target_sample,
+                    f0[:, :, None],
+                    self.hps_ms.data.hop_length,
+                    adaptive_key=enhancer_adaptive_key)
             if loudness_envelope_adjustment != 1:
+                audio = utils.change_rms(wav, self.target_sample, audio, self.target_sample,
+                                         loudness_envelope_adjustment)
             use_time = time.time() - start
             print("vits use time:{}".format(use_time))
         return audio, audio.shape[-1], n_frames
         # unload model
         self.net_g_ms = self.net_g_ms.to("cpu")
         del self.net_g_ms
+        if hasattr(self, "enhancer"):
             self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
             del self.enhancer.enhancer
             del self.enhancer
                         pad_seconds=0.5,
                         clip_seconds=0,
                         lg_num=0,
+                        lgr_num=0.75,
                         f0_predictor='pm',
+                        enhancer_adaptive_key=0,
+                        cr_threshold=0.05,
+                        k_step=100,
+                        use_spk_mix=False,
+                        second_encoding=False,
+                        loudness_envelope_adjustment=1
                         ):
         if use_spk_mix:
             if len(self.spk2id) == 1:
         wav_path = Path(raw_audio_path).with_suffix('.wav')
         chunks = slicer.cut(wav_path, db_thresh=slice_db)
         audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
+        per_size = int(clip_seconds * audio_sr)
+        lg_size = int(lg_num * audio_sr)
+        lg_size_r = int(lg_size * lgr_num)
+        lg_size_c_l = (lg_size - lg_size_r) // 2
+        lg_size_c_r = lg_size - lg_size_r - lg_size_c_l
+        lg = np.linspace(0, 1, lg_size_r) if lg_size != 0 else 0
         if use_spk_mix:
             assert len(self.spk2id) == len(spk)
                     audio_length += aud_length // self.hop_size
                     continue
                 if per_size != 0:
+                    datas = split_list_by_n(data, per_size, lg_size)
                 else:
                     datas = [data]
+                for k, dat in enumerate(datas):
                     pad_len = int(audio_sr * pad_seconds)
                     per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
                     a_length = per_length + 2 * pad_len
             for i in range(len(spk)):
                 last_end = None
                 for mix in spk[i]:
+                    if mix[3] < 0. or mix[2] < 0.:
                         raise RuntimeError("mix value must higer Than zero!")
                     begin = int(audio_length * mix[0])
                     end = int(audio_length * mix[1])
                     length = end - begin
+                    if length <= 0:
                         raise RuntimeError("begin Must lower Than end!")
+                    step = (mix[3] - mix[2]) / length
                     if last_end is not None:
                         if last_end != begin:
                             raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
                     if step == 0.:
                         spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
                     else:
+                        spk_mix_data = torch.arange(mix[2], mix[3], step).to(self.dev)
+                    if (len(spk_mix_data) < length):
                         num_pad = length - len(spk_mix_data)
                         spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
                     spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
+            spk_mix_ten = torch.sum(spk_mix_tensor, dim=0).unsqueeze(0).to(self.dev)
             # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
             for i, x in enumerate(spk_mix_ten[0]):
                 if x == 0.0:
                     spk_mix_ten[0][i] = 1.0
+                    spk_mix_tensor[:, i] = 1.0 / len(spk)
             spk_mix_tensor = spk_mix_tensor / spk_mix_ten
+            if not ((torch.sum(spk_mix_tensor, dim=0) - 1.) < 0.0001).all():
                 raise RuntimeError("sum(spk_mix_tensor) not equal 1")
             spk = spk_mix_tensor
                 global_frame += length // self.hop_size
                 continue
             if per_size != 0:
+                datas = split_list_by_n(data, per_size, lg_size)
             else:
                 datas = [data]
+            for k, dat in enumerate(datas):
+                per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds != 0 else length
+                if clip_seconds != 0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
                 # padd
                 pad_len = int(audio_sr * pad_seconds)
                 dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
                 soundfile.write(raw_path, dat, audio_sr, format="wav")
                 raw_path.seek(0)
                 out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
+                                                          cluster_infer_ratio=cluster_infer_ratio,
+                                                          auto_predict_f0=auto_predict_f0,
+                                                          noice_scale=noice_scale,
+                                                          f0_predictor=f0_predictor,
+                                                          enhancer_adaptive_key=enhancer_adaptive_key,
+                                                          cr_threshold=cr_threshold,
+                                                          k_step=k_step,
+                                                          frame=global_frame,
+                                                          spk_mix=use_spk_mix,
+                                                          second_encoding=second_encoding,
+                                                          loudness_envelope_adjustment=loudness_envelope_adjustment
+                                                          )
                 global_frame += out_frame
                 _audio = out_audio.cpu().numpy()
                 pad_len = int(self.target_sample * pad_seconds)
                 _audio = _audio[pad_len:-pad_len]
                 _audio = pad_array(_audio, per_length)
+                if lg_size != 0 and k != 0:
+                    lg1 = audio[-(lg_size_r + lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
+                    lg2 = _audio[lg_size_c_l:lg_size_c_l + lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
+                    lg_pre = lg1 * (1 - lg) + lg2 * lg
+                    audio = audio[0:-(lg_size_r + lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
                     audio.extend(lg_pre)
+                    _audio = _audio[lg_size_c_l + lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
                 audio.extend(list(_audio))
         return np.array(audio)
 class RealTimeVC:
     def __init__(self):
         self.last_chunk = None
                                         auto_predict_f0=auto_predict_f0,
                                         noice_scale=noice_scale,
                                         f0_filter=f0_filter)
             audio = audio.cpu().numpy()
             self.last_chunk = audio[-self.pre_len:]
             self.last_o = audio
             self.last_chunk = audio[-self.pre_len:]
             self.last_o = audio
             return ret[self.chunk_len:2 * self.chunk_len]