diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..6d8a89a885c2c7cacc22288043d53d51a596fc75 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +examples/instrumental/blue_instrumental.wav filter=lfs diff=lfs merge=lfs -text +examples/instrumental/Counter_clockwise_Clock_instrumental.wav filter=lfs diff=lfs merge=lfs -text +examples/instrumental/one_last_kiss_instrumental.wav filter=lfs diff=lfs merge=lfs -text +examples/song/blue.wav filter=lfs diff=lfs merge=lfs -text +examples/song/Counter_clockwise_Clock.wav filter=lfs diff=lfs merge=lfs -text +examples/song/one_last_kiss.wav filter=lfs diff=lfs merge=lfs -text +examples/vocals/blue_vocal.wav filter=lfs diff=lfs merge=lfs -text +examples/vocals/Counter_clockwise_Clock_vocal.wav filter=lfs diff=lfs merge=lfs -text +examples/vocals/one_last_kiss_vocal.wav filter=lfs diff=lfs merge=lfs -text +output_2stems/blue-instrumental.wav filter=lfs diff=lfs merge=lfs -text +output_2stems/blue-vocals.wav filter=lfs diff=lfs merge=lfs -text +output_2stems/temp-instrumental.wav filter=lfs diff=lfs merge=lfs -text +output_2stems/temp-vocals.wav filter=lfs diff=lfs merge=lfs -text +raw/1.wav filter=lfs diff=lfs merge=lfs -text +spleeter/2stems_instrumental.pdparams filter=lfs diff=lfs merge=lfs -text +spleeter/2stems_vocals.pdparams filter=lfs diff=lfs merge=lfs -text +trained_models/纳西妲.pdparams filter=lfs diff=lfs merge=lfs -text +trained_models/派蒙.pdparams filter=lfs diff=lfs merge=lfs -text +trained_models/YH.pdparams filter=lfs diff=lfs merge=lfs -text diff --git a/.ipynb_checkpoints/build.gradio-checkpoint.py b/.ipynb_checkpoints/build.gradio-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..655744d20675e2ce6bf7c96fa99af8714b8a14ec --- /dev/null +++ b/.ipynb_checkpoints/build.gradio-checkpoint.py @@ -0,0 +1,176 @@ +import io +import os + +import gradio as gr +import librosa +import numpy as np +import soundfile +from inference.infer_tool import Svc +import logging +import os +import paddle +import requests +import utils +from spleeter import Separator + +build_dir=os.getcwd() +if build_dir == "/home/aistudio": + build_dir += "/build" + +model_dir=build_dir+'/trained_models' + +model_list_path = model_dir + "/model_list.txt" + +# 筛选出文件夹 +models = [] +for filename in os.listdir(model_dir): + # 判断文件名是否以 '.pdparams' 结尾,并且不包含后缀部分 + if filename.endswith('.pdparams') and os.path.splitext(filename)[0].isalpha(): + models.append(os.path.splitext(filename)[0]) +cache_model = {} + +def reboot(): + os.execv(sys.executable, ['python'] + sys.argv) + +def separate_fn(song_input): + try: + if song_input is None: + return "请上传歌曲",None,None,None,None + params_2stems = { + 'sample_rate': 44100, + 'frame_length': 4096, + 'frame_step': 1024, + 'T': 512, + 'F': 1024, + 'num_instruments': ['vocals', 'instrumental'], + 'output_dir': build_dir+'/output_2stems', + 'checkpoint_path': build_dir+'/spleeter', + 'use_elu': False} + sampling_rate, song = song_input + soundfile.write("temp.wav", song, sampling_rate, format="wav") + # 初始化分离器 + sep = Separator(params_2stems) + sep.separate('temp.wav') + vocal_path = params_2stems["output_dir"]+"/temp-vocals.wav" + instrumental_path = params_2stems["output_dir"]+"/temp-instrumental.wav" + return "分离成功,请继续前往体验【转换】和【混音】",vocal_path,instrumental_path,vocal_path,instrumental_path + except Exception as e: + import traceback + return traceback.format_exc() , None,None,None,None + + +def convert_fn(model_name, input_audio,input_audio_micro, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale): + try: + if model_name in cache_model: + model = cache_model[model_name] + else: + if paddle.device.is_compiled_with_cuda()==False and len(cache_model)!=0: + return f"目前运行环境为CPU,受制于平台算力,每次启动本项目只允许加载1个模型,当前已加载{next(iter(cache_model))}",None,None + config_path = f"{build_dir}/trained_models/config.json" + model = Svc(f"{build_dir}/trained_models/{model_name}.pdparams", config_path,mode="test") + cache_model[model_name] = model + if input_audio is None and input_audio_micro is None: + return "请上传音频", None,None + if input_audio_micro is not None: + input_audio = input_audio_micro + sampling_rate, audio = input_audio + duration = audio.shape[0] / sampling_rate + audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.transpose(1, 0)) + if sampling_rate != 16000: + audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) + print(audio.shape) + out_wav_path = "temp.wav" + soundfile.write(out_wav_path, audio, 16000, format="wav") + print(cluster_ratio, auto_f0, noise_scale) + _audio = model.slice_inference(out_wav_path, model_name, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale) + del model + return "转换成功,请继续前往体验【混音】", (44100, _audio),(44100, _audio) + except Exception as e: + import traceback + return traceback.format_exc() , None,None + +def compose_fn(input_vocal,input_instrumental,mixing_ratio=0.5): + try: + outlog = "混音成功" + if input_vocal is None: + return "请上传人声", None + if input_instrumental is None: + return "请上传伴奏", None + vocal_sampling_rate, vocal = input_vocal + vocal_duration = vocal.shape[0] / vocal_sampling_rate + vocal = (vocal / np.iinfo(vocal.dtype).max).astype(np.float32) + if len(vocal.shape) > 1: + vocal = librosa.to_mono(vocal.transpose(1, 0)) + if vocal_sampling_rate != 44100: + vocal = librosa.resample(vocal, orig_sr=vocal_sampling_rate, target_sr=44100) + + instrumental_sampling_rate, instrumental = input_instrumental + instrumental_duration = instrumental.shape[0] / instrumental_sampling_rate + instrumental = (instrumental / np.iinfo(instrumental.dtype).max).astype(np.float32) + if len(instrumental.shape) > 1: + instrumental = librosa.to_mono(instrumental.transpose(1, 0)) + if instrumental_sampling_rate != 44100: + instrumental = librosa.resample(instrumental, orig_sr=instrumental_sampling_rate, target_sr=44100) + if len(vocal)!=len(instrumental): + min_length = min(len(vocal),len(instrumental)) + instrumental = instrumental[:min_length] + vocal = vocal[:min_length] + outlog = "人声伴奏长度不一致,已自动截断较长的音频" + + mixed_audio = (1 - mixing_ratio) * vocal + mixing_ratio * instrumental + mixed_audio_data = mixed_audio.astype(np.float32) + return outlog,(44100,mixed_audio_data) + except Exception as e: + import traceback + return traceback.format_exc() , None + + +app = gr.Blocks() +with app: + gr.Markdown('

SVC歌声转换全流程体验(伴奏分离,转换,混音)

') + btn_reboot = gr.Button("重启程序", variant="primary") + with gr.Tabs() as tabs: + with gr.TabItem("人声伴奏分离"): + gr.Markdown('

该项目人声分离的效果弱于UVR5,如自备分离好的伴奏和人声可跳过该步骤

') + song_input = gr.Audio(label="上传歌曲(tips:上传后点击右上角✏可以进行歌曲剪辑)",interactive=True) + gr.Examples(examples=[build_dir+"/examples/song/blue.wav",build_dir+"/examples/song/Counter_clockwise_Clock.wav",build_dir+"/examples/song/one_last_kiss.wav"],inputs=song_input,label="歌曲样例") + + btn_separate = gr.Button("人声伴奏分离", variant="primary") + text_output1 = gr.Textbox(label="输出信息") + vocal_output1 = gr.Audio(label="输出人声",interactive=False) + instrumental_output1 = gr.Audio(label="输出伴奏",interactive=False) + with gr.TabItem("转换"): + model_name = gr.Dropdown(label="模型", choices=models, value="纳西妲") + vocal_input1 = gr.Audio(label="上传人声",interactive=True) + gr.Examples(examples=[build_dir+"/examples/vocals/blue_vocal.wav",build_dir+"/examples/vocals/Counter_clockwise_Clock_vocal.wav",build_dir+"/examples/vocals/one_last_kiss_vocal.wav"],inputs=vocal_input1,label="人声样例") + btn_use_separate = gr.Button("使用【人声伴奏分离】分离的人声") + micro_input = gr.Audio(label="麦克风输入(优先于上传的人声)",source="microphone",interactive=True) + vc_transform = gr.Number(label="变调(半音数量,升八度12降八度-12)", value=0) + cluster_ratio = gr.Number(label="聚类模型混合比例", value=0,visible=False) + auto_f0 = gr.Checkbox(label="自动预测音高(转换歌声时不要打开,会严重跑调)", value=False) + slice_db = gr.Number(label="静音分贝阈值(嘈杂的音频可以-30,干声保留呼吸可以-50)", value=-50) + noise_scale = gr.Number(label="noise_scale", value=0.2) + btn_convert = gr.Button("转换", variant="primary") + text_output2 = gr.Textbox(label="输出信息") + vc_output2 = gr.Audio(label="输出音频",interactive=False) + + with gr.TabItem("混音"): + vocal_input2 = gr.Audio(label="上传人声",interactive=True) + btn_use_convert = gr.Button("使用【转换】输出的人声") + instrumental_input1 = gr.Audio(label="上传伴奏") + gr.Examples(examples=[build_dir+"/examples/instrumental/blue_instrumental.wav",build_dir+"/examples/instrumental/Counter_clockwise_Clock_instrumental.wav",build_dir+"/examples/instrumental/one_last_kiss_instrumental.wav"],inputs=instrumental_input1,label="伴奏样例") + btn_use_separate2 = gr.Button("使用【人声伴奏分离】分离的伴奏") + mixing_ratio = gr.Slider(0, 1, value=0.75,step=0.01,label="混音比例(人声:伴奏)", info="人声:伴奏") + btn_compose = gr.Button("混音", variant="primary") + text_output3 = gr.Textbox(label="输出信息") + song_output = gr.Audio(label="输出歌曲",interactive=False) + btn_separate.click(separate_fn, song_input, [text_output1, vocal_output1,instrumental_output1,vocal_input1,instrumental_input1]) + btn_convert.click(convert_fn, [model_name, vocal_input1,micro_input,vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale], [text_output2, vc_output2,vocal_input2]) + btn_reboot.click(reboot) + btn_use_convert.click(lambda x:x,vc_output2,vocal_input2) + btn_use_separate.click(lambda x:x,vocal_output1,vocal_input1) + btn_use_separate2.click(lambda x:x,instrumental_output1,instrumental_input1) + +app.launch() diff --git a/build.gradio.py b/build.gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..2586fe270fa4672b0a31fe4c545dde8cc5574fc7 --- /dev/null +++ b/build.gradio.py @@ -0,0 +1,176 @@ +import io +import os + +import gradio as gr +import librosa +import numpy as np +import soundfile +from inference.infer_tool import Svc +import logging +import os +import paddle +import requests +import utils +from spleeter import Separator + +build_dir=os.getcwd() +if build_dir == "/home/aistudio": + build_dir += "/build" + +model_dir=build_dir+'/trained_models' + +model_list_path = model_dir + "/model_list.txt" + +# 筛选出文件夹 +models = [] +for filename in os.listdir(model_dir): + # 判断文件名是否以 '.pdparams' 结尾,并且不包含后缀部分 + if filename.endswith('.pdparams') and os.path.splitext(filename)[0].isalpha(): + models.append(os.path.splitext(filename)[0]) +cache_model = {} + +def reboot(): + os._exit(0) + +def separate_fn(song_input): + try: + if song_input is None: + return "请上传歌曲",None,None,None,None + params_2stems = { + 'sample_rate': 44100, + 'frame_length': 4096, + 'frame_step': 1024, + 'T': 512, + 'F': 1024, + 'num_instruments': ['vocals', 'instrumental'], + 'output_dir': build_dir+'/output_2stems', + 'checkpoint_path': build_dir+'/spleeter', + 'use_elu': False} + sampling_rate, song = song_input + soundfile.write("temp.wav", song, sampling_rate, format="wav") + # 初始化分离器 + sep = Separator(params_2stems) + sep.separate('temp.wav') + vocal_path = params_2stems["output_dir"]+"/temp-vocals.wav" + instrumental_path = params_2stems["output_dir"]+"/temp-instrumental.wav" + return "分离成功,请继续前往体验【转换】和【混音】",vocal_path,instrumental_path,vocal_path,instrumental_path + except Exception as e: + import traceback + return traceback.format_exc() , None,None,None,None + + +def convert_fn(model_name, input_audio,input_audio_micro, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale): + try: + if model_name in cache_model: + model = cache_model[model_name] + else: + if paddle.device.is_compiled_with_cuda()==False and len(cache_model)!=0: + return f"目前运行环境为CPU,受制于平台算力,每次启动本项目只允许加载1个模型,当前已加载{next(iter(cache_model))}",None,None + config_path = f"{build_dir}/trained_models/config.json" + model = Svc(f"{build_dir}/trained_models/{model_name}.pdparams", config_path,mode="test") + cache_model[model_name] = model + if input_audio is None and input_audio_micro is None: + return "请上传音频", None,None + if input_audio_micro is not None: + input_audio = input_audio_micro + sampling_rate, audio = input_audio + duration = audio.shape[0] / sampling_rate + audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.transpose(1, 0)) + if sampling_rate != 16000: + audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) + print(audio.shape) + out_wav_path = "temp.wav" + soundfile.write(out_wav_path, audio, 16000, format="wav") + print(cluster_ratio, auto_f0, noise_scale) + _audio = model.slice_inference(out_wav_path, model_name, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale) + del model + return "转换成功,请继续前往体验【混音】", (44100, _audio),(44100, _audio) + except Exception as e: + import traceback + return traceback.format_exc() , None,None + +def compose_fn(input_vocal,input_instrumental,mixing_ratio=0.5): + try: + outlog = "混音成功" + if input_vocal is None: + return "请上传人声", None + if input_instrumental is None: + return "请上传伴奏", None + vocal_sampling_rate, vocal = input_vocal + vocal_duration = vocal.shape[0] / vocal_sampling_rate + vocal = (vocal / np.iinfo(vocal.dtype).max).astype(np.float32) + if len(vocal.shape) > 1: + vocal = librosa.to_mono(vocal.transpose(1, 0)) + if vocal_sampling_rate != 44100: + vocal = librosa.resample(vocal, orig_sr=vocal_sampling_rate, target_sr=44100) + + instrumental_sampling_rate, instrumental = input_instrumental + instrumental_duration = instrumental.shape[0] / instrumental_sampling_rate + instrumental = (instrumental / np.iinfo(instrumental.dtype).max).astype(np.float32) + if len(instrumental.shape) > 1: + instrumental = librosa.to_mono(instrumental.transpose(1, 0)) + if instrumental_sampling_rate != 44100: + instrumental = librosa.resample(instrumental, orig_sr=instrumental_sampling_rate, target_sr=44100) + if len(vocal)!=len(instrumental): + min_length = min(len(vocal),len(instrumental)) + instrumental = instrumental[:min_length] + vocal = vocal[:min_length] + outlog = "人声伴奏长度不一致,已自动截断较长的音频" + + mixed_audio = (1 - mixing_ratio) * vocal + mixing_ratio * instrumental + mixed_audio_data = mixed_audio.astype(np.float32) + return outlog,(44100,mixed_audio_data) + except Exception as e: + import traceback + return traceback.format_exc() , None + + +app = gr.Blocks() +with app: + gr.Markdown('

SVC歌声转换全流程体验(伴奏分离,转换,混音)

') + btn_reboot = gr.Button("重启程序", variant="primary") + with gr.Tabs() as tabs: + with gr.TabItem("人声伴奏分离"): + gr.Markdown('

该项目人声分离的效果弱于UVR5,如自备分离好的伴奏和人声可跳过该步骤

') + song_input = gr.Audio(label="上传歌曲(tips:上传后点击右上角✏可以进行歌曲剪辑)",interactive=True) + gr.Examples(examples=[build_dir+"/examples/song/blue.wav",build_dir+"/examples/song/Counter_clockwise_Clock.wav",build_dir+"/examples/song/one_last_kiss.wav"],inputs=song_input,label="歌曲样例") + + btn_separate = gr.Button("人声伴奏分离", variant="primary") + text_output1 = gr.Textbox(label="输出信息") + vocal_output1 = gr.Audio(label="输出人声",interactive=False) + instrumental_output1 = gr.Audio(label="输出伴奏",interactive=False) + with gr.TabItem("转换"): + model_name = gr.Dropdown(label="模型", choices=models, value="纳西妲") + vocal_input1 = gr.Audio(label="上传人声",interactive=True) + gr.Examples(examples=[build_dir+"/examples/vocals/blue_vocal.wav",build_dir+"/examples/vocals/Counter_clockwise_Clock_vocal.wav",build_dir+"/examples/vocals/one_last_kiss_vocal.wav"],inputs=vocal_input1,label="人声样例") + btn_use_separate = gr.Button("使用【人声伴奏分离】分离的人声") + micro_input = gr.Audio(label="麦克风输入(优先于上传的人声)",source="microphone",interactive=True) + vc_transform = gr.Number(label="变调(半音数量,升八度12降八度-12)", value=0) + cluster_ratio = gr.Number(label="聚类模型混合比例", value=0,visible=False) + auto_f0 = gr.Checkbox(label="自动预测音高(转换歌声时不要打开,会严重跑调)", value=False) + slice_db = gr.Number(label="静音分贝阈值(嘈杂的音频可以-30,干声保留呼吸可以-50)", value=-50) + noise_scale = gr.Number(label="noise_scale", value=0.2) + btn_convert = gr.Button("转换", variant="primary") + text_output2 = gr.Textbox(label="输出信息") + vc_output2 = gr.Audio(label="输出音频",interactive=False) + + with gr.TabItem("混音"): + vocal_input2 = gr.Audio(label="上传人声",interactive=True) + btn_use_convert = gr.Button("使用【转换】输出的人声") + instrumental_input1 = gr.Audio(label="上传伴奏") + gr.Examples(examples=[build_dir+"/examples/instrumental/blue_instrumental.wav",build_dir+"/examples/instrumental/Counter_clockwise_Clock_instrumental.wav",build_dir+"/examples/instrumental/one_last_kiss_instrumental.wav"],inputs=instrumental_input1,label="伴奏样例") + btn_use_separate2 = gr.Button("使用【人声伴奏分离】分离的伴奏") + mixing_ratio = gr.Slider(0, 1, value=0.75,step=0.01,label="混音比例(人声:伴奏)", info="人声:伴奏") + btn_compose = gr.Button("混音", variant="primary") + text_output3 = gr.Textbox(label="输出信息") + song_output = gr.Audio(label="输出歌曲",interactive=False) + btn_separate.click(separate_fn, song_input, [text_output1, vocal_output1,instrumental_output1,vocal_input1,instrumental_input1]) + btn_convert.click(convert_fn, [model_name, vocal_input1,micro_input,vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale], [text_output2, vc_output2,vocal_input2]) + btn_reboot.click(reboot) + btn_use_convert.click(lambda x:x,vc_output2,vocal_input2) + btn_use_separate.click(lambda x:x,vocal_output1,vocal_input1) + btn_use_separate2.click(lambda x:x,instrumental_output1,instrumental_input1) + +app.launch() diff --git a/cluster/__init__.py b/cluster/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..be10d04604c08473f9e0468797fee58a1986ae6e --- /dev/null +++ b/cluster/__init__.py @@ -0,0 +1,29 @@ +import numpy as np +import paddle +from sklearn.cluster import KMeans + +def get_cluster_model(ckpt_path:str): + checkpoint = paddle.load(ckpt_path) + kmeans_dict = {} + for spk, ckpt in checkpoint.items(): + km = KMeans(ckpt["n_features_in_"]) + km.__dict__["n_features_in_"] = ckpt["n_features_in_"] + km.__dict__["_n_threads"] = ckpt["_n_threads"] + km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"] + kmeans_dict[spk] = km + return kmeans_dict + +def get_cluster_result(model, x, speaker): + """ + x: np.array [t, 256] + return cluster class result + """ + return model[speaker].predict(x) + +def get_cluster_center_result(model, x,speaker): + """x: np.array [t, 256]""" + predict = model[speaker].predict(x) + return model[speaker].cluster_centers_[predict] + +def get_center(model, x,speaker): + return model[speaker].cluster_centers_[x] \ No newline at end of file diff --git a/cluster/train_cluster.py b/cluster/train_cluster.py new file mode 100644 index 0000000000000000000000000000000000000000..48506b94ace50d2e955b61d93c12e5911e3b227f --- /dev/null +++ b/cluster/train_cluster.py @@ -0,0 +1,88 @@ +import os +from glob import glob +from pathlib import Path +import paddle +import logging +import argparse +import numpy as np +from sklearn.cluster import KMeans, MiniBatchKMeans +import tqdm +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +import time +import random + +def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False): + + logger.info(f"正在从{in_dir}加载特征") + features = [] + nums = 0 + for path in tqdm.tqdm(in_dir.glob("*.soft.pdtensor")): + path = str(path) + features.append(paddle.load(path).squeeze(0).numpy().T) + # print(features[-1].shape) + features = np.concatenate(features, axis=0) + print(nums, features.nbytes/ 1024**2, "MB , 形状:",features.shape, features.dtype) + features = features.astype(np.float32) + logger.info(f"聚类特征的形状:{features.shape}") + t = time.time() + if use_minibatch: + kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features) + else: + kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features) + print(time.time()-t, "s") + + x = { + "n_features_in_": kmeans.n_features_in_, + "_n_threads": kmeans._n_threads, + "cluster_centers_": kmeans.cluster_centers_, + } + print("结束") + + return x + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('--dataset', type=Path, default="./dataset/44k", + help='path of training data directory') + parser.add_argument('--output', type=Path, default="logs/44k", + help='path of model output directory') + + args = parser.parse_args() + + checkpoint_dir = args.output + dataset = args.dataset + n_clusters = 10000 + + ckpt = {} + for spk in os.listdir(dataset): + if os.path.isdir(dataset/spk): + print(f"正在给{spk}训练kmeans中……") + in_dir = dataset/spk + x = train_cluster(in_dir, n_clusters, verbose=False) + ckpt[spk] = x + + checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pdparams" + checkpoint_path.parent.mkdir(exist_ok=True, parents=True) + paddle.save( + ckpt, + str(checkpoint_path), + ) + + + # import cluster + # for spk in tqdm.tqdm(os.listdir("dataset")): + # if os.path.isdir(f"dataset/{spk}"): + # print(f"start kmeans inference for {spk}...") + # for feature_path in tqdm.tqdm(glob(f"dataset/{spk}/*.discrete.npy", recursive=True)): + # mel_path = feature_path.replace(".discrete.npy",".mel.npy") + # mel_spectrogram = np.load(mel_path) + # feature_len = mel_spectrogram.shape[-1] + # c = np.load(feature_path) + # c = utils.tools.repeat_expand_2d(torch.FloatTensor(c), feature_len).numpy() + # feature = c.T + # feature_class = cluster.get_cluster_result(feature, spk) + # np.save(feature_path.replace(".discrete.npy", ".discrete_class.npy"), feature_class) + diff --git a/configs/config.json b/configs/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c171c964b2740499a804339fda5eabaae3778d7e --- /dev/null +++ b/configs/config.json @@ -0,0 +1,95 @@ +{ + "train": { + "log_interval": 800, + "eval_interval": 400, + "seed": 1234, + "epochs": 114514, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-05, + "batch_size": 2, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 10240, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 512, + "port": "8001", + "keep_ckpts": 5 + }, + "data": { + "training_files": "filelists/train.txt", + "validation_files": "filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 44100, + "filter_length": 2048, + "hop_length": 512, + "win_length": 2048, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 22050 + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 8, + 8, + 2, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4, + 4 + ], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 256, + "n_speakers": 200 + }, + "spk": { + "yuuka": 0 + }, + "clean_logs": true, + "trainer": "admin" +} \ No newline at end of file diff --git a/data_utils.py b/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1f1ba011f111827f797db5ab159ea9debdae89d5 --- /dev/null +++ b/data_utils.py @@ -0,0 +1,143 @@ +import time +import os +import random +import numpy as np +import paddle + +import modules.commons as commons +import utils +from modules.mel_processing import spectrogram_torch, spec_to_mel_torch +from utils import load_wav_to_torch, load_filepaths_and_text + +# import h5py + + +"""Multi speaker version""" + + +class TextAudioSpeakerLoader(paddle.io.Dataset): + """ + 1) loads audio, speaker_id, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audiopaths, hparams): + self.audiopaths = load_filepaths_and_text(audiopaths) + self.max_wav_value = hparams.data.max_wav_value + self.sampling_rate = hparams.data.sampling_rate + self.filter_length = hparams.data.filter_length + self.hop_length = hparams.data.hop_length + self.win_length = hparams.data.win_length + self.sampling_rate = hparams.data.sampling_rate + self.use_sr = hparams.train.use_sr + self.spec_len = hparams.train.max_speclen + self.spk_map = hparams.spk + + random.seed(1234) + random.shuffle(self.audiopaths) + + def get_audio(self, filename): + filename = filename.replace("\\", "/") + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError("{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate)) + audio_norm = audio / self.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pdtensor") + if os.path.exists(spec_filename): + spec = paddle.load(spec_filename) + else: + spec = spectrogram_torch(audio_norm, self.filter_length, + self.sampling_rate, self.hop_length, self.win_length, + center=False) + spec = paddle.squeeze(spec, 0) + paddle.save(spec, spec_filename) + + spk = filename.split("/")[-2] + spk = paddle.to_tensor([self.spk_map[spk]],dtype = 'int64') + + f0 = np.load(filename + ".f0.npy") + f0, uv = utils.interpolate_f0(f0) + f0 = paddle.to_tensor(f0,dtype = 'float32') + uv = paddle.to_tensor(uv,dtype = 'float32') + + c = paddle.load(filename+ ".soft.pdtensor") + c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0]) + + + lmin = min(c.shape[-1], spec.shape[-1]) + assert abs(c.shape[-1] - spec.shape[-1]) < 3, (c.shape[-1], spec.shape[-1], f0.shape, filename) + assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length + spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin] + audio_norm = audio_norm[:, :lmin * self.hop_length] + # if spec.shape[1] < 30: + # print("skip too short audio:", filename) + # return None + if spec.shape[1] > 800: + start = random.randint(0, spec.shape[1]-800) + end = start + 790 + spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end] + audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length] + + return c, f0, spec, audio_norm, spk, uv + + def __getitem__(self, index): + return self.get_audio(self.audiopaths[index][0]) + + def __len__(self): + return len(self.audiopaths) + + +class TextAudioCollate: + + def __call__(self, batch): + batch = [b for b in batch if b is not None] + + input_lengths, ids_sorted_decreasing = \ + paddle.sort( + paddle.to_tensor([x[0].shape[1] for x in batch],dtype = 'int64'), + axis=0, descending=True),\ + paddle.argsort( + paddle.to_tensor([x[0].shape[1] for x in batch],dtype = 'int64'), + axis=0, descending=True) + + max_c_len = max([x[0].shape[1] for x in batch]) + max_wav_len = max([x[3].shape[1] for x in batch]) + + lengths = paddle.zeros((len(batch),),dtype = 'int64') + c_padded = paddle.to_tensor(np.random.rand(len(batch), batch[0][0].shape[0], max_c_len),dtype = 'float32') + f0_padded = paddle.to_tensor(np.random.rand(len(batch), max_c_len),dtype = 'float32') + spec_padded = paddle.to_tensor(np.random.rand(len(batch), batch[0][2].shape[0], max_c_len),dtype = 'float32') + wav_padded = paddle.to_tensor(np.random.rand(len(batch), 1, max_wav_len),dtype = 'float32') + spkids = paddle.zeros((len(batch), 1),dtype = 'int64') + uv_padded = paddle.to_tensor(np.random.rand(len(batch), max_c_len),dtype = 'float32') + + c_padded.zero_() + spec_padded.zero_() + f0_padded.zero_() + wav_padded.zero_() + uv_padded.zero_() + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + c = row[0] + c_padded[i, :, :c.shape[1]] = c + lengths[i] = c.shape[1] + + f0 = row[1] + f0_padded[i, :f0.shape[0]] = f0 + + spec = row[2] + spec_padded[i, :, :spec.shape[1]] = spec + + wav = row[3] + wav_padded[i, :, :wav.shape[1]] = wav + spkids[i, 0] = row[4] + + uv = row[5] + uv_padded[i, :uv.shape[0]] = uv + + return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded diff --git a/examples/instrumental/Counter_clockwise_Clock_instrumental.wav b/examples/instrumental/Counter_clockwise_Clock_instrumental.wav new file mode 100644 index 0000000000000000000000000000000000000000..a6dbcf54879b143be84f6107ee502742c1ce3206 --- /dev/null +++ b/examples/instrumental/Counter_clockwise_Clock_instrumental.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a1c1d882e28458a0aa448cb68b65344e4839a2354766d679f20970c30362fc9 +size 2062380 diff --git a/examples/instrumental/blue_instrumental.wav b/examples/instrumental/blue_instrumental.wav new file mode 100644 index 0000000000000000000000000000000000000000..0dcef37518116ffb9d25350c544ece582d8d052b --- /dev/null +++ b/examples/instrumental/blue_instrumental.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bbbc0b86619c634cbad2769ce035e96d4b7e598d50260be11a84e21f7b8b311 +size 5420756 diff --git a/examples/instrumental/one_last_kiss_instrumental.wav b/examples/instrumental/one_last_kiss_instrumental.wav new file mode 100644 index 0000000000000000000000000000000000000000..4172676ec0f776866bcc09e709f62b1e68534e2f --- /dev/null +++ b/examples/instrumental/one_last_kiss_instrumental.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:223b2eb0ef567c891bf61a3f05d97aea97b18fb44bc21d6726053f81c5fe0f57 +size 6575226 diff --git a/examples/song/Counter_clockwise_Clock.wav b/examples/song/Counter_clockwise_Clock.wav new file mode 100644 index 0000000000000000000000000000000000000000..c98e1d74c6dcdba4a31328da5a552de20f323c5a --- /dev/null +++ b/examples/song/Counter_clockwise_Clock.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b87df2edb949b8c8d87f740ebd701c764425636906b0232c6ea7375b0dc898e8 +size 2252296 diff --git a/examples/song/blue.wav b/examples/song/blue.wav new file mode 100644 index 0000000000000000000000000000000000000000..21cd366aee65b3277aa6318f94f86540f2111b02 --- /dev/null +++ b/examples/song/blue.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b80bd01d14fcd86bbee2c048f94b8b5927ae0a661ff5b2b99a014110a8c9a49 +size 2709736 diff --git a/examples/song/one_last_kiss.wav b/examples/song/one_last_kiss.wav new file mode 100644 index 0000000000000000000000000000000000000000..061fe0fa9e962439ae413f6fc1cd1486568a030a --- /dev/null +++ b/examples/song/one_last_kiss.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc671b5095cc1dfa00772de09f8fa94741482b573eafbd88188fe909cdb1f173 +size 6575226 diff --git a/examples/vocals/Counter_clockwise_Clock_vocal.wav b/examples/vocals/Counter_clockwise_Clock_vocal.wav new file mode 100644 index 0000000000000000000000000000000000000000..bfe024812c838e01a4492d4b5493a42cd13cf503 --- /dev/null +++ b/examples/vocals/Counter_clockwise_Clock_vocal.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f749f2b9a1c8af5afbc66544956337ce12fd1c5b9d64e69d1212847a36480b9e +size 2062380 diff --git a/examples/vocals/blue_vocal.wav b/examples/vocals/blue_vocal.wav new file mode 100644 index 0000000000000000000000000000000000000000..95fe19867cd090c130fff221e8bdada221d450f0 --- /dev/null +++ b/examples/vocals/blue_vocal.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5014db90a4bd53a18054ae2a2f9b1e733c8c474a6872beace1c3bd716b2cb61f +size 2484268 diff --git a/examples/vocals/one_last_kiss_vocal.wav b/examples/vocals/one_last_kiss_vocal.wav new file mode 100644 index 0000000000000000000000000000000000000000..076bc23649c95b496cde194f17c8d4b4c7b0df45 --- /dev/null +++ b/examples/vocals/one_last_kiss_vocal.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d52245bb0ac3965ebedcf730d5e21bc8d35519b84184c40b9f8ec780420b813c +size 6568602 diff --git a/filelists/test.txt b/filelists/test.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe103f33b012552dd179637d7c80346e36f1bbb9 --- /dev/null +++ b/filelists/test.txt @@ -0,0 +1,2 @@ +./dataset/44k/yuuka/7_93.wav +./dataset/44k/yuuka/1_98.wav diff --git a/filelists/train.txt b/filelists/train.txt new file mode 100644 index 0000000000000000000000000000000000000000..9db09391ee191fbb130f60b99b81784e134887a9 --- /dev/null +++ b/filelists/train.txt @@ -0,0 +1,857 @@ +./dataset/44k/yuuka/1_44.wav +./dataset/44k/yuuka/6_3.wav +./dataset/44k/yuuka/3_136.wav +./dataset/44k/yuuka/3_63.wav +./dataset/44k/yuuka/4_14.wav +./dataset/44k/yuuka/7_168.wav +./dataset/44k/yuuka/4_12.wav +./dataset/44k/yuuka/364983.wav +./dataset/44k/yuuka/606448.wav +./dataset/44k/yuuka/5_21.wav +./dataset/44k/yuuka/8_23.wav +./dataset/44k/yuuka/9_69.wav +./dataset/44k/yuuka/5_91.wav +./dataset/44k/yuuka/3_163.wav +./dataset/44k/yuuka/6_27.wav +./dataset/44k/yuuka/234705.wav +./dataset/44k/yuuka/3_176.wav +./dataset/44k/yuuka/1_93.wav +./dataset/44k/yuuka/76490.wav +./dataset/44k/yuuka/3_179.wav +./dataset/44k/yuuka/1_69.wav +./dataset/44k/yuuka/192554.wav +./dataset/44k/yuuka/218550.wav +./dataset/44k/yuuka/6_16.wav +./dataset/44k/yuuka/5_35.wav +./dataset/44k/yuuka/207846.wav +./dataset/44k/yuuka/7_145.wav +./dataset/44k/yuuka/audio_28.wav +./dataset/44k/yuuka/2_30.wav +./dataset/44k/yuuka/1_55.wav +./dataset/44k/yuuka/5_6.wav +./dataset/44k/yuuka/3_178.wav +./dataset/44k/yuuka/3_19.wav +./dataset/44k/yuuka/audio_15.wav +./dataset/44k/yuuka/3_134.wav +./dataset/44k/yuuka/1_1.wav +./dataset/44k/yuuka/5_66.wav +./dataset/44k/yuuka/7_61.wav +./dataset/44k/yuuka/7_43.wav +./dataset/44k/yuuka/9_4.wav +./dataset/44k/yuuka/4_42.wav +./dataset/44k/yuuka/7_130.wav +./dataset/44k/yuuka/5_52.wav +./dataset/44k/yuuka/919255.wav +./dataset/44k/yuuka/4_82.wav +./dataset/44k/yuuka/1_114.wav +./dataset/44k/yuuka/7_7.wav +./dataset/44k/yuuka/1_79.wav +./dataset/44k/yuuka/1_33.wav +./dataset/44k/yuuka/9_21.wav +./dataset/44k/yuuka/574054.wav +./dataset/44k/yuuka/69301.wav +./dataset/44k/yuuka/5_30.wav +./dataset/44k/yuuka/119619.wav +./dataset/44k/yuuka/7_167.wav +./dataset/44k/yuuka/1_45.wav +./dataset/44k/yuuka/6_23.wav +./dataset/44k/yuuka/656221.wav +./dataset/44k/yuuka/9_65.wav +./dataset/44k/yuuka/3_85.wav +./dataset/44k/yuuka/7_188.wav +./dataset/44k/yuuka/6_13.wav +./dataset/44k/yuuka/3_60.wav +./dataset/44k/yuuka/997011.wav +./dataset/44k/yuuka/audio_1.wav +./dataset/44k/yuuka/6_39.wav +./dataset/44k/yuuka/5_15.wav +./dataset/44k/yuuka/7775.wav +./dataset/44k/yuuka/969593.wav +./dataset/44k/yuuka/7_94.wav +./dataset/44k/yuuka/39442.wav +./dataset/44k/yuuka/9_37.wav +./dataset/44k/yuuka/3_46.wav +./dataset/44k/yuuka/5_41.wav +./dataset/44k/yuuka/517653.wav +./dataset/44k/yuuka/9_38.wav +./dataset/44k/yuuka/2_18.wav +./dataset/44k/yuuka/255268.wav +./dataset/44k/yuuka/8_5.wav +./dataset/44k/yuuka/4_22.wav +./dataset/44k/yuuka/1_81.wav +./dataset/44k/yuuka/3_39.wav +./dataset/44k/yuuka/262686.wav +./dataset/44k/yuuka/3_105.wav +./dataset/44k/yuuka/571657.wav +./dataset/44k/yuuka/8_20.wav +./dataset/44k/yuuka/6_33.wav +./dataset/44k/yuuka/3_165.wav +./dataset/44k/yuuka/594447.wav +./dataset/44k/yuuka/7_181.wav +./dataset/44k/yuuka/695541.wav +./dataset/44k/yuuka/1_32.wav +./dataset/44k/yuuka/1_92.wav +./dataset/44k/yuuka/3_40.wav +./dataset/44k/yuuka/98954.wav +./dataset/44k/yuuka/8_11.wav +./dataset/44k/yuuka/7_25.wav +./dataset/44k/yuuka/4_43.wav +./dataset/44k/yuuka/3_26.wav +./dataset/44k/yuuka/7_23.wav +./dataset/44k/yuuka/1_34.wav +./dataset/44k/yuuka/3_24.wav +./dataset/44k/yuuka/3_135.wav +./dataset/44k/yuuka/4_33.wav +./dataset/44k/yuuka/9_39.wav +./dataset/44k/yuuka/52347.wav +./dataset/44k/yuuka/3_54.wav +./dataset/44k/yuuka/3_145.wav +./dataset/44k/yuuka/1_13.wav +./dataset/44k/yuuka/463536.wav +./dataset/44k/yuuka/3_104.wav +./dataset/44k/yuuka/2_20.wav +./dataset/44k/yuuka/3_66.wav +./dataset/44k/yuuka/7_182.wav +./dataset/44k/yuuka/3_1.wav +./dataset/44k/yuuka/5_63.wav +./dataset/44k/yuuka/3_12.wav +./dataset/44k/yuuka/audio_40.wav +./dataset/44k/yuuka/7_127.wav +./dataset/44k/yuuka/1_11.wav +./dataset/44k/yuuka/audio_10.wav +./dataset/44k/yuuka/490130.wav +./dataset/44k/yuuka/3_162.wav +./dataset/44k/yuuka/audio_11.wav +./dataset/44k/yuuka/3_42.wav +./dataset/44k/yuuka/635392.wav +./dataset/44k/yuuka/7_52.wav +./dataset/44k/yuuka/5_51.wav +./dataset/44k/yuuka/8_6.wav +./dataset/44k/yuuka/8_7.wav +./dataset/44k/yuuka/8_1.wav +./dataset/44k/yuuka/3_94.wav +./dataset/44k/yuuka/1_10.wav +./dataset/44k/yuuka/9_76.wav +./dataset/44k/yuuka/7_160.wav +./dataset/44k/yuuka/3_98.wav +./dataset/44k/yuuka/553418.wav +./dataset/44k/yuuka/3_53.wav +./dataset/44k/yuuka/295642.wav +./dataset/44k/yuuka/5_45.wav +./dataset/44k/yuuka/5_50.wav +./dataset/44k/yuuka/1_47.wav +./dataset/44k/yuuka/4_50.wav +./dataset/44k/yuuka/609212.wav +./dataset/44k/yuuka/4_77.wav +./dataset/44k/yuuka/5_13.wav +./dataset/44k/yuuka/2_21.wav +./dataset/44k/yuuka/9_25.wav +./dataset/44k/yuuka/2_6.wav +./dataset/44k/yuuka/4_1.wav +./dataset/44k/yuuka/8_4.wav +./dataset/44k/yuuka/1_14.wav +./dataset/44k/yuuka/7_55.wav +./dataset/44k/yuuka/3_90.wav +./dataset/44k/yuuka/7_98.wav +./dataset/44k/yuuka/1_26.wav +./dataset/44k/yuuka/7_187.wav +./dataset/44k/yuuka/622956.wav +./dataset/44k/yuuka/4_37.wav +./dataset/44k/yuuka/61332.wav +./dataset/44k/yuuka/1_43.wav +./dataset/44k/yuuka/233242.wav +./dataset/44k/yuuka/audio_36.wav +./dataset/44k/yuuka/2_3.wav +./dataset/44k/yuuka/9_80.wav +./dataset/44k/yuuka/3_14.wav +./dataset/44k/yuuka/9_32.wav +./dataset/44k/yuuka/audio_12.wav +./dataset/44k/yuuka/7_164.wav +./dataset/44k/yuuka/4_56.wav +./dataset/44k/yuuka/5_36.wav +./dataset/44k/yuuka/359799.wav +./dataset/44k/yuuka/6_0.wav +./dataset/44k/yuuka/5_85.wav +./dataset/44k/yuuka/6_4.wav +./dataset/44k/yuuka/537888.wav +./dataset/44k/yuuka/7_108.wav +./dataset/44k/yuuka/352307.wav +./dataset/44k/yuuka/3_18.wav +./dataset/44k/yuuka/3_51.wav +./dataset/44k/yuuka/8_26.wav +./dataset/44k/yuuka/7_49.wav +./dataset/44k/yuuka/3_129.wav +./dataset/44k/yuuka/2_13.wav +./dataset/44k/yuuka/7_44.wav +./dataset/44k/yuuka/3_50.wav +./dataset/44k/yuuka/5_95.wav +./dataset/44k/yuuka/audio_29.wav +./dataset/44k/yuuka/6_21.wav +./dataset/44k/yuuka/8_15.wav +./dataset/44k/yuuka/963257.wav +./dataset/44k/yuuka/7_95.wav +./dataset/44k/yuuka/2_11.wav +./dataset/44k/yuuka/4_38.wav +./dataset/44k/yuuka/1_68.wav +./dataset/44k/yuuka/1_58.wav +./dataset/44k/yuuka/3_181.wav +./dataset/44k/yuuka/3_29.wav +./dataset/44k/yuuka/7_76.wav +./dataset/44k/yuuka/2_24.wav +./dataset/44k/yuuka/6_7.wav +./dataset/44k/yuuka/7_134.wav +./dataset/44k/yuuka/3_89.wav +./dataset/44k/yuuka/audio_19.wav +./dataset/44k/yuuka/3_41.wav +./dataset/44k/yuuka/9_59.wav +./dataset/44k/yuuka/7_56.wav +./dataset/44k/yuuka/3_167.wav +./dataset/44k/yuuka/1_31.wav +./dataset/44k/yuuka/8_24.wav +./dataset/44k/yuuka/4_54.wav +./dataset/44k/yuuka/7_53.wav +./dataset/44k/yuuka/7_120.wav +./dataset/44k/yuuka/3_132.wav +./dataset/44k/yuuka/9_11.wav +./dataset/44k/yuuka/3_65.wav +./dataset/44k/yuuka/5_8.wav +./dataset/44k/yuuka/3_2.wav +./dataset/44k/yuuka/519313.wav +./dataset/44k/yuuka/audio_16.wav +./dataset/44k/yuuka/283655.wav +./dataset/44k/yuuka/6_41.wav +./dataset/44k/yuuka/5_74.wav +./dataset/44k/yuuka/5_88.wav +./dataset/44k/yuuka/9_56.wav +./dataset/44k/yuuka/2_23.wav +./dataset/44k/yuuka/3_22.wav +./dataset/44k/yuuka/4_41.wav +./dataset/44k/yuuka/9_47.wav +./dataset/44k/yuuka/4_5.wav +./dataset/44k/yuuka/974105.wav +./dataset/44k/yuuka/2_2.wav +./dataset/44k/yuuka/86716.wav +./dataset/44k/yuuka/300145.wav +./dataset/44k/yuuka/9_60.wav +./dataset/44k/yuuka/3_36.wav +./dataset/44k/yuuka/3_77.wav +./dataset/44k/yuuka/287748.wav +./dataset/44k/yuuka/7_37.wav +./dataset/44k/yuuka/3_11.wav +./dataset/44k/yuuka/5_3.wav +./dataset/44k/yuuka/1_62.wav +./dataset/44k/yuuka/8_3.wav +./dataset/44k/yuuka/audio_5.wav +./dataset/44k/yuuka/9_14.wav +./dataset/44k/yuuka/5_25.wav +./dataset/44k/yuuka/5_24.wav +./dataset/44k/yuuka/1_66.wav +./dataset/44k/yuuka/484944.wav +./dataset/44k/yuuka/171951.wav +./dataset/44k/yuuka/102566.wav +./dataset/44k/yuuka/3_27.wav +./dataset/44k/yuuka/3_108.wav +./dataset/44k/yuuka/audio_9.wav +./dataset/44k/yuuka/4_80.wav +./dataset/44k/yuuka/3_56.wav +./dataset/44k/yuuka/9_63.wav +./dataset/44k/yuuka/7_51.wav +./dataset/44k/yuuka/1_51.wav +./dataset/44k/yuuka/5_90.wav +./dataset/44k/yuuka/356679.wav +./dataset/44k/yuuka/724651.wav +./dataset/44k/yuuka/7_65.wav +./dataset/44k/yuuka/251091.wav +./dataset/44k/yuuka/9_10.wav +./dataset/44k/yuuka/7_2.wav +./dataset/44k/yuuka/4_10.wav +./dataset/44k/yuuka/3_182.wav +./dataset/44k/yuuka/3_55.wav +./dataset/44k/yuuka/1_9.wav +./dataset/44k/yuuka/3_97.wav +./dataset/44k/yuuka/1_25.wav +./dataset/44k/yuuka/96411.wav +./dataset/44k/yuuka/7_31.wav +./dataset/44k/yuuka/427153.wav +./dataset/44k/yuuka/4_48.wav +./dataset/44k/yuuka/1_89.wav +./dataset/44k/yuuka/audio_30.wav +./dataset/44k/yuuka/135370.wav +./dataset/44k/yuuka/7_166.wav +./dataset/44k/yuuka/7_39.wav +./dataset/44k/yuuka/4_39.wav +./dataset/44k/yuuka/6_36.wav +./dataset/44k/yuuka/8_9.wav +./dataset/44k/yuuka/4_24.wav +./dataset/44k/yuuka/4_72.wav +./dataset/44k/yuuka/5_58.wav +./dataset/44k/yuuka/7_162.wav +./dataset/44k/yuuka/6_11.wav +./dataset/44k/yuuka/audio_13.wav +./dataset/44k/yuuka/3_67.wav +./dataset/44k/yuuka/9_15.wav +./dataset/44k/yuuka/3_93.wav +./dataset/44k/yuuka/8_25.wav +./dataset/44k/yuuka/1_74.wav +./dataset/44k/yuuka/7_137.wav +./dataset/44k/yuuka/9_74.wav +./dataset/44k/yuuka/3_131.wav +./dataset/44k/yuuka/3_31.wav +./dataset/44k/yuuka/3_95.wav +./dataset/44k/yuuka/445323.wav +./dataset/44k/yuuka/1_48.wav +./dataset/44k/yuuka/audio_23.wav +./dataset/44k/yuuka/5_62.wav +./dataset/44k/yuuka/5_67.wav +./dataset/44k/yuuka/4_74.wav +./dataset/44k/yuuka/audio_35.wav +./dataset/44k/yuuka/9_3.wav +./dataset/44k/yuuka/379745.wav +./dataset/44k/yuuka/6_17.wav +./dataset/44k/yuuka/audio_4.wav +./dataset/44k/yuuka/3_160.wav +./dataset/44k/yuuka/9_5.wav +./dataset/44k/yuuka/9_22.wav +./dataset/44k/yuuka/1_4.wav +./dataset/44k/yuuka/2_1.wav +./dataset/44k/yuuka/9_51.wav +./dataset/44k/yuuka/6_38.wav +./dataset/44k/yuuka/3_30.wav +./dataset/44k/yuuka/3_180.wav +./dataset/44k/yuuka/5_69.wav +./dataset/44k/yuuka/3_44.wav +./dataset/44k/yuuka/3_45.wav +./dataset/44k/yuuka/7_17.wav +./dataset/44k/yuuka/audio_41.wav +./dataset/44k/yuuka/1_24.wav +./dataset/44k/yuuka/7_132.wav +./dataset/44k/yuuka/9_44.wav +./dataset/44k/yuuka/9_43.wav +./dataset/44k/yuuka/1_15.wav +./dataset/44k/yuuka/469368.wav +./dataset/44k/yuuka/144449.wav +./dataset/44k/yuuka/2_7.wav +./dataset/44k/yuuka/audio_17.wav +./dataset/44k/yuuka/5_92.wav +./dataset/44k/yuuka/7_96.wav +./dataset/44k/yuuka/5_53.wav +./dataset/44k/yuuka/6_20.wav +./dataset/44k/yuuka/8_19.wav +./dataset/44k/yuuka/5_12.wav +./dataset/44k/yuuka/3_38.wav +./dataset/44k/yuuka/7_57.wav +./dataset/44k/yuuka/6_32.wav +./dataset/44k/yuuka/3_48.wav +./dataset/44k/yuuka/5_0.wav +./dataset/44k/yuuka/7_1.wav +./dataset/44k/yuuka/3_52.wav +./dataset/44k/yuuka/7_135.wav +./dataset/44k/yuuka/1_20.wav +./dataset/44k/yuuka/206478.wav +./dataset/44k/yuuka/7_6.wav +./dataset/44k/yuuka/7_165.wav +./dataset/44k/yuuka/271096.wav +./dataset/44k/yuuka/1_17.wav +./dataset/44k/yuuka/6_9.wav +./dataset/44k/yuuka/3_122.wav +./dataset/44k/yuuka/7_10.wav +./dataset/44k/yuuka/audio_38.wav +./dataset/44k/yuuka/507694.wav +./dataset/44k/yuuka/1_28.wav +./dataset/44k/yuuka/9_7.wav +./dataset/44k/yuuka/8_27.wav +./dataset/44k/yuuka/7_8.wav +./dataset/44k/yuuka/8_29.wav +./dataset/44k/yuuka/2_12.wav +./dataset/44k/yuuka/8_12.wav +./dataset/44k/yuuka/7_26.wav +./dataset/44k/yuuka/4_9.wav +./dataset/44k/yuuka/3_21.wav +./dataset/44k/yuuka/3_61.wav +./dataset/44k/yuuka/64918.wav +./dataset/44k/yuuka/7_172.wav +./dataset/44k/yuuka/5_27.wav +./dataset/44k/yuuka/256460.wav +./dataset/44k/yuuka/1_7.wav +./dataset/44k/yuuka/7_185.wav +./dataset/44k/yuuka/3_62.wav +./dataset/44k/yuuka/5_40.wav +./dataset/44k/yuuka/6_30.wav +./dataset/44k/yuuka/7_32.wav +./dataset/44k/yuuka/4_76.wav +./dataset/44k/yuuka/3_185.wav +./dataset/44k/yuuka/622091.wav +./dataset/44k/yuuka/3_33.wav +./dataset/44k/yuuka/9_70.wav +./dataset/44k/yuuka/9_66.wav +./dataset/44k/yuuka/7_40.wav +./dataset/44k/yuuka/813875.wav +./dataset/44k/yuuka/7_176.wav +./dataset/44k/yuuka/4_55.wav +./dataset/44k/yuuka/3_91.wav +./dataset/44k/yuuka/4_4.wav +./dataset/44k/yuuka/5_72.wav +./dataset/44k/yuuka/551231.wav +./dataset/44k/yuuka/7_64.wav +./dataset/44k/yuuka/7_143.wav +./dataset/44k/yuuka/1_37.wav +./dataset/44k/yuuka/5_54.wav +./dataset/44k/yuuka/5_26.wav +./dataset/44k/yuuka/audio_0.wav +./dataset/44k/yuuka/3_125.wav +./dataset/44k/yuuka/1_41.wav +./dataset/44k/yuuka/7_142.wav +./dataset/44k/yuuka/4_23.wav +./dataset/44k/yuuka/3_184.wav +./dataset/44k/yuuka/6_6.wav +./dataset/44k/yuuka/9_73.wav +./dataset/44k/yuuka/3_166.wav +./dataset/44k/yuuka/2_28.wav +./dataset/44k/yuuka/9_0.wav +./dataset/44k/yuuka/1_12.wav +./dataset/44k/yuuka/6_18.wav +./dataset/44k/yuuka/349028.wav +./dataset/44k/yuuka/547091.wav +./dataset/44k/yuuka/audio_14.wav +./dataset/44k/yuuka/5_34.wav +./dataset/44k/yuuka/4_19.wav +./dataset/44k/yuuka/7_27.wav +./dataset/44k/yuuka/7_20.wav +./dataset/44k/yuuka/9_35.wav +./dataset/44k/yuuka/8_16.wav +./dataset/44k/yuuka/7_102.wav +./dataset/44k/yuuka/3_6.wav +./dataset/44k/yuuka/798678.wav +./dataset/44k/yuuka/915260.wav +./dataset/44k/yuuka/7_103.wav +./dataset/44k/yuuka/5_18.wav +./dataset/44k/yuuka/3_13.wav +./dataset/44k/yuuka/7_85.wav +./dataset/44k/yuuka/85191.wav +./dataset/44k/yuuka/5_76.wav +./dataset/44k/yuuka/3_4.wav +./dataset/44k/yuuka/3_142.wav +./dataset/44k/yuuka/4_59.wav +./dataset/44k/yuuka/711779.wav +./dataset/44k/yuuka/7_104.wav +./dataset/44k/yuuka/7_129.wav +./dataset/44k/yuuka/470902.wav +./dataset/44k/yuuka/1_49.wav +./dataset/44k/yuuka/7_11.wav +./dataset/44k/yuuka/6_1.wav +./dataset/44k/yuuka/1_67.wav +./dataset/44k/yuuka/7_170.wav +./dataset/44k/yuuka/1_39.wav +./dataset/44k/yuuka/550982.wav +./dataset/44k/yuuka/7_125.wav +./dataset/44k/yuuka/9_75.wav +./dataset/44k/yuuka/8_21.wav +./dataset/44k/yuuka/3_0.wav +./dataset/44k/yuuka/1_18.wav +./dataset/44k/yuuka/audio_42.wav +./dataset/44k/yuuka/5_20.wav +./dataset/44k/yuuka/4_46.wav +./dataset/44k/yuuka/253501.wav +./dataset/44k/yuuka/7_184.wav +./dataset/44k/yuuka/8_30.wav +./dataset/44k/yuuka/2_0.wav +./dataset/44k/yuuka/7_140.wav +./dataset/44k/yuuka/9_77.wav +./dataset/44k/yuuka/7_89.wav +./dataset/44k/yuuka/56932.wav +./dataset/44k/yuuka/audio_32.wav +./dataset/44k/yuuka/7_29.wav +./dataset/44k/yuuka/7_50.wav +./dataset/44k/yuuka/9_64.wav +./dataset/44k/yuuka/3_34.wav +./dataset/44k/yuuka/4_36.wav +./dataset/44k/yuuka/1_38.wav +./dataset/44k/yuuka/9_34.wav +./dataset/44k/yuuka/705069.wav +./dataset/44k/yuuka/7_54.wav +./dataset/44k/yuuka/3_121.wav +./dataset/44k/yuuka/7_177.wav +./dataset/44k/yuuka/audio_3.wav +./dataset/44k/yuuka/437916.wav +./dataset/44k/yuuka/7_169.wav +./dataset/44k/yuuka/6_24.wav +./dataset/44k/yuuka/2_5.wav +./dataset/44k/yuuka/5_48.wav +./dataset/44k/yuuka/3_47.wav +./dataset/44k/yuuka/3_78.wav +./dataset/44k/yuuka/3_187.wav +./dataset/44k/yuuka/7_124.wav +./dataset/44k/yuuka/3_169.wav +./dataset/44k/yuuka/321860.wav +./dataset/44k/yuuka/5_46.wav +./dataset/44k/yuuka/3_86.wav +./dataset/44k/yuuka/3_87.wav +./dataset/44k/yuuka/1_83.wav +./dataset/44k/yuuka/1_36.wav +./dataset/44k/yuuka/1_116.wav +./dataset/44k/yuuka/687395.wav +./dataset/44k/yuuka/1_35.wav +./dataset/44k/yuuka/7_186.wav +./dataset/44k/yuuka/9_49.wav +./dataset/44k/yuuka/8_18.wav +./dataset/44k/yuuka/6_14.wav +./dataset/44k/yuuka/5_4.wav +./dataset/44k/yuuka/9_58.wav +./dataset/44k/yuuka/498387.wav +./dataset/44k/yuuka/5_42.wav +./dataset/44k/yuuka/3_17.wav +./dataset/44k/yuuka/4_3.wav +./dataset/44k/yuuka/7_24.wav +./dataset/44k/yuuka/314228.wav +./dataset/44k/yuuka/652599.wav +./dataset/44k/yuuka/7_28.wav +./dataset/44k/yuuka/3_140.wav +./dataset/44k/yuuka/7_3.wav +./dataset/44k/yuuka/915926.wav +./dataset/44k/yuuka/3_183.wav +./dataset/44k/yuuka/7_13.wav +./dataset/44k/yuuka/3_137.wav +./dataset/44k/yuuka/968783.wav +./dataset/44k/yuuka/1_52.wav +./dataset/44k/yuuka/4_66.wav +./dataset/44k/yuuka/5_2.wav +./dataset/44k/yuuka/5_9.wav +./dataset/44k/yuuka/5_70.wav +./dataset/44k/yuuka/7_83.wav +./dataset/44k/yuuka/8_17.wav +./dataset/44k/yuuka/5_7.wav +./dataset/44k/yuuka/1_70.wav +./dataset/44k/yuuka/5_89.wav +./dataset/44k/yuuka/3_9.wav +./dataset/44k/yuuka/3_84.wav +./dataset/44k/yuuka/1_29.wav +./dataset/44k/yuuka/6_15.wav +./dataset/44k/yuuka/884738.wav +./dataset/44k/yuuka/1_72.wav +./dataset/44k/yuuka/168875.wav +./dataset/44k/yuuka/833141.wav +./dataset/44k/yuuka/7_68.wav +./dataset/44k/yuuka/7_41.wav +./dataset/44k/yuuka/4_0.wav +./dataset/44k/yuuka/7_48.wav +./dataset/44k/yuuka/9_81.wav +./dataset/44k/yuuka/4_44.wav +./dataset/44k/yuuka/958019.wav +./dataset/44k/yuuka/9_50.wav +./dataset/44k/yuuka/5_68.wav +./dataset/44k/yuuka/3_32.wav +./dataset/44k/yuuka/7_106.wav +./dataset/44k/yuuka/1_16.wav +./dataset/44k/yuuka/5_16.wav +./dataset/44k/yuuka/3_20.wav +./dataset/44k/yuuka/502529.wav +./dataset/44k/yuuka/237547.wav +./dataset/44k/yuuka/3_186.wav +./dataset/44k/yuuka/audio_27.wav +./dataset/44k/yuuka/5_5.wav +./dataset/44k/yuuka/7_35.wav +./dataset/44k/yuuka/3_120.wav +./dataset/44k/yuuka/5_59.wav +./dataset/44k/yuuka/7_78.wav +./dataset/44k/yuuka/650180.wav +./dataset/44k/yuuka/audio_24.wav +./dataset/44k/yuuka/678092.wav +./dataset/44k/yuuka/6_2.wav +./dataset/44k/yuuka/5_23.wav +./dataset/44k/yuuka/1_88.wav +./dataset/44k/yuuka/240071.wav +./dataset/44k/yuuka/1_46.wav +./dataset/44k/yuuka/15940.wav +./dataset/44k/yuuka/8_22.wav +./dataset/44k/yuuka/4_2.wav +./dataset/44k/yuuka/5_93.wav +./dataset/44k/yuuka/7_87.wav +./dataset/44k/yuuka/454791.wav +./dataset/44k/yuuka/5_73.wav +./dataset/44k/yuuka/1_0.wav +./dataset/44k/yuuka/7_4.wav +./dataset/44k/yuuka/9_1.wav +./dataset/44k/yuuka/586033.wav +./dataset/44k/yuuka/1_23.wav +./dataset/44k/yuuka/2_27.wav +./dataset/44k/yuuka/5_38.wav +./dataset/44k/yuuka/4_34.wav +./dataset/44k/yuuka/347125.wav +./dataset/44k/yuuka/2_15.wav +./dataset/44k/yuuka/6_12.wav +./dataset/44k/yuuka/1_84.wav +./dataset/44k/yuuka/975179.wav +./dataset/44k/yuuka/2_25.wav +./dataset/44k/yuuka/4_65.wav +./dataset/44k/yuuka/9_72.wav +./dataset/44k/yuuka/6_8.wav +./dataset/44k/yuuka/380298.wav +./dataset/44k/yuuka/1_56.wav +./dataset/44k/yuuka/1_5.wav +./dataset/44k/yuuka/1_94.wav +./dataset/44k/yuuka/7_86.wav +./dataset/44k/yuuka/7_42.wav +./dataset/44k/yuuka/1_64.wav +./dataset/44k/yuuka/5_115.wav +./dataset/44k/yuuka/audio_26.wav +./dataset/44k/yuuka/4_63.wav +./dataset/44k/yuuka/5_83.wav +./dataset/44k/yuuka/5_64.wav +./dataset/44k/yuuka/7_150.wav +./dataset/44k/yuuka/7_179.wav +./dataset/44k/yuuka/6_35.wav +./dataset/44k/yuuka/7_9.wav +./dataset/44k/yuuka/3_127.wav +./dataset/44k/yuuka/1_21.wav +./dataset/44k/yuuka/5_31.wav +./dataset/44k/yuuka/5_78.wav +./dataset/44k/yuuka/9_6.wav +./dataset/44k/yuuka/4_71.wav +./dataset/44k/yuuka/2_9.wav +./dataset/44k/yuuka/617112.wav +./dataset/44k/yuuka/8_28.wav +./dataset/44k/yuuka/5_11.wav +./dataset/44k/yuuka/832708.wav +./dataset/44k/yuuka/8_0.wav +./dataset/44k/yuuka/540598.wav +./dataset/44k/yuuka/7_38.wav +./dataset/44k/yuuka/1_71.wav +./dataset/44k/yuuka/6_37.wav +./dataset/44k/yuuka/394815.wav +./dataset/44k/yuuka/9_24.wav +./dataset/44k/yuuka/5_49.wav +./dataset/44k/yuuka/3_103.wav +./dataset/44k/yuuka/1_90.wav +./dataset/44k/yuuka/9_13.wav +./dataset/44k/yuuka/66052.wav +./dataset/44k/yuuka/1_96.wav +./dataset/44k/yuuka/3_141.wav +./dataset/44k/yuuka/1_40.wav +./dataset/44k/yuuka/2_17.wav +./dataset/44k/yuuka/7_88.wav +./dataset/44k/yuuka/386393.wav +./dataset/44k/yuuka/2_10.wav +./dataset/44k/yuuka/4_79.wav +./dataset/44k/yuuka/9_33.wav +./dataset/44k/yuuka/7_91.wav +./dataset/44k/yuuka/audio_33.wav +./dataset/44k/yuuka/7_0.wav +./dataset/44k/yuuka/3_138.wav +./dataset/44k/yuuka/872587.wav +./dataset/44k/yuuka/7_122.wav +./dataset/44k/yuuka/40927.wav +./dataset/44k/yuuka/5_33.wav +./dataset/44k/yuuka/4_11.wav +./dataset/44k/yuuka/4_40.wav +./dataset/44k/yuuka/3_28.wav +./dataset/44k/yuuka/audio_37.wav +./dataset/44k/yuuka/228114.wav +./dataset/44k/yuuka/629546.wav +./dataset/44k/yuuka/1_100.wav +./dataset/44k/yuuka/4_62.wav +./dataset/44k/yuuka/1_6.wav +./dataset/44k/yuuka/650930.wav +./dataset/44k/yuuka/7_121.wav +./dataset/44k/yuuka/1_95.wav +./dataset/44k/yuuka/audio_20.wav +./dataset/44k/yuuka/752079.wav +./dataset/44k/yuuka/7_33.wav +./dataset/44k/yuuka/1_99.wav +./dataset/44k/yuuka/5_1.wav +./dataset/44k/yuuka/571895.wav +./dataset/44k/yuuka/3_8.wav +./dataset/44k/yuuka/5_99.wav +./dataset/44k/yuuka/5_14.wav +./dataset/44k/yuuka/3_68.wav +./dataset/44k/yuuka/7_161.wav +./dataset/44k/yuuka/2_16.wav +./dataset/44k/yuuka/audio_2.wav +./dataset/44k/yuuka/2_4.wav +./dataset/44k/yuuka/1_3.wav +./dataset/44k/yuuka/260935.wav +./dataset/44k/yuuka/3_152.wav +./dataset/44k/yuuka/4_35.wav +./dataset/44k/yuuka/617923.wav +./dataset/44k/yuuka/5_98.wav +./dataset/44k/yuuka/4_49.wav +./dataset/44k/yuuka/audio_43.wav +./dataset/44k/yuuka/73560.wav +./dataset/44k/yuuka/3_144.wav +./dataset/44k/yuuka/5_82.wav +./dataset/44k/yuuka/4_7.wav +./dataset/44k/yuuka/651496.wav +./dataset/44k/yuuka/3_49.wav +./dataset/44k/yuuka/audio_6.wav +./dataset/44k/yuuka/9_71.wav +./dataset/44k/yuuka/1_59.wav +./dataset/44k/yuuka/3_25.wav +./dataset/44k/yuuka/7_92.wav +./dataset/44k/yuuka/7_46.wav +./dataset/44k/yuuka/4_73.wav +./dataset/44k/yuuka/7_136.wav +./dataset/44k/yuuka/9_79.wav +./dataset/44k/yuuka/3_175.wav +./dataset/44k/yuuka/9_41.wav +./dataset/44k/yuuka/9_31.wav +./dataset/44k/yuuka/761979.wav +./dataset/44k/yuuka/3_83.wav +./dataset/44k/yuuka/3_177.wav +./dataset/44k/yuuka/7_163.wav +./dataset/44k/yuuka/3_43.wav +./dataset/44k/yuuka/3_37.wav +./dataset/44k/yuuka/8_2.wav +./dataset/44k/yuuka/8_10.wav +./dataset/44k/yuuka/820939.wav +./dataset/44k/yuuka/778182.wav +./dataset/44k/yuuka/6_31.wav +./dataset/44k/yuuka/3_164.wav +./dataset/44k/yuuka/5_32.wav +./dataset/44k/yuuka/215854.wav +./dataset/44k/yuuka/6_5.wav +./dataset/44k/yuuka/3_109.wav +./dataset/44k/yuuka/6_42.wav +./dataset/44k/yuuka/188142.wav +./dataset/44k/yuuka/6_40.wav +./dataset/44k/yuuka/4_30.wav +./dataset/44k/yuuka/3_161.wav +./dataset/44k/yuuka/7_62.wav +./dataset/44k/yuuka/2_22.wav +./dataset/44k/yuuka/4_69.wav +./dataset/44k/yuuka/4_6.wav +./dataset/44k/yuuka/577791.wav +./dataset/44k/yuuka/723735.wav +./dataset/44k/yuuka/7_90.wav +./dataset/44k/yuuka/5_96.wav +./dataset/44k/yuuka/1_82.wav +./dataset/44k/yuuka/225368.wav +./dataset/44k/yuuka/audio_31.wav +./dataset/44k/yuuka/3_130.wav +./dataset/44k/yuuka/audio_39.wav +./dataset/44k/yuuka/7_153.wav +./dataset/44k/yuuka/7_183.wav +./dataset/44k/yuuka/870812.wav +./dataset/44k/yuuka/5_114.wav +./dataset/44k/yuuka/719896.wav +./dataset/44k/yuuka/5_65.wav +./dataset/44k/yuuka/269297.wav +./dataset/44k/yuuka/3_124.wav +./dataset/44k/yuuka/9_57.wav +./dataset/44k/yuuka/7_18.wav +./dataset/44k/yuuka/619226.wav +./dataset/44k/yuuka/207781.wav +./dataset/44k/yuuka/5_28.wav +./dataset/44k/yuuka/5_71.wav +./dataset/44k/yuuka/143646.wav +./dataset/44k/yuuka/383188.wav +./dataset/44k/yuuka/7_131.wav +./dataset/44k/yuuka/3_92.wav +./dataset/44k/yuuka/892318.wav +./dataset/44k/yuuka/1_57.wav +./dataset/44k/yuuka/audio_21.wav +./dataset/44k/yuuka/4_57.wav +./dataset/44k/yuuka/3_96.wav +./dataset/44k/yuuka/7_12.wav +./dataset/44k/yuuka/6_29.wav +./dataset/44k/yuuka/3_3.wav +./dataset/44k/yuuka/7_97.wav +./dataset/44k/yuuka/1_54.wav +./dataset/44k/yuuka/9_78.wav +./dataset/44k/yuuka/76879.wav +./dataset/44k/yuuka/3_123.wav +./dataset/44k/yuuka/1_91.wav +./dataset/44k/yuuka/7_138.wav +./dataset/44k/yuuka/7_77.wav +./dataset/44k/yuuka/5_55.wav +./dataset/44k/yuuka/2_29.wav +./dataset/44k/yuuka/2_19.wav +./dataset/44k/yuuka/4_45.wav +./dataset/44k/yuuka/7_84.wav +./dataset/44k/yuuka/3_143.wav +./dataset/44k/yuuka/1_8.wav +./dataset/44k/yuuka/3_64.wav +./dataset/44k/yuuka/1_30.wav +./dataset/44k/yuuka/3_10.wav +./dataset/44k/yuuka/854231.wav +./dataset/44k/yuuka/6_26.wav +./dataset/44k/yuuka/3_23.wav +./dataset/44k/yuuka/5_44.wav +./dataset/44k/yuuka/7_45.wav +./dataset/44k/yuuka/5_10.wav +./dataset/44k/yuuka/7_180.wav +./dataset/44k/yuuka/9_12.wav +./dataset/44k/yuuka/audio_8.wav +./dataset/44k/yuuka/3_35.wav +./dataset/44k/yuuka/565217.wav +./dataset/44k/yuuka/7_63.wav +./dataset/44k/yuuka/5_75.wav +./dataset/44k/yuuka/1_2.wav +./dataset/44k/yuuka/9_8.wav +./dataset/44k/yuuka/5_94.wav +./dataset/44k/yuuka/1_75.wav +./dataset/44k/yuuka/8_8.wav +./dataset/44k/yuuka/7_141.wav +./dataset/44k/yuuka/5_17.wav +./dataset/44k/yuuka/853514.wav +./dataset/44k/yuuka/4_78.wav +./dataset/44k/yuuka/5_116.wav +./dataset/44k/yuuka/448774.wav +./dataset/44k/yuuka/8_13.wav +./dataset/44k/yuuka/4_13.wav +./dataset/44k/yuuka/4_8.wav +./dataset/44k/yuuka/5_60.wav +./dataset/44k/yuuka/321094.wav +./dataset/44k/yuuka/859348.wav +./dataset/44k/yuuka/9_19.wav +./dataset/44k/yuuka/1_27.wav +./dataset/44k/yuuka/4_58.wav +./dataset/44k/yuuka/1_53.wav +./dataset/44k/yuuka/522647.wav +./dataset/44k/yuuka/73298.wav +./dataset/44k/yuuka/9_2.wav +./dataset/44k/yuuka/3_139.wav +./dataset/44k/yuuka/9_55.wav +./dataset/44k/yuuka/1_115.wav +./dataset/44k/yuuka/796714.wav +./dataset/44k/yuuka/5_43.wav +./dataset/44k/yuuka/3_57.wav +./dataset/44k/yuuka/4_70.wav +./dataset/44k/yuuka/4_81.wav +./dataset/44k/yuuka/4_31.wav +./dataset/44k/yuuka/3_88.wav +./dataset/44k/yuuka/5_57.wav +./dataset/44k/yuuka/6_19.wav +./dataset/44k/yuuka/6_10.wav +./dataset/44k/yuuka/9_20.wav +./dataset/44k/yuuka/5_37.wav +./dataset/44k/yuuka/6_43.wav +./dataset/44k/yuuka/9_23.wav +./dataset/44k/yuuka/7_60.wav +./dataset/44k/yuuka/3_159.wav +./dataset/44k/yuuka/182426.wav +./dataset/44k/yuuka/1_50.wav +./dataset/44k/yuuka/5_81.wav +./dataset/44k/yuuka/4_64.wav +./dataset/44k/yuuka/5_84.wav +./dataset/44k/yuuka/5_29.wav +./dataset/44k/yuuka/2_8.wav +./dataset/44k/yuuka/4_32.wav +./dataset/44k/yuuka/7_47.wav +./dataset/44k/yuuka/7_109.wav +./dataset/44k/yuuka/130059.wav +./dataset/44k/yuuka/1_42.wav +./dataset/44k/yuuka/5_47.wav +./dataset/44k/yuuka/9_40.wav +./dataset/44k/yuuka/4_75.wav +./dataset/44k/yuuka/1_63.wav +./dataset/44k/yuuka/9_9.wav +./dataset/44k/yuuka/208859.wav +./dataset/44k/yuuka/5_56.wav +./dataset/44k/yuuka/9_36.wav +./dataset/44k/yuuka/187864.wav +./dataset/44k/yuuka/7_21.wav +./dataset/44k/yuuka/7_144.wav +./dataset/44k/yuuka/7_22.wav +./dataset/44k/yuuka/7_34.wav +./dataset/44k/yuuka/1_65.wav +./dataset/44k/yuuka/6_28.wav +./dataset/44k/yuuka/7_19.wav diff --git a/filelists/val.txt b/filelists/val.txt new file mode 100644 index 0000000000000000000000000000000000000000..99b13feeb9f8fa969bb5b9522d0b3745df81b16e --- /dev/null +++ b/filelists/val.txt @@ -0,0 +1,2 @@ +./dataset/44k/yuuka/3_106.wav +./dataset/44k/yuuka/3_7.wav diff --git a/flask_api.py b/flask_api.py new file mode 100644 index 0000000000000000000000000000000000000000..1872019efc90d3e500625ab7dd300791c91136c6 --- /dev/null +++ b/flask_api.py @@ -0,0 +1,57 @@ +import io +import logging + +import librosa +import soundfile +import paddle +import paddle.audio as paddleaudio +from flask import Flask, request, send_file +from flask_cors import CORS + +from inference.infer_tool import Svc, RealTimeVC + +app = Flask(__name__) + +CORS(app) + +logging.getLogger('numba').setLevel(logging.WARNING) + + +@app.route("/voiceChangeModel", methods=["POST"]) +def voice_change_model(): + request_form = request.form + wave_file = request.files.get("sample", None) + # 变调信息 + f_pitch_change = float(request_form.get("fPitchChange", 0)) + # DAW所需的采样率 + daw_sample = int(float(request_form.get("sampleRate", 0))) + speaker_id = int(float(request_form.get("sSpeakId", 0))) + # http获得wav文件并转换 + input_wav_path = io.BytesIO(wave_file.read()) + + # 模型推理 + if raw_infer: + out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path) + tar_audio = librosa.resample(out_audio.numpy(), svc_model.target_sample, daw_sample) + else: + out_audio = svc.process(svc_model, speaker_id, f_pitch_change, input_wav_path) + tar_audio = librosa.resample(out_audio, svc_model.target_sample, daw_sample) + # 返回音频 + out_wav_path = io.BytesIO() + soundfile.write(out_wav_path, tar_audio, daw_sample, format="wav") + out_wav_path.seek(0) + return send_file(out_wav_path, download_name="temp.wav", as_attachment=True) + + +if __name__ == '__main__': + # 启用则为直接切片合成,False为交叉淡化方式 + # vst插件调整0.3-0.5s切片时间可以降低延迟,直接切片方法会有连接处爆音、交叉淡化会有轻微重叠声音 + # 自行选择能接受的方法,或将vst最大切片时间调整为1s,此处设为Ture,延迟大音质稳定一些 + raw_infer = True + # 每个模型和config是唯一对应的 + model_name = "logs/44k/G_1005.pdparams" + config_name = "configs/config.json" + svc_model = Svc(model_name, config_name) + svc = RealTimeVC() + # 此处与vst插件对应,不建议更改 + app.run(port=6842, host="0.0.0.0", debug=False, threaded=False) diff --git a/hubert/__init__.py b/hubert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/hubert/hubert4.0.onnx b/hubert/hubert4.0.onnx new file mode 100644 index 0000000000000000000000000000000000000000..32c7fd5f911bd94a436201b6db1c36a4bab9d610 --- /dev/null +++ b/hubert/hubert4.0.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc348818d386f8cff3332ab4bcd77e5870c373c492c896664092fd7230122a32 +size 293347531 diff --git a/hubert/hubert_model.py b/hubert/hubert_model.py new file mode 100644 index 0000000000000000000000000000000000000000..50be97aa535011008e257d5ec3db8da7282da93b --- /dev/null +++ b/hubert/hubert_model.py @@ -0,0 +1,226 @@ +import copy +import random +from typing import Optional, Tuple + +import paddle +import paddle.nn as nn +import paddle.nn.functional as t_func +#from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present + + +class Hubert(paddle.nn.Layer): + def __init__(self, num_label_embeddings: int = 100, mask: bool = True): + super().__init__() + self._mask = mask + self.feature_extractor = FeatureExtractor() + self.feature_projection = FeatureProjection() + self.positional_embedding = PositionalConvEmbedding() + self.norm = nn.LayerNorm(768) + self.dropout = nn.Dropout(0.1) + self.encoder = TransformerEncoder( + nn.TransformerEncoderLayer( + 768, 12, 3072, activation="gelu" + ), + 12, + ) + self.proj = nn.Linear(768, 256) + + self.masked_spec_embed = paddle.create_parameter([768],dtype = 'float32') + self.label_embedding = nn.Embedding(num_label_embeddings, 256) + + def mask(self, x: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: + mask = None + if self.training and self._mask: + mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, None, 2) + x[mask] = self.masked_spec_embed + return x, mask + + def encode( + self, x: paddle.Tensor, layer: Optional[int] = None + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + x = self.feature_extractor(x) + x = self.feature_projection(x.transpose([0, 2, 1])) + x, mask = self.mask(x) + x = x + self.positional_embedding(x) + x = self.dropout(self.norm(x)) + x = self.encoder(x, output_layer=layer) + return x, mask + + def logits(self, x: paddle.Tensor) -> paddle.Tensor: + logits = t_func.cosine_similarity( + x.unsqueeze(2), + self.label_embedding.weight.unsqueeze(0).unsqueeze(0), + axis=-1, + ) + return logits / 0.1 + + def forward(self, x: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: + x, mask = self.encode(x) + x = self.proj(x) + logits = self.logits(x) + return logits, mask + + +class HubertSoft(Hubert): + def __init__(self): + super().__init__() + + def units(self, wav: paddle.Tensor) -> paddle.Tensor: + wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2),data_format='NCL') + x, _ = self.encode(wav) + return self.proj(x) + + +class FeatureExtractor(paddle.nn.Layer): + def __init__(self): + super().__init__() + self.conv0 = nn.Conv1D(1, 512, 10, 5, bias_attr=False) + self.norm0 = nn.GroupNorm(512, 512) + self.conv1 = nn.Conv1D(512, 512, 3, 2, bias_attr=False) + self.conv2 = nn.Conv1D(512, 512, 3, 2, bias_attr=False) + self.conv3 = nn.Conv1D(512, 512, 3, 2, bias_attr=False) + self.conv4 = nn.Conv1D(512, 512, 3, 2, bias_attr=False) + self.conv5 = nn.Conv1D(512, 512, 2, 2, bias_attr=False) + self.conv6 = nn.Conv1D(512, 512, 2, 2, bias_attr=False) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = t_func.gelu(self.norm0(self.conv0(x))) + x = t_func.gelu(self.conv1(x)) + x = t_func.gelu(self.conv2(x)) + x = t_func.gelu(self.conv3(x)) + x = t_func.gelu(self.conv4(x)) + x = t_func.gelu(self.conv5(x)) + x = t_func.gelu(self.conv6(x)) + return x + + +class FeatureProjection(paddle.nn.Layer): + def __init__(self): + super().__init__() + self.norm = nn.LayerNorm(512) + self.projection = nn.Linear(512, 768) + self.dropout = nn.Dropout(0.1) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self.norm(x) + x = self.projection(x) + x = self.dropout(x) + return x + + +class PositionalConvEmbedding(paddle.nn.Layer): + def __init__(self): + super().__init__() + self.conv = nn.Conv1D( + 768, + 768, + kernel_size=128, + padding=128 // 2, + groups=16, + ) + self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self.conv(x.transpose([0, 2, 1])) + x = t_func.gelu(x[:, :, :-1]) + return x.transpose([0, 2, 1]) + + +class TransformerEncoder(paddle.nn.Layer): + def __init__( + self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int + ) -> None: + super(TransformerEncoder, self).__init__() + self.layers = nn.LayerList( + [copy.deepcopy(encoder_layer) for _ in range(num_layers)] + ) + self.num_layers = num_layers + + def forward( + self, + src: paddle.Tensor, + mask: paddle.Tensor = None, + src_key_padding_mask: paddle.Tensor = None, + output_layer: Optional[int] = None, + ) -> paddle.Tensor: + output = src + for layer in self.layers[:output_layer]: + output = layer( + output, src_mask=mask, src_key_padding_mask=src_key_padding_mask + ) + return output + + +def _compute_mask( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + device: None, + min_masks: int = 0, +) -> paddle.Tensor: + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + ) + + # compute number of masked spans in batch + num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) + num_masked_spans = max(num_masked_spans, min_masks) + + # make sure num masked indices <= sequence_length + if num_masked_spans * mask_length > sequence_length: + num_masked_spans = sequence_length // mask_length + + # SpecAugment mask to fill + mask = paddle.zeros((batch_size, sequence_length), dtype='bool') + + # uniform distribution to sample from, make sure that offset samples are < sequence_length + uniform_dist = paddle.ones( + (batch_size, sequence_length - (mask_length - 1)) + ) + + # get random indices to mask + mask_indices = paddle.multinomial(uniform_dist, num_masked_spans) + + # expand masked indices to masked spans + mask_indices = ( + mask_indices.unsqueeze(dim=-1) + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + offsets = ( + paddle.arange(mask_length)[None, None, :] + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + mask_idxs = mask_indices + offsets + + # scatter indices to mask + mask = mask.scatter(1, mask_idxs, True) + + return mask + + +def hubert_soft( + path: str, +) -> HubertSoft: + r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. + Args: + path (str): path of a pretrained model + """ + hubert = HubertSoft() + checkpoint = paddle.load(path) + #consume_prefix_in_state_dict_if_present(checkpoint, "module.") + hubert.set_state_dict(checkpoint) + hubert.eval() + return hubert + +if __name__ == '__main__': + hubert = HubertSoft() + d = paddle.load(r'E:\trans\hubert\final.pdparams') + hubert.set_state_dict(d) diff --git a/hubert/hubert_model_onnx.py b/hubert/hubert_model_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..86864321a3b8c5e9fc0f688285f1cc72844a63ee --- /dev/null +++ b/hubert/hubert_model_onnx.py @@ -0,0 +1,217 @@ +import copy +import random +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as t_func +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present + + +class Hubert(nn.Layer): + def __init__(self, num_label_embeddings: int = 100, mask: bool = True): + super().__init__() + self._mask = mask + self.feature_extractor = FeatureExtractor() + self.feature_projection = FeatureProjection() + self.positional_embedding = PositionalConvEmbedding() + self.norm = nn.LayerNorm(768) + self.dropout = nn.Dropout(0.1) + self.encoder = TransformerEncoder( + nn.TransformerEncoderLayer( + 768, 12, 3072, activation="gelu", batch_first=True + ), + 12, + ) + self.proj = nn.Linear(768, 256) + + self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) + self.label_embedding = nn.Embedding(num_label_embeddings, 256) + + def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + mask = None + if self.training and self._mask: + mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) + x[mask] = self.masked_spec_embed.to(x.dtype) + return x, mask + + def encode( + self, x: torch.Tensor, layer: Optional[int] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = self.feature_extractor(x) + x = self.feature_projection(x.transpose(1, 2)) + x, mask = self.mask(x) + x = x + self.positional_embedding(x) + x = self.dropout(self.norm(x)) + x = self.encoder(x, output_layer=layer) + return x, mask + + def logits(self, x: torch.Tensor) -> torch.Tensor: + logits = torch.cosine_similarity( + x.unsqueeze(2), + self.label_embedding.weight.unsqueeze(0).unsqueeze(0), + dim=-1, + ) + return logits / 0.1 + + +class HubertSoft(Hubert): + def __init__(self): + super().__init__() + + def units(self, wav: torch.Tensor) -> torch.Tensor: + wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + x, _ = self.encode(wav) + return self.proj(x) + + def forward(self, x): + return self.units(x) + +class FeatureExtractor(nn.Layer): + def __init__(self): + super().__init__() + self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) + self.norm0 = nn.GroupNorm(512, 512) + self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) + self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = t_func.gelu(self.norm0(self.conv0(x))) + x = t_func.gelu(self.conv1(x)) + x = t_func.gelu(self.conv2(x)) + x = t_func.gelu(self.conv3(x)) + x = t_func.gelu(self.conv4(x)) + x = t_func.gelu(self.conv5(x)) + x = t_func.gelu(self.conv6(x)) + return x + + +class FeatureProjection(nn.Layer): + def __init__(self): + super().__init__() + self.norm = nn.LayerNorm(512) + self.projection = nn.Linear(512, 768) + self.dropout = nn.Dropout(0.1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.norm(x) + x = self.projection(x) + x = self.dropout(x) + return x + + +class PositionalConvEmbedding(nn.Layer): + def __init__(self): + super().__init__() + self.conv = nn.Conv1d( + 768, + 768, + kernel_size=128, + padding=128 // 2, + groups=16, + ) + self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x.transpose(1, 2)) + x = t_func.gelu(x[:, :, :-1]) + return x.transpose(1, 2) + + +class TransformerEncoder(nn.Layer): + def __init__( + self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int + ) -> None: + super(TransformerEncoder, self).__init__() + self.layers = nn.LayerList( + [copy.deepcopy(encoder_layer) for _ in range(num_layers)] + ) + self.num_layers = num_layers + + def forward( + self, + src: torch.Tensor, + mask: torch.Tensor = None, + src_key_padding_mask: torch.Tensor = None, + output_layer: Optional[int] = None, + ) -> torch.Tensor: + output = src + for layer in self.layers[:output_layer]: + output = layer( + output, src_mask=mask, src_key_padding_mask=src_key_padding_mask + ) + return output + + +def _compute_mask( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + device: torch.device, + min_masks: int = 0, +) -> torch.Tensor: + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + ) + + # compute number of masked spans in batch + num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) + num_masked_spans = max(num_masked_spans, min_masks) + + # make sure num masked indices <= sequence_length + if num_masked_spans * mask_length > sequence_length: + num_masked_spans = sequence_length // mask_length + + # SpecAugment mask to fill + mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) + + # uniform distribution to sample from, make sure that offset samples are < sequence_length + uniform_dist = torch.ones( + (batch_size, sequence_length - (mask_length - 1)), device=device + ) + + # get random indices to mask + mask_indices = torch.multinomial(uniform_dist, num_masked_spans) + + # expand masked indices to masked spans + mask_indices = ( + mask_indices.unsqueeze(dim=-1) + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + offsets = ( + torch.arange(mask_length, device=device)[None, None, :] + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + mask_idxs = mask_indices + offsets + + # scatter indices to mask + mask = mask.scatter(1, mask_idxs, True) + + return mask + + +def hubert_soft( + path: str, +) -> HubertSoft: + r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. + Args: + path (str): path of a pretrained model + """ + hubert = HubertSoft() + checkpoint = torch.load(path) + consume_prefix_in_state_dict_if_present(checkpoint, "module.") + hubert.load_state_dict(checkpoint) + hubert.eval() + return hubert diff --git a/inference/__init__.py b/inference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dd86a7534f88d9943d50e7409512f8f10aaa8bf2 --- /dev/null +++ b/inference/__init__.py @@ -0,0 +1 @@ +'''梅花三弄再回首花了一个小时迁移的模块''' \ No newline at end of file diff --git a/inference/chunks_temp.json b/inference/chunks_temp.json new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/inference/chunks_temp.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/inference/infer_tool.py b/inference/infer_tool.py new file mode 100644 index 0000000000000000000000000000000000000000..01eacaea1a318113c9c33fce8e808dccf94878c9 --- /dev/null +++ b/inference/infer_tool.py @@ -0,0 +1,255 @@ +import hashlib +import io +import json +import logging +import os +import time +from pathlib import Path +from inference import slicer + +import librosa +import numpy as np +# import onnxruntime +import parselmouth +import soundfile +import paddle +import paddle.audio as paddleaudio +import paddleaudio + +import cluster +#from hubert import hubert_model +import utils +from models import SynthesizerTrn,SynthesizerTrn_test + +logging.getLogger('matplotlib').setLevel(logging.WARNING) +paddle.audio.backends.set_backend('soundfile') + +def read_temp(file_name): + if not os.path.exists(file_name): + with open(file_name, "w") as f: + f.write(json.dumps({"info": "temp_dict"})) + return {} + else: + try: + with open(file_name, "r") as f: + data = f.read() + data_dict = json.loads(data) + if os.path.getsize(file_name) > 50 * 1024 * 1024: + f_name = file_name.replace("\\", "/").split("/")[-1] + print(f"clean {f_name}") + for wav_hash in list(data_dict.keys()): + if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600: + del data_dict[wav_hash] + except Exception as e: + print(e) + print(f"{file_name} error,auto rebuild file") + data_dict = {"info": "temp_dict"} + return data_dict + + +def write_temp(file_name, data): + with open(file_name, "w") as f: + f.write(json.dumps(data)) + + +def timeit(func): + def run(*args, **kwargs): + t = time.time() + res = func(*args, **kwargs) + print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t)) + return res + + return run + + +def format_wav(audio_path): + if Path(audio_path).suffix == '.wav': + return + raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None) + soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate) + + +def get_end_file(dir_path, end): + file_lists = [] + for root, dirs, files in os.walk(dir_path): + files = [f for f in files if f[0] != '.'] + dirs[:] = [d for d in dirs if d[0] != '.'] + for f_file in files: + if f_file.endswith(end): + file_lists.append(os.path.join(root, f_file).replace("\\", "/")) + return file_lists + + +def get_md5(content): + return hashlib.new("md5", content).hexdigest() + +def fill_a_to_b(a, b): + if len(a) < len(b): + for _ in range(0, len(b) - len(a)): + a.append(a[0]) + +def mkdir(paths: list): + for path in paths: + if not os.path.exists(path): + os.mkdir(path) + +def pad_array(arr, target_length): + current_length = arr.shape[0] + if current_length >= target_length: + return arr + else: + pad_width = target_length - current_length + pad_left = pad_width // 2 + pad_right = pad_width - pad_left + padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0)) + return padded_arr + + +class Svc(object): + def __init__(self, net_g_path, config_path, + device=None, + cluster_model_path="./logs/44k/kmeans_10000.pdparams",mode="train"): + self.net_g_path = net_g_path + if device is None: + self.dev = "gpu:0" if paddle.device.is_compiled_with_cuda() else "cpu" + else: + self.dev = device + self.net_g_ms = None + self.hps_ms = utils.get_hparams_from_file(config_path) + self.target_sample = self.hps_ms.data.sampling_rate + self.hop_size = self.hps_ms.data.hop_length + self.spk2id = self.hps_ms.spk + # 加载hubert + self.hubert_model = utils.get_hubert_model() + self.load_model(mode) + if os.path.exists(cluster_model_path): + self.cluster_model = cluster.get_cluster_model(cluster_model_path) + + def load_model(self,mode): + # 获取模型配置 + if mode == "train": + self.net_g_ms = SynthesizerTrn( + self.hps_ms.data.filter_length // 2 + 1, + self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, + **self.hps_ms.model) + elif mode == "test": + self.net_g_ms = SynthesizerTrn_test( + self.hps_ms.data.filter_length // 2 + 1, + self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, + **self.hps_ms.model) + _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None) + if "half" in self.net_g_path and paddle.device.is_compiled_with_cuda(): + self.net_g_ms.half().eval() + self.net_g_ms.half().to(self.dev) + else: + self.net_g_ms.eval() + self.net_g_ms.to(self.dev) + + + + def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker): + + wav, sr = librosa.load(in_path, sr=self.target_sample) + + f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size) + f0, uv = utils.interpolate_f0(f0) + f0 = paddle.to_tensor(f0,dtype = ('float32')) + uv = paddle.to_tensor(uv,dtype = ('float32')) + f0 = f0 * 2 ** (tran / 12) + f0 = f0.unsqueeze(0) + uv = uv.unsqueeze(0) + + wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000) + wav16k = paddle.to_tensor(wav16k) + c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k) + c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1]) + + if cluster_infer_ratio !=0: + cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T + cluster_c = paddle.to_tensor(cluster_c,dtype = 'float32') + c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c + + c = c.unsqueeze(0) + return c, f0, uv + + def infer(self, speaker, tran, raw_path, + cluster_infer_ratio=0, + auto_predict_f0=False, + noice_scale=0.4): + speaker_id = 0 + sid = paddle.to_tensor([int(speaker_id)],dtype = 'int64').unsqueeze(0) + c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker) + if "half" in self.net_g_path and paddle.device.is_compiled_with_cuda(): + c = c.half() + with paddle.no_grad(): + start = time.time() + audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].detach().astype('float32') + use_time = time.time() - start + print("vits耗时:{}".format(use_time)) + return audio, audio.shape[-1] + + def slice_inference(self,raw_audio_path, spk, tran, slice_db,cluster_infer_ratio, auto_predict_f0,noice_scale, pad_seconds=0.5,empty_cache=False): + wav_path = raw_audio_path + chunks = slicer.cut(wav_path, db_thresh=slice_db) + audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks) + + audio = [] + for (slice_tag, data) in audio_data: + print(f'#=====分段开始,耗时{round(len(data) / audio_sr, 3)}秒======') + # padd + pad_len = int(audio_sr * pad_seconds) + data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])]) + length = int(np.ceil(len(data) / audio_sr * self.target_sample)) + raw_path = io.BytesIO() + soundfile.write(raw_path, data, audio_sr, format="wav") + raw_path.seek(0) + if slice_tag: + print('跳过空段') + _audio = np.zeros(length) + else: + out_audio, out_sr = self.infer(spk, tran, raw_path, + cluster_infer_ratio=cluster_infer_ratio, + auto_predict_f0=auto_predict_f0, + noice_scale=noice_scale + ) + _audio = out_audio.cpu().numpy() + + pad_len = int(self.target_sample * pad_seconds) + _audio = _audio[pad_len:-pad_len] + audio.extend(list(_audio)) + if empty_cache == True: + paddle.device.cuda.empty_cache() + return np.array(audio) + + +class RealTimeVC: + def __init__(self): + self.last_chunk = None + self.last_o = None + self.chunk_len = 16000 # 区块长度 + self.pre_len = 3840 # 交叉淡化长度,640的倍数 + + """输入输出都是1维numpy 音频波形数组""" + + def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path): + import maad + audio, sr = paddleaudio.load(input_wav_path) + audio = audio.cpu().numpy()[0] + temp_wav = io.BytesIO() + if self.last_chunk is None: + input_wav_path.seek(0) + audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path) + audio = audio.cpu().numpy() + self.last_chunk = audio[-self.pre_len:] + self.last_o = audio + return audio[-self.chunk_len:] + else: + audio = np.concatenate([self.last_chunk, audio]) + soundfile.write(temp_wav, audio, sr, format="wav") + temp_wav.seek(0) + audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav) + audio = audio.cpu().numpy() + ret = maad.util.crossfade(self.last_o, audio, self.pre_len) + self.last_chunk = audio[-self.pre_len:] + self.last_o = audio + return ret[self.chunk_len:2 * self.chunk_len] diff --git a/inference/infer_tool_grad.py b/inference/infer_tool_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..19bb647aa2ea3b81909481d1e3963d22afe33569 --- /dev/null +++ b/inference/infer_tool_grad.py @@ -0,0 +1,161 @@ +import hashlib +import json +import logging +import os +import time +from pathlib import Path +import io +import librosa +import maad +import numpy as np +from inference import slicer +import parselmouth +import soundfile +import paddle +import paddle.audio as paddleaudio + +from hubert import hubert_model +import utils +from models import SynthesizerTrn +logging.getLogger('numba').setLevel(logging.WARNING) +logging.getLogger('matplotlib').setLevel(logging.WARNING) + +def resize2d_f0(x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), + source) + res = np.nan_to_num(target) + return res + +def get_f0(x, p_len,f0_up_key=0): + + time_step = 160 / 16000 * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0 = parselmouth.Sound(x, 16000).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + + pad_size=(p_len - len(f0) + 1) // 2 + if(pad_size>0 or p_len - len(f0) - pad_size>0): + f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + + f0 *= pow(2, f0_up_key / 12) + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0 + +def clean_pitch(input_pitch): + num_nan = np.sum(input_pitch == 1) + if num_nan / len(input_pitch) > 0.9: + input_pitch[input_pitch != 1] = 1 + return input_pitch + + +def plt_pitch(input_pitch): + input_pitch = input_pitch.astype(float) + input_pitch[input_pitch == 1] = np.nan + return input_pitch + + +def f0_to_pitch(ff): + f0_pitch = 69 + 12 * np.log2(ff / 440) + return f0_pitch + + +def fill_a_to_b(a, b): + if len(a) < len(b): + for _ in range(0, len(b) - len(a)): + a.append(a[0]) + + +def mkdir(paths: list): + for path in paths: + if not os.path.exists(path): + os.mkdir(path) + + +class VitsSvc(object): + def __init__(self): + self.device = "gpu:0" if paddle.device.is_compiled_with_cuda() else "cpu" + self.SVCVITS = None + self.hps = None + self.speakers = None + self.hubert_soft = utils.get_hubert_model() + + def set_device(self, device): + self.device = device + #self.hubert_soft.to(self.device) + if self.SVCVITS != None: + self.SVCVITS.to(self.device) + + def loadCheckpoint(self, path): + self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json") + self.SVCVITS = SynthesizerTrn( + self.hps.data.filter_length // 2 + 1, + self.hps.train.segment_size // self.hps.data.hop_length, + **self.hps.model) + _ = utils.load_checkpoint(f"checkpoints/{path}/model.pdparams", self.SVCVITS, None) + _ = self.SVCVITS.eval().to(self.device) + self.speakers = self.hps.spk + + def get_units(self, source, sr): + source = source.unsqueeze(0).cuda() if self.device == 'gpu:0' else source.unsqueeze(0).cpu() + # hubert没有迁移到paddle上。这里也就不迁移了。 + with torch.inference_mode(): + units = self.hubert_soft.units(source) + return units + + + def get_unit_pitch(self, in_path, tran): + source, sr = torchaudio.load(in_path) + source = torchaudio.functional.resample(source, sr, 16000) + if len(source.shape) == 2 and source.shape[1] >= 2: + source = torch.mean(source, dim=0).unsqueeze(0) + soft = self.get_units(source, sr).squeeze(0).cpu().numpy() + f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran) + return soft, f0 + + def infer(self, speaker_id, tran, raw_path): + speaker_id = self.speakers[speaker_id] + sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0) + soft, pitch = self.get_unit_pitch(raw_path, tran) + f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device) + stn_tst = torch.FloatTensor(soft) + with torch.no_grad(): + x_tst = stn_tst.unsqueeze(0).to(self.device) + x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2) + audio = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float() + return audio, audio.shape[-1] + + def inference(self,srcaudio,chara,tran,slice_db): + sampling_rate, audio = srcaudio + audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.transpose(1, 0)) + if sampling_rate != 16000: + audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) + soundfile.write("tmpwav.wav", audio, 16000, format="wav") + chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db) + audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks) + audio = [] + for (slice_tag, data) in audio_data: + length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate)) + raw_path = io.BytesIO() + soundfile.write(raw_path, data, audio_sr, format="wav") + raw_path.seek(0) + if slice_tag: + _audio = np.zeros(length) + else: + out_audio, out_sr = self.infer(chara, tran, raw_path) + _audio = out_audio.cpu().numpy() + audio.extend(list(_audio)) + audio = (np.array(audio) * 32768.0).astype('int16') + return (self.hps.data.sampling_rate,audio) diff --git a/inference/slicer.py b/inference/slicer.py new file mode 100644 index 0000000000000000000000000000000000000000..61a323c53a0eb69323bdfdbe0155455bd195c850 --- /dev/null +++ b/inference/slicer.py @@ -0,0 +1,142 @@ +import librosa +import paddle +import paddle.audio as paddleaudio + + +class Slicer: + def __init__(self, + sr: int, + threshold: float = -40., + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000): + if not min_length >= min_interval >= hop_size: + raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size') + if not max_sil_kept >= hop_size: + raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size') + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + if len(waveform.shape) > 1: + return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)] + else: + return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)] + + # @timeit + def slice(self, waveform): + if len(waveform.shape) > 1: + samples = librosa.to_mono(waveform) + else: + samples = waveform + if samples.shape[0] <= self.min_length: + return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}} + rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) + sil_tags = [] + silence_start = None + clip_start = 0 + for i, rms in enumerate(rms_list): + # Keep looping while frame is silent. + if rms < self.threshold: + # Record start of silent frames. + if silence_start is None: + silence_start = i + continue + # Keep looping while frame is not silent and silence start has not been recorded. + if silence_start is None: + continue + # Clear recorded silence start if interval is not enough or clip is too short + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + # Need slicing. Record the range of silent frames to be removed. + if i - silence_start <= self.max_sil_kept: + pos = rms_list[silence_start: i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin() + pos += i - self.max_sil_kept + pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + # Deal with trailing silence. + total_frames = rms_list.shape[0] + if silence_start is not None and total_frames - silence_start >= self.min_interval: + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + # Apply and return slices. + if len(sil_tags) == 0: + return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}} + else: + chunks = [] + # 第一段静音并非从头开始,补上有声片段 + if sil_tags[0][0]: + chunks.append( + {"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"}) + for i in range(0, len(sil_tags)): + # 标识有声片段(跳过第一段) + if i: + chunks.append({"slice": False, + "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"}) + # 标识所有静音片段 + chunks.append({"slice": True, + "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"}) + # 最后一段静音并非结尾,补上结尾片段 + if sil_tags[-1][1] * self.hop_size < len(waveform): + chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"}) + chunk_dict = {} + for i in range(len(chunks)): + chunk_dict[str(i)] = chunks[i] + return chunk_dict + + +def cut(audio_path, db_thresh=-30, min_len=5000): + audio, sr = librosa.load(audio_path, sr=None) + slicer = Slicer( + sr=sr, + threshold=db_thresh, + min_length=min_len + ) + chunks = slicer.slice(audio) + return chunks + + +def chunks2audio(audio_path, chunks): + chunks = dict(chunks) + audio, sr = paddleaudio.load(audio_path) + if len(audio.shape) == 2 and audio.shape[1] >= 2: + audio = paddle.mean(audio, axis=0).unsqueeze(0) + audio = audio.cpu().numpy()[0] + result = [] + for k, v in chunks.items(): + tag = v["split_time"].split(",") + if tag[0] != tag[1]: + result.append((v["slice"], audio[int(tag[0]):int(tag[1])])) + return result, sr diff --git a/inference_main.py b/inference_main.py new file mode 100644 index 0000000000000000000000000000000000000000..f612b919444701bd94ab8d70e29058cf90769994 --- /dev/null +++ b/inference_main.py @@ -0,0 +1,108 @@ +import io +import logging +import time +from pathlib import Path + +import librosa +import matplotlib.pyplot as plt +import numpy as np +import soundfile + +from inference import infer_tool +from inference import slicer +from inference.infer_tool import Svc + +logging.getLogger('numba').setLevel(logging.WARNING) +chunks_dict = infer_tool.read_temp("inference/chunks_temp.json") + +# 这里是推理用到的所有参数,从这里修改参数即可 +模型路径:str = "./logs/44k/G_10000.pdparams" # 模型路径 +推理文件列表:list = ["1.wav"] # wav文件名列表,放在raw文件夹下 +音高调整:list = [0] # 音高调整,支持正负(半音) +合成目标说话人名称:list = ['yuuka'] # 合成目标说话人名称 +自动预测音高:bool = False # 语音转换自动预测音高,转换歌声时不要打开这个会严重跑调 +聚类模型路径:str = "logs/44k/kmeans_10000.pdparams" # 聚类模型路径,如果没有训练聚类则随便填 +聚类方案占比:float = 0 # 聚类方案占比,范围0-1,若没有训练聚类模型则填0即可 +静音分贝:int = -40 # 静音分贝阈值,默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50 +推理设备:str or None = None # 推理设备,None则为自动选择cpu和gpu +音频输出格式:str = 'flac' # 音频输出格式 +噪音比例:float = 0.4 # 声音有点电的话可以尝试调高这个,但是会降低音质,较为玄学 + +def main(): + import argparse + + parser = argparse.ArgumentParser(description='飞桨sovits4 推理模块') + parser.add_argument('-m', '--model_path', type=str, default=模型路径, help='模型路径') + parser.add_argument('-c', '--config_path', type=str, default="./logs/44k/config.json", help='配置文件路径') + parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=推理文件列表, help='wav文件名列表,放在raw文件夹下') + parser.add_argument('-t', '--trans', type=int, nargs='+', default=音高调整, help='音高调整,支持正负(半音)') + parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=合成目标说话人名称, help='合成目标说话人名称') + + parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=自动预测音高, + help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调') + parser.add_argument('-cm', '--cluster_model_path', type=str, default=聚类模型路径, help='聚类模型路径,如果没有训练聚类则随便填') + parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=聚类方案占比, help='聚类方案占比,范围0-1,若没有训练聚类模型则填0即可') + + parser.add_argument('-sd', '--slice_db', type=int, default=静音分贝, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50') + parser.add_argument('-d', '--device', type=str, default=推理设备, help='推理设备,None则为自动选择cpu和gpu') + parser.add_argument('-ns', '--noice_scale', type=float, default=噪音比例, help='噪音级别,会影响咬字和音质,较为玄学') + parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现') + parser.add_argument('-wf', '--wav_format', type=str, default=音频输出格式, help='音频输出格式') + + args = parser.parse_args() + + svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path) + infer_tool.mkdir(["raw", "results"]) + clean_names = args.clean_names + trans = args.trans + spk_list = args.spk_list + slice_db = args.slice_db + wav_format = args.wav_format + auto_predict_f0 = args.auto_predict_f0 + cluster_infer_ratio = args.cluster_infer_ratio + noice_scale = args.noice_scale + pad_seconds = args.pad_seconds + + infer_tool.fill_a_to_b(trans, clean_names) + for clean_name, tran in zip(clean_names, trans): + raw_audio_path = f"raw/{clean_name}" + if "." not in raw_audio_path: + raw_audio_path += ".wav" + infer_tool.format_wav(raw_audio_path) + wav_path = Path(raw_audio_path).with_suffix('.wav') + chunks = slicer.cut(wav_path, db_thresh=slice_db) + audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks) + + for spk in spk_list: + audio = [] + for (slice_tag, data) in audio_data: + print(f'#=====分段开始,{round(len(data) / audio_sr, 3)}秒======') + + length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample)) + if slice_tag: + print('跳过空段') + _audio = np.zeros(length) + else: + # padd + pad_len = int(audio_sr * pad_seconds) + data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])]) + raw_path = io.BytesIO() + soundfile.write(raw_path, data, audio_sr, format="wav") + raw_path.seek(0) + out_audio, out_sr = svc_model.infer(spk, tran, raw_path, + cluster_infer_ratio=cluster_infer_ratio, + auto_predict_f0=auto_predict_f0, + noice_scale=noice_scale + ) + _audio = out_audio.detach().cpu().numpy() + pad_len = int(svc_model.target_sample * pad_seconds) + _audio = _audio[pad_len:-pad_len] + + audio.extend(list(infer_tool.pad_array(_audio, length))) + key = "auto" if auto_predict_f0 else f"{tran}key" + cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}" + res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}' + soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format) + +if __name__ == '__main__': + main() diff --git a/logs/44k/config.json b/logs/44k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c171c964b2740499a804339fda5eabaae3778d7e --- /dev/null +++ b/logs/44k/config.json @@ -0,0 +1,95 @@ +{ + "train": { + "log_interval": 800, + "eval_interval": 400, + "seed": 1234, + "epochs": 114514, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-05, + "batch_size": 2, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 10240, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 512, + "port": "8001", + "keep_ckpts": 5 + }, + "data": { + "training_files": "filelists/train.txt", + "validation_files": "filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 44100, + "filter_length": 2048, + "hop_length": 512, + "win_length": 2048, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 22050 + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 8, + 8, + 2, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4, + 4 + ], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 256, + "n_speakers": 200 + }, + "spk": { + "yuuka": 0 + }, + "clean_logs": true, + "trainer": "admin" +} \ No newline at end of file diff --git a/logs/44k/eval/vdlrecords.1690034156.log b/logs/44k/eval/vdlrecords.1690034156.log new file mode 100644 index 0000000000000000000000000000000000000000..8f2a1af20b6ca2a0107d6c4694ebe0b7286082a1 Binary files /dev/null and b/logs/44k/eval/vdlrecords.1690034156.log differ diff --git a/logs/44k/train.log b/logs/44k/train.log new file mode 100644 index 0000000000000000000000000000000000000000..e9bb9ced77e4b52460ac2c48117cfba41662c7f1 --- /dev/null +++ b/logs/44k/train.log @@ -0,0 +1,8 @@ +2023-07-22 21:55:56,616 44k INFO {'train': {'log_interval': 800, 'eval_interval': 400, 'seed': 1234, 'epochs': 114514, 'learning_rate': 0.0001, 'betas': [0.8, 0.99], 'eps': 1e-05, 'batch_size': 2, 'fp16_run': True, 'lr_decay': 0.999875, 'segment_size': 10240, 'init_lr_ratio': 1, 'warmup_epochs': 0, 'c_mel': 45, 'c_kl': 1.0, 'use_sr': True, 'max_speclen': 512, 'port': '8001', 'keep_ckpts': 5}, 'data': {'training_files': 'filelists/train.txt', 'validation_files': 'filelists/val.txt', 'max_wav_value': 32768.0, 'sampling_rate': 44100, 'filter_length': 2048, 'hop_length': 512, 'win_length': 2048, 'n_mel_channels': 80, 'mel_fmin': 0.0, 'mel_fmax': 22050}, 'model': {'inter_channels': 192, 'hidden_channels': 192, 'filter_channels': 768, 'n_heads': 2, 'n_layers': 6, 'kernel_size': 3, 'p_dropout': 0.1, 'resblock': '1', 'resblock_kernel_sizes': [3, 7, 11], 'resblock_dilation_sizes': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'upsample_rates': [8, 8, 2, 2, 2], 'upsample_initial_channel': 512, 'upsample_kernel_sizes': [16, 16, 4, 4, 4], 'n_layers_q': 3, 'use_spectral_norm': False, 'gin_channels': 256, 'ssl_dim': 256, 'n_speakers': 200}, 'spk': {'yuuka': 0}, 'clean_logs': True, 'trainer': 'admin', 'model_dir': './logs/44k'} +2023-07-22 21:55:56,617 44k WARNING /home/aistudio/build不是git存储库,因此将忽略哈希值比较。 +2023-07-22 21:55:59,336 44k INFO 加载检查点 './logs/44k/G_0.pdparams' (迭代次数 1) +2023-07-22 21:55:59,680 44k INFO 加载检查点 './logs/44k/D_0.pdparams' (迭代次数 1) +2023-07-22 21:56:12,355 44k INFO 训练回合:1 [0%] +2023-07-22 21:56:12,356 44k INFO 损失:[2.723755359649658, 2.7280983924865723, 7.272645950317383, 30.232248306274414, 3.609935998916626],步数:0,学习率:0.0001 +2023-07-22 21:56:20,694 44k INFO 保存模型和优化器状态位于迭代次数1 到 ./logs/44k/G_0.pdparams +2023-07-22 21:56:22,248 44k INFO 保存模型和优化器状态位于迭代次数1 到 ./logs/44k/D_0.pdparams diff --git a/logs/44k/vdlrecords.1690034156.log b/logs/44k/vdlrecords.1690034156.log new file mode 100644 index 0000000000000000000000000000000000000000..bfe88f10921d2a8040a7e3f4ef39d0df860de04f Binary files /dev/null and b/logs/44k/vdlrecords.1690034156.log differ diff --git a/models.py b/models.py new file mode 100644 index 0000000000000000000000000000000000000000..a0901dd91b5b41cd62c946ffe3bc90a0219739af --- /dev/null +++ b/models.py @@ -0,0 +1,556 @@ +import copy +import math +import paddle +from paddle import nn +from paddle.nn import functional as F + +import modules.attentions as attentions +import modules.commons as commons +import modules.modules as modules + +from paddle.nn import Conv1D, Conv1DTranspose, AvgPool1D, Conv2D +from paddle.nn.utils import weight_norm, remove_weight_norm, spectral_norm + +import utils +from modules.commons import init_weights, get_padding +from vdecoder.hifigan.models import Generator +from utils import f0_to_coarse +import random +import string +import time + +class ResidualCouplingBlock(nn.Layer): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.LayerList() + for i in range(n_flows): + self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class Encoder(nn.Layer): + def __init__(self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1D(in_channels, hidden_channels, 1) + self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.proj = nn.Conv1D(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + # print(x.shape,x_lengths.shape) + x_mask = paddle.unsqueeze(commons.sequence_mask(x_lengths, x.shape[2]), 1).cast(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = paddle.split(stats, [self.out_channels,self.out_channels], axis=1) + z = (m + paddle.randn(m.shape,m.dtype) * paddle.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class TextEncoder(nn.Layer): + def __init__(self, + out_channels, + hidden_channels, + kernel_size, + n_layers, + gin_channels=0, + filter_channels=None, + n_heads=None, + p_dropout=None): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.gin_channels = gin_channels + self.proj = nn.Conv1D(hidden_channels, out_channels * 2, 1) + self.f0_emb = nn.Embedding(256, hidden_channels) + + self.enc_ = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout) + + def forward(self, x, x_mask, f0=None, noice_scale=1): + x = x + self.f0_emb(f0).transpose((0,2,1)) + x = self.enc_(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + m, logs = paddle.split(stats, [self.out_channels,self.out_channels], axis = 1) + z = (m + paddle.randn(m.shape,m.dtype) * paddle.exp(logs) * noice_scale) * x_mask + return z, m, logs, x_mask + + + +class DiscriminatorP(paddle.nn.Layer): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.LayerList([ + norm_f(Conv2D(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2D(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2D(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2D(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2D(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), + ]) + self.conv_post = norm_f(Conv2D(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect",data_format='NCL') + t = t + n_pad + x = x.reshape((b, c, t // self.period, self.period)) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = paddle.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(paddle.nn.Layer): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.LayerList([ + norm_f(Conv1D(1, 16, 15, 1, padding=7)), + norm_f(Conv1D(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1D(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1D(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1D(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1D(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1D(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = paddle.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(paddle.nn.Layer): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2,3,5,7,11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] + self.discriminators = nn.LayerList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class SpeakerEncoder(paddle.nn.Layer): + def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): + super(SpeakerEncoder, self).__init__() + self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers) + self.linear = nn.Linear(model_hidden_size, model_embedding_size) + self.relu = nn.ReLU() + + def forward(self, mels): + self.lstm.flatten_parameters() + _, (hidden, _) = self.lstm(mels) + embeds_raw = self.relu(self.linear(hidden[-1])) + return embeds_raw / paddle.norm(embeds_raw, axis=1, keepdim=True) + + def compute_partial_slices(self, total_frames, partial_frames, partial_hop): + mel_slices = [] + for i in range(0, total_frames-partial_frames, partial_hop): + mel_range = paddle.arange(i, i+partial_frames) + mel_slices.append(mel_range) + + return mel_slices + + def embed_utterance(self, mel, partial_frames=128, partial_hop=64): + mel_len = mel.shape[1] + last_mel = mel[:,-partial_frames:] + + if mel_len > partial_frames: + mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop) + mels = list(mel[:,s] for s in mel_slices) + mels.append(last_mel) + mels = paddle.stack(tuple(mels), 0).squeeze(1) + + with paddle.no_grad(): + partial_embeds = self(mels) + embed = paddle.mean(partial_embeds, axis=0).unsqueeze(0) + #embed = embed / torch.linalg.norm(embed, 2) + else: + with paddle.no_grad(): + embed = self(last_mel) + + return embed + +class F0Decoder(nn.Layer): + def __init__(self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + spk_channels=0): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.spk_channels = spk_channels + + self.prenet = nn.Conv1D(hidden_channels, hidden_channels, 3, padding=1) + self.decoder = attentions.FFT( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout) + self.proj = nn.Conv1D(hidden_channels, out_channels, 1) + self.f0_prenet = nn.Conv1D(1, hidden_channels , 3, padding=1) + self.cond = nn.Conv1D(spk_channels, hidden_channels, 1) + + def forward(self, x, norm_f0, x_mask, spk_emb=None): + x = x.detach() + if (spk_emb is not None): + x = x + self.cond(spk_emb) + x += self.f0_prenet(norm_f0) + x = self.prenet(x) * x_mask + x = self.decoder(x * x_mask, x_mask) + x = self.proj(x) * x_mask + return x + +class SynthesizerTrn_test(nn.Layer): + """ + Synthesizer for Training + """ + + def __init__(self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + ssl_dim, + n_speakers, + sampling_rate=44100, + **kwargs): + + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.ssl_dim = ssl_dim + + init = paddle.nn.initializer.Normal(0.001,1) + pa = paddle.ParamAttr(f'emb_g_pa_{int(time.time())}',init) + self.emb_g = nn.Embedding(n_speakers, gin_channels, weight_attr = pa) + + init = paddle.nn.initializer.Normal(2.7973e-06,0.0161) + pre_pa = paddle.ParamAttr(f'pre_pa_{int(time.time())}',init) + self.pre = nn.Conv1D(ssl_dim, hidden_channels, kernel_size=5, padding=2, weight_attr = pre_pa) + + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels=filter_channels, + n_heads=n_heads, + n_layers=n_layers, + kernel_size=kernel_size, + p_dropout=p_dropout + ) + hps = { + "sampling_rate": sampling_rate, + "inter_channels": inter_channels, + "resblock": resblock, + "resblock_kernel_sizes": resblock_kernel_sizes, + "resblock_dilation_sizes": resblock_dilation_sizes, + "upsample_rates": upsample_rates, + "upsample_initial_channel": upsample_initial_channel, + "upsample_kernel_sizes": upsample_kernel_sizes, + "gin_channels": gin_channels, + } + self.dec = Generator(h=hps) + self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + self.f0_decoder = F0Decoder( + 1, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + spk_channels=gin_channels + ) + initer = paddle.nn.initializer.Normal(mean = 0.202, std = 0.9640, name = f'emb_uv_init_weight_{time.time}') + emb_uv_pa = paddle.ParamAttr(f'emb_uv_init_weight_pa_{int(time.time())}',initer) + self.emb_uv = nn.Embedding(2, hidden_channels, weight_attr = emb_uv_pa) + + def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None): + g = self.emb_g(g).transpose([0,2,1]) + # ssl prenet + x_mask = paddle.unsqueeze(commons.sequence_mask(c_lengths, c.shape[2]), 1).astype(c.dtype) + emb_uv = self.emb_uv(uv.cast('int64')).transpose([0,2,1]) + prec = self.pre(c) + x = prec * x_mask + emb_uv + # f0 predict + lf0 = 2595. * paddle.log10(1. + f0.unsqueeze(1) / 700.) / 500 + norm_lf0 = utils.normalize_f0(lf0, x_mask, uv) + pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) + + # encoder + z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0 = f0_to_coarse(f0)) + z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) + + # flow + z_p = self.flow(z, spec_mask, g=g) + z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size) + # nsf decoder + o = self.dec(z_slice, g=g, f0=pitch_slice) + + return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 + + def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False): + c_lengths = ((paddle.ones((c.shape[0],)) * c.shape[-1])).cpu() if 'cpu'in str(c.place) else ((paddle.ones((c.shape[0],)) * c.shape[-1])).cuda() + g = self.emb_g(g).transpose([0,2,1]) + x_mask = paddle.unsqueeze(commons.sequence_mask(c_lengths, c.shape[2]), 1).astype(c.dtype) + x = self.pre(c) * x_mask + self.emb_uv(uv.astype('int64')).transpose([0,2,1]) + if predict_f0: + lf0 = 2595. * paddle.log10(1. + f0.unsqueeze(1) / 700.) / 500 + norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False) + pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) + f0 = (700 * (paddle.pow(paddle.to_tensor(10.), pred_lf0 * 500 / 2595) - 1)).squeeze(1) + z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) + z = self.flow(z_p, c_mask, g=g, reverse=True) + o = self.dec(z * c_mask, g=g, f0=f0) + return o + +class SynthesizerTrn(nn.Layer): + """ + Synthesizer for Training + """ + + def __init__(self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + ssl_dim, + n_speakers, + sampling_rate=44100, + **kwargs): + + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.ssl_dim = ssl_dim + + init = paddle.nn.initializer.Normal(0.001,1) + pa = paddle.ParamAttr('emb_g_pa',init) + self.emb_g = nn.Embedding(n_speakers, gin_channels, weight_attr = pa) + + init = paddle.nn.initializer.Normal(2.7973e-06,0.0161) + pre_pa = paddle.ParamAttr('pre_pa',init) + self.pre = nn.Conv1D(ssl_dim, hidden_channels, kernel_size=5, padding=2, weight_attr = pre_pa) + + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels=filter_channels, + n_heads=n_heads, + n_layers=n_layers, + kernel_size=kernel_size, + p_dropout=p_dropout + ) + hps = { + "sampling_rate": sampling_rate, + "inter_channels": inter_channels, + "resblock": resblock, + "resblock_kernel_sizes": resblock_kernel_sizes, + "resblock_dilation_sizes": resblock_dilation_sizes, + "upsample_rates": upsample_rates, + "upsample_initial_channel": upsample_initial_channel, + "upsample_kernel_sizes": upsample_kernel_sizes, + "gin_channels": gin_channels, + } + self.dec = Generator(h=hps) + self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + self.f0_decoder = F0Decoder( + 1, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + spk_channels=gin_channels + ) + initer = paddle.nn.initializer.Normal(mean = 0.202, std = 0.9640, name = f'emb_uv_init_weight') + emb_uv_pa = paddle.ParamAttr('emb_uv_init_weight_pa',initer) + self.emb_uv = nn.Embedding(2, hidden_channels, weight_attr = emb_uv_pa) + + def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None): + g = self.emb_g(g).transpose([0,2,1]) + # ssl prenet + x_mask = paddle.unsqueeze(commons.sequence_mask(c_lengths, c.shape[2]), 1).astype(c.dtype) + emb_uv = self.emb_uv(uv.cast('int64')).transpose([0,2,1]) + prec = self.pre(c) + x = prec * x_mask + emb_uv + # f0 predict + lf0 = 2595. * paddle.log10(1. + f0.unsqueeze(1) / 700.) / 500 + norm_lf0 = utils.normalize_f0(lf0, x_mask, uv) + pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) + + # encoder + z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0 = f0_to_coarse(f0)) + z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) + + # flow + z_p = self.flow(z, spec_mask, g=g) + z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size) + # nsf decoder + o = self.dec(z_slice, g=g, f0=pitch_slice) + + return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 + + def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False): + c_lengths = ((paddle.ones((c.shape[0],)) * c.shape[-1])).cpu() if 'cpu'in str(c.place) else ((paddle.ones((c.shape[0],)) * c.shape[-1])).cuda() + g = self.emb_g(g).transpose([0,2,1]) + x_mask = paddle.unsqueeze(commons.sequence_mask(c_lengths, c.shape[2]), 1).astype(c.dtype) + x = self.pre(c) * x_mask + self.emb_uv(uv.astype('int64')).transpose([0,2,1]) + if predict_f0: + lf0 = 2595. * paddle.log10(1. + f0.unsqueeze(1) / 700.) / 500 + norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False) + pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) + f0 = (700 * (paddle.pow(paddle.to_tensor(10.), pred_lf0 * 500 / 2595) - 1)).squeeze(1) + z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) + z = self.flow(z_p, c_mask, g=g, reverse=True) + o = self.dec(z * c_mask, g=g, f0=f0) + return o diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a588e7f157875901a8be92ed112953c4a19a1d8d --- /dev/null +++ b/modules/__init__.py @@ -0,0 +1 @@ +'''由梅花三弄再回首花了一个下午迁移的模块。''' \ No newline at end of file diff --git a/modules/attentions.py b/modules/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..8ca80c90f2d2db2a52eaac8b88881a84193302a5 --- /dev/null +++ b/modules/attentions.py @@ -0,0 +1,377 @@ +import copy +import math +import numpy as np +import paddle +from paddle import nn +from paddle.nn import functional as F + +import modules.commons as commons +import modules.modules as modules +from modules.modules import LayerNorm + + +class FFT(nn.Layer): + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0., + proximal_bias=False, proximal_init=True, **kwargs): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.LayerList() + self.norm_layers_0 = nn.LayerList() + self.ffn_layers = nn.LayerList() + self.norm_layers_1 = nn.LayerList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, + proximal_init=proximal_init)) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.shape[2]).astype(dtype=x.dtype) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + x = x * x_mask + return x + + +class Encoder(nn.Layer): + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.LayerList() + self.norm_layers_1 = nn.LayerList() + self.ffn_layers = nn.LayerList() + self.norm_layers_2 = nn.LayerList() + for i in range(self.n_layers): + self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class Decoder(nn.Layer): + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.LayerList() + self.norm_layers_0 = nn.LayerList() + self.encdec_attn_layers = nn.LayerList() + self.norm_layers_1 = nn.LayerList() + self.ffn_layers = nn.LayerList() + self.norm_layers_2 = nn.LayerList() + for i in range(self.n_layers): + self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).astype(dtype=x.dtype) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Layer): + def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + + self.conv_q = nn.Conv1D(channels, channels, 1,)# weight_attr=attr) + self.conv_k = nn.Conv1D(channels, channels, 1,)# weight_attr=attr) + self.conv_v = nn.Conv1D(channels, channels, 1,)# weight_attr=attr) + self.conv_o = nn.Conv1D(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + + rand = paddle.randn((n_heads_rel, window_size * 2 + 1, self.k_channels)) * rel_stddev + + self.emb_rel_k = paddle.create_parameter(rand.shape,'float32',None) + self.emb_rel_v = paddle.create_parameter(rand.shape,'float32',None) + + #nn.init.xavier_uniform_(self.conv_q.weight) + #nn.init.xavier_uniform_(self.conv_k.weight) + #nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with paddle.no_grad(): + self.conv_k.weight = (self.conv_q.weight) + self.conv_k.bias = (self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + #print(x) + #print(self.conv_q.weight) + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + @staticmethod + def _masked_fill(x, mask, value:float): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.shape, query.shape[2]) + query = query.reshape((b, self.n_heads, self.k_channels, t_t)).transpose([0,1,3,2]) + key = key.reshape((b, self.n_heads, self.k_channels, t_s)).transpose([0,1,3,2]) + value = value.reshape((b, self.n_heads, self.k_channels, t_s)).transpose([0,1,3,2]) + + scores = paddle.matmul(query / math.sqrt(self.k_channels), key.transpose([0,1,3,2])) # 0 1 2 3 -4 -3 -2 -1 + if self.window_size is not None: + assert t_s == t_t, "Relative attention is only available for self-attention." + + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).astype(dtype=scores.dtype) + if mask is not None: + scores = self._masked_fill(scores, mask == 0, -1e4) + if self.block_length is not None: + assert t_s == t_t, "Local attention is only available for self-attention." + block_mask = paddle.tril(paddle.triu(paddle.ones_like(scores), -self.block_length),self.block_length) + scores = self._masked_fill(scores, block_mask == 0, -1e4) + p_attn = F.softmax(scores, axis=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = paddle.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) + output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) + output = output.transpose([0,1,3,2]).reshape((b, d, t_t)) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = paddle.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = paddle.matmul(x, y.unsqueeze(0).transpose([0,1,3,2])) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padding = commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]) + + padded_relative_embeddings = F.pad( + x = relative_embeddings.unsqueeze(0), + pad = padding[0:4]).squeeze(0) + + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.shape + # Concat columns of pad to shift from relative to absolute indexing. + pad_shape = commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]) + pad_shape = commons.fix_pad_shape(pad_shape, x) + x = F.pad(x, pad_shape) + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.reshape([batch, heads, length * 2 * length]) + pad_shape = commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]) + pad_shape = commons.fix_pad_shape(pad_shape,x_flat) + x_flat = F.pad(x_flat, pad_shape, data_format='NCL') + # Reshape and slice out the padded elements. + x_final = x_flat.reshape([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.shape + # padd along column + pad_shape = commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]) + pad_shape = commons.fix_pad_shape(pad_shape, x) + x = F.pad(x, pad_shape) + x_flat = x.reshape([batch, heads, length**2 + length*(length -1)]) + # add 0's in the beginning that will skew the elements after reshape + pad_shape = commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]) + pad_shape = commons.fix_pad_shape(pad_shape, x_flat) + x_flat = F.pad(x_flat, pad_shape, data_format='NCL') + x_final = x_flat.reshape([batch, heads, length, 2*length])[:,:,:,1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = paddle.arange(length, dtype=np.float32) + diff = paddle.unsqueeze(r, 0) - paddle.unsqueeze(r, 1) + return paddle.unsqueeze(paddle.unsqueeze(-paddle.log1p(paddle.abs(diff)), 0), 0) + + +class FFN(nn.Layer): + def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1D(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1D(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = x * x_mask + x = self.padding(x) + x = self.conv_1(x) + if self.activation == "gelu": + x = x * F.sigmoid(1.702 * x) + else: + x = F.relu(x) + x = self.drop(x) + x = x * x_mask + x = self.padding(x) + x = self.conv_2(x) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + pad_shape:list = commons.convert_pad_shape(padding) + pad_shape = commons.fix_pad_shape(pad_shape, x) + x = F.pad(x, pad_shape,data_format='NCL') + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + pad_shape = commons.convert_pad_shape(padding) + pad_shape = commons.fix_pad_shape(pad_shape, x) + x = F.pad(x, pad_shape, data_format='NCL') + return x diff --git a/modules/commons.py b/modules/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..2c6fc529dffbecc7946f82c60f5864e09960e294 --- /dev/null +++ b/modules/commons.py @@ -0,0 +1,192 @@ +import math +import numpy as np +import paddle +from paddle import nn +from paddle.nn import functional as F + +def slice_pitch_segments(x, ids_str, segment_size=4): + ret = paddle.zeros_like(x[:, :segment_size]) + for i in range(x.shape[0]): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, idx_str:idx_end] + return ret + +def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4): + b, d, t = x.shape + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (paddle.rand([b]) * ids_str_max.astype('float32')).astype(dtype='int64') + ret = slice_segments(x, ids_str, segment_size) + ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size) + return ret, ret_pitch, ids_str + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = paddle.to_tensor([item for sublist in l for item in sublist],).flatten().astype('int32') + return pad_shape + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += 0.5 * (paddle.exp(2. * logs_p) + ((m_p - m_q)**2)) * paddle.exp(-2. * logs_q) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = paddle.rand(shape) * 0.99998 + 0.00001 + return -paddle.log(-paddle.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.shape).astype(dtype=x.dtype) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = paddle.zeros_like(x[:, :, :segment_size]) + for i in range(x.shape[0]): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (paddle.rand([b]) * ids_str_max).astype('int64') + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def rand_spec_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + ids_str = (paddle.rand([b]) * ids_str_max).astype('int64') + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d( + length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = paddle.arange(length, dtype=np.float32) + num_timescales = channels // 2 + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / + (num_timescales - 1)) + inv_timescales = min_timescale * paddle.exp( + paddle.arange(num_timescales, dtype=np.float32) * -log_timescale_increment) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = paddle.concat([paddle.sin(scaled_time), paddle.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.reshape((1, channels, length)) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.shape + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.astype(dtype=x.dtype) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return paddle.concat([x, signal.astype(dtype=x.dtype)], axis) + + +def subsequent_mask(length): + mask = paddle.tril(paddle.ones((length, length))).unsqueeze(0).unsqueeze(0) + return mask + + +#@paddle.jit.to_static # @torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = paddle.tanh(in_act[:, :n_channels_int, :]) + s_act = paddle.nn.functional.sigmoid(in_act[:, n_channels_int:, :]) + print(t_act) + print(s_act) + acts = t_act * s_act + return acts + +def fix_pad_shape(pad_shape:paddle.Tensor, pad_tensor) -> paddle.Tensor: # 飞桨里面的padding函数对pad_shape有比较严格的要求,需要自己修正一下~~~ + if len(pad_tensor.shape) == 3: + return pad_shape[0:2].astype('int32') + elif len(pad_tensor.shape) == 4: + return pad_shape[0:4].astype('int32') + elif len(pad_tensor.shape) == 5: + return pad_shape[0:6].astype('int32') + return pad_shape.astype('int32') + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length:paddle.Tensor, max_length=None): + if max_length is None: + max_length = length.max() + x = paddle.arange(max_length, dtype=length.dtype) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = paddle.cumsum(duration, -1) + + cum_duration_flat = cum_duration.reshape((b * t_x)) + path = sequence_mask(cum_duration_flat, t_y).astype(mask.dtype) + path = path.reshape((b, t_x, t_y)) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose([0,1,3,2]) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, paddle.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = paddle.to_tensor(p.grad).norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + paddle.to_tensor(p.grad).clip_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1. / norm_type) + return total_norm diff --git a/modules/losses.py b/modules/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..93f9523df3e326825c9130b5bbcced39c4a5be3f --- /dev/null +++ b/modules/losses.py @@ -0,0 +1,61 @@ +import paddle +from paddle.nn import functional as F + +import modules.commons as commons + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + rl = rl.astype('float32').detach() + gl = gl.astype('float32') + loss += paddle.mean(paddle.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + dr = dr.astype('float32') + dg = dg.astype('float32') + r_loss = paddle.mean((1-dr)**2) + g_loss = paddle.mean(dg**2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + dg = dg.astype('float32') + l = paddle.mean((1-dg)**2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + +def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + """ + z_p = z_p.astype('float32') + logs_q = logs_q.astype('float32') + m_p = m_p.astype('float32') + logs_p = logs_p.astype('float32') + z_mask = z_mask.astype('float32') + #print(logs_p) + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p)**2) * paddle.exp(-2. * logs_p) + kl = paddle.sum(kl * z_mask) + l = kl / paddle.sum(z_mask) + return l diff --git a/modules/mel_processing.py b/modules/mel_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..3a4360e3115dbaee4c76ff5fc44114b0620c088c --- /dev/null +++ b/modules/mel_processing.py @@ -0,0 +1,111 @@ +import math +import os +import random +import paddle +from paddle import nn +import paddle.nn.functional as F +import numpy as np +import librosa +import librosa.util as librosa_util +from librosa.util import normalize, pad_center, tiny +from scipy.signal import get_window +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return paddle.log(paddle.clip(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return paddle.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + if paddle.min(y) < -1.: + print('min value is ', paddle.min(y)) + if paddle.max(y) > 1.: + print('max value is ', paddle.max(y)) + + global hann_window + dtype_device = str(y.dtype) + '_' + str(str(y.place)[6:-1]) + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = paddle.audio.functional.get_window('hann',win_size).astype(y.dtype) + + y = paddle.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect', data_format='NCL') + y = y.squeeze(1) + + spec = paddle.signal.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True) + spec = paddle.as_real(spec) + spec = paddle.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + global mel_basis + dtype_device = str(spec.dtype) + '_' + str(spec.place)[6:-1] + fmax_dtype_device = str(fmax) + '_' + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = paddle.to_tensor(mel).astype(spec.dtype) + spec = paddle.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + return spec + + +def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if paddle.min(y) < -1.: + print('min value is ', paddle.min(y)) + if paddle.max(y) > 1.: + print('max value is ', paddle.max(y)) + + global mel_basis, hann_window + dtype_device = str(y.dtype) + '_' + str(y.place)[6:-1] + fmax_dtype_device = str(fmax) + '_' + dtype_device + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = paddle.to_tensor(mel).astype(y.dtype) + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = paddle.audio.functional.get_window('hann',win_size).astype(y.dtype) + + y = paddle.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect',data_format = 'NCL') + y = y.squeeze(1) + + spec = paddle.signal.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True) + spec = paddle.as_real(spec) + spec = paddle.sqrt(spec.pow(2).sum(-1) + 1e-6) + + spec = paddle.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + + return spec diff --git a/modules/modules.py b/modules/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..a4bab8fe96c666fcc862a60d20aa425947707ee8 --- /dev/null +++ b/modules/modules.py @@ -0,0 +1,351 @@ +import copy +import math +import numpy as np +import scipy +import paddle +from paddle import nn +from paddle.nn import functional as F + +from paddle.nn import Conv1D, Conv1DTranspose, AvgPool1D, Conv2D +from paddle.nn.utils import weight_norm, remove_weight_norm + +import modules.commons as commons +from modules.commons import init_weights, get_padding + + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Layer): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = paddle.create_parameter([channels],'float32','modules_Layer_Norm_gamma',\ + paddle.ParamAttr(initializer = paddle.nn.initializer.Constant(value=1.0))) # ones,shape = [channels] + self.beta = paddle.create_parameter([channels],'float32','modules_Layer_Norm_beta',\ + paddle.ParamAttr(initializer = paddle.nn.initializer.Constant(value=0.0))) # zeros,shape = [channels] + + def forward(self, x): + x = x.transpose([0,2,1])#x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose([0,2,1])#x.transpose(1, -1) + + +class ConvReluNorm(nn.Layer): + def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.LayerList() + self.norm_layers = nn.LayerList() + self.conv_layers.append(nn.Conv1D(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential( + nn.ReLU(), + nn.Dropout(p_dropout)) + for _ in range(n_layers-1): + self.conv_layers.append(nn.Conv1D(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + att = paddle.ParamAttr('modules_ConvReluNorm_att',initializer = paddle.nn.initializer.Constant(value=0.0)) # น้มใ + self.proj = nn.Conv1D(hidden_channels, out_channels, 1, weight_attr=att, bias_attr=att) + #self.proj.weight.data.zero_() + #self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Layer): + """ + Dialted and Depth-Separable Convolution + """ + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.LayerList() + self.convs_1x1 = nn.LayerList() + self.norms_1 = nn.LayerList() + self.norms_2 = nn.LayerList() + for i in range(n_layers): + dilation = kernel_size ** i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append(nn.Conv1D(channels, channels, kernel_size, + groups=channels, dilation=dilation, padding=padding + )) + self.convs_1x1.append(nn.Conv1D(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(paddle.nn.Layer): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert(kernel_size % 2 == 1) + self.hidden_channels =hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = paddle.nn.LayerList() + self.res_skip_layers = paddle.nn.LayerList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = paddle.nn.Conv1D(gin_channels, 2*hidden_channels*n_layers, 1) + self.cond_layer = paddle.nn.utils.weight_norm(cond_layer, name='weight') + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = paddle.nn.Conv1D(hidden_channels, 2*hidden_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = paddle.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = paddle.nn.Conv1D(hidden_channels, res_skip_channels, 1) + res_skip_layer = paddle.nn.utils.weight_norm(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = paddle.zeros_like(x,name = 'module_WN_forward_output') + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] + else: + g_l = paddle.zeros_like(x_in,name = 'module_WN_forward_gl') + + input_a=x_in; input_b=g_l + n_channels_int = self.hidden_channels + in_act = input_a + input_b + t_act = paddle.tanh(in_act[:, :n_channels_int, :]) + s_act = paddle.nn.functional.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:,:self.hidden_channels,:] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:,self.hidden_channels:,:] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + paddle.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + paddle.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + paddle.nn.utils.remove_weight_norm(l) + + +class ResBlock1(paddle.nn.Layer): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.LayerList([ + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.LayerList([ + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(paddle.nn.Layer): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.LayerList([ + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x, x_mask=None): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Log(nn.Layer): + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = paddle.log(paddle.clip(x, 1e-5)) * x_mask + logdet = paddle.sum(-y, [1, 2]) + return y, logdet + else: + x = paddle.exp(x) * x_mask + return x + + +class Flip(nn.Layer): + def forward(self, x, *args, reverse=False, **kwargs): + x = paddle.flip(x, [1]) + if not reverse: + logdet = paddle.zeros([x.shape[0]]).astype(x.dtype) + return x, logdet + else: + return x + + +class ElementwiseAffine(nn.Layer): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = paddle.create_parameter([channels,1],'float32',None,\ + paddle.ParamAttr(initializer = paddle.nn.initializer.Constant(value=0.0))) + self.logs = paddle.create_parameter([channels,1],'float32',None,\ + paddle.ParamAttr(initializer = paddle.nn.initializer.Constant(value=0.0))) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + paddle.exp(self.logs) * x + y = y * x_mask + logdet = paddle.sum(self.logs * x_mask, [1,2]) + return y, logdet + else: + x = (x - self.m) * paddle.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Layer): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1D(self.half_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) + att = paddle.ParamAttr(initializer = paddle.nn.initializer.Constant(value=0.0)) # น้มใ + self.post = nn.Conv1D(hidden_channels, self.half_channels * (2 - mean_only), 1,weight_attr=att, bias_attr=att) + #self.post.weight.data.zero_() + #self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = paddle.split(x, [self.half_channels]*2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = paddle.split(stats, [self.half_channels]*2, 1) + else: + m = stats + logs = paddle.zeros_like(m) + + if not reverse: + x1 = m + x1 * paddle.exp(logs) * x_mask + x = paddle.concat([x0, x1], 1) + logdet = paddle.sum(logs, [1,2]) + return x, logdet + else: + x1 = (x1 - m) * paddle.exp(-logs) * x_mask + x = paddle.concat([x0, x1], 1) + return x diff --git a/output_2stems/blue-instrumental.wav b/output_2stems/blue-instrumental.wav new file mode 100644 index 0000000000000000000000000000000000000000..c483f0949dfd616ee1f7c8aaf30b4573f0efd2ae --- /dev/null +++ b/output_2stems/blue-instrumental.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9cdb5adc7ccd29f82f8f0d13adbdc83d4f8d9e56ea6f56d206f44e06e6ed690 +size 2704274 diff --git a/output_2stems/blue-vocals.wav b/output_2stems/blue-vocals.wav new file mode 100644 index 0000000000000000000000000000000000000000..b5e7fe1eaf39f78a8f990a5c1f02b232ed17f4e8 --- /dev/null +++ b/output_2stems/blue-vocals.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59edf5461750769e48eb8d72ab41bfbe92a2483d6fcf725a87fa52659d8400ac +size 2704274 diff --git a/output_2stems/temp-instrumental.wav b/output_2stems/temp-instrumental.wav new file mode 100644 index 0000000000000000000000000000000000000000..5daee18dafc8e698277dd369a119fede96e65691 --- /dev/null +++ b/output_2stems/temp-instrumental.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3583c87e8e45021d925579ef0f229ec88c6c73846509c6d08c03db0a18faba5e +size 2704274 diff --git a/output_2stems/temp-vocals.wav b/output_2stems/temp-vocals.wav new file mode 100644 index 0000000000000000000000000000000000000000..520073d86ebdbca68976c58796269da9439e7e66 --- /dev/null +++ b/output_2stems/temp-vocals.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b9fe08d766a205dc0e60f2fe80b951ec95c35a2c32fbc2bb72e98a7b3ccfff +size 2704274 diff --git a/paddle_infer_shape.py b/paddle_infer_shape.py new file mode 100644 index 0000000000000000000000000000000000000000..30d38f89c892dc5fe3e96a5d76f5cbce5db265f6 --- /dev/null +++ b/paddle_infer_shape.py @@ -0,0 +1,88 @@ +# 这个脚本文件用于更改静态图的静态Shape为动态Shape,从飞桨的GitHub上面复制的 +import argparse + + +def process_old_ops_desc(program): + for i in range(len(program.blocks[0].ops)): + if program.blocks[0].ops[i].type == "matmul": + if not program.blocks[0].ops[i].has_attr("head_number"): + program.blocks[0].ops[i]._set_attr("head_number", 1) + + +def infer_shape(program, input_shape_dict): + import paddle + paddle.enable_static() + import paddle.fluid as fluid + + OP_WITHOUT_KERNEL_SET = { + 'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad', + 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', + 'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify', + 'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id', + 'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream', + 'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv', + 'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl', + 'copy_cross_scope' + } + model_version = program.desc._version() + paddle_version = paddle.__version__ + major_ver = model_version // 1000000 + minor_ver = (model_version - major_ver * 1000000) // 1000 + patch_ver = model_version - major_ver * 1000000 - minor_ver * 1000 + model_version = "{}.{}.{}".format(major_ver, minor_ver, patch_ver) + if model_version != paddle_version: + print( + "[WARNING] The model is saved by paddlepaddle v{}, but now your paddlepaddle is version of {}, this difference may cause error, it is recommend you reinstall a same version of paddlepaddle for this model". + format(model_version, paddle_version)) + for k, v in input_shape_dict.items(): + program.blocks[0].var(k).desc.set_shape(v) + for i in range(len(program.blocks)): + for j in range(len(program.blocks[0].ops)): + if program.blocks[i].ops[j].type in OP_WITHOUT_KERNEL_SET: + continue + program.blocks[i].ops[j].desc.infer_shape(program.blocks[i].desc) + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--model_dir', + required=True, + help='Path of directory saved the input model.') + parser.add_argument( + '--model_filename', required=True, help='The input model file name.') + parser.add_argument( + '--params_filename', required=True, help='The parameters file name.') + parser.add_argument( + '--save_dir', + required=True, + help='Path of directory to save the new exported model.') + parser.add_argument( + '--input_shape_dict', required=True, help="The new shape information.") + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_arguments() + import paddle + paddle.enable_static() + import paddle.fluid as fluid + input_shape_dict_str = args.input_shape_dict + input_shape_dict = eval(input_shape_dict_str) + print("Start to load paddle model...") + exe = fluid.Executor(fluid.CPUPlace()) + [prog, ipts, outs] = fluid.io.load_inference_model( + args.model_dir, + exe, + model_filename=args.model_filename, + params_filename=args.params_filename) + process_old_ops_desc(prog) + infer_shape(prog, input_shape_dict) + fluid.io.save_inference_model( + args.save_dir, + ipts, + outs, + exe, + prog, + model_filename=args.model_filename, + params_filename=args.params_filename) diff --git a/preprocess_flist_config.py b/preprocess_flist_config.py new file mode 100644 index 0000000000000000000000000000000000000000..76283d380fbf84ca03684b9153d28e913b0c3045 --- /dev/null +++ b/preprocess_flist_config.py @@ -0,0 +1,84 @@ +import os +import argparse +import re + +from tqdm import tqdm +from random import shuffle +import json +import wave + +config_template = json.load(open("configs/config.json")) + +pattern = re.compile(r'^[\.a-zA-Z0-9_\/]+$') + +def get_wav_duration(file_path): + with wave.open(file_path, 'rb') as wav_file: + # 获取音频帧数 + n_frames = wav_file.getnframes() + # 获取采样率 + framerate = wav_file.getframerate() + # 计算时长(秒) + duration = n_frames / float(framerate) + return duration + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list") + parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list") + parser.add_argument("--test_list", type=str, default="./filelists/test.txt", help="path to test list") + parser.add_argument("--source_dir", type=str, default="./dataset/44k", help="path to source dir") + args = parser.parse_args() + + train = [] + val = [] + test = [] + idx = 0 + spk_dict = {} + spk_id = 0 + for speaker in tqdm(os.listdir(args.source_dir)): + spk_dict[speaker] = spk_id + spk_id += 1 + wavs = ["/".join([args.source_dir, speaker, i]) for i in os.listdir(os.path.join(args.source_dir, speaker))] + new_wavs = [] + for file in wavs: + if not file.endswith("wav"): + continue + if not pattern.match(file): + print(f"警告:文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)") + print('梅花自己测试发现是会出现问题的。') + if get_wav_duration(file) < 0.3: + print("跳过太短的音频:", file) + continue + new_wavs.append(file) + wavs = new_wavs + shuffle(wavs) + train += wavs[2:-2] + val += wavs[:2] + test += wavs[-2:] + + shuffle(train) + shuffle(val) + shuffle(test) + + print("写入:", args.train_list) + with open(args.train_list, "w") as f: + for fname in tqdm(train): + wavpath = fname + f.write(wavpath + "\n") + + print("写入:", args.val_list) + with open(args.val_list, "w") as f: + for fname in tqdm(val): + wavpath = fname + f.write(wavpath + "\n") + + print("写入:", args.test_list) + with open(args.test_list, "w") as f: + for fname in tqdm(test): + wavpath = fname + f.write(wavpath + "\n") + + config_template["spk"] = spk_dict + print("写入:configs/config.json") + with open("configs/config.json", "w") as f: + json.dump(config_template, f, indent=2) diff --git a/preprocess_hubert_f0.py b/preprocess_hubert_f0.py new file mode 100644 index 0000000000000000000000000000000000000000..5d7680e8490fd73bbc3d17db9a73d53a5e96131f --- /dev/null +++ b/preprocess_hubert_f0.py @@ -0,0 +1,62 @@ +import math +import multiprocessing +import os +import argparse +from random import shuffle + +import paddle +from glob import glob +from tqdm import tqdm + +import utils +import logging +logging.getLogger('numba').setLevel(logging.WARNING) +import librosa +import numpy as np + +hps = utils.get_hparams_from_file("configs/config.json") +sampling_rate = hps.data.sampling_rate +hop_length = hps.data.hop_length + + +def process_one(filename, hmodel): + # print(filename) + wav, sr = librosa.load(filename, sr=sampling_rate) + soft_path = filename + ".soft.pdtensor" + if not os.path.exists(soft_path): + devive = "cuda" if paddle.device.is_compiled_with_cuda() else "cpu" + wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000) + wav16k = paddle.to_tensor(wav16k).cpu() if devive=='cpu' else paddle.to_tensor(wav16k).cuda() + c:paddle.Tensor = utils.get_hubert_content(hmodel, wav_16k_tensor=wav16k) + paddle.save(c.cpu(), soft_path) + f0_path = filename + ".f0.npy" + if not os.path.exists(f0_path): + f0 = utils.compute_f0_dio(wav, sampling_rate=sampling_rate, hop_length=hop_length) + np.save(f0_path, f0) + + +def process_batch(filenames): + print("正在加载内容的HuBERT……") + device = "cuda" if paddle.device.is_compiled_with_cuda() else "cpu" + hmodel = utils.get_hubert_model() + print("HuBERT已被装载。") + for filename in tqdm(filenames): + process_one(filename, hmodel) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--in_dir", type=str, default="dataset/44k", help="path to input dir") + + args = parser.parse_args() + filenames = glob(f'{args.in_dir}/*/*.wav', recursive=True) # [:10] + shuffle(filenames) + multiprocessing.set_start_method('spawn',force=True) + + num_processes = 1 + chunk_size = int(math.ceil(len(filenames) / num_processes)) + chunks = [filenames[i:i + chunk_size] for i in range(0, len(filenames), chunk_size)] + print([len(c) for c in chunks]) + processes = [multiprocessing.Process(target=process_batch, args=(chunk,)) for chunk in chunks] + for p in processes: + p.start() diff --git a/raw/1.wav b/raw/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..95fe19867cd090c130fff221e8bdada221d450f0 --- /dev/null +++ b/raw/1.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5014db90a4bd53a18054ae2a2f9b1e733c8c474a6872beace1c3bd716b2cb61f +size 2484268 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3d8efa30f051ab76154fa7ef9e1d4e0b98dcef2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +numba==0.56.4 +llvmlite==0.39.1 +matplotlib +msgpack +librosa==0.10.0.post2 +onnxruntime-gpu +pyworld==0.2.11.post0 +praat-parselmouth +numpy==1.23.4 +paddleaudio==1.0.2 +gradio==3.19.1 +pydub +ffmpeg-python +paddlepaddle==2.5.1 +visualdl +tqdm diff --git a/resample.py b/resample.py new file mode 100644 index 0000000000000000000000000000000000000000..79f60bd7446885aa2e736b8e968bd8a827259db6 --- /dev/null +++ b/resample.py @@ -0,0 +1,49 @@ +import os +import argparse +import librosa +import numpy as np +from multiprocessing import Pool, cpu_count +from scipy.io import wavfile +from tqdm import tqdm + +def process(item): + spkdir, wav_name, args = item + # speaker 's5', 'p280', 'p315' are excluded, + speaker = spkdir.replace("\\", "/").split("/")[-1] + wav_path = os.path.join(args.in_dir, speaker, wav_name) + + if os.path.exists(wav_path) and '.wav' in wav_path: + os.makedirs(os.path.join(args.out_dir2, speaker), exist_ok=True) + + wav, sr = librosa.load(wav_path, sr=None) + wav, _ = librosa.effects.trim(wav, top_db=20) + peak = np.abs(wav).max() + if peak > 1.0: + wav = 0.98 * wav / peak + wav2 = librosa.resample(wav, orig_sr=sr, target_sr=args.sr2) + wav2 /= max(wav2.max(), -wav2.min()) + save_name = wav_name + save_path2 = os.path.join(args.out_dir2, speaker, save_name) + wavfile.write( + save_path2, + args.sr2, + (wav2 * np.iinfo(np.int16).max).astype(np.int16) + ) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--sr2", type=int, default=44100, help="sampling rate") + parser.add_argument("--in_dir", type=str, default="./dataset_raw", help="path to source dir") + parser.add_argument("--out_dir2", type=str, default="./dataset/44k", help="path to target dir") + args = parser.parse_args() + processs = 1 + pool = Pool(processes=processs) + + for speaker in os.listdir(args.in_dir): + spk_dir = os.path.join(args.in_dir, speaker) + if os.path.isdir(spk_dir): + print(spk_dir) + for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])): + pass diff --git a/results/1.wav_0key_yuuka.flac b/results/1.wav_0key_yuuka.flac new file mode 100644 index 0000000000000000000000000000000000000000..8430db94d3cbc1190e0e475fe496ae0ef98018c1 Binary files /dev/null and b/results/1.wav_0key_yuuka.flac differ diff --git a/spec_gen.py b/spec_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..9476395adab6fa841fde10c05fbb92902310ebd4 --- /dev/null +++ b/spec_gen.py @@ -0,0 +1,22 @@ +from data_utils import TextAudioSpeakerLoader +import json +from tqdm import tqdm + +from utils import HParams + +config_path = 'configs/config.json' +with open(config_path, "r") as f: + data = f.read() +config = json.loads(data) +hps = HParams(**config) + +train_dataset = TextAudioSpeakerLoader("filelists/train.txt", hps) +test_dataset = TextAudioSpeakerLoader("filelists/test.txt", hps) +eval_dataset = TextAudioSpeakerLoader("filelists/val.txt", hps) + +for _ in tqdm(train_dataset): + pass +for _ in tqdm(eval_dataset): + pass +for _ in tqdm(test_dataset): + pass \ No newline at end of file diff --git a/spleeter.py b/spleeter.py new file mode 100644 index 0000000000000000000000000000000000000000..b6d5cde67191fe1b086cbce0b6db556c632c1b93 --- /dev/null +++ b/spleeter.py @@ -0,0 +1,337 @@ +import paddle +import paddle.nn as nn +import paddle +import os +import numpy as np +import math +import paddle.nn as nn +import ffmpeg +from scipy.signal.windows import hann +from librosa.core import stft, istft + +class UNet(nn.Layer): + def __init__(self, use_elu=False): + super(UNet, self).__init__() + self.use_elu = use_elu + self.pad = nn.Pad2D(padding=[1, 2, 1, 2]) + + ### Encoder ### + # First Layer + self.conv1 = nn.Conv2D(2, 16, kernel_size=5, stride=2) ## padding + self.encoder1 = self.encoder_block(16) + # Second Layer + self.conv2 = nn.Conv2D(16, 32, kernel_size=5, stride=2) + self.encoder2 = self.encoder_block(32) + # Third Layer + self.conv3 = nn.Conv2D(32, 64, kernel_size=5, stride=2) + self.encoder3 = self.encoder_block(64) + # Fourth Layer + self.conv4 = nn.Conv2D(64, 128, kernel_size=5, stride=2) + self.encoder4 = self.encoder_block(128) + # Fifth Layer + self.conv5 = nn.Conv2D(128, 256, kernel_size=5, stride=2) + self.encoder5 = self.encoder_block(256) + # Sixth Layer + self.conv6 = nn.Conv2D(256, 512, kernel_size=5, stride=2) + self.encoder6 = self.encoder_block(512) + + ### Decoder ### + # First Layer + self.decoder1 = self.decoder_block(512, 256, dropout=True) + # Second Layer + self.decoder2 = self.decoder_block(512, 128, dropout=True) + # Third Layer + self.decoder3 = self.decoder_block(256, 64, dropout=True) + # Fourth Layer + self.decoder4 = self.decoder_block(128, 32) + # Fifth Layer + self.decoder5 = self.decoder_block(64, 16) + # Sixth Layer + self.decoder6 = self.decoder_block(32, 1) + + # Last Layer + self.mask = nn.Conv2D(1, 2, kernel_size=4, dilation=2, padding=3) + self.sig = nn.Sigmoid() + + def encoder_block(self, out_channel): + if not self.use_elu: + return nn.Sequential( + nn.BatchNorm2D(out_channel, epsilon=1e-3, momentum=0.01), + nn.LeakyReLU(0.2) + ) + else: + return nn.Sequential( + nn.BatchNorm2D(out_channel, epsilon=1e-3, momentum=0.01), + nn.ELU() + ) + + def decoder_block(self, in_channel, out_channel, dropout=False): + layers = [ + nn.Conv2DTranspose(in_channel, out_channel, kernel_size=5, stride=2) + ] + if not self.use_elu: + layers.append(nn.ReLU()) + else: + layers.append(nn.ELU()) + layers.append(nn.BatchNorm2D(out_channel, epsilon=1e-3, momentum=0.01)) + if dropout: + layers.append(nn.Dropout(0.5)) + return nn.Sequential(*layers) + + def forward(self, x): + ### Encoder ### + skip1 = self.pad(x) + skip1 = self.conv1(skip1) + down1 = self.encoder1(skip1) + + skip2 = self.pad(down1) + skip2 = self.conv2(skip2) + down2 = self.encoder2(skip2) + + skip3 = self.pad(down2) + skip3 = self.conv3(skip3) + down3 = self.encoder3(skip3) + + skip4 = self.pad(down3) + skip4 = self.conv4(skip4) + down4 = self.encoder4(skip4) + + skip5 = self.pad(down4) + skip5 = self.conv5(skip5) + down5 = self.encoder5(skip5) + + skip6 = self.pad(down5) + skip6 = self.conv6(skip6) + down6 = self.encoder6(skip6) + + ### Decoder ### + up1 = self.decoder1(skip6) + up1 = up1[:, :, 1: -2, 1: -2] + merge1 = paddle.concat((skip5, up1), 1) + + up2 = self.decoder2(merge1) + up2 = up2[:, :, 1: -2, 1: -2] + merge2 = paddle.concat((skip4, up2), 1) + + up3 = self.decoder3(merge2) + up3 = up3[:, :, 1: -2, 1: -2] + merge3 = paddle.concat((skip3, up3), 1) + + up4 = self.decoder4(merge3) + up4 = up4[:, :, 1: -2, 1: -2] + merge4 = paddle.concat((skip2, up4), 1) + + up5 = self.decoder5(merge4) + up5 = up5[:, :, 1: -2, 1: -2] + merge5 = paddle.concat((skip1, up5), 1) + + up6 = self.decoder6(merge5) + up6 = up6[:, :, 1: -2, 1: -2] + + m = self.mask(up6) + + m = self.sig(m) + return m * x + +class Separator(object): + def __init__(self, params): + self.num_instruments = params['num_instruments'] + self.output_dir = params['output_dir'] + self.model_list = nn.LayerList() + + for i, name in enumerate(self.num_instruments): + print('Loading model for instrumment {}'.format(i)) + net = UNet(use_elu=params['use_elu']) + net.eval() + state_dict = paddle.load(os.path.join(params['checkpoint_path'], '%dstems_%s.pdparams' % (len(self.num_instruments), name))) + net.set_dict(state_dict) + self.model_list.append(net) + + self.T = params['T'] + self.F = params['F'] + self.frame_length = params['frame_length'] + self.frame_step = params['frame_step'] + self.samplerate = params['sample_rate'] + + def _load_audio( + self, path, offset=None, duration=None, + sample_rate=None, dtype=np.float32): + """ Loads the audio file denoted by the given path + and returns it data as a waveform. + + :param path: Path of the audio file to load data from. + :param offset: (Optional) Start offset to load from in seconds. + :param duration: (Optional) Duration to load in seconds. + :param sample_rate: (Optional) Sample rate to load audio with. + :param dtype: (Optional) Numpy data type to use, default to float32. + :returns: Loaded data a (waveform, sample_rate) tuple. + :raise SpleeterError: If any error occurs while loading audio. + """ + if not isinstance(path, str): + path = path.decode() + + probe = ffmpeg.probe(path) + + metadata = next( + stream + for stream in probe['streams'] + if stream['codec_type'] == 'audio') + n_channels = metadata['channels'] + if sample_rate is None: + sample_rate = metadata['sample_rate'] + output_kwargs = {'format': 'f32le', 'ar': sample_rate} + process = ( + ffmpeg + .input(path) + .output('pipe:', **output_kwargs) + .run_async(pipe_stdout=True, pipe_stderr=True)) + buffer, _ = process.communicate() + waveform = np.frombuffer(buffer, dtype=' 2: + source_audio = source_audio[:, :2] + + stft = self._stft(source_audio) # L * F * 2 + stft = stft[:, : self.F, :] + + stft_mag = abs(stft) # L * F * 2 + stft_mag = paddle.to_tensor(stft_mag) + stft_mag = stft_mag.unsqueeze(0).transpose([0, 3, 2, 1]) # 1 * 2 * F * L + + L = stft.shape[0] + + stft_mag = self._pad_and_partition( + stft_mag, self.T) # [(L + T) / T] * 2 * F * T + stft_mag = stft_mag.transpose((0, 1, 3, 2)) + # stft_mag : B * 2 * T * F + + B = stft_mag.shape[0] + masks = [] + + stft_mag = stft_mag + + for model, name in zip(self.model_list, self.num_instruments): + mask = model(stft_mag) + masks.append(mask) + paddle.save(model.state_dict(), '2stems_%s.pdparams' % name) + + mask_sum = sum([m ** 2 for m in masks]) + mask_sum += 1e-10 + + for i in range(len(self.num_instruments)): + mask = masks[i] + mask = (mask ** 2 + 1e-10/2) / (mask_sum) + mask = mask.transpose((0, 1, 3, 2)) # B x 2 X F x T + mask = paddle.concat(paddle.split(mask, mask.shape[0], axis=0), axis=3) + mask = mask.squeeze(0)[:, :, :L] # 2 x F x L + mask = mask.transpose([2, 1, 0]) + + # End using GPU + + mask = mask.detach().numpy() + + stft_masked = stft * mask + stft_masked = np.pad( + stft_masked, ((0, 0), (0, 1025), (0, 0)), 'constant') + + wav_masked = self._stft( + stft_masked, inverse=True, length=source_audio.shape[0]) + + save_path = os.path.join( + output_dir, (wav_name + '-' + self.num_instruments[i] + '.wav')) + + self._save_to_file(save_path, wav_masked, + samplerate, 'wav', '128k') + + print('Audio {} separated'.format(wav_name)) \ No newline at end of file diff --git a/spleeter/2stems_instrumental.pdparams b/spleeter/2stems_instrumental.pdparams new file mode 100644 index 0000000000000000000000000000000000000000..1f0d0259c094eaf1cbf5feb890e121277fd26450 --- /dev/null +++ b/spleeter/2stems_instrumental.pdparams @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:305d00378a1fccbfa0680fddced78daeeb63a4ae353791e82da36e99af2daf54 +size 59035467 diff --git a/spleeter/2stems_vocals.pdparams b/spleeter/2stems_vocals.pdparams new file mode 100644 index 0000000000000000000000000000000000000000..0fabccfd9edf451a5825149ce848eafc31713d4d --- /dev/null +++ b/spleeter/2stems_vocals.pdparams @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9e8c9785f70a87f2b8d26fca5eef07f099239ffea7f97538073f0f8f61aad6 +size 59052138 diff --git a/temp.wav b/temp.wav new file mode 100644 index 0000000000000000000000000000000000000000..e4ea5d574334f361f079af34dbe3ef5338a3c750 Binary files /dev/null and b/temp.wav differ diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..6d8319e7133859a94e0c0bd5cbc39ed90b7abb6d --- /dev/null +++ b/train.py @@ -0,0 +1,322 @@ +import logging +import multiprocessing +import time + +logging.getLogger('matplotlib').setLevel(logging.WARNING) +import os +import paddle +#paddle.device.set_device("cpu") #开启可用CPU进行炼丹 +trainer:str = "admin" +from paddle.nn import functional as F +from paddle.io import DataLoader +from visualdl import LogWriter +from paddle.amp import auto_cast, GradScaler + +import modules.commons as commons +import utils +from data_utils import TextAudioSpeakerLoader, TextAudioCollate +from models import ( + SynthesizerTrn, + MultiPeriodDiscriminator, +) +from modules.losses import ( + kl_loss, + generator_loss, discriminator_loss, feature_loss +) + +from modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch + +paddle.set_flags({'FLAGS_cudnn_exhaustive_search': True}) # 使用穷举搜索方法来选择卷积算法 +global_step = 0 +trainers:list[str] = [] +start_time = time.time() + + +def main(): + """Assume Single Node Multi GPUs Training Only""" + #assert torch.cuda.is_available(), "CPU training is not allowed." + hps = utils.get_hparams() + + n_gpus = paddle.device.cuda.device_count() + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = hps.train.port + + run(n_gpus, hps, ) + + +def run(n_gpus, hps): + global global_step,trainers,trainer + + trainer = hps.trainer + rank = 0 + if rank == 0: + logger = utils.get_logger(hps.model_dir) + logger.info(hps) + utils.check_git_hash(hps.model_dir) + writer = LogWriter(logdir=hps.model_dir) + writer_eval = LogWriter(logdir=os.path.join(hps.model_dir, "eval")) + + paddle.seed(hps.train.seed) + paddle.device.set_device('cpu' if paddle.device.get_device() == 'cpu' else 'gpu:' + str(rank)) + collate_fn = TextAudioCollate() + train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps) + num_workers = 5 if multiprocessing.cpu_count() > 4 else multiprocessing.cpu_count() + train_loader = DataLoader(dataset = train_dataset, + num_workers=num_workers, + shuffle=False, + batch_size=hps.train.batch_size, + collate_fn=collate_fn) + if rank == 0: + eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps) + eval_loader = DataLoader(dataset = eval_dataset, + num_workers = 1, + shuffle = False, + batch_size = 1, + drop_last = False, + collate_fn = collate_fn) + + net_g = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model) + net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm) + optim_g = paddle.optimizer.AdamW( + parameters = net_g.parameters(), + learning_rate = hps.train.learning_rate, + beta1 = hps.train.betas[0], + beta2 = hps.train.betas[1], + epsilon = hps.train.eps) + optim_d = paddle.optimizer.AdamW( + parameters = net_d.parameters(), + learning_rate = hps.train.learning_rate, + beta1 = hps.train.betas[0], + beta2 = hps.train.betas[1], + epsilon = hps.train.eps) + + skip_optimizer = False + try: + _, _, _, epoch_str, trainers = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pdparams"), net_g, + optim_g, skip_optimizer) + _, _, _, epoch_str, trainers = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pdparams"), net_d, + optim_d, skip_optimizer) + if trainer not in trainers: + trainers.append(trainer) + epoch_str = max(epoch_str, 1) + global_step = (epoch_str - 1) * len(train_loader) + except Exception as e: + print(e) + logger.info("加载旧检查点失败……") + epoch_str = 1 + global_step = 0 + if skip_optimizer: + epoch_str = 1 + global_step = 0 + + scheduler_g = paddle.optimizer.lr.ExponentialDecay(hps.train.learning_rate, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2) + scheduler_d = paddle.optimizer.lr.ExponentialDecay(hps.train.learning_rate, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2) + + optim_g = paddle.optimizer.AdamW( + parameters = net_g.parameters(), + learning_rate = scheduler_g, + beta1 = hps.train.betas[0], + beta2 = hps.train.betas[1], + epsilon = hps.train.eps) + optim_d = paddle.optimizer.AdamW( + parameters = net_d.parameters(), + learning_rate = scheduler_d, + beta1 = hps.train.betas[0], + beta2 = hps.train.betas[1], + epsilon = hps.train.eps) + + scaler = GradScaler(enable = hps.train.fp16_run) + + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, + [train_loader, eval_loader], logger, [writer, writer_eval]) + else: + train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, + [train_loader, None], None, None) + scheduler_g.step() + scheduler_d.step() + + +def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler:GradScaler, loaders, logger:logging.Logger, writers:list or None): + net_g, net_d = nets + optim_g, optim_d = optims + scheduler_g, scheduler_d = schedulers + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + # train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + net_d.train() + for batch_idx, items in enumerate(train_loader): + c, f0, spec, y, spk, lengths, uv = items + g = spk + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax) + + with auto_cast(enable=hps.train.fp16_run): + + y_hat, ids_slice, z_mask, \ + (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths, + spec_lengths=lengths) + + y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax + ) + y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) + + with auto_cast(enable=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) + loss_disc_all = loss_disc + + optim_d.clear_grad() + scaler.scale(loss_disc_all).backward(retain_graph = True) # 将 Tensor 乘上缩放因子,返回缩放后的输出,返回loss然后反向传播 + scaler.unscale_(optim_d) # 将参数的梯度除去缩放比例。 + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + with auto_cast(enable=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) + with auto_cast(enable=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_lf0 = F.mse_loss(pred_lf0, lf0) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0 + optim_g.clear_grad() + scaler.scale(loss_gen_all).backward(retain_graph = True) + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + #lr = optim_g.state_dict()['LR_Scheduler']['last_lr'] # paddle优化器特有的字典 + #losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl] + #logger.info(f"损失:{[x.item() for x in losses]},步数:{global_step},学习率:{lr}") # 梅花自己看的~ + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.state_dict()['LR_Scheduler']['last_lr'] # paddle优化器特有的字典 + losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl] + logger.info('训练回合:{} [{:.0f}%]'.format( + epoch, + 100. * batch_idx / len(train_loader))) + logger.info(f"损失:{[x.item() for x in losses]},步数:{global_step},学习率:{lr}") + + scalar_dict = {"损失/生成器/总损失": loss_gen_all, "损失/判别器/总损失": loss_disc_all, "学习率": lr, + "归一化判别器梯度": grad_norm_d, "归一化生成器梯度": grad_norm_g} + scalar_dict.update({"损失/生成器/特征匹配损失": loss_fm, "损失/生成器/梅尔频谱损失": loss_mel, "损失/生成器/KL散度": loss_kl, + "损失/生成器/基音损失": loss_lf0}) + + image_dict = { + "切片/原始梅尔频谱图": utils.plot_spectrogram_to_numpy(y_mel[0].detach().numpy()), + "切片/生成梅尔频谱图": utils.plot_spectrogram_to_numpy(y_hat_mel[0].detach().numpy()), + "全部/梅尔频谱图": utils.plot_spectrogram_to_numpy(mel[0].detach().numpy()), + "全部/基音损失": utils.plot_data_to_numpy(lf0[0, 0, :].numpy(), + pred_lf0[0, 0, :].detach().numpy()), + "全部/归一化基音损失": utils.plot_data_to_numpy(lf0[0, 0, :].numpy(), + norm_lf0[0, 0, :].detach().numpy()) + } + + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict + ) + if global_step % hps.train.eval_interval == 0: + if hps.clean_logs: + os.system('clear') + evaluate(hps, net_g, eval_loader, writer_eval) + utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, + os.path.join(hps.model_dir, "G_{}.pdparams".format(global_step)), trainers) + utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, + os.path.join(hps.model_dir, "D_{}.pdparams".format(global_step)), trainers) + keep_ckpts = getattr(hps.train, 'keep_ckpts', 0) + if keep_ckpts > 0: + utils.clean_checkpoints(path_to_models=hps.model_dir, n_ckpts_to_keep=keep_ckpts, sort_by_time=True) + + global_step += 1 + + if rank == 0: + global start_time + now = time.time() + durtaion = format(now - start_time, '.2f') + logger.info(f'====> 回合:{epoch}, 消耗 {durtaion} 秒') + start_time = now + + +def evaluate(hps, generator, eval_loader, writer_eval): + generator.eval() + image_dict = {} + audio_dict = {} + with paddle.no_grad(): + for batch_idx, items in enumerate(eval_loader): + c, f0, spec, y, spk, _, uv = items + g = spk[:1] + spec, y = spec[:1], y[:1] + c = c[:1] + f0 = f0[:1] + uv= uv[:1] + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax) + y_hat = generator.infer(c, f0, uv, g=g) + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1).cast('float32'), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax + ) + + audio_dict.update({ + f"生成器测试数据/音频_{batch_idx}": y_hat[0], + f"地标真实数据/音频_{batch_idx}": y[0] + }) + image_dict.update({ + "生成器测试数据/梅尔频谱图": utils.plot_spectrogram_to_numpy(y_hat_mel[0].numpy()), + "地标真实数据/梅尔频谱图": utils.plot_spectrogram_to_numpy(mel[0].numpy()) + }) + utils.summarize( + writer=writer_eval, + global_step=global_step, + images=image_dict, + audios=audio_dict, + audio_sampling_rate=hps.data.sampling_rate + ) + generator.train() + + +if __name__ == "__main__": + main() diff --git a/trained_models/.ipynb_checkpoints/config-checkpoint.json b/trained_models/.ipynb_checkpoints/config-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..6f68cdb320aa4e79d80598bdb6b80bcfdc901e2c --- /dev/null +++ b/trained_models/.ipynb_checkpoints/config-checkpoint.json @@ -0,0 +1,95 @@ +{ + "train": { + "log_interval": 400, + "eval_interval": 800, + "seed": 1234, + "epochs": 114514, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-05, + "batch_size": 2, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 10240, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 512, + "port": "8001", + "keep_ckpts": 5 + }, + "data": { + "training_files": "filelists/train.txt", + "validation_files": "filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 44100, + "filter_length": 2048, + "hop_length": 512, + "win_length": 2048, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 22050 + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 8, + 8, + 2, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4, + 4 + ], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 256, + "n_speakers": 200 + }, + "spk": { + "azi": 0 + }, + "clean_logs": true, + "trainer": "admin" +} \ No newline at end of file diff --git a/trained_models/.ipynb_checkpoints/model_list-checkpoint.txt b/trained_models/.ipynb_checkpoints/model_list-checkpoint.txt new file mode 100644 index 0000000000000000000000000000000000000000..73ac4b9d2fa5855481f53ac16f9e8d4d6b4d96b8 --- /dev/null +++ b/trained_models/.ipynb_checkpoints/model_list-checkpoint.txt @@ -0,0 +1,3 @@ +纳西妲.pdparams +派蒙.pdparams +YH.pdparams \ No newline at end of file diff --git a/trained_models/YH.pdparams b/trained_models/YH.pdparams new file mode 100644 index 0000000000000000000000000000000000000000..803d02ab1e2e029a851ef4ac5dceef9b4971a75c --- /dev/null +++ b/trained_models/YH.pdparams @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0477da9b09c1bca1289a8ba013f840423a002f4edcc01a1e773371c8f4c74ede +size 430133781 diff --git a/trained_models/config.json b/trained_models/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6f68cdb320aa4e79d80598bdb6b80bcfdc901e2c --- /dev/null +++ b/trained_models/config.json @@ -0,0 +1,95 @@ +{ + "train": { + "log_interval": 400, + "eval_interval": 800, + "seed": 1234, + "epochs": 114514, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-05, + "batch_size": 2, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 10240, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 512, + "port": "8001", + "keep_ckpts": 5 + }, + "data": { + "training_files": "filelists/train.txt", + "validation_files": "filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 44100, + "filter_length": 2048, + "hop_length": 512, + "win_length": 2048, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 22050 + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 8, + 8, + 2, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4, + 4 + ], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 256, + "n_speakers": 200 + }, + "spk": { + "azi": 0 + }, + "clean_logs": true, + "trainer": "admin" +} \ No newline at end of file diff --git a/trained_models/model_list.txt b/trained_models/model_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..73ac4b9d2fa5855481f53ac16f9e8d4d6b4d96b8 --- /dev/null +++ b/trained_models/model_list.txt @@ -0,0 +1,3 @@ +纳西妲.pdparams +派蒙.pdparams +YH.pdparams \ No newline at end of file diff --git "a/trained_models/\346\264\276\350\222\231.pdparams" "b/trained_models/\346\264\276\350\222\231.pdparams" new file mode 100644 index 0000000000000000000000000000000000000000..6169bd345b223b4388776d2c255bbd7800da20db --- /dev/null +++ "b/trained_models/\346\264\276\350\222\231.pdparams" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02ee9416feafb3e206712e7ef2008e5c040ae9fff02368cd8fc03069c368a0d9 +size 180707709 diff --git "a/trained_models/\347\272\263\350\245\277\345\246\262.pdparams" "b/trained_models/\347\272\263\350\245\277\345\246\262.pdparams" new file mode 100644 index 0000000000000000000000000000000000000000..d3183c9a103890c9fe276d16f4874c2a177487fc --- /dev/null +++ "b/trained_models/\347\272\263\350\245\277\345\246\262.pdparams" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4706f1979788b028aadac31ccee3b2f1ef5675b3a10f4e6f88ff5773291c4341 +size 430133781 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6f33abd40a1ebee5931ac64228aadd2a392d7b13 --- /dev/null +++ b/utils.py @@ -0,0 +1,524 @@ +import os +import glob +import re +import sys +import argparse +import logging +import json +import subprocess +import random + +import visualdl +import librosa +import numpy as np +from scipy.io.wavfile import read +import paddle +import requests + +from paddle.nn import functional as F +from modules.commons import sequence_mask +#from hubert import hubert_model +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) +logger = logging + +f0_bin = 256 +f0_max = 1100.0 +f0_min = 50.0 +f0_mel_min = float(1127 * np.log(1 + f0_min / 700)) +f0_mel_max = float(1127 * np.log(1 + f0_max / 700)) + +def normalize_f0(f0, x_mask, uv, random_scale = True): + # calculate means based on x_mask + uv_sum = paddle.sum(uv, axis = 1, keepdim = True) + uv_sum[uv_sum == 0] = 9999 + means = paddle.sum(f0[:, 0, :] * uv, axis = 1, keepdim = True) / uv_sum + + if random_scale: + factor = paddle.zeros((f0.shape[0], 1)).uniform_(0.8, 1.2) + else: + factor = paddle.ones([f0.shape[0], 1]) + # normalize f0 based on means and factor + f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1) + if paddle.isnan(f0_norm).any(): # 如果存在非数字 + print('utils.py:44行:存在非数字,退出。') + exit(0) + return f0_norm * x_mask + + +def plot_data_to_numpy(x, y): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10, 2)) + plt.plot(x) + plt.plot(y) + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + + +def interpolate_f0(f0): + ''' + 对F0进行插值处理 + ''' + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] + last_value = data[i] + + return ip_data[:,0], vuv_vector[:,0] + + +def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512): + import parselmouth + x = wav_numpy + if p_len is None: + p_len = x.shape[0]//hop_length + else: + assert abs(p_len-x.shape[0]//hop_length) < 4, "pad length error" + time_step = hop_length / sampling_rate * 1000 + f0_min = 50 + f0_max = 1100 + f0 = parselmouth.Sound(x, sampling_rate).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + + pad_size=(p_len - len(f0) + 1) // 2 + if(pad_size>0 or p_len - len(f0) - pad_size>0): + f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + return f0 + +def resize_f0(x, target_len): + source = np.array(x) + source[source<0.001] = np.nan + target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) + res = np.nan_to_num(target) + return res + +def compute_f0_dio(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512): + import pyworld + if p_len is None: + p_len = wav_numpy.shape[0]//hop_length + f0, t = pyworld.dio( + wav_numpy.astype(np.double), + fs=sampling_rate, + f0_ceil=800, + frame_period=1000 * hop_length / sampling_rate, + ) + f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return resize_f0(f0, p_len) + +def 导出的时候才能跑通的f0_to_coarse(f0): + f0_mel_min = paddle.to_tensor(float(1127 * np.log(1 + f0_min / 700))) + f0_mel_max = paddle.to_tensor(float(1127 * np.log(1 + f0_max / 700))) + #is_paddle = isinstance(f0, paddle.Tensor) + f0_mel = 1127 * (1 + f0 / 700).log()# if is_paddle else 1127 * np.log(1 + f0 / 700) + greater:paddle.Tensor = f0_mel > 0 + a1 = paddle.masked_select(f0_mel,greater) + a = paddle.subtract(a1 , f0_mel_min) + b = paddle.to_tensor(f0_bin - 2,dtype = 'float32') + c = paddle.subtract(f0_mel_max , f0_mel_min) + left = paddle.to_tensor(a.astype('float32') * b.astype('float32') / c.astype('float32') + 1) + right = f0_mel + f0_mel = paddle.where(greater, left, right) + + less_equal = paddle.less_equal(f0_mel , paddle.to_tensor(1.)) + f0_mel = paddle.where(less_equal,paddle.to_tensor(1.),f0_mel) # float + greater = paddle.greater_than(f0_mel , paddle.to_tensor(f0_bin - 1,dtype = 'float32')) + f0_mel = paddle.where(greater,paddle.to_tensor(f0_bin - 1,dtype = 'float32'),f0_mel) + f0_coarse = (f0_mel + 0.5).astype('int64') #if is_paddle else np.rint(f0_mel).astype(np.int) # 改了 + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) + return f0_coarse + +def f0_to_coarse(f0): + is_paddle = isinstance(f0, paddle.Tensor) + f0_mel = 1127 * (1 + f0 / 700).log() if is_paddle else 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 + + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 + f0_coarse = (f0_mel + 0.5).astype('int64') #if is_paddle else np.rint(f0_mel).astype(np.int) # 改了 + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) + return f0_coarse +import os + +def get_hubert_model(): + if os.getcwd() == "/home/aistudio": + vec_path = f"{os.getcwd()}/build/hubert/hubert4.0.onnx" + else: + vec_path = f"{os.getcwd()}/hubert/hubert4.0.onnx" + import onnxruntime as ort + print("从{}加载模型".format(vec_path)) + model = ort.InferenceSession(vec_path,providers=[ 'CUDAExecutionProvider', 'CPUExecutionProvider']) + return model + +def get_hubert_content(hmodel, wav_16k_tensor) -> paddle.Tensor: # 传入的模型和声音数组 + feats = wav_16k_tensor + if feats.dim() == 2: # 双通道 + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.reshape((1, 1, -1)).numpy() + outputs = hmodel.run( + None, + {"source": feats.astype(np.float32)}, + )[0] + return paddle.to_tensor(outputs.transpose((0,2,1))) + +def load_checkpoint(checkpoint_path, model, optimizer:paddle.optimizer.Optimizer=None, skip_optimizer:bool=False): + # assert os.path.isfile(checkpoint_path) + checkpoint_dict = paddle.load(checkpoint_path) + iteration = checkpoint_dict['iteration'] + learning_rate = checkpoint_dict['learning_rate'] + try: + trainers = checkpoint_dict["trainers"] + except: + trainers = ['最初作者信息丢失'] + if optimizer is not None and not skip_optimizer and checkpoint_dict['optimizer'] is not None: + optimizer.set_state_dict(checkpoint_dict['optimizer']) + saved_state_dict = checkpoint_dict['model'] + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + # assert "dec" in k or "disc" in k + # print("load", k) + new_state_dict[k] = saved_state_dict[k] # 改过的 + assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape) + except Exception as e: + print(e) + print("错误,%s 不在检查点里面" % k) + logger.info("%s 不在检查点里面" % k) + new_state_dict[k] = v + if hasattr(model, 'module'): + model.module.set_state_dict(new_state_dict) + else: + model.set_state_dict(new_state_dict) + logger.info("加载检查点 '{}' (迭代次数 {})".format( + checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration, trainers + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path, trainers:list[str]): + logger.info("保存模型和优化器状态位于迭代次数{} 到 {}".format( + iteration, checkpoint_path)) + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + paddle.save( + {'model': state_dict, + 'iteration': iteration, + 'optimizer': optimizer.state_dict(), + 'learning_rate': learning_rate, + 'trainers':trainers + }, + checkpoint_path,) + +def clean_checkpoints(path_to_models='logs/44k/', n_ckpts_to_keep=2, sort_by_time=True): + """通过删除保存的检查点来释放空间 + + 参数: + path_to_models -- 模型路径 + n_ckpts_to_keep -- 要保留的检查点数量,不包括G_0.pdparams和D_0.pdparams + sort_by_time -- True -> 按时间顺序删除检查点 + False -> 按字典顺序删除检查点 + """ + ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))] + name_key = (lambda _f: int(re.compile('._(\d+)\.pdparams').match(_f).group(1))) + time_key = (lambda _f: os.path.getmtime(os.path.join(path_to_models, _f))) + sort_key = time_key if sort_by_time else name_key + x_sorted = lambda _x: sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith('_0.pdparams')], key=sort_key) + to_del = [os.path.join(path_to_models, fn) for fn in + (x_sorted('G')[:-n_ckpts_to_keep] + x_sorted('D')[:-n_ckpts_to_keep])] + del_info = lambda fn: logger.info(f".. 通过删除模型 {fn} 来释放空间") + del_routine = lambda x: [os.remove(x), del_info(x)] + rs = [del_routine(fn) for fn in to_del] + +def summarize(writer:visualdl.writer.writer.LogWriter, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): + for k, v in scalars.items(): + writer.add_scalar(tag = k, value = v, step = global_step) + for k, v in histograms.items(): + writer.add_histogram(tag = k, values = v, step = global_step) + for k, v in images.items(): + writer.add_image(tag = k, img = v, step = global_step, dataformats='HWC') + for k, v in audios.items(): + writer.add_audio(tag = k, audio_array = v.numpy(), step = global_step, sample_rate = audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pdparams"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10,2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', + interpolation='none') + fig.colorbar(im, ax=ax) + xlabel = 'Decoder timestep' + if info is not None: + xlabel += '\n\n' + info + plt.xlabel(xlabel) + plt.ylabel('Encoder timestep') + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return paddle.to_tensor(data.astype(np.float32), dtype = 'float32'), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding='utf-8') as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init:bool=True): + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, default="./configs/base.json", + help='JSON file for configuration') + parser.add_argument('-m', '--model', type=str, required=True, + help='Model name') + + args = parser.parse_args() + model_dir = os.path.join("./logs", args.model) + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + + config_path = args.config + config_save_path = os.path.join(model_dir, "config.json") + if init: + with open(config_path, "r") as f: + data = f.read() + with open(config_save_path, "w") as f: + f.write(data) + else: + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + +def get_hparams_no_args(parser): + + args = parser.parse_args() + model_dir = args.path + + config_path = args.config_path + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams =HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn("{}不是git存储库,因此将忽略哈希值比较。".format( + source_dir + )) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn("git散列值不同。{}(已保存)!={}(当前)".format( + saved_hash[:8], cur_hash[:8])) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + +def download_file(url, save_path): + response = requests.get(url) + if response.status_code == 200: + with open(save_path, 'wb') as file: + file.write(response.content) + print(f"File downloaded successfully and saved to: {save_path}") + else: + print(f"Failed to download the file from: {url}") + +def repeat_expand_2d(content:paddle.Tensor, target_len): + # content : [h, t] + + src_len = content.shape[-1] + target = paddle.zeros([content.shape[0], target_len], dtype='float32').cpu() \ + if 'cpu' in str(content.place) \ + else paddle.zeros([content.shape[0], target_len], dtype='float32').cuda() + temp = paddle.arange(src_len+1) * target_len / src_len + current_pos = 0 + for i in range(target_len): + if i < temp[current_pos+1]: + target[:, i] = content[:, current_pos] + else: + current_pos += 1 + target[:, i] = content[:, current_pos] + + return target + +# 含有所有前面存下来的输入超参 +class HParams(): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() + diff --git a/vdecoder/__init__.py b/vdecoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vdecoder/hifigan/env.py b/vdecoder/hifigan/env.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdbc95d4f7a8bad8fd4f5eef657e2b51d946056 --- /dev/null +++ b/vdecoder/hifigan/env.py @@ -0,0 +1,15 @@ +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/vdecoder/hifigan/models.py b/vdecoder/hifigan/models.py new file mode 100644 index 0000000000000000000000000000000000000000..b9ec779218bfc160985b4976f49d5155b465fbab --- /dev/null +++ b/vdecoder/hifigan/models.py @@ -0,0 +1,508 @@ +import os +import json +from .env import AttrDict +import numpy as np +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +from paddle.nn import Conv1D, Conv1DTranspose, AvgPool1D, Conv2D +from paddle.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from .utils import init_weights, get_padding + +LRELU_SLOPE = 0.1 + + +def load_model(model_path, device='gpu:0'): + config_file = os.path.join(os.path.split(model_path)[0], 'config.json') + with open(config_file) as f: + data = f.read() + + global h + json_config = json.loads(data) + h = AttrDict(json_config) + + generator = Generator(h).to(device) + + cp_dict = paddle.load(model_path) + generator.set_state_dict(cp_dict['generator']) + generator.eval() + generator.remove_weight_norm() + del cp_dict + return generator, h + + +class ResBlock1(nn.Layer): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.LayerList([ + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + self.convs2 = nn.LayerList([ + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(paddle.nn.Layer): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.LayerList([ + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +def padDiff(x): + x = x.unsqueeze(0) # 为了能用paddle的pad函数 + pad = F.pad(x, (0,0,-1,1), 'constant', 0) + out = F.pad(pad - x, (0,0,0,-1), 'constant', 0) + out = out.squeeze(0) + return out + +class SineGen(paddle.nn.Layer): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.flag_for_pulse = flag_for_pulse + + def _f02uv(self, f0): + # generate uv signal + uv = (f0 > self.voiced_threshold).astype('float32') + return uv + + def _f02sine(self, f0_values): + """ f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = paddle.rand((f0_values.shape[0], f0_values.shape[2])) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + if not self.flag_for_pulse: + # for normal case + + # To prevent torch.cumsum numerical overflow, + # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. + # Buffer tmp_over_one_idx indicates the time step to add -1. + # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi + tmp_over_one = paddle.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 + cumsum_shift = paddle.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = paddle.sin(paddle.cumsum(rad_values + cumsum_shift, axis=1) + * 2 * np.pi) + else: + # If necessary, make sure that the first time step of every + # voiced segments is sin(pi) or cos(0) + # This is used for pulse-train generation + + # identify the last time step in unvoiced segments + uv = self._f02uv(f0_values) + uv_1 = paddle.roll(uv, shifts=-1, axis=1) + uv_1[:, -1, :] = 1 + u_loc = (uv < 1) * (uv_1 > 0) + + # get the instantanouse phase + tmp_cumsum = paddle.cumsum(rad_values, axis=1) + # different batch needs to be processed differently + for idx in range(f0_values.shape[0]): + temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] + temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] + # stores the accumulation of i.phase within + # each voiced segments + tmp_cumsum[idx, :, :] = 0 + tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum + + # rad_values - tmp_cumsum: remove the accumulation of i.phase + # within the previous voiced segment. + i_phase = paddle.cumsum(rad_values - tmp_cumsum, axis=1) + + # get the sines + sines = paddle.cos(i_phase * 2 * np.pi) + return sines + + def forward(self, f0): + """ sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with paddle.no_grad(): + f0_buf = paddle.zeros((f0.shape[0], f0.shape[1], self.dim,)) + #device=f0.device) + # fundamental component + fn = paddle.multiply(f0, paddle.to_tensor([[range(1, self.harmonic_num + 2)]],dtype = 'float32')) + + # generate sine waveforms + sine_waves = self._f02sine(fn) * self.sine_amp + + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * paddle.randn(sine_waves.shape) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(paddle.nn.Layer): + """ SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen(sampling_rate, harmonic_num, + sine_amp, add_noise_std, voiced_threshod) + + # to merge source harmonics into a single excitation + self.l_linear = paddle.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = paddle.nn.Tanh() + + def forward(self, x): + """ + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + """ + # source for harmonic branch + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + # source for noise branch, in the same shape as uv + noise = paddle.randn(uv.shape) * self.sine_amp / 3 + return sine_merge, noise, uv + + +class Generator(paddle.nn.Layer): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + + self.num_kernels = len(h["resblock_kernel_sizes"]) + self.num_upsamples = len(h["upsample_rates"]) + self.f0_upsamp = paddle.nn.Upsample(scale_factor=paddle.prod(paddle.to_tensor(h["upsample_rates"]),dtype = 'float32'),data_format = 'NCW',mode = 'linear') + self.m_source = SourceModuleHnNSF( + sampling_rate=h["sampling_rate"], + harmonic_num=8) + self.noise_convs = nn.LayerList() + self.conv_pre = weight_norm(Conv1D(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) + resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2 + self.ups = nn.LayerList() + for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])): + c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) + self.ups.append(weight_norm( + Conv1DTranspose(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), + k, u, padding=(k - u) // 2))) + if i + 1 < len(h["upsample_rates"]): # + stride_f0 = np.prod(h["upsample_rates"][i + 1:]) + self.noise_convs.append(Conv1D( + 1, c_cur, kernel_size=[stride_f0 * 2], stride=[stride_f0], padding=[int(stride_f0 // 2)])) + else: + self.noise_convs.append(Conv1D(1, c_cur, kernel_size=1)) + self.resblocks = nn.LayerList() + for i in range(len(self.ups)): + ch = h["upsample_initial_channel"] // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1D(ch, 1, 7, 1, padding=[3])) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + self.cond = nn.Conv1D(h['gin_channels'], h['upsample_initial_channel'], 1) + @paddle.jit.to_static + def forward(self, x, f0, g=None): + # print(1,x.shape,f0.shape,f0[:, None].shape) + vt = f0[:, None]#.astype('float64') + f0 = self.f0_upsamp(vt) + #f0 = f0.astype('float32') + f0 = f0.transpose([0,2,1]) # bs,n,t + # print(2,f0.shape) + har_source, noi_source, uv = self.m_source(f0) + har_source = har_source.transpose([0,2,1]) + x = self.conv_pre(x) + x = x + self.cond(g) + # print(124,x.shape,har_source.shape) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + # print(3,x.shape) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + # print(4,x_source.shape,har_source.shape,x.shape) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = paddle.tanh(x) + + return x + + def remove_weight_norm(self): + print('移除weight norm……') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(paddle.nn.Layer): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.LayerList([ + norm_f(Conv2D(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2D(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2D(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2D(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2D(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2D(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = paddle.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(paddle.nn.Layer): + def __init__(self, periods=None): + super(MultiPeriodDiscriminator, self).__init__() + self.periods = periods if periods is not None else [2, 3, 5, 7, 11] + self.discriminators = nn.LayerList() + for period in self.periods: + self.discriminators.append(DiscriminatorP(period)) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(paddle.nn.Layer): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.LayerList([ + norm_f(Conv1D(1, 128, 15, 1, padding=7)), + norm_f(Conv1D(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1D(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1D(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1D(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1D(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1D(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1D(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = paddle.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(paddle.nn.Layer): + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.LayerList([ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ]) + self.meanpools = nn.LayerList([ + AvgPool1D(4, 2, padding=2), + AvgPool1D(4, 2, padding=2) + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i - 1](y) + y_hat = self.meanpools[i - 1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += paddle.mean(paddle.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = paddle.mean((1 - dr) ** 2) + g_loss = paddle.mean(dg ** 2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = paddle.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses diff --git a/vdecoder/hifigan/nvSTFT.py b/vdecoder/hifigan/nvSTFT.py new file mode 100644 index 0000000000000000000000000000000000000000..1282764d3bbdda7f25973316ab1f5d0d1e75041f --- /dev/null +++ b/vdecoder/hifigan/nvSTFT.py @@ -0,0 +1,110 @@ +import math +import os +os.environ["LRU_CACHE_CAPACITY"] = "3" +import random +import paddle +import numpy as np +import librosa +from librosa.util import normalize +from librosa.filters import mel as librosa_mel_fn +from scipy.io.wavfile import read +import soundfile as sf + +def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): + sampling_rate = None + try: + data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile. + except Exception as ex: + print(f"ȡ'{full_path}'ʧ\n쳣") + print(ex) + if return_empty_on_exception: + return [], sampling_rate or target_sr or 32000 + else: + raise Exception(ex) + + if len(data.shape) > 1: + data = data[:, 0] + assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension) + + if np.issubdtype(data.dtype, np.integer): # if audio data is type int + max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX + else: # if audio data is type fp32 + max_mag = max(np.amax(data), -np.amin(data)) + max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32 + + data = paddle.to_tensor(data.astype(np.float32),dtype = 'float32') / max_mag + + if (paddle.isinf(data) | paddle.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except + return [], sampling_rate or target_sr or 32000 + if target_sr is not None and sampling_rate != target_sr: + data = paddle.to_tensor(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr)) + sampling_rate = target_sr + + return data, sampling_rate + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return paddle.log(paddle.clip(x, min=clip_val) * C) + +def dynamic_range_decompression_torch(x, C=1): + return paddle.exp(x) / C + +class STFT(): + def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): + self.target_sr = sr + + self.n_mels = n_mels + self.n_fft = n_fft + self.win_size = win_size + self.hop_length = hop_length + self.fmin = fmin + self.fmax = fmax + self.clip_val = clip_val + self.mel_basis = {} + self.hann_window = {} + + def get_mel(self, y, center=False): + sampling_rate = self.target_sr + n_mels = self.n_mels + n_fft = self.n_fft + win_size = self.win_size + hop_length = self.hop_length + fmin = self.fmin + fmax = self.fmax + clip_val = self.clip_val + + if paddle.min(y) < -1.: + print('min value is ', paddle.min(y)) + if paddle.max(y) > 1.: + print('max value is ', paddle.max(y)) + + if fmax not in self.mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) + self.mel_basis[str(fmax)+'_'+str(y.device)] = paddle.to_tensor(mel,dtype = 'float32') + self.hann_window[str(y.place)] = paddle.audio.functional.get_window('hann',self.win_size) + + y = paddle.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect') + y = y.squeeze(1) + + spec = paddle.signal.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)], + center=center, pad_mode='reflect', normalized=False, onesided=True) + # print(111,spec) + spec = paddle.sqrt(spec.pow(2).sum(-1)+(1e-9)) + # print(222,spec) + spec = paddle.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec) + # print(333,spec) + spec = dynamic_range_compression_torch(spec, clip_val=clip_val) + # print(444,spec) + return spec + + def __call__(self, audiopath): + audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) + spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) + return spect + +stft = STFT() diff --git a/vdecoder/hifigan/utils.py b/vdecoder/hifigan/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1063b035664d2c61bb03a607db7f0ef5e1b99e07 --- /dev/null +++ b/vdecoder/hifigan/utils.py @@ -0,0 +1,67 @@ +import glob +import os +import matplotlib +import paddle +from paddle.nn.utils import weight_norm +# matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight = paddle.normal(mean, std, m.weight.shape) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("加载'{}'".format(filepath)) + checkpoint_dict = paddle.load(filepath, map_location=device) + print("完成。") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("保存检查点到{}".format(filepath)) + paddle.save(obj, filepath) + print("完成。") + + +def del_old_checkpoints(cp_dir, prefix, n_models=2): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) # get checkpoint paths + cp_list = sorted(cp_list)# sort by iter + if len(cp_list) > n_models: # if more than n_models models are found + for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models + open(cp, 'w').close()# empty file contents + os.unlink(cp)# delete file (move to trash when using Colab) + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] diff --git a/vdecoder/vdecoder/__init__.py b/vdecoder/vdecoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vdecoder/vdecoder/hifigan/env.py b/vdecoder/vdecoder/hifigan/env.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdbc95d4f7a8bad8fd4f5eef657e2b51d946056 --- /dev/null +++ b/vdecoder/vdecoder/hifigan/env.py @@ -0,0 +1,15 @@ +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/vdecoder/vdecoder/hifigan/models.py b/vdecoder/vdecoder/hifigan/models.py new file mode 100644 index 0000000000000000000000000000000000000000..1438419afad969dad2001bb716d9648f35df2b9d --- /dev/null +++ b/vdecoder/vdecoder/hifigan/models.py @@ -0,0 +1,506 @@ +import os +import json +from .env import AttrDict +import numpy as np +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +from paddle.nn import Conv1D, Conv1DTranspose, AvgPool1D, Conv2D +from paddle.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from .utils import init_weights, get_padding + +LRELU_SLOPE = 0.1 + + +def load_model(model_path, device='gpu:0'): + config_file = os.path.join(os.path.split(model_path)[0], 'config.json') + with open(config_file) as f: + data = f.read() + + global h + json_config = json.loads(data) + h = AttrDict(json_config) + + generator = Generator(h).to(device) + + cp_dict = paddle.load(model_path) + generator.set_state_dict(cp_dict['generator']) + generator.eval() + generator.remove_weight_norm() + del cp_dict + return generator, h + + +class ResBlock1(nn.Layer): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.LayerList([ + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + self.convs2 = nn.LayerList([ + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(paddle.nn.Layer): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.LayerList([ + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1D(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +def padDiff(x): + x.unsqueeze_(0) # 为了能用paddle的pad函数 + pad = F.pad(x, (0,0,-1,1), 'constant', 0) + out = F.pad(pad - x, (0,0,0,-1), 'constant', 0) + out.squeeze_(0) + return out + +class SineGen(paddle.nn.Layer): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.flag_for_pulse = flag_for_pulse + + def _f02uv(self, f0): + # generate uv signal + uv = (f0 > self.voiced_threshold).astype('float32') + return uv + + def _f02sine(self, f0_values): + """ f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = paddle.rand((f0_values.shape[0], f0_values.shape[2])) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + if not self.flag_for_pulse: + # for normal case + + # To prevent torch.cumsum numerical overflow, + # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. + # Buffer tmp_over_one_idx indicates the time step to add -1. + # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi + tmp_over_one = paddle.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 + cumsum_shift = paddle.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = paddle.sin(paddle.cumsum(rad_values + cumsum_shift, axis=1) + * 2 * np.pi) + else: + # If necessary, make sure that the first time step of every + # voiced segments is sin(pi) or cos(0) + # This is used for pulse-train generation + + # identify the last time step in unvoiced segments + uv = self._f02uv(f0_values) + uv_1 = paddle.roll(uv, shifts=-1, axis=1) + uv_1[:, -1, :] = 1 + u_loc = (uv < 1) * (uv_1 > 0) + + # get the instantanouse phase + tmp_cumsum = paddle.cumsum(rad_values, axis=1) + # different batch needs to be processed differently + for idx in range(f0_values.shape[0]): + temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] + temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] + # stores the accumulation of i.phase within + # each voiced segments + tmp_cumsum[idx, :, :] = 0 + tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum + + # rad_values - tmp_cumsum: remove the accumulation of i.phase + # within the previous voiced segment. + i_phase = paddle.cumsum(rad_values - tmp_cumsum, axis=1) + + # get the sines + sines = paddle.cos(i_phase * 2 * np.pi) + return sines + + def forward(self, f0): + """ sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with paddle.no_grad(): + f0_buf = paddle.zeros((f0.shape[0], f0.shape[1], self.dim,)) + #device=f0.device) + # fundamental component + fn = paddle.multiply(f0, paddle.to_tensor([[range(1, self.harmonic_num + 2)]],dtype = 'float32')) + + # generate sine waveforms + sine_waves = self._f02sine(fn) * self.sine_amp + + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * paddle.randn(sine_waves.shape) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(paddle.nn.Layer): + """ SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen(sampling_rate, harmonic_num, + sine_amp, add_noise_std, voiced_threshod) + + # to merge source harmonics into a single excitation + self.l_linear = paddle.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = paddle.nn.Tanh() + + def forward(self, x): + """ + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + """ + # source for harmonic branch + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + # source for noise branch, in the same shape as uv + noise = paddle.randn(uv.shape) * self.sine_amp / 3 + return sine_merge, noise, uv + + +class Generator(paddle.nn.Layer): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + + self.num_kernels = len(h["resblock_kernel_sizes"]) + self.num_upsamples = len(h["upsample_rates"]) + self.f0_upsamp = paddle.nn.Upsample(scale_factor=paddle.to_tensor(np.prod(h["upsample_rates"])),data_format = 'NCW',mode = 'linear') + self.m_source = SourceModuleHnNSF( + sampling_rate=h["sampling_rate"], + harmonic_num=8) + self.noise_convs = nn.LayerList() + self.conv_pre = weight_norm(Conv1D(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) + resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2 + self.ups = nn.LayerList() + for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])): + c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) + self.ups.append(weight_norm( + Conv1DTranspose(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), + k, u, padding=(k - u) // 2))) + if i + 1 < len(h["upsample_rates"]): # + stride_f0 = np.prod(h["upsample_rates"][i + 1:]) + self.noise_convs.append(Conv1D( + 1, c_cur, kernel_size=[stride_f0 * 2], stride=[stride_f0], padding=[int(stride_f0 // 2)])) + else: + self.noise_convs.append(Conv1D(1, c_cur, kernel_size=1)) + self.resblocks = nn.LayerList() + for i in range(len(self.ups)): + ch = h["upsample_initial_channel"] // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1D(ch, 1, 7, 1, padding=[3])) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + self.cond = nn.Conv1D(h['gin_channels'], h['upsample_initial_channel'], 1) + + def forward(self, x, f0, g=None): + # print(1,x.shape,f0.shape,f0[:, None].shape) + f0 = self.f0_upsamp(f0[:, None]) + f0 = f0.transpose([0,2,1]) # bs,n,t + # print(2,f0.shape) + har_source, noi_source, uv = self.m_source(f0) + har_source = har_source.transpose([0,2,1]) + x = self.conv_pre(x) + x = x + self.cond(g) + # print(124,x.shape,har_source.shape) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + # print(3,x.shape) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + # print(4,x_source.shape,har_source.shape,x.shape) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = paddle.tanh(x) + + return x + + def remove_weight_norm(self): + print('移除weight norm……') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(paddle.nn.Layer): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.LayerList([ + norm_f(Conv2D(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2D(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2D(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2D(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2D(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2D(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = paddle.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(paddle.nn.Layer): + def __init__(self, periods=None): + super(MultiPeriodDiscriminator, self).__init__() + self.periods = periods if periods is not None else [2, 3, 5, 7, 11] + self.discriminators = nn.LayerList() + for period in self.periods: + self.discriminators.append(DiscriminatorP(period)) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(paddle.nn.Layer): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.LayerList([ + norm_f(Conv1D(1, 128, 15, 1, padding=7)), + norm_f(Conv1D(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1D(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1D(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1D(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1D(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1D(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1D(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = paddle.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(paddle.nn.Layer): + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.LayerList([ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ]) + self.meanpools = nn.LayerList([ + AvgPool1D(4, 2, padding=2), + AvgPool1D(4, 2, padding=2) + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i - 1](y) + y_hat = self.meanpools[i - 1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += paddle.mean(paddle.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = paddle.mean((1 - dr) ** 2) + g_loss = paddle.mean(dg ** 2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = paddle.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses diff --git a/vdecoder/vdecoder/hifigan/nvSTFT.py b/vdecoder/vdecoder/hifigan/nvSTFT.py new file mode 100644 index 0000000000000000000000000000000000000000..1282764d3bbdda7f25973316ab1f5d0d1e75041f --- /dev/null +++ b/vdecoder/vdecoder/hifigan/nvSTFT.py @@ -0,0 +1,110 @@ +import math +import os +os.environ["LRU_CACHE_CAPACITY"] = "3" +import random +import paddle +import numpy as np +import librosa +from librosa.util import normalize +from librosa.filters import mel as librosa_mel_fn +from scipy.io.wavfile import read +import soundfile as sf + +def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): + sampling_rate = None + try: + data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile. + except Exception as ex: + print(f"ȡ'{full_path}'ʧ\n쳣") + print(ex) + if return_empty_on_exception: + return [], sampling_rate or target_sr or 32000 + else: + raise Exception(ex) + + if len(data.shape) > 1: + data = data[:, 0] + assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension) + + if np.issubdtype(data.dtype, np.integer): # if audio data is type int + max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX + else: # if audio data is type fp32 + max_mag = max(np.amax(data), -np.amin(data)) + max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32 + + data = paddle.to_tensor(data.astype(np.float32),dtype = 'float32') / max_mag + + if (paddle.isinf(data) | paddle.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except + return [], sampling_rate or target_sr or 32000 + if target_sr is not None and sampling_rate != target_sr: + data = paddle.to_tensor(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr)) + sampling_rate = target_sr + + return data, sampling_rate + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return paddle.log(paddle.clip(x, min=clip_val) * C) + +def dynamic_range_decompression_torch(x, C=1): + return paddle.exp(x) / C + +class STFT(): + def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): + self.target_sr = sr + + self.n_mels = n_mels + self.n_fft = n_fft + self.win_size = win_size + self.hop_length = hop_length + self.fmin = fmin + self.fmax = fmax + self.clip_val = clip_val + self.mel_basis = {} + self.hann_window = {} + + def get_mel(self, y, center=False): + sampling_rate = self.target_sr + n_mels = self.n_mels + n_fft = self.n_fft + win_size = self.win_size + hop_length = self.hop_length + fmin = self.fmin + fmax = self.fmax + clip_val = self.clip_val + + if paddle.min(y) < -1.: + print('min value is ', paddle.min(y)) + if paddle.max(y) > 1.: + print('max value is ', paddle.max(y)) + + if fmax not in self.mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) + self.mel_basis[str(fmax)+'_'+str(y.device)] = paddle.to_tensor(mel,dtype = 'float32') + self.hann_window[str(y.place)] = paddle.audio.functional.get_window('hann',self.win_size) + + y = paddle.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect') + y = y.squeeze(1) + + spec = paddle.signal.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)], + center=center, pad_mode='reflect', normalized=False, onesided=True) + # print(111,spec) + spec = paddle.sqrt(spec.pow(2).sum(-1)+(1e-9)) + # print(222,spec) + spec = paddle.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec) + # print(333,spec) + spec = dynamic_range_compression_torch(spec, clip_val=clip_val) + # print(444,spec) + return spec + + def __call__(self, audiopath): + audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) + spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) + return spect + +stft = STFT() diff --git a/vdecoder/vdecoder/hifigan/utils.py b/vdecoder/vdecoder/hifigan/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1063b035664d2c61bb03a607db7f0ef5e1b99e07 --- /dev/null +++ b/vdecoder/vdecoder/hifigan/utils.py @@ -0,0 +1,67 @@ +import glob +import os +import matplotlib +import paddle +from paddle.nn.utils import weight_norm +# matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight = paddle.normal(mean, std, m.weight.shape) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("加载'{}'".format(filepath)) + checkpoint_dict = paddle.load(filepath, map_location=device) + print("完成。") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("保存检查点到{}".format(filepath)) + paddle.save(obj, filepath) + print("完成。") + + +def del_old_checkpoints(cp_dir, prefix, n_models=2): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) # get checkpoint paths + cp_list = sorted(cp_list)# sort by iter + if len(cp_list) > n_models: # if more than n_models models are found + for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models + open(cp, 'w').close()# empty file contents + os.unlink(cp)# delete file (move to trash when using Colab) + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1]