Spaces:

papercore-dev
/

diff-SVC

Runtime error

App Files Files Community

10kwon commited on Dec 17, 2022

Commit

2bfc29a

1 Parent(s): 10f66bf

DiffSVC

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

batch.py +43 -0
flask_api.py +54 -0
infer.py +98 -0
infer_tools/__init__.py +0 -0
infer_tools/infer_tool.py +334 -0
infer_tools/slicer.py +158 -0
modules/commons/common_layers.py +671 -0
modules/commons/espnet_positional_embedding.py +113 -0
modules/commons/ssim.py +391 -0
modules/fastspeech/fs2.py +255 -0
modules/fastspeech/pe.py +149 -0
modules/fastspeech/tts_modules.py +364 -0
modules/hifigan/hifigan.py +365 -0
modules/hifigan/mel_utils.py +80 -0
modules/nsf_hifigan/env.py +15 -0
modules/nsf_hifigan/models.py +549 -0
modules/nsf_hifigan/nvSTFT.py +111 -0
modules/nsf_hifigan/utils.py +67 -0
modules/parallel_wavegan/__init__.py +0 -0
modules/parallel_wavegan/layers/__init__.py +5 -0
modules/parallel_wavegan/layers/causal_conv.py +56 -0
modules/parallel_wavegan/layers/pqmf.py +129 -0
modules/parallel_wavegan/layers/residual_block.py +129 -0
modules/parallel_wavegan/layers/residual_stack.py +75 -0
modules/parallel_wavegan/layers/tf_layers.py +129 -0
modules/parallel_wavegan/layers/upsample.py +183 -0
modules/parallel_wavegan/losses/__init__.py +1 -0
modules/parallel_wavegan/losses/stft_loss.py +153 -0
modules/parallel_wavegan/models/__init__.py +2 -0
modules/parallel_wavegan/models/melgan.py +427 -0
modules/parallel_wavegan/models/parallel_wavegan.py +434 -0
modules/parallel_wavegan/models/source.py +538 -0
modules/parallel_wavegan/optimizers/__init__.py +2 -0
modules/parallel_wavegan/optimizers/radam.py +91 -0
modules/parallel_wavegan/stft_loss.py +100 -0
modules/parallel_wavegan/utils/__init__.py +1 -0
modules/parallel_wavegan/utils/utils.py +169 -0
network/diff/candidate_decoder.py +98 -0
network/diff/diffusion.py +332 -0
network/diff/net.py +135 -0
network/hubert/hubert_model.py +276 -0
network/hubert/vec_model.py +60 -0
network/vocoders/__init__.py +2 -0
network/vocoders/base_vocoder.py +39 -0
network/vocoders/hifigan.py +83 -0
network/vocoders/nsf_hifigan.py +92 -0
network/vocoders/pwg.py +137 -0
network/vocoders/vocoder_utils.py +15 -0
preprocessing/SVCpre.py +63 -0
preprocessing/base_binarizer.py +237 -0

batch.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import soundfile
+from infer_tools import infer_tool
+from infer_tools.infer_tool import Svc
+def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise_step, project_name='', f_name=None,
+             file_path=None, out_path=None):
+    raw_audio_path = f_name
+    infer_tool.format_wav(raw_audio_path)
+    _f0_tst, _f0_pred, _audio = svc_model.infer(raw_audio_path, key=key, acc=acc, singer=True, use_pe=use_pe,
+                                                use_crepe=use_crepe,
+                                                thre=thre, use_gt_mel=use_gt_mel, add_noise_step=add_noise_step)
+    out_path = f'./singer_data/{f_name.split("/")[-1]}'
+    soundfile.write(out_path, _audio, 44100, 'PCM_16')
+if __name__ == '__main__':
+    # 工程文件夹名，训练时用的那个
+    project_name = "firefox"
+    model_path = f'./checkpoints/{project_name}/clean_model_ckpt_steps_100000.ckpt'
+    config_path = f'./checkpoints/{project_name}/config.yaml'
+    # 支持多个wav/ogg文件，放在raw文件夹下，带扩展名
+    file_names = infer_tool.get_end_file("./batch", "wav")
+    trans = [-6]  # 音高调整，支持正负（半音），数量与上一行对应，不足的自动按第一个移调参数补齐
+    # 加速倍数
+    accelerate = 50
+    hubert_gpu = True
+    cut_time = 30
+    # 下面不动
+    infer_tool.mkdir(["./batch", "./singer_data"])
+    infer_tool.fill_a_to_b(trans, file_names)
+    model = Svc(project_name, config_path, hubert_gpu, model_path)
+    count = 0
+    for f_name, tran in zip(file_names, trans):
+        print(f_name)
+        run_clip(model, key=tran, acc=accelerate, use_crepe=False, thre=0.05, use_pe=False, use_gt_mel=False,
+                 add_noise_step=500, f_name=f_name, project_name=project_name)
+        count += 1
+        print(f"process:{round(count * 100 / len(file_names), 2)}%")

flask_api.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import io
+import logging
+import librosa
+import soundfile
+from flask import Flask, request, send_file
+from flask_cors import CORS
+from infer_tools.infer_tool import Svc
+from utils.hparams import hparams
+app = Flask(__name__)
+CORS(app)
+logging.getLogger('numba').setLevel(logging.WARNING)
+@app.route("/voiceChangeModel", methods=["POST"])
+def voice_change_model():
+    request_form = request.form
+    wave_file = request.files.get("sample", None)
+    # 变调信息
+    f_pitch_change = float(request_form.get("fPitchChange", 0))
+    # DAW所需的采样率
+    daw_sample = int(float(request_form.get("sampleRate", 0)))
+    speaker_id = int(float(request_form.get("sSpeakId", 0)))
+    # http获得wav文件并转换
+    input_wav_path = io.BytesIO(wave_file.read())
+    # 模型推理
+    _f0_tst, _f0_pred, _audio = model.infer(input_wav_path, key=f_pitch_change, acc=accelerate, use_pe=False,
+                                            use_crepe=False)
+    tar_audio = librosa.resample(_audio, hparams["audio_sample_rate"], daw_sample)
+    # 返回音频
+    out_wav_path = io.BytesIO()
+    soundfile.write(out_wav_path, tar_audio, daw_sample, format="wav")
+    out_wav_path.seek(0)
+    return send_file(out_wav_path, download_name="temp.wav", as_attachment=True)
+if __name__ == '__main__':
+    # 工程文件夹名，训练时用的那个
+    project_name = "firefox"
+    model_path = f'./checkpoints/{project_name}/model_ckpt_steps_188000.ckpt'
+    config_path = f'./checkpoints/{project_name}/config.yaml'
+    # 加速倍数
+    accelerate = 50
+    hubert_gpu = True
+    model = Svc(project_name, config_path, hubert_gpu, model_path)
+    # 此处与vst插件对应，不建议更改
+    app.run(port=6842, host="0.0.0.0", debug=False, threaded=False)

infer.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import io
+import time
+from pathlib import Path
+import librosa
+import numpy as np
+import soundfile
+from infer_tools import infer_tool
+from infer_tools import slicer
+from infer_tools.infer_tool import Svc
+from utils.hparams import hparams
+chunks_dict = infer_tool.read_temp("./infer_tools/new_chunks_temp.json")
+def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise_step, project_name='', f_name=None,
+             file_path=None, out_path=None, slice_db=-40,**kwargs):
+    print(f'code version:2022-12-04')
+    use_pe = use_pe if hparams['audio_sample_rate'] == 24000 else False
+    if file_path is None:
+        raw_audio_path = f"./raw/{f_name}"
+        clean_name = f_name[:-4]
+    else:
+        raw_audio_path = file_path
+        clean_name = str(Path(file_path).name)[:-4]
+    infer_tool.format_wav(raw_audio_path)
+    wav_path = Path(raw_audio_path).with_suffix('.wav')
+    global chunks_dict
+    audio, sr = librosa.load(wav_path, mono=True,sr=None)
+    wav_hash = infer_tool.get_md5(audio)
+    if wav_hash in chunks_dict.keys():
+        print("load chunks from temp")
+        chunks = chunks_dict[wav_hash]["chunks"]
+    else:
+        chunks = slicer.cut(wav_path, db_thresh=slice_db)
+    chunks_dict[wav_hash] = {"chunks": chunks, "time": int(time.time())}
+    infer_tool.write_temp("./infer_tools/new_chunks_temp.json", chunks_dict)
+    audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
+    count = 0
+    f0_tst = []
+    f0_pred = []
+    audio = []
+    for (slice_tag, data) in audio_data:
+        print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
+        length = int(np.ceil(len(data) / audio_sr * hparams['audio_sample_rate']))
+        raw_path = io.BytesIO()
+        soundfile.write(raw_path, data, audio_sr, format="wav")
+        if hparams['debug']:
+            print(np.mean(data), np.var(data))
+        raw_path.seek(0)
+        if slice_tag:
+            print('jump empty segment')
+            _f0_tst, _f0_pred, _audio = (
+                np.zeros(int(np.ceil(length / hparams['hop_size']))), np.zeros(int(np.ceil(length / hparams['hop_size']))),
+                np.zeros(length))
+        else:
+            _f0_tst, _f0_pred, _audio = svc_model.infer(raw_path, key=key, acc=acc, use_pe=use_pe, use_crepe=use_crepe,
+                                                        thre=thre, use_gt_mel=use_gt_mel, add_noise_step=add_noise_step)
+        fix_audio = np.zeros(length)
+        fix_audio[:] = np.mean(_audio)
+        fix_audio[:len(_audio)] = _audio[0 if len(_audio)<len(fix_audio) else len(_audio)-len(fix_audio):]
+        f0_tst.extend(_f0_tst)
+        f0_pred.extend(_f0_pred)
+        audio.extend(list(fix_audio))
+        count += 1
+    if out_path is None:
+        out_path = f'./results/{clean_name}_{key}key_{project_name}_{hparams["residual_channels"]}_{hparams["residual_layers"]}_{int(step / 1000)}k_{accelerate}x.{kwargs["format"]}'
+    soundfile.write(out_path, audio, hparams["audio_sample_rate"], 'PCM_16',format=out_path.split('.')[-1])
+    return np.array(f0_tst), np.array(f0_pred), audio
+if __name__ == '__main__':
+    # 工程文件夹名，训练时用的那个
+    project_name = "yilanqiu"
+    model_path = f'./checkpoints/{project_name}/model_ckpt_steps_246000.ckpt'
+    config_path = f'./checkpoints/{project_name}/config.yaml'
+    # 支持多个wav/ogg文件，放在raw文件夹下，带扩展名
+    file_names = ["青花瓷.wav"]
+    trans = [0]  # 音高调整，支持正负（半音），数量与上一行对应，不足的自动按第一个移调参数补齐
+    # 加速倍数
+    accelerate = 20
+    hubert_gpu = True
+    format='flac'
+    step = int(model_path.split("_")[-1].split(".")[0])
+    # 下面不动
+    infer_tool.mkdir(["./raw", "./results"])
+    infer_tool.fill_a_to_b(trans, file_names)
+    model = Svc(project_name, config_path, hubert_gpu, model_path)
+    for f_name, tran in zip(file_names, trans):
+        if "." not in f_name:
+            f_name += ".wav"
+        run_clip(model, key=tran, acc=accelerate, use_crepe=True, thre=0.05, use_pe=True, use_gt_mel=False,
+                 add_noise_step=500, f_name=f_name, project_name=project_name, format=format)

infer_tools/__init__.py ADDED Viewed

File without changes

infer_tools/infer_tool.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import hashlib
+import json
+import os
+import time
+from io import BytesIO
+from pathlib import Path
+import librosa
+import numpy as np
+import soundfile
+import torch
+import utils
+from modules.fastspeech.pe import PitchExtractor
+from network.diff.candidate_decoder import FFT
+from network.diff.diffusion import GaussianDiffusion
+from network.diff.net import DiffNet
+from network.vocoders.base_vocoder import VOCODERS, get_vocoder_cls
+from preprocessing.data_gen_utils import get_pitch_parselmouth, get_pitch_crepe
+from preprocessing.hubertinfer import Hubertencoder
+from utils.hparams import hparams, set_hparams
+from utils.pitch_utils import denorm_f0, norm_interp_f0
+if os.path.exists("chunks_temp.json"):
+    os.remove("chunks_temp.json")
+def read_temp(file_name):
+    if not os.path.exists(file_name):
+        with open(file_name, "w") as f:
+            f.write(json.dumps({"info": "temp_dict"}))
+        return {}
+    else:
+        try:
+            with open(file_name, "r") as f:
+                data = f.read()
+            data_dict = json.loads(data)
+            if os.path.getsize(file_name) > 50 * 1024 * 1024:
+                f_name = file_name.split("/")[-1]
+                print(f"clean {f_name}")
+                for wav_hash in list(data_dict.keys()):
+                    if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
+                        del data_dict[wav_hash]
+        except Exception as e:
+            print(e)
+            print(f"{file_name} error,auto rebuild file")
+            data_dict = {"info": "temp_dict"}
+        return data_dict
+f0_dict = read_temp("./infer_tools/f0_temp.json")
+def write_temp(file_name, data):
+    with open(file_name, "w") as f:
+        f.write(json.dumps(data))
+def timeit(func):
+    def run(*args, **kwargs):
+        t = time.time()
+        res = func(*args, **kwargs)
+        print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
+        return res
+    return run
+def format_wav(audio_path):
+    if Path(audio_path).suffix=='.wav':
+        return
+    raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True,sr=None)
+    soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
+def fill_a_to_b(a, b):
+    if len(a) < len(b):
+        for _ in range(0, len(b) - len(a)):
+            a.append(a[0])
+def get_end_file(dir_path, end):
+    file_lists = []
+    for root, dirs, files in os.walk(dir_path):
+        files = [f for f in files if f[0] != '.']
+        dirs[:] = [d for d in dirs if d[0] != '.']
+        for f_file in files:
+            if f_file.endswith(end):
+                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
+    return file_lists
+def mkdir(paths: list):
+    for path in paths:
+        if not os.path.exists(path):
+            os.mkdir(path)
+def get_md5(content):
+    return hashlib.new("md5", content).hexdigest()
+class Svc:
+    def __init__(self, project_name, config_name, hubert_gpu, model_path):
+        self.project_name = project_name
+        self.DIFF_DECODERS = {
+            'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
+            'fft': lambda hp: FFT(
+                hp['hidden_size'], hp['dec_layers'], hp['dec_ffn_kernel_size'], hp['num_heads']),
+        }
+        self.model_path = model_path
+        self.dev = torch.device("cuda")
+        self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
+                             reset=True,
+                             hparams_str='',
+                             print_hparams=False)
+        self.mel_bins = hparams['audio_num_mel_bins']
+        self.model = GaussianDiffusion(
+            phone_encoder=Hubertencoder(hparams['hubert_path']),
+            out_dims=self.mel_bins, denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
+            timesteps=hparams['timesteps'],
+            K_step=hparams['K_step'],
+            loss_type=hparams['diff_loss_type'],
+            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
+        )
+        self.load_ckpt()
+        self.model.cuda()
+        hparams['hubert_gpu'] = hubert_gpu
+        self.hubert = Hubertencoder(hparams['hubert_path'])
+        self.pe = PitchExtractor().cuda()
+        utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
+        self.pe.eval()
+        self.vocoder = get_vocoder_cls(hparams)()
+    def load_ckpt(self, model_name='model', force=True, strict=True):
+        utils.load_ckpt(self.model, self.model_path, model_name, force, strict)
+    def infer(self, in_path, key, acc, use_pe=True, use_crepe=True, thre=0.05, singer=False, **kwargs):
+        batch = self.pre(in_path, acc, use_crepe, thre)
+        spk_embed = batch.get('spk_embed') if not hparams['use_spk_id'] else batch.get('spk_ids')
+        hubert = batch['hubert']
+        ref_mels = batch["mels"]
+        energy=batch['energy']
+        mel2ph = batch['mel2ph']
+        batch['f0'] = batch['f0'] + (key / 12)
+        batch['f0'][batch['f0']>np.log2(hparams['f0_max'])]=0
+        f0 = batch['f0']
+        uv = batch['uv']
+        @timeit
+        def diff_infer():
+            outputs = self.model(
+                hubert.cuda(), spk_embed=spk_embed, mel2ph=mel2ph.cuda(), f0=f0.cuda(), uv=uv.cuda(),energy=energy.cuda(),
+                ref_mels=ref_mels.cuda(),
+                infer=True, **kwargs)
+            return outputs
+        outputs=diff_infer()
+        batch['outputs'] = self.model.out2mel(outputs['mel_out'])
+        batch['mel2ph_pred'] = outputs['mel2ph']
+        batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
+        if use_pe:
+            batch['f0_pred'] = self.pe(outputs['mel_out'])['f0_denorm_pred'].detach()
+        else:
+            batch['f0_pred'] = outputs.get('f0_denorm')
+        return self.after_infer(batch, singer, in_path)
+    @timeit
+    def after_infer(self, prediction, singer, in_path):
+        for k, v in prediction.items():
+            if type(v) is torch.Tensor:
+                prediction[k] = v.cpu().numpy()
+        # remove paddings
+        mel_gt = prediction["mels"]
+        mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
+        mel_pred = prediction["outputs"]
+        mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
+        mel_pred = mel_pred[mel_pred_mask]
+        mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])
+        f0_gt = prediction.get("f0_gt")
+        f0_pred = prediction.get("f0_pred")
+        if f0_pred is not None:
+            f0_gt = f0_gt[mel_gt_mask]
+        if len(f0_pred) > len(mel_pred_mask):
+            f0_pred = f0_pred[:len(mel_pred_mask)]
+        f0_pred = f0_pred[mel_pred_mask]
+        torch.cuda.is_available() and torch.cuda.empty_cache()
+        if singer:
+            data_path = in_path.replace("batch", "singer_data")
+            mel_path = data_path[:-4] + "_mel.npy"
+            f0_path = data_path[:-4] + "_f0.npy"
+            np.save(mel_path, mel_pred)
+            np.save(f0_path, f0_pred)
+        wav_pred = self.vocoder.spec2wav(mel_pred, f0=f0_pred)
+        return f0_gt, f0_pred, wav_pred
+    def temporary_dict2processed_input(self, item_name, temp_dict, use_crepe=True, thre=0.05):
+        '''
+            process data in temporary_dicts
+        '''
+        binarization_args = hparams['binarization_args']
+        @timeit
+        def get_pitch(wav, mel):
+            # get ground truth f0 by self.get_pitch_algorithm
+            global f0_dict
+            if use_crepe:
+                md5 = get_md5(wav)
+                if f"{md5}_gt" in f0_dict.keys():
+                    print("load temp crepe f0")
+                    gt_f0 = np.array(f0_dict[f"{md5}_gt"]["f0"])
+                    coarse_f0 = np.array(f0_dict[f"{md5}_coarse"]["f0"])
+                else:
+                    torch.cuda.is_available() and torch.cuda.empty_cache()
+                    gt_f0, coarse_f0 = get_pitch_crepe(wav, mel, hparams, thre)
+                f0_dict[f"{md5}_gt"] = {"f0": gt_f0.tolist(), "time": int(time.time())}
+                f0_dict[f"{md5}_coarse"] = {"f0": coarse_f0.tolist(), "time": int(time.time())}
+                write_temp("./infer_tools/f0_temp.json", f0_dict)
+            else:
+                gt_f0, coarse_f0 = get_pitch_parselmouth(wav, mel, hparams)
+            processed_input['f0'] = gt_f0
+            processed_input['pitch'] = coarse_f0
+        def get_align(mel, phone_encoded):
+            mel2ph = np.zeros([mel.shape[0]], int)
+            start_frame = 0
+            ph_durs = mel.shape[0] / phone_encoded.shape[0]
+            if hparams['debug']:
+                print(mel.shape, phone_encoded.shape, mel.shape[0] / phone_encoded.shape[0])
+            for i_ph in range(phone_encoded.shape[0]):
+                end_frame = int(i_ph * ph_durs + ph_durs + 0.5)
+                mel2ph[start_frame:end_frame + 1] = i_ph + 1
+                start_frame = end_frame + 1
+            processed_input['mel2ph'] = mel2ph
+        if hparams['vocoder'] in VOCODERS:
+            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(temp_dict['wav_fn'])
+        else:
+            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(temp_dict['wav_fn'])
+        processed_input = {
+            'item_name': item_name, 'mel': mel,
+            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]
+        }
+        processed_input = {**temp_dict, **processed_input}  # merge two dicts
+        if binarization_args['with_f0']:
+            get_pitch(wav, mel)
+        if binarization_args['with_hubert']:
+            st = time.time()
+            hubert_encoded = processed_input['hubert'] = self.hubert.encode(temp_dict['wav_fn'])
+            et = time.time()
+            dev = 'cuda' if hparams['hubert_gpu'] and torch.cuda.is_available() else 'cpu'
+            print(f'hubert (on {dev}) time used {et - st}')
+            if binarization_args['with_align']:
+                get_align(mel, hubert_encoded)
+        return processed_input
+    def pre(self, wav_fn, accelerate, use_crepe=True, thre=0.05):
+        if isinstance(wav_fn, BytesIO):
+            item_name = self.project_name
+        else:
+            song_info = wav_fn.split('/')
+            item_name = song_info[-1].split('.')[-2]
+        temp_dict = {'wav_fn': wav_fn, 'spk_id': self.project_name}
+        temp_dict = self.temporary_dict2processed_input(item_name, temp_dict, use_crepe, thre)
+        hparams['pndm_speedup'] = accelerate
+        batch = processed_input2batch([getitem(temp_dict)])
+        return batch
+def getitem(item):
+    max_frames = hparams['max_frames']
+    spec = torch.Tensor(item['mel'])[:max_frames]
+    energy = (spec.exp() ** 2).sum(-1).sqrt()
+    mel2ph = torch.LongTensor(item['mel2ph'])[:max_frames] if 'mel2ph' in item else None
+    f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams)
+    hubert = torch.Tensor(item['hubert'][:hparams['max_input_tokens']])
+    pitch = torch.LongTensor(item.get("pitch"))[:max_frames]
+    sample = {
+        "item_name": item['item_name'],
+        "hubert": hubert,
+        "mel": spec,
+        "pitch": pitch,
+        "energy": energy,
+        "f0": f0,
+        "uv": uv,
+        "mel2ph": mel2ph,
+        "mel_nonpadding": spec.abs().sum(-1) > 0,
+    }
+    return sample
+def processed_input2batch(samples):
+    '''
+        Args:
+            samples: one batch of processed_input
+        NOTE:
+            the batch size is controlled by hparams['max_sentences']
+    '''
+    if len(samples) == 0:
+        return {}
+    item_names = [s['item_name'] for s in samples]
+    hubert = utils.collate_2d([s['hubert'] for s in samples], 0.0)
+    f0 = utils.collate_1d([s['f0'] for s in samples], 0.0)
+    pitch = utils.collate_1d([s['pitch'] for s in samples])
+    uv = utils.collate_1d([s['uv'] for s in samples])
+    energy = utils.collate_1d([s['energy'] for s in samples], 0.0)
+    mel2ph = utils.collate_1d([s['mel2ph'] for s in samples], 0.0) \
+        if samples[0]['mel2ph'] is not None else None
+    mels = utils.collate_2d([s['mel'] for s in samples], 0.0)
+    mel_lengths = torch.LongTensor([s['mel'].shape[0] for s in samples])
+    batch = {
+        'item_name': item_names,
+        'nsamples': len(samples),
+        'hubert': hubert,
+        'mels': mels,
+        'mel_lengths': mel_lengths,
+        'mel2ph': mel2ph,
+        'energy': energy,
+        'pitch': pitch,
+        'f0': f0,
+        'uv': uv,
+    }
+    return batch

infer_tools/slicer.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import time
+import numpy as np
+import torch
+import torchaudio
+from scipy.ndimage import maximum_filter1d, uniform_filter1d
+def timeit(func):
+    def run(*args, **kwargs):
+        t = time.time()
+        res = func(*args, **kwargs)
+        print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
+        return res
+    return run
+# @timeit
+def _window_maximum(arr, win_sz):
+    return maximum_filter1d(arr, size=win_sz)[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
+# @timeit
+def _window_rms(arr, win_sz):
+    filtered = np.sqrt(uniform_filter1d(np.power(arr, 2), win_sz) - np.power(uniform_filter1d(arr, win_sz), 2))
+    return filtered[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
+def level2db(levels, eps=1e-12):
+    return 20 * np.log10(np.clip(levels, a_min=eps, a_max=1))
+def _apply_slice(audio, begin, end):
+    if len(audio.shape) > 1:
+        return audio[:, begin: end]
+    else:
+        return audio[begin: end]
+class Slicer:
+    def __init__(self,
+                 sr: int,
+                 db_threshold: float = -40,
+                 min_length: int = 5000,
+                 win_l: int = 300,
+                 win_s: int = 20,
+                 max_silence_kept: int = 500):
+        self.db_threshold = db_threshold
+        self.min_samples = round(sr * min_length / 1000)
+        self.win_ln = round(sr * win_l / 1000)
+        self.win_sn = round(sr * win_s / 1000)
+        self.max_silence = round(sr * max_silence_kept / 1000)
+        if not self.min_samples >= self.win_ln >= self.win_sn:
+            raise ValueError('The following condition must be satisfied: min_length >= win_l >= win_s')
+        if not self.max_silence >= self.win_sn:
+            raise ValueError('The following condition must be satisfied: max_silence_kept >= win_s')
+    @timeit
+    def slice(self, audio):
+        samples = audio
+        if samples.shape[0] <= self.min_samples:
+            return {"0": {"slice": False, "split_time": f"0,{len(audio)}"}}
+        # get absolute amplitudes
+        abs_amp = np.abs(samples - np.mean(samples))
+        # calculate local maximum with large window
+        win_max_db = level2db(_window_maximum(abs_amp, win_sz=self.win_ln))
+        sil_tags = []
+        left = right = 0
+        while right < win_max_db.shape[0]:
+            if win_max_db[right] < self.db_threshold:
+                right += 1
+            elif left == right:
+                left += 1
+                right += 1
+            else:
+                if left == 0:
+                    split_loc_l = left
+                else:
+                    sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
+                    rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
+                    split_win_l = left + np.argmin(rms_db_left)
+                    split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
+                if len(sil_tags) != 0 and split_loc_l - sil_tags[-1][1] < self.min_samples and right < win_max_db.shape[
+                    0] - 1:
+                    right += 1
+                    left = right
+                    continue
+                if right == win_max_db.shape[0] - 1:
+                    split_loc_r = right + self.win_ln
+                else:
+                    sil_right_n = min(self.max_silence, (right + self.win_ln - left) // 2)
+                    rms_db_right = level2db(_window_rms(samples[right + self.win_ln - sil_right_n: right + self.win_ln],
+                                                        win_sz=self.win_sn))
+                    split_win_r = right + self.win_ln - sil_right_n + np.argmin(rms_db_right)
+                    split_loc_r = split_win_r + np.argmin(abs_amp[split_win_r: split_win_r + self.win_sn])
+                sil_tags.append((split_loc_l, split_loc_r))
+                right += 1
+                left = right
+        if left != right:
+            sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
+            rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
+            split_win_l = left + np.argmin(rms_db_left)
+            split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
+            sil_tags.append((split_loc_l, samples.shape[0]))
+        if len(sil_tags) == 0:
+            return {"0": {"slice": False, "split_time": f"0,{len(audio)}"}}
+        else:
+            chunks = []
+            # 第一段静音并非从头开始，补上有声片段
+            if sil_tags[0][0]:
+                chunks.append({"slice": False, "split_time": f"0,{sil_tags[0][0]}"})
+            for i in range(0, len(sil_tags)):
+                # 标识有声片段（跳过第一段）
+                if i:
+                    chunks.append({"slice": False, "split_time": f"{sil_tags[i - 1][1]},{sil_tags[i][0]}"})
+                # 标识所有静音片段
+                chunks.append({"slice": True, "split_time": f"{sil_tags[i][0]},{sil_tags[i][1]}"})
+            # 最后一段静音并非结尾，补上结尾片段
+            if sil_tags[-1][1] != len(audio):
+                chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1]},{len(audio)}"})
+            chunk_dict = {}
+            for i in range(len(chunks)):
+                chunk_dict[str(i)] = chunks[i]
+            return chunk_dict
+def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_kept=500):
+    audio, sr = torchaudio.load(audio_path)
+    if len(audio.shape) == 2 and audio.shape[1] >= 2:
+        audio = torch.mean(audio, dim=0).unsqueeze(0)
+    audio = audio.cpu().numpy()[0]
+    slicer = Slicer(
+        sr=sr,
+        db_threshold=db_thresh,
+        min_length=min_len,
+        win_l=win_l,
+        win_s=win_s,
+        max_silence_kept=max_sil_kept
+    )
+    chunks = slicer.slice(audio)
+    return chunks
+def chunks2audio(audio_path, chunks):
+    chunks = dict(chunks)
+    audio, sr = torchaudio.load(audio_path)
+    if len(audio.shape) == 2 and audio.shape[1] >= 2:
+        audio = torch.mean(audio, dim=0).unsqueeze(0)
+    audio = audio.cpu().numpy()[0]
+    result = []
+    for k, v in chunks.items():
+        tag = v["split_time"].split(",")
+        result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
+    return result, sr

modules/commons/common_layers.py ADDED Viewed

	@@ -0,0 +1,671 @@

+import math
+import torch
+from torch import nn
+from torch.nn import Parameter
+import torch.onnx.operators
+import torch.nn.functional as F
+import utils
+class Reshape(nn.Module):
+    def __init__(self, *args):
+        super(Reshape, self).__init__()
+        self.shape = args
+    def forward(self, x):
+        return x.view(self.shape)
+class Permute(nn.Module):
+    def __init__(self, *args):
+        super(Permute, self).__init__()
+        self.args = args
+    def forward(self, x):
+        return x.permute(self.args)
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert (kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+def Embedding(num_embeddings, embedding_dim, padding_idx=None):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    if padding_idx is not None:
+        nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
+    if not export and torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.)
+    return m
+class SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length.
+    Padding symbols are ignored.
+    """
+    def __init__(self, embedding_dim, padding_idx, init_size=1024):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.weights = SinusoidalPositionalEmbedding.get_embedding(
+            init_size,
+            embedding_dim,
+            padding_idx,
+        )
+        self.register_buffer('_float_tensor', torch.FloatTensor(1))
+    @staticmethod
+    def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
+        """Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+    def forward(self, input, incremental_state=None, timestep=None, positions=None, **kwargs):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input.shape[:2]
+        max_pos = self.padding_idx + 1 + seq_len
+        if self.weights is None or max_pos > self.weights.size(0):
+            # recompute/expand embeddings if needed
+            self.weights = SinusoidalPositionalEmbedding.get_embedding(
+                max_pos,
+                self.embedding_dim,
+                self.padding_idx,
+            )
+        self.weights = self.weights.to(self._float_tensor)
+        if incremental_state is not None:
+            # positions is the same for every token when decoding a single step
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
+            return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
+        positions = utils.make_positions(input, self.padding_idx) if positions is None else positions
+        return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()
+    def max_positions(self):
+        """Maximum number of supported positions."""
+        return int(1e5)  # an arbitrary large number
+class ConvTBC(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
+        super(ConvTBC, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.weight = torch.nn.Parameter(torch.Tensor(
+            self.kernel_size, in_channels, out_channels))
+        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
+    def forward(self, input):
+        return torch.conv_tbc(input.contiguous(), self.weight, self.bias, self.padding)
+class MultiheadAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True,
+                 add_bias_kv=False, add_zero_attn=False, self_attention=False,
+                 encoder_decoder_attention=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
+                                                             'value to be of the same size'
+        if self.qkv_same_dim:
+            self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
+        else:
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+        if bias:
+            self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.reset_parameters()
+        self.enable_torch_version = False
+        if hasattr(F, "multi_head_attention_forward"):
+            self.enable_torch_version = True
+        else:
+            self.enable_torch_version = False
+        self.last_attn_probs = None
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            nn.init.xavier_uniform_(self.in_proj_weight)
+        else:
+            nn.init.xavier_uniform_(self.k_proj_weight)
+            nn.init.xavier_uniform_(self.v_proj_weight)
+            nn.init.xavier_uniform_(self.q_proj_weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.in_proj_bias is not None:
+            nn.init.constant_(self.in_proj_bias, 0.)
+            nn.init.constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+    def forward(
+            self,
+            query, key, value,
+            key_padding_mask=None,
+            incremental_state=None,
+            need_weights=True,
+            static_kv=False,
+            attn_mask=None,
+            before_softmax=False,
+            need_head_weights=False,
+            enc_dec_attn_constraint_mask=None,
+            reset_attn_weight=None
+    ):
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if self.enable_torch_version and incremental_state is None and not static_kv and reset_attn_weight is None:
+            if self.qkv_same_dim:
+                return F.multi_head_attention_forward(query, key, value,
+                                                      self.embed_dim, self.num_heads,
+                                                      self.in_proj_weight,
+                                                      self.in_proj_bias, self.bias_k, self.bias_v,
+                                                      self.add_zero_attn, self.dropout,
+                                                      self.out_proj.weight, self.out_proj.bias,
+                                                      self.training, key_padding_mask, need_weights,
+                                                      attn_mask)
+            else:
+                return F.multi_head_attention_forward(query, key, value,
+                                                      self.embed_dim, self.num_heads,
+                                                      torch.empty([0]),
+                                                      self.in_proj_bias, self.bias_k, self.bias_v,
+                                                      self.add_zero_attn, self.dropout,
+                                                      self.out_proj.weight, self.out_proj.bias,
+                                                      self.training, key_padding_mask, need_weights,
+                                                      attn_mask, use_separate_proj_weight=True,
+                                                      q_proj_weight=self.q_proj_weight,
+                                                      k_proj_weight=self.k_proj_weight,
+                                                      v_proj_weight=self.v_proj_weight)
+        if incremental_state is not None:
+            print('Not implemented error.')
+            exit()
+        else:
+            saved_state = None
+        if self.self_attention:
+            # self-attention
+            q, k, v = self.in_proj_qkv(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.in_proj_q(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.in_proj_k(key)
+                v = self.in_proj_v(key)
+        else:
+            q = self.in_proj_q(query)
+            k = self.in_proj_k(key)
+            v = self.in_proj_v(value)
+        q *= self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if saved_state is not None:
+            print('Not implemented error.')
+            exit()
+        src_len = k.size(1)
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.shape == torch.Size([]):
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            if len(attn_mask.shape) == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+            elif len(attn_mask.shape) == 3:
+                attn_mask = attn_mask[:, None].repeat([1, self.num_heads, 1, 1]).reshape(
+                    bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights + attn_mask
+        if enc_dec_attn_constraint_mask is not None:  # bs x head x L_kv
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                enc_dec_attn_constraint_mask.unsqueeze(2).bool(),
+                -1e9,
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                -1e9,
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_logits = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        if before_softmax:
+            return attn_weights, v
+        attn_weights_float = utils.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
+        if reset_attn_weight is not None:
+            if reset_attn_weight:
+                self.last_attn_probs = attn_probs.detach()
+            else:
+                assert self.last_attn_probs is not None
+                attn_probs = self.last_attn_probs
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        if need_weights:
+            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        else:
+            attn_weights = None
+        return attn, (attn_weights, attn_logits)
+    def in_proj_qkv(self, query):
+        return self._in_proj(query).chunk(3, dim=-1)
+    def in_proj_q(self, query):
+        if self.qkv_same_dim:
+            return self._in_proj(query, end=self.embed_dim)
+        else:
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[:self.embed_dim]
+            return F.linear(query, self.q_proj_weight, bias)
+    def in_proj_k(self, key):
+        if self.qkv_same_dim:
+            return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)
+        else:
+            weight = self.k_proj_weight
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[self.embed_dim:2 * self.embed_dim]
+            return F.linear(key, weight, bias)
+    def in_proj_v(self, value):
+        if self.qkv_same_dim:
+            return self._in_proj(value, start=2 * self.embed_dim)
+        else:
+            weight = self.v_proj_weight
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[2 * self.embed_dim:]
+            return F.linear(value, weight, bias)
+    def _in_proj(self, input, start=0, end=None):
+        weight = self.in_proj_weight
+        bias = self.in_proj_bias
+        weight = weight[start:end, :]
+        if bias is not None:
+            bias = bias[start:end]
+        return F.linear(input, weight, bias)
+    def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
+        return attn_weights
+class Swish(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_variables[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+class CustomSwish(nn.Module):
+    def forward(self, input_tensor):
+        return Swish.apply(input_tensor)
+class Mish(nn.Module):
+    def forward(self, x):
+        return x * torch.tanh(F.softplus(x))
+class TransformerFFNLayer(nn.Module):
+    def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.dropout = dropout
+        self.act = act
+        if padding == 'SAME':
+            self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
+        elif padding == 'LEFT':
+            self.ffn_1 = nn.Sequential(
+                nn.ConstantPad1d((kernel_size - 1, 0), 0.0),
+                nn.Conv1d(hidden_size, filter_size, kernel_size)
+            )
+        self.ffn_2 = Linear(filter_size, hidden_size)
+        if self.act == 'swish':
+            self.swish_fn = CustomSwish()
+    def forward(self, x, incremental_state=None):
+        # x: T x B x C
+        if incremental_state is not None:
+            assert incremental_state is None, 'Nar-generation does not allow this.'
+            exit(1)
+        x = self.ffn_1(x.permute(1, 2, 0)).permute(2, 0, 1)
+        x = x * self.kernel_size ** -0.5
+        if incremental_state is not None:
+            x = x[-1:]
+        if self.act == 'gelu':
+            x = F.gelu(x)
+        if self.act == 'relu':
+            x = F.relu(x)
+        if self.act == 'swish':
+            x = self.swish_fn(x)
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = self.ffn_2(x)
+        return x
+class BatchNorm1dTBC(nn.Module):
+    def __init__(self, c):
+        super(BatchNorm1dTBC, self).__init__()
+        self.bn = nn.BatchNorm1d(c)
+    def forward(self, x):
+        """
+        :param x: [T, B, C]
+        :return: [T, B, C]
+        """
+        x = x.permute(1, 2, 0)  # [B, C, T]
+        x = self.bn(x)  # [B, C, T]
+        x = x.permute(2, 0, 1)  # [T, B, C]
+        return x
+class EncSALayer(nn.Module):
+    def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
+                 relu_dropout=0.1, kernel_size=9, padding='SAME', norm='ln', act='gelu'):
+        super().__init__()
+        self.c = c
+        self.dropout = dropout
+        self.num_heads = num_heads
+        if num_heads > 0:
+            if norm == 'ln':
+                self.layer_norm1 = LayerNorm(c)
+            elif norm == 'bn':
+                self.layer_norm1 = BatchNorm1dTBC(c)
+            self.self_attn = MultiheadAttention(
+                self.c, num_heads, self_attention=True, dropout=attention_dropout, bias=False,
+            )
+        if norm == 'ln':
+            self.layer_norm2 = LayerNorm(c)
+        elif norm == 'bn':
+            self.layer_norm2 = BatchNorm1dTBC(c)
+        self.ffn = TransformerFFNLayer(
+            c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act)
+    def forward(self, x, encoder_padding_mask=None, **kwargs):
+        layer_norm_training = kwargs.get('layer_norm_training', None)
+        if layer_norm_training is not None:
+            self.layer_norm1.training = layer_norm_training
+            self.layer_norm2.training = layer_norm_training
+        if self.num_heads > 0:
+            residual = x
+            x = self.layer_norm1(x)
+            x, _, = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=encoder_padding_mask
+            )
+            x = F.dropout(x, self.dropout, training=self.training)
+            x = residual + x
+            x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        residual = x
+        x = self.layer_norm2(x)
+        x = self.ffn(x)
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        return x
+class DecSALayer(nn.Module):
+    def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_dropout=0.1, kernel_size=9, act='gelu'):
+        super().__init__()
+        self.c = c
+        self.dropout = dropout
+        self.layer_norm1 = LayerNorm(c)
+        self.self_attn = MultiheadAttention(
+            c, num_heads, self_attention=True, dropout=attention_dropout, bias=False
+        )
+        self.layer_norm2 = LayerNorm(c)
+        self.encoder_attn = MultiheadAttention(
+            c, num_heads, encoder_decoder_attention=True, dropout=attention_dropout, bias=False,
+        )
+        self.layer_norm3 = LayerNorm(c)
+        self.ffn = TransformerFFNLayer(
+            c, 4 * c, padding='LEFT', kernel_size=kernel_size, dropout=relu_dropout, act=act)
+    def forward(
+            self,
+            x,
+            encoder_out=None,
+            encoder_padding_mask=None,
+            incremental_state=None,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+            attn_out=None,
+            reset_attn_weight=None,
+            **kwargs,
+    ):
+        layer_norm_training = kwargs.get('layer_norm_training', None)
+        if layer_norm_training is not None:
+            self.layer_norm1.training = layer_norm_training
+            self.layer_norm2.training = layer_norm_training
+            self.layer_norm3.training = layer_norm_training
+        residual = x
+        x = self.layer_norm1(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            attn_mask=self_attn_mask
+        )
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        residual = x
+        x = self.layer_norm2(x)
+        if encoder_out is not None:
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                enc_dec_attn_constraint_mask=None, #utils.get_incremental_state(self, incremental_state, 'enc_dec_attn_constraint_mask'),
+                reset_attn_weight=reset_attn_weight
+            )
+            attn_logits = attn[1]
+        else:
+            assert attn_out is not None
+            x = self.encoder_attn.in_proj_v(attn_out.transpose(0, 1))
+            attn_logits = None
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        residual = x
+        x = self.layer_norm3(x)
+        x = self.ffn(x, incremental_state=incremental_state)
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        # if len(attn_logits.size()) > 3:
+        #    indices = attn_logits.softmax(-1).max(-1).values.sum(-1).argmax(-1)
+        #    attn_logits = attn_logits.gather(1,
+        #        indices[:, None, None, None].repeat(1, 1, attn_logits.size(-2), attn_logits.size(-1))).squeeze(1)
+        return x, attn_logits

modules/commons/espnet_positional_embedding.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import math
+import torch
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
+        """Construct an PositionalEncoding object."""
+        super(PositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.reverse = reverse
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        if self.reverse:
+            position = torch.arange(
+                x.size(1) - 1, -1, -1.0, dtype=torch.float32
+            ).unsqueeze(1)
+        else:
+            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class ScaledPositionalEncoding(PositionalEncoding):
+    """Scaled positional encoding module.
+    See Sec. 3.2  https://arxiv.org/abs/1809.08895
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Initialize class."""
+        super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
+        self.alpha = torch.nn.Parameter(torch.tensor(1.0))
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.alpha.data = torch.tensor(1.0)
+    def forward(self, x):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x + self.alpha * self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Initialize class."""
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+    def forward(self, x):
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.pe[:, : x.size(1)]
+        return self.dropout(x) + self.dropout(pos_emb)

modules/commons/ssim.py ADDED Viewed

	@@ -0,0 +1,391 @@

+# '''
+# https://github.com/One-sixth/ms_ssim_pytorch/blob/master/ssim.py
+# '''
+#
+# import torch
+# import torch.jit
+# import torch.nn.functional as F
+#
+#
+# @torch.jit.script
+# def create_window(window_size: int, sigma: float, channel: int):
+#     '''
+#     Create 1-D gauss kernel
+#     :param window_size: the size of gauss kernel
+#     :param sigma: sigma of normal distribution
+#     :param channel: input channel
+#     :return: 1D kernel
+#     '''
+#     coords = torch.arange(window_size, dtype=torch.float)
+#     coords -= window_size // 2
+#
+#     g = torch.exp(-(coords ** 2) / (2 * sigma ** 2))
+#     g /= g.sum()
+#
+#     g = g.reshape(1, 1, 1, -1).repeat(channel, 1, 1, 1)
+#     return g
+#
+#
+# @torch.jit.script
+# def _gaussian_filter(x, window_1d, use_padding: bool):
+#     '''
+#     Blur input with 1-D kernel
+#     :param x: batch of tensors to be blured
+#     :param window_1d: 1-D gauss kernel
+#     :param use_padding: padding image before conv
+#     :return: blured tensors
+#     '''
+#     C = x.shape[1]
+#     padding = 0
+#     if use_padding:
+#         window_size = window_1d.shape[3]
+#         padding = window_size // 2
+#     out = F.conv2d(x, window_1d, stride=1, padding=(0, padding), groups=C)
+#     out = F.conv2d(out, window_1d.transpose(2, 3), stride=1, padding=(padding, 0), groups=C)
+#     return out
+#
+#
+# @torch.jit.script
+# def ssim(X, Y, window, data_range: float, use_padding: bool = False):
+#     '''
+#     Calculate ssim index for X and Y
+#     :param X: images [B, C, H, N_bins]
+#     :param Y: images [B, C, H, N_bins]
+#     :param window: 1-D gauss kernel
+#     :param data_range: value range of input images. (usually 1.0 or 255)
+#     :param use_padding: padding image before conv
+#     :return:
+#     '''
+#
+#     K1 = 0.01
+#     K2 = 0.03
+#     compensation = 1.0
+#
+#     C1 = (K1 * data_range) ** 2
+#     C2 = (K2 * data_range) ** 2
+#
+#     mu1 = _gaussian_filter(X, window, use_padding)
+#     mu2 = _gaussian_filter(Y, window, use_padding)
+#     sigma1_sq = _gaussian_filter(X * X, window, use_padding)
+#     sigma2_sq = _gaussian_filter(Y * Y, window, use_padding)
+#     sigma12 = _gaussian_filter(X * Y, window, use_padding)
+#
+#     mu1_sq = mu1.pow(2)
+#     mu2_sq = mu2.pow(2)
+#     mu1_mu2 = mu1 * mu2
+#
+#     sigma1_sq = compensation * (sigma1_sq - mu1_sq)
+#     sigma2_sq = compensation * (sigma2_sq - mu2_sq)
+#     sigma12 = compensation * (sigma12 - mu1_mu2)
+#
+#     cs_map = (2 * sigma12 + C2) / (sigma1_sq + sigma2_sq + C2)
+#     # Fixed the issue that the negative value of cs_map caused ms_ssim to output Nan.
+#     cs_map = cs_map.clamp_min(0.)
+#     ssim_map = ((2 * mu1_mu2 + C1) / (mu1_sq + mu2_sq + C1)) * cs_map
+#
+#     ssim_val = ssim_map.mean(dim=(1, 2, 3))  # reduce along CHW
+#     cs = cs_map.mean(dim=(1, 2, 3))
+#
+#     return ssim_val, cs
+#
+#
+# @torch.jit.script
+# def ms_ssim(X, Y, window, data_range: float, weights, use_padding: bool = False, eps: float = 1e-8):
+#     '''
+#     interface of ms-ssim
+#     :param X: a batch of images, (N,C,H,W)
+#     :param Y: a batch of images, (N,C,H,W)
+#     :param window: 1-D gauss kernel
+#     :param data_range: value range of input images. (usually 1.0 or 255)
+#     :param weights: weights for different levels
+#     :param use_padding: padding image before conv
+#     :param eps: use for avoid grad nan.
+#     :return:
+#     '''
+#     levels = weights.shape[0]
+#     cs_vals = []
+#     ssim_vals = []
+#     for _ in range(levels):
+#         ssim_val, cs = ssim(X, Y, window=window, data_range=data_range, use_padding=use_padding)
+#         # Use for fix a issue. When c = a ** b and a is 0, c.backward() will cause the a.grad become inf.
+#         ssim_val = ssim_val.clamp_min(eps)
+#         cs = cs.clamp_min(eps)
+#         cs_vals.append(cs)
+#
+#         ssim_vals.append(ssim_val)
+#         padding = (X.shape[2] % 2, X.shape[3] % 2)
+#         X = F.avg_pool2d(X, kernel_size=2, stride=2, padding=padding)
+#         Y = F.avg_pool2d(Y, kernel_size=2, stride=2, padding=padding)
+#
+#     cs_vals = torch.stack(cs_vals, dim=0)
+#     ms_ssim_val = torch.prod((cs_vals[:-1] ** weights[:-1].unsqueeze(1)) * (ssim_vals[-1] ** weights[-1]), dim=0)
+#     return ms_ssim_val
+#
+#
+# class SSIM(torch.jit.ScriptModule):
+#     __constants__ = ['data_range', 'use_padding']
+#
+#     def __init__(self, window_size=11, window_sigma=1.5, data_range=255., channel=3, use_padding=False):
+#         '''
+#         :param window_size: the size of gauss kernel
+#         :param window_sigma: sigma of normal distribution
+#         :param data_range: value range of input images. (usually 1.0 or 255)
+#         :param channel: input channels (default: 3)
+#         :param use_padding: padding image before conv
+#         '''
+#         super().__init__()
+#         assert window_size % 2 == 1, 'Window size must be odd.'
+#         window = create_window(window_size, window_sigma, channel)
+#         self.register_buffer('window', window)
+#         self.data_range = data_range
+#         self.use_padding = use_padding
+#
+#     @torch.jit.script_method
+#     def forward(self, X, Y):
+#         r = ssim(X, Y, window=self.window, data_range=self.data_range, use_padding=self.use_padding)
+#         return r[0]
+#
+#
+# class MS_SSIM(torch.jit.ScriptModule):
+#     __constants__ = ['data_range', 'use_padding', 'eps']
+#
+#     def __init__(self, window_size=11, window_sigma=1.5, data_range=255., channel=3, use_padding=False, weights=None,
+#                  levels=None, eps=1e-8):
+#         '''
+#         class for ms-ssim
+#         :param window_size: the size of gauss kernel
+#         :param window_sigma: sigma of normal distribution
+#         :param data_range: value range of input images. (usually 1.0 or 255)
+#         :param channel: input channels
+#         :param use_padding: padding image before conv
+#         :param weights: weights for different levels. (default [0.0448, 0.2856, 0.3001, 0.2363, 0.1333])
+#         :param levels: number of downsampling
+#         :param eps: Use for fix a issue. When c = a ** b and a is 0, c.backward() will cause the a.grad become inf.
+#         '''
+#         super().__init__()
+#         assert window_size % 2 == 1, 'Window size must be odd.'
+#         self.data_range = data_range
+#         self.use_padding = use_padding
+#         self.eps = eps
+#
+#         window = create_window(window_size, window_sigma, channel)
+#         self.register_buffer('window', window)
+#
+#         if weights is None:
+#             weights = [0.0448, 0.2856, 0.3001, 0.2363, 0.1333]
+#         weights = torch.tensor(weights, dtype=torch.float)
+#
+#         if levels is not None:
+#             weights = weights[:levels]
+#             weights = weights / weights.sum()
+#
+#         self.register_buffer('weights', weights)
+#
+#     @torch.jit.script_method
+#     def forward(self, X, Y):
+#         return ms_ssim(X, Y, window=self.window, data_range=self.data_range, weights=self.weights,
+#                        use_padding=self.use_padding, eps=self.eps)
+#
+#
+# if __name__ == '__main__':
+#     print('Simple Test')
+#     im = torch.randint(0, 255, (5, 3, 256, 256), dtype=torch.float, device='cuda')
+#     img1 = im / 255
+#     img2 = img1 * 0.5
+#
+#     losser = SSIM(data_range=1.).cuda()
+#     loss = losser(img1, img2).mean()
+#
+#     losser2 = MS_SSIM(data_range=1.).cuda()
+#     loss2 = losser2(img1, img2).mean()
+#
+#     print(loss.item())
+#     print(loss2.item())
+#
+# if __name__ == '__main__':
+#     print('Training Test')
+#     import cv2
+#     import torch.optim
+#     import numpy as np
+#     import imageio
+#     import time
+#
+#     out_test_video = False
+#     # 最好不要直接输出gif图，会非常大，最好先输出mkv文件后用ffmpeg转换到GIF
+#     video_use_gif = False
+#
+#     im = cv2.imread('test_img1.jpg', 1)
+#     t_im = torch.from_numpy(im).cuda().permute(2, 0, 1).float()[None] / 255.
+#
+#     if out_test_video:
+#         if video_use_gif:
+#             fps = 0.5
+#             out_wh = (im.shape[1] // 2, im.shape[0] // 2)
+#             suffix = '.gif'
+#         else:
+#             fps = 5
+#             out_wh = (im.shape[1], im.shape[0])
+#             suffix = '.mkv'
+#         video_last_time = time.perf_counter()
+#         video = imageio.get_writer('ssim_test' + suffix, fps=fps)
+#
+#     # 测试ssim
+#     print('Training SSIM')
+#     rand_im = torch.randint_like(t_im, 0, 255, dtype=torch.float32) / 255.
+#     rand_im.requires_grad = True
+#     optim = torch.optim.Adam([rand_im], 0.003, eps=1e-8)
+#     losser = SSIM(data_range=1., channel=t_im.shape[1]).cuda()
+#     ssim_score = 0
+#     while ssim_score < 0.999:
+#         optim.zero_grad()
+#         loss = losser(rand_im, t_im)
+#         (-loss).sum().backward()
+#         ssim_score = loss.item()
+#         optim.step()
+#         r_im = np.transpose(rand_im.detach().cpu().numpy().clip(0, 1) * 255, [0, 2, 3, 1]).astype(np.uint8)[0]
+#         r_im = cv2.putText(r_im, 'ssim %f' % ssim_score, (10, 30), cv2.FONT_HERSHEY_PLAIN, 2, (255, 0, 0), 2)
+#
+#         if out_test_video:
+#             if time.perf_counter() - video_last_time > 1. / fps:
+#                 video_last_time = time.perf_counter()
+#                 out_frame = cv2.cvtColor(r_im, cv2.COLOR_BGR2RGB)
+#                 out_frame = cv2.resize(out_frame, out_wh, interpolation=cv2.INTER_AREA)
+#                 if isinstance(out_frame, cv2.UMat):
+#                     out_frame = out_frame.get()
+#                 video.append_data(out_frame)
+#
+#         cv2.imshow('ssim', r_im)
+#         cv2.setWindowTitle('ssim', 'ssim %f' % ssim_score)
+#         cv2.waitKey(1)
+#
+#     if out_test_video:
+#         video.close()
+#
+#     # 测试ms_ssim
+#     if out_test_video:
+#         if video_use_gif:
+#             fps = 0.5
+#             out_wh = (im.shape[1] // 2, im.shape[0] // 2)
+#             suffix = '.gif'
+#         else:
+#             fps = 5
+#             out_wh = (im.shape[1], im.shape[0])
+#             suffix = '.mkv'
+#         video_last_time = time.perf_counter()
+#         video = imageio.get_writer('ms_ssim_test' + suffix, fps=fps)
+#
+#     print('Training MS_SSIM')
+#     rand_im = torch.randint_like(t_im, 0, 255, dtype=torch.float32) / 255.
+#     rand_im.requires_grad = True
+#     optim = torch.optim.Adam([rand_im], 0.003, eps=1e-8)
+#     losser = MS_SSIM(data_range=1., channel=t_im.shape[1]).cuda()
+#     ssim_score = 0
+#     while ssim_score < 0.999:
+#         optim.zero_grad()
+#         loss = losser(rand_im, t_im)
+#         (-loss).sum().backward()
+#         ssim_score = loss.item()
+#         optim.step()
+#         r_im = np.transpose(rand_im.detach().cpu().numpy().clip(0, 1) * 255, [0, 2, 3, 1]).astype(np.uint8)[0]
+#         r_im = cv2.putText(r_im, 'ms_ssim %f' % ssim_score, (10, 30), cv2.FONT_HERSHEY_PLAIN, 2, (255, 0, 0), 2)
+#
+#         if out_test_video:
+#             if time.perf_counter() - video_last_time > 1. / fps:
+#                 video_last_time = time.perf_counter()
+#                 out_frame = cv2.cvtColor(r_im, cv2.COLOR_BGR2RGB)
+#                 out_frame = cv2.resize(out_frame, out_wh, interpolation=cv2.INTER_AREA)
+#                 if isinstance(out_frame, cv2.UMat):
+#                     out_frame = out_frame.get()
+#                 video.append_data(out_frame)
+#
+#         cv2.imshow('ms_ssim', r_im)
+#         cv2.setWindowTitle('ms_ssim', 'ms_ssim %f' % ssim_score)
+#         cv2.waitKey(1)
+#
+#     if out_test_video:
+#         video.close()
+"""
+Adapted from https://github.com/Po-Hsun-Su/pytorch-ssim
+"""
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+import numpy as np
+from math import exp
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
+    return gauss / gauss.sum()
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
+    return window
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+    C1 = 0.01 ** 2
+    C2 = 0.03 ** 2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1)
+class SSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True):
+        super(SSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.window = create_window(window_size, self.channel)
+    def forward(self, img1, img2):
+        (_, channel, _, _) = img1.size()
+        if channel == self.channel and self.window.data.type() == img1.data.type():
+            window = self.window
+        else:
+            window = create_window(self.window_size, channel)
+            if img1.is_cuda:
+                window = window.cuda(img1.get_device())
+            window = window.type_as(img1)
+            self.window = window
+            self.channel = channel
+        return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
+window = None
+def ssim(img1, img2, window_size=11, size_average=True):
+    (_, channel, _, _) = img1.size()
+    global window
+    if window is None:
+        window = create_window(window_size, channel)
+        if img1.is_cuda:
+            window = window.cuda(img1.get_device())
+        window = window.type_as(img1)
+    return _ssim(img1, img2, window, window_size, channel, size_average)

modules/fastspeech/fs2.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from modules.commons.common_layers import *
+from modules.commons.common_layers import Embedding
+from modules.fastspeech.tts_modules import FastspeechDecoder, DurationPredictor, LengthRegulator, PitchPredictor, \
+    EnergyPredictor, FastspeechEncoder
+from utils.cwt import cwt2f0
+from utils.hparams import hparams
+from utils.pitch_utils import f0_to_coarse, denorm_f0, norm_f0
+FS_ENCODERS = {
+    'fft': lambda hp: FastspeechEncoder(
+        hp['hidden_size'], hp['enc_layers'], hp['enc_ffn_kernel_size'],
+        num_heads=hp['num_heads']),
+}
+FS_DECODERS = {
+    'fft': lambda hp: FastspeechDecoder(
+        hp['hidden_size'], hp['dec_layers'], hp['dec_ffn_kernel_size'], hp['num_heads']),
+}
+class FastSpeech2(nn.Module):
+    def __init__(self, dictionary, out_dims=None):
+        super().__init__()
+        # self.dictionary = dictionary
+        self.padding_idx = 0
+        if not hparams['no_fs2'] if 'no_fs2' in hparams.keys() else True:
+            self.enc_layers = hparams['enc_layers']
+            self.dec_layers = hparams['dec_layers']
+            self.encoder = FS_ENCODERS[hparams['encoder_type']](hparams)
+            self.decoder = FS_DECODERS[hparams['decoder_type']](hparams)
+        self.hidden_size = hparams['hidden_size']
+        # self.encoder_embed_tokens = self.build_embedding(self.dictionary, self.hidden_size)
+        self.out_dims = out_dims
+        if out_dims is None:
+            self.out_dims = hparams['audio_num_mel_bins']
+        self.mel_out = Linear(self.hidden_size, self.out_dims, bias=True)
+        #=========not used===========
+        # if hparams['use_spk_id']:
+        #     self.spk_embed_proj = Embedding(hparams['num_spk'] + 1, self.hidden_size)
+        #     if hparams['use_split_spk_id']:
+        #         self.spk_embed_f0 = Embedding(hparams['num_spk'] + 1, self.hidden_size)
+        #         self.spk_embed_dur = Embedding(hparams['num_spk'] + 1, self.hidden_size)
+        # elif hparams['use_spk_embed']:
+        #     self.spk_embed_proj = Linear(256, self.hidden_size, bias=True)
+        predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size
+        # self.dur_predictor = DurationPredictor(
+        #     self.hidden_size,
+        #     n_chans=predictor_hidden,
+        #     n_layers=hparams['dur_predictor_layers'],
+        #     dropout_rate=hparams['predictor_dropout'], padding=hparams['ffn_padding'],
+        #     kernel_size=hparams['dur_predictor_kernel'])
+        # self.length_regulator = LengthRegulator()
+        if hparams['use_pitch_embed']:
+            self.pitch_embed = Embedding(300, self.hidden_size, self.padding_idx)
+            if hparams['pitch_type'] == 'cwt':
+                h = hparams['cwt_hidden_size']
+                cwt_out_dims = 10
+                if hparams['use_uv']:
+                    cwt_out_dims = cwt_out_dims + 1
+                self.cwt_predictor = nn.Sequential(
+                    nn.Linear(self.hidden_size, h),
+                    PitchPredictor(
+                        h,
+                        n_chans=predictor_hidden,
+                        n_layers=hparams['predictor_layers'],
+                        dropout_rate=hparams['predictor_dropout'], odim=cwt_out_dims,
+                        padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel']))
+                self.cwt_stats_layers = nn.Sequential(
+                    nn.Linear(self.hidden_size, h), nn.ReLU(),
+                    nn.Linear(h, h), nn.ReLU(), nn.Linear(h, 2)
+                )
+            else:
+                self.pitch_predictor = PitchPredictor(
+                    self.hidden_size,
+                    n_chans=predictor_hidden,
+                    n_layers=hparams['predictor_layers'],
+                    dropout_rate=hparams['predictor_dropout'],
+                    odim=2 if hparams['pitch_type'] == 'frame' else 1,
+                    padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel'])
+        if hparams['use_energy_embed']:
+            self.energy_embed = Embedding(256, self.hidden_size, self.padding_idx)
+            # self.energy_predictor = EnergyPredictor(
+            #     self.hidden_size,
+            #     n_chans=predictor_hidden,
+            #     n_layers=hparams['predictor_layers'],
+            #     dropout_rate=hparams['predictor_dropout'], odim=1,
+            #     padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel'])
+    # def build_embedding(self, dictionary, embed_dim):
+    #     num_embeddings = len(dictionary)
+    #     emb = Embedding(num_embeddings, embed_dim, self.padding_idx)
+    #     return emb
+    def forward(self, hubert, mel2ph=None, spk_embed=None,
+                ref_mels=None, f0=None, uv=None, energy=None, skip_decoder=True,
+                spk_embed_dur_id=None, spk_embed_f0_id=None, infer=False, **kwargs):
+        ret = {}
+        if not hparams['no_fs2'] if 'no_fs2' in hparams.keys() else True:
+            encoder_out =self.encoder(hubert)  # [B, T, C]
+        else:
+            encoder_out =hubert
+        src_nonpadding = (hubert!=0).any(-1)[:,:,None]
+        # add ref style embed
+        # Not implemented
+        # variance encoder
+        var_embed = 0
+        # encoder_out_dur denotes encoder outputs for duration predictor
+        # in speech adaptation, duration predictor use old speaker embedding
+        if hparams['use_spk_embed']:
+            spk_embed_dur = spk_embed_f0 = spk_embed = self.spk_embed_proj(spk_embed)[:, None, :]
+        elif hparams['use_spk_id']:
+            spk_embed_id = spk_embed
+            if spk_embed_dur_id is None:
+                spk_embed_dur_id = spk_embed_id
+            if spk_embed_f0_id is None:
+                spk_embed_f0_id = spk_embed_id
+            spk_embed = self.spk_embed_proj(spk_embed_id)[:, None, :]
+            spk_embed_dur = spk_embed_f0 = spk_embed
+            if hparams['use_split_spk_id']:
+                spk_embed_dur = self.spk_embed_dur(spk_embed_dur_id)[:, None, :]
+                spk_embed_f0 = self.spk_embed_f0(spk_embed_f0_id)[:, None, :]
+        else:
+            spk_embed_dur = spk_embed_f0 = spk_embed = 0
+        # add dur
+        # dur_inp = (encoder_out + var_embed + spk_embed_dur) * src_nonpadding
+        # mel2ph = self.add_dur(dur_inp, mel2ph, hubert, ret)
+        ret['mel2ph'] = mel2ph
+        decoder_inp = F.pad(encoder_out, [0, 0, 1, 0])
+        mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]])
+        decoder_inp_origin = decoder_inp = torch.gather(decoder_inp, 1, mel2ph_)  # [B, T, H]
+        tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
+        # add pitch and energy embed
+        pitch_inp = (decoder_inp_origin + var_embed + spk_embed_f0) * tgt_nonpadding
+        if hparams['use_pitch_embed']:
+            pitch_inp_ph = (encoder_out + var_embed + spk_embed_f0) * src_nonpadding
+            decoder_inp = decoder_inp + self.add_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out=pitch_inp_ph)
+        if hparams['use_energy_embed']:
+            decoder_inp = decoder_inp + self.add_energy(pitch_inp, energy, ret)
+        ret['decoder_inp'] = decoder_inp = (decoder_inp + spk_embed) * tgt_nonpadding
+        if not hparams['no_fs2'] if 'no_fs2' in hparams.keys() else True:
+            if skip_decoder:
+                return ret
+            ret['mel_out'] = self.run_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs)
+        return ret
+    def add_dur(self, dur_input, mel2ph, hubert, ret):
+        src_padding = (hubert==0).all(-1)
+        dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
+        if mel2ph is None:
+            dur, xs = self.dur_predictor.inference(dur_input, src_padding)
+            ret['dur'] = xs
+            ret['dur_choice'] = dur
+            mel2ph = self.length_regulator(dur, src_padding).detach()
+        else:
+            ret['dur'] = self.dur_predictor(dur_input, src_padding)
+        ret['mel2ph'] = mel2ph
+        return mel2ph
+    def run_decoder(self, decoder_inp, tgt_nonpadding, ret, infer, **kwargs):
+        x = decoder_inp  # [B, T, H]
+        x = self.decoder(x)
+        x = self.mel_out(x)
+        return x * tgt_nonpadding
+    def cwt2f0_norm(self, cwt_spec, mean, std, mel2ph):
+        f0 = cwt2f0(cwt_spec, mean, std, hparams['cwt_scales'])
+        f0 = torch.cat(
+            [f0] + [f0[:, -1:]] * (mel2ph.shape[1] - f0.shape[1]), 1)
+        f0_norm = norm_f0(f0, None, hparams)
+        return f0_norm
+    def out2mel(self, out):
+        return out
+    def add_pitch(self,decoder_inp, f0, uv, mel2ph, ret, encoder_out=None):
+        # if hparams['pitch_type'] == 'ph':
+        #     pitch_pred_inp = encoder_out.detach() + hparams['predictor_grad'] * (encoder_out - encoder_out.detach())
+        #     pitch_padding = (encoder_out.sum().abs() == 0)
+        #     ret['pitch_pred'] = pitch_pred = self.pitch_predictor(pitch_pred_inp)
+        #     if f0 is None:
+        #         f0 = pitch_pred[:, :, 0]
+        #     ret['f0_denorm'] = f0_denorm = denorm_f0(f0, None, hparams, pitch_padding=pitch_padding)
+        #     pitch = f0_to_coarse(f0_denorm)  # start from 0 [B, T_txt]
+        #     pitch = F.pad(pitch, [1, 0])
+        #     pitch = torch.gather(pitch, 1, mel2ph)  # [B, T_mel]
+        #     pitch_embedding = pitch_embed(pitch)
+        #     return pitch_embedding
+        decoder_inp = decoder_inp.detach() + hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
+        pitch_padding = (mel2ph == 0)
+        # if hparams['pitch_type'] == 'cwt':
+        #     # NOTE: this part of script is *isolated* from other scripts, which means
+        #     #       it may not be compatible with the current version.
+        #     pass
+        #     # pitch_padding = None
+        #     # ret['cwt'] = cwt_out = self.cwt_predictor(decoder_inp)
+        #     # stats_out = self.cwt_stats_layers(encoder_out[:, 0, :])  # [B, 2]
+        #     # mean = ret['f0_mean'] = stats_out[:, 0]
+        #     # std = ret['f0_std'] = stats_out[:, 1]
+        #     # cwt_spec = cwt_out[:, :, :10]
+        #     # if f0 is None:
+        #     #     std = std * hparams['cwt_std_scale']
+        #     #     f0 = self.cwt2f0_norm(cwt_spec, mean, std, mel2ph)
+        #     #     if hparams['use_uv']:
+        #     #         assert cwt_out.shape[-1] == 11
+        #     #         uv = cwt_out[:, :, -1] > 0
+        # elif hparams['pitch_ar']:
+        #     ret['pitch_pred'] = pitch_pred = self.pitch_predictor(decoder_inp, f0 if is_training else None)
+        #     if f0 is None:
+        #         f0 = pitch_pred[:, :, 0]
+        # else:
+        #ret['pitch_pred'] = pitch_pred = self.pitch_predictor(decoder_inp)
+        # if f0 is None:
+        #     f0 = pitch_pred[:, :, 0]
+        # if hparams['use_uv'] and uv is None:
+        #     uv = pitch_pred[:, :, 1] > 0
+        ret['f0_denorm'] = f0_denorm = denorm_f0(f0, uv, hparams, pitch_padding=pitch_padding)
+        if pitch_padding is not None:
+            f0[pitch_padding] = 0
+        pitch = f0_to_coarse(f0_denorm,hparams)  # start from 0
+        ret['pitch_pred']=pitch.unsqueeze(-1)
+        # print(ret['pitch_pred'].shape)
+        # print(pitch.shape)
+        pitch_embedding = self.pitch_embed(pitch)
+        return pitch_embedding
+    def add_energy(self,decoder_inp, energy, ret):
+        decoder_inp = decoder_inp.detach() + hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
+        ret['energy_pred'] = energy#energy_pred = self.energy_predictor(decoder_inp)[:, :, 0]
+        # if energy is None:
+        #     energy = energy_pred
+        energy = torch.clamp(energy * 256 // 4, max=255).long() # energy_to_coarse
+        energy_embedding = self.energy_embed(energy)
+        return energy_embedding
+    @staticmethod
+    def mel_norm(x):
+        return (x + 5.5) / (6.3 / 2) - 1
+    @staticmethod
+    def mel_denorm(x):
+        return (x + 1) * (6.3 / 2) - 5.5

modules/fastspeech/pe.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from modules.commons.common_layers import *
+from utils.hparams import hparams
+from modules.fastspeech.tts_modules import PitchPredictor
+from utils.pitch_utils import denorm_f0
+class Prenet(nn.Module):
+    def __init__(self, in_dim=80, out_dim=256, kernel=5, n_layers=3, strides=None):
+        super(Prenet, self).__init__()
+        padding = kernel // 2
+        self.layers = []
+        self.strides = strides if strides is not None else [1] * n_layers
+        for l in range(n_layers):
+            self.layers.append(nn.Sequential(
+                nn.Conv1d(in_dim, out_dim, kernel_size=kernel, padding=padding, stride=self.strides[l]),
+                nn.ReLU(),
+                nn.BatchNorm1d(out_dim)
+            ))
+            in_dim = out_dim
+        self.layers = nn.ModuleList(self.layers)
+        self.out_proj = nn.Linear(out_dim, out_dim)
+    def forward(self, x):
+        """
+        :param x: [B, T, 80]
+        :return: [L, B, T, H], [B, T, H]
+        """
+        # padding_mask = x.abs().sum(-1).eq(0).data  # [B, T]
+        padding_mask = x.abs().sum(-1).eq(0).detach()
+        nonpadding_mask_TB = 1 - padding_mask.float()[:, None, :]  # [B, 1, T]
+        x = x.transpose(1, 2)
+        hiddens = []
+        for i, l in enumerate(self.layers):
+            nonpadding_mask_TB = nonpadding_mask_TB[:, :, ::self.strides[i]]
+            x = l(x) * nonpadding_mask_TB
+        hiddens.append(x)
+        hiddens = torch.stack(hiddens, 0)  # [L, B, H, T]
+        hiddens = hiddens.transpose(2, 3)  # [L, B, T, H]
+        x = self.out_proj(x.transpose(1, 2))  # [B, T, H]
+        x = x * nonpadding_mask_TB.transpose(1, 2)
+        return hiddens, x
+class ConvBlock(nn.Module):
+    def __init__(self, idim=80, n_chans=256, kernel_size=3, stride=1, norm='gn', dropout=0):
+        super().__init__()
+        self.conv = ConvNorm(idim, n_chans, kernel_size, stride=stride)
+        self.norm = norm
+        if self.norm == 'bn':
+            self.norm = nn.BatchNorm1d(n_chans)
+        elif self.norm == 'in':
+            self.norm = nn.InstanceNorm1d(n_chans, affine=True)
+        elif self.norm == 'gn':
+            self.norm = nn.GroupNorm(n_chans // 16, n_chans)
+        elif self.norm == 'ln':
+            self.norm = LayerNorm(n_chans // 16, n_chans)
+        elif self.norm == 'wn':
+            self.conv = torch.nn.utils.weight_norm(self.conv.conv)
+        self.dropout = nn.Dropout(dropout)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        """
+        :param x: [B, C, T]
+        :return: [B, C, T]
+        """
+        x = self.conv(x)
+        if not isinstance(self.norm, str):
+            if self.norm == 'none':
+                pass
+            elif self.norm == 'ln':
+                x = self.norm(x.transpose(1, 2)).transpose(1, 2)
+            else:
+                x = self.norm(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        return x
+class ConvStacks(nn.Module):
+    def __init__(self, idim=80, n_layers=5, n_chans=256, odim=32, kernel_size=5, norm='gn',
+                 dropout=0, strides=None, res=True):
+        super().__init__()
+        self.conv = torch.nn.ModuleList()
+        self.kernel_size = kernel_size
+        self.res = res
+        self.in_proj = Linear(idim, n_chans)
+        if strides is None:
+            strides = [1] * n_layers
+        else:
+            assert len(strides) == n_layers
+        for idx in range(n_layers):
+            self.conv.append(ConvBlock(
+                n_chans, n_chans, kernel_size, stride=strides[idx], norm=norm, dropout=dropout))
+        self.out_proj = Linear(n_chans, odim)
+    def forward(self, x, return_hiddens=False):
+        """
+        :param x: [B, T, H]
+        :return: [B, T, H]
+        """
+        x = self.in_proj(x)
+        x = x.transpose(1, -1)  # (B, idim, Tmax)
+        hiddens = []
+        for f in self.conv:
+            x_ = f(x)
+            x = x + x_ if self.res else x_  # (B, C, Tmax)
+            hiddens.append(x)
+        x = x.transpose(1, -1)
+        x = self.out_proj(x)  # (B, Tmax, H)
+        if return_hiddens:
+            hiddens = torch.stack(hiddens, 1)  # [B, L, C, T]
+            return x, hiddens
+        return x
+class PitchExtractor(nn.Module):
+    def __init__(self, n_mel_bins=80, conv_layers=2):
+        super().__init__()
+        self.hidden_size = hparams['hidden_size']
+        self.predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size
+        self.conv_layers = conv_layers
+        self.mel_prenet = Prenet(n_mel_bins, self.hidden_size, strides=[1, 1, 1])
+        if self.conv_layers > 0:
+            self.mel_encoder = ConvStacks(
+                    idim=self.hidden_size, n_chans=self.hidden_size, odim=self.hidden_size, n_layers=self.conv_layers)
+        self.pitch_predictor = PitchPredictor(
+            self.hidden_size, n_chans=self.predictor_hidden,
+            n_layers=5, dropout_rate=0.1, odim=2,
+            padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel'])
+    def forward(self, mel_input=None):
+        ret = {}
+        mel_hidden = self.mel_prenet(mel_input)[1]
+        if self.conv_layers > 0:
+            mel_hidden = self.mel_encoder(mel_hidden)
+        ret['pitch_pred'] = pitch_pred = self.pitch_predictor(mel_hidden)
+        pitch_padding = mel_input.abs().sum(-1) == 0
+        use_uv = hparams['pitch_type'] == 'frame' #and hparams['use_uv']
+        ret['f0_denorm_pred'] = denorm_f0(
+            pitch_pred[:, :, 0], (pitch_pred[:, :, 1] > 0) if use_uv else None,
+            hparams, pitch_padding=pitch_padding)
+        return ret

modules/fastspeech/tts_modules.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import logging
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from modules.commons.espnet_positional_embedding import RelPositionalEncoding
+from modules.commons.common_layers import SinusoidalPositionalEmbedding, Linear, EncSALayer, DecSALayer, BatchNorm1dTBC
+from utils.hparams import hparams
+DEFAULT_MAX_SOURCE_POSITIONS = 2000
+DEFAULT_MAX_TARGET_POSITIONS = 2000
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, hidden_size, dropout, kernel_size=None, num_heads=2, norm='ln'):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_heads = num_heads
+        self.op = EncSALayer(
+            hidden_size, num_heads, dropout=dropout,
+            attention_dropout=0.0, relu_dropout=dropout,
+            kernel_size=kernel_size
+            if kernel_size is not None else hparams['enc_ffn_kernel_size'],
+            padding=hparams['ffn_padding'],
+            norm=norm, act=hparams['ffn_act'])
+    def forward(self, x, **kwargs):
+        return self.op(x, **kwargs)
+######################
+# fastspeech modules
+######################
+class LayerNorm(torch.nn.LayerNorm):
+    """Layer normalization module.
+    :param int nout: output dim size
+    :param int dim: dimension to be normalized
+    """
+    def __init__(self, nout, dim=-1):
+        """Construct an LayerNorm object."""
+        super(LayerNorm, self).__init__(nout, eps=1e-12)
+        self.dim = dim
+    def forward(self, x):
+        """Apply layer normalization.
+        :param torch.Tensor x: input tensor
+        :return: layer normalized tensor
+        :rtype torch.Tensor
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
+class DurationPredictor(torch.nn.Module):
+    """Duration predictor module.
+    This is a module of duration predictor described in `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    The duration predictor predicts a duration of each frame in log domain from the hidden embeddings of encoder.
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+    Note:
+        The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`,
+        the outputs are calculated in log domain but in `inference`, those are calculated in linear domain.
+    """
+    def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0, padding='SAME'):
+        """Initilize duration predictor module.
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+            offset (float, optional): Offset value to avoid nan in log domain.
+        """
+        super(DurationPredictor, self).__init__()
+        self.offset = offset
+        self.conv = torch.nn.ModuleList()
+        self.kernel_size = kernel_size
+        self.padding = padding
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv += [torch.nn.Sequential(
+                torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
+                                       if padding == 'SAME'
+                                       else (kernel_size - 1, 0), 0),
+                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
+                torch.nn.ReLU(),
+                LayerNorm(n_chans, dim=1),
+                torch.nn.Dropout(dropout_rate)
+            )]
+        if hparams['dur_loss'] in ['mse', 'huber']:
+            odims = 1
+        elif hparams['dur_loss'] == 'mog':
+            odims = 15
+        elif hparams['dur_loss'] == 'crf':
+            odims = 32
+            from torchcrf import CRF
+            self.crf = CRF(odims, batch_first=True)
+        self.linear = torch.nn.Linear(n_chans, odims)
+    def _forward(self, xs, x_masks=None, is_inference=False):
+        xs = xs.transpose(1, -1)  # (B, idim, Tmax)
+        for f in self.conv:
+            xs = f(xs)  # (B, C, Tmax)
+            if x_masks is not None:
+                xs = xs * (1 - x_masks.float())[:, None, :]
+        xs = self.linear(xs.transpose(1, -1))  # [B, T, C]
+        xs = xs * (1 - x_masks.float())[:, :, None]  # (B, T, C)
+        if is_inference:
+            return self.out2dur(xs), xs
+        else:
+            if hparams['dur_loss'] in ['mse']:
+                xs = xs.squeeze(-1)  # (B, Tmax)
+        return xs
+    def out2dur(self, xs):
+        if hparams['dur_loss'] in ['mse']:
+            # NOTE: calculate in log domain
+            xs = xs.squeeze(-1)  # (B, Tmax)
+            dur = torch.clamp(torch.round(xs.exp() - self.offset), min=0).long()  # avoid negative value
+        elif hparams['dur_loss'] == 'mog':
+            return NotImplementedError
+        elif hparams['dur_loss'] == 'crf':
+            dur = torch.LongTensor(self.crf.decode(xs)).cuda()
+        return dur
+    def forward(self, xs, x_masks=None):
+        """Calculate forward propagation.
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (ByteTensor, optional): Batch of masks indicating padded part (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
+        """
+        return self._forward(xs, x_masks, False)
+    def inference(self, xs, x_masks=None):
+        """Inference duration.
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (ByteTensor, optional): Batch of masks indicating padded part (B, Tmax).
+        Returns:
+            LongTensor: Batch of predicted durations in linear domain (B, Tmax).
+        """
+        return self._forward(xs, x_masks, True)
+class LengthRegulator(torch.nn.Module):
+    def __init__(self, pad_value=0.0):
+        super(LengthRegulator, self).__init__()
+        self.pad_value = pad_value
+    def forward(self, dur, dur_padding=None, alpha=1.0):
+        """
+        Example (no batch dim version):
+            1. dur = [2,2,3]
+            2. token_idx = [[1],[2],[3]], dur_cumsum = [2,4,7], dur_cumsum_prev = [0,2,4]
+            3. token_mask = [[1,1,0,0,0,0,0],
+                             [0,0,1,1,0,0,0],
+                             [0,0,0,0,1,1,1]]
+            4. token_idx * token_mask = [[1,1,0,0,0,0,0],
+                                         [0,0,2,2,0,0,0],
+                                         [0,0,0,0,3,3,3]]
+            5. (token_idx * token_mask).sum(0) = [1,1,2,2,3,3,3]
+        :param dur: Batch of durations of each frame (B, T_txt)
+        :param dur_padding: Batch of padding of each frame (B, T_txt)
+        :param alpha: duration rescale coefficient
+        :return:
+            mel2ph (B, T_speech)
+        """
+        assert alpha > 0
+        dur = torch.round(dur.float() * alpha).long()
+        if dur_padding is not None:
+            dur = dur * (1 - dur_padding.long())
+        token_idx = torch.arange(1, dur.shape[1] + 1)[None, :, None].to(dur.device)
+        dur_cumsum = torch.cumsum(dur, 1)
+        dur_cumsum_prev = F.pad(dur_cumsum, [1, -1], mode='constant', value=0)
+        pos_idx = torch.arange(dur.sum(-1).max())[None, None].to(dur.device)
+        token_mask = (pos_idx >= dur_cumsum_prev[:, :, None]) & (pos_idx < dur_cumsum[:, :, None])
+        mel2ph = (token_idx * token_mask.long()).sum(1)
+        return mel2ph
+class PitchPredictor(torch.nn.Module):
+    def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5,
+                 dropout_rate=0.1, padding='SAME'):
+        """Initilize pitch predictor module.
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+        """
+        super(PitchPredictor, self).__init__()
+        self.conv = torch.nn.ModuleList()
+        self.kernel_size = kernel_size
+        self.padding = padding
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv += [torch.nn.Sequential(
+                torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
+                                       if padding == 'SAME'
+                                       else (kernel_size - 1, 0), 0),
+                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
+                torch.nn.ReLU(),
+                LayerNorm(n_chans, dim=1),
+                torch.nn.Dropout(dropout_rate)
+            )]
+        self.linear = torch.nn.Linear(n_chans, odim)
+        self.embed_positions = SinusoidalPositionalEmbedding(idim, 0, init_size=4096)
+        self.pos_embed_alpha = nn.Parameter(torch.Tensor([1]))
+    def forward(self, xs):
+        """
+        :param xs: [B, T, H]
+        :return: [B, T, H]
+        """
+        positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
+        xs = xs + positions
+        xs = xs.transpose(1, -1)  # (B, idim, Tmax)
+        for f in self.conv:
+            xs = f(xs)  # (B, C, Tmax)
+        # NOTE: calculate in log domain
+        xs = self.linear(xs.transpose(1, -1))  # (B, Tmax, H)
+        return xs
+class EnergyPredictor(PitchPredictor):
+    pass
+def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
+    B, _ = mel2ph.shape
+    dur = mel2ph.new_zeros(B, T_txt + 1).scatter_add(1, mel2ph, torch.ones_like(mel2ph))
+    dur = dur[:, 1:]
+    if max_dur is not None:
+        dur = dur.clamp(max=max_dur)
+    return dur
+class FFTBlocks(nn.Module):
+    def __init__(self, hidden_size, num_layers, ffn_kernel_size=9, dropout=None, num_heads=2,
+                 use_pos_embed=True, use_last_norm=True, norm='ln', use_pos_embed_alpha=True):
+        super().__init__()
+        self.num_layers = num_layers
+        embed_dim = self.hidden_size = hidden_size
+        self.dropout = dropout if dropout is not None else hparams['dropout']
+        self.use_pos_embed = use_pos_embed
+        self.use_last_norm = use_last_norm
+        if use_pos_embed:
+            self.max_source_positions = DEFAULT_MAX_TARGET_POSITIONS
+            self.padding_idx = 0
+            self.pos_embed_alpha = nn.Parameter(torch.Tensor([1])) if use_pos_embed_alpha else 1
+            self.embed_positions = SinusoidalPositionalEmbedding(
+                embed_dim, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS,
+            )
+        self.layers = nn.ModuleList([])
+        self.layers.extend([
+            TransformerEncoderLayer(self.hidden_size, self.dropout,
+                                    kernel_size=ffn_kernel_size, num_heads=num_heads)
+            for _ in range(self.num_layers)
+        ])
+        if self.use_last_norm:
+            if norm == 'ln':
+                self.layer_norm = nn.LayerNorm(embed_dim)
+            elif norm == 'bn':
+                self.layer_norm = BatchNorm1dTBC(embed_dim)
+        else:
+            self.layer_norm = None
+    def forward(self, x, padding_mask=None, attn_mask=None, return_hiddens=False):
+        """
+        :param x: [B, T, C]
+        :param padding_mask: [B, T]
+        :return: [B, T, C] or [L, B, T, C]
+        """
+        # padding_mask = x.abs().sum(-1).eq(0).data if padding_mask is None else padding_mask
+        padding_mask = x.abs().sum(-1).eq(0).detach() if padding_mask is None else padding_mask
+        nonpadding_mask_TB = 1 - padding_mask.transpose(0, 1).float()[:, :, None]  # [T, B, 1]
+        if self.use_pos_embed:
+            positions = self.pos_embed_alpha * self.embed_positions(x[..., 0])
+            x = x + positions
+            x = F.dropout(x, p=self.dropout, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1) * nonpadding_mask_TB
+        hiddens = []
+        for layer in self.layers:
+            x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
+            hiddens.append(x)
+        if self.use_last_norm:
+            x = self.layer_norm(x) * nonpadding_mask_TB
+        if return_hiddens:
+            x = torch.stack(hiddens, 0)  # [L, T, B, C]
+            x = x.transpose(1, 2)  # [L, B, T, C]
+        else:
+            x = x.transpose(0, 1)  # [B, T, C]
+        return x
+class FastspeechEncoder(FFTBlocks):
+    '''
+        compared to FFTBlocks:
+        - input is [B, T, H], not [B, T, C]
+        - supports "relative" positional encoding
+    '''
+    def __init__(self, hidden_size=None, num_layers=None, kernel_size=None, num_heads=2):
+        hidden_size = hparams['hidden_size'] if hidden_size is None else hidden_size
+        kernel_size = hparams['enc_ffn_kernel_size'] if kernel_size is None else kernel_size
+        num_layers = hparams['dec_layers'] if num_layers is None else num_layers
+        super().__init__(hidden_size, num_layers, kernel_size, num_heads=num_heads,
+                         use_pos_embed=False)  # use_pos_embed_alpha for compatibility
+        #self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(hidden_size)
+        self.padding_idx = 0
+        if hparams.get('rel_pos') is not None and hparams['rel_pos']:
+            self.embed_positions = RelPositionalEncoding(hidden_size, dropout_rate=0.0)
+        else:
+            self.embed_positions = SinusoidalPositionalEmbedding(
+                hidden_size, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS,
+            )
+    def forward(self, hubert):
+        """
+        :param hubert: [B, T, H ]
+        :return: {
+            'encoder_out': [T x B x C]
+        }
+        """
+        # encoder_padding_mask = txt_tokens.eq(self.padding_idx).data
+        encoder_padding_mask = (hubert==0).all(-1)
+        x = self.forward_embedding(hubert)  # [B, T, H]
+        x = super(FastspeechEncoder, self).forward(x, encoder_padding_mask)
+        return x
+    def forward_embedding(self, hubert):
+        # embed tokens and positions
+        x = self.embed_scale * hubert
+        if hparams['use_pos_embed']:
+            positions = self.embed_positions(hubert)
+            x = x + positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        return x
+class FastspeechDecoder(FFTBlocks):
+    def __init__(self, hidden_size=None, num_layers=None, kernel_size=None, num_heads=None):
+        num_heads = hparams['num_heads'] if num_heads is None else num_heads
+        hidden_size = hparams['hidden_size'] if hidden_size is None else hidden_size
+        kernel_size = hparams['dec_ffn_kernel_size'] if kernel_size is None else kernel_size
+        num_layers = hparams['dec_layers'] if num_layers is None else num_layers
+        super().__init__(hidden_size, num_layers, kernel_size, num_heads=num_heads)

modules/hifigan/hifigan.py ADDED Viewed

	@@ -0,0 +1,365 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from modules.parallel_wavegan.layers import UpsampleNetwork, ConvInUpsampleNetwork
+from modules.parallel_wavegan.models.source import SourceModuleHnNSF
+import numpy as np
+LRELU_SLOPE = 0.1
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Conv1d1x1(Conv1d):
+    """1x1 Conv1d with customized initialization."""
+    def __init__(self, in_channels, out_channels, bias):
+        """Initialize 1x1 Conv1d module."""
+        super(Conv1d1x1, self).__init__(in_channels, out_channels,
+                                        kernel_size=1, padding=0,
+                                        dilation=1, bias=bias)
+class HifiGanGenerator(torch.nn.Module):
+    def __init__(self, h, c_out=1):
+        super(HifiGanGenerator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h['resblock_kernel_sizes'])
+        self.num_upsamples = len(h['upsample_rates'])
+        if h['use_pitch_embed']:
+            self.harmonic_num = 8
+            self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h['upsample_rates']))
+            self.m_source = SourceModuleHnNSF(
+                sampling_rate=h['audio_sample_rate'],
+                harmonic_num=self.harmonic_num)
+            self.noise_convs = nn.ModuleList()
+        self.conv_pre = weight_norm(Conv1d(80, h['upsample_initial_channel'], 7, 1, padding=3))
+        resblock = ResBlock1 if h['resblock'] == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h['upsample_rates'], h['upsample_kernel_sizes'])):
+            c_cur = h['upsample_initial_channel'] // (2 ** (i + 1))
+            self.ups.append(weight_norm(
+                ConvTranspose1d(c_cur * 2, c_cur, k, u, padding=(k - u) // 2)))
+            if h['use_pitch_embed']:
+                if i + 1 < len(h['upsample_rates']):
+                    stride_f0 = np.prod(h['upsample_rates'][i + 1:])
+                    self.noise_convs.append(Conv1d(
+                        1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
+                else:
+                    self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h['upsample_initial_channel'] // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(h['resblock_kernel_sizes'], h['resblock_dilation_sizes'])):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, c_out, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x, f0=None):
+        if f0 is not None:
+            # harmonic-source signal, noise-source signal, uv flag
+            f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)
+            har_source, noi_source, uv = self.m_source(f0)
+            har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            if f0 is not None:
+                x_source = self.noise_convs[i](har_source)
+                x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False, use_cond=False, c_in=1):
+        super(DiscriminatorP, self).__init__()
+        self.use_cond = use_cond
+        if use_cond:
+            from utils.hparams import hparams
+            t = hparams['hop_size']
+            self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2)
+            c_in = 2
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(c_in, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x, mel):
+        fmap = []
+        if self.use_cond:
+            x_mel = self.cond_net(mel)
+            x = torch.cat([x_mel, x], 1)
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_cond=False, c_in=1):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2, use_cond=use_cond, c_in=c_in),
+            DiscriminatorP(3, use_cond=use_cond, c_in=c_in),
+            DiscriminatorP(5, use_cond=use_cond, c_in=c_in),
+            DiscriminatorP(7, use_cond=use_cond, c_in=c_in),
+            DiscriminatorP(11, use_cond=use_cond, c_in=c_in),
+        ])
+    def forward(self, y, y_hat, mel=None):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y, mel)
+            y_d_g, fmap_g = d(y_hat, mel)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False, use_cond=False, upsample_rates=None, c_in=1):
+        super(DiscriminatorS, self).__init__()
+        self.use_cond = use_cond
+        if use_cond:
+            t = np.prod(upsample_rates)
+            self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2)
+            c_in = 2
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(c_in, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x, mel):
+        if self.use_cond:
+            x_mel = self.cond_net(mel)
+            x = torch.cat([x_mel, x], 1)
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self, use_cond=False, c_in=1):
+        super(MultiScaleDiscriminator, self).__init__()
+        from utils.hparams import hparams
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True, use_cond=use_cond,
+                           upsample_rates=[4, 4, hparams['hop_size'] // 16],
+                           c_in=c_in),
+            DiscriminatorS(use_cond=use_cond,
+                           upsample_rates=[4, 4, hparams['hop_size'] // 32],
+                           c_in=c_in),
+            DiscriminatorS(use_cond=use_cond,
+                           upsample_rates=[4, 4, hparams['hop_size'] // 64],
+                           c_in=c_in),
+        ])
+        self.meanpools = nn.ModuleList([
+            AvgPool1d(4, 2, padding=1),
+            AvgPool1d(4, 2, padding=1)
+        ])
+    def forward(self, y, y_hat, mel=None):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y, mel)
+            y_d_g, fmap_g = d(y_hat, mel)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    r_losses = 0
+    g_losses = 0
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg ** 2)
+        r_losses += r_loss
+        g_losses += g_loss
+    r_losses = r_losses / len(disc_real_outputs)
+    g_losses = g_losses / len(disc_real_outputs)
+    return r_losses, g_losses
+def cond_discriminator_loss(outputs):
+    loss = 0
+    for dg in outputs:
+        g_loss = torch.mean(dg ** 2)
+        loss += g_loss
+    loss = loss / len(outputs)
+    return loss
+def generator_loss(disc_outputs):
+    loss = 0
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg) ** 2)
+        loss += l
+    loss = loss / len(disc_outputs)
+    return loss

modules/hifigan/mel_utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import numpy as np
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, hparams, center=False, complex=False):
+    # hop_size: 512  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
+    # win_size: 2048  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
+    # fmin: 55  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+    # fmax: 10000  # To be increased/reduced depending on data.
+    # fft_size: 2048  # Extra window size is filled with 0 paddings to match this parameter
+    # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax,
+    n_fft = hparams['fft_size']
+    num_mels = hparams['audio_num_mel_bins']
+    sampling_rate = hparams['audio_sample_rate']
+    hop_size = hparams['hop_size']
+    win_size = hparams['win_size']
+    fmin = hparams['fmin']
+    fmax = hparams['fmax']
+    y = y.clamp(min=-1., max=1.)
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+                                mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True)
+    if not complex:
+        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+        spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
+        spec = spectral_normalize_torch(spec)
+    else:
+        B, C, T, _ = spec.shape
+        spec = spec.transpose(1, 2)  # [B, T, n_fft, 2]
+    return spec

modules/nsf_hifigan/env.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import shutil
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))

modules/nsf_hifigan/models.py ADDED Viewed

	@@ -0,0 +1,549 @@

+import os
+import json
+from .env import AttrDict
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from .utils import init_weights, get_padding
+LRELU_SLOPE = 0.1
+def load_model(model_path, device='cuda'):
+    config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
+    with open(config_file) as f:
+        data = f.read()
+    global h
+    json_config = json.loads(data)
+    h = AttrDict(json_config)
+    generator = Generator(h).to(device)
+    cp_dict = torch.load(model_path)
+    generator.load_state_dict(cp_dict['generator'])
+    generator.eval()
+    generator.remove_weight_norm()
+    del cp_dict
+    return generator, h
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
+                              device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            # for normal case
+            # To prevent torch.cumsum numerical overflow,
+            # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+            # Buffer tmp_over_one_idx indicates the time step to add -1.
+            # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
+            tmp_over_one = torch.cumsum(rad_values, 1) % 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] -
+                                tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
+                              * 2 * np.pi)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
+                                 device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+            # generate sine waveforms
+            sine_waves = self._f02sine(f0_buf) * self.sine_amp
+            # generate uv signal
+            # uv = torch.ones(f0.shape)
+            # uv = uv * (f0 > self.voiced_threshold)
+            uv = self._f02uv(f0)
+            # noise: for unvoiced should be similar to sine_amp
+            #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+            # .       for voiced regions is self.noise_std
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            # first: set the unvoiced part to 0 by uv
+            # then: additive noise
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h.upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=h.sampling_rate,
+            harmonic_num=8)
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            c_cur = h.upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(weight_norm(
+                ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+            if i + 1 < len(h.upsample_rates):#
+                stride_f0 = np.prod(h.upsample_rates[i + 1:])
+                self.noise_convs.append(Conv1d(
+                    1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x,f0):
+        # print(1,x.shape,f0.shape,f0[:, None].shape)
+        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)#bs,n,t
+        # print(2,f0.shape)
+        har_source, noi_source, uv = self.m_source(f0)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        # print(124,x.shape,har_source.shape)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            # print(3,x.shape)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            # print(4,x_source.shape,har_source.shape,x.shape)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, periods=None):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
+        self.discriminators = nn.ModuleList()
+        for period in self.periods:
+            self.discriminators.append(DiscriminatorP(period))
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList([
+            AvgPool1d(4, 2, padding=2),
+            AvgPool1d(4, 2, padding=2)
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i-1](y)
+                y_hat = self.meanpools[i-1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss*2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

modules/nsf_hifigan/nvSTFT.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import math
+import os
+os.environ["LRU_CACHE_CAPACITY"] = "3"
+import random
+import torch
+import torch.utils.data
+import numpy as np
+import librosa
+from librosa.util import normalize
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+import soundfile as sf
+def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
+    sampling_rate = None
+    try:
+        data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
+    except Exception as ex:
+        print(f"'{full_path}' failed to load.\nException:")
+        print(ex)
+        if return_empty_on_exception:
+            return [], sampling_rate or target_sr or 48000
+        else:
+            raise Exception(ex)
+    if len(data.shape) > 1:
+        data = data[:, 0]
+        assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
+    if np.issubdtype(data.dtype, np.integer): # if audio data is type int
+        max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
+    else: # if audio data is type fp32
+        max_mag = max(np.amax(data), -np.amin(data))
+        max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
+    data = torch.FloatTensor(data.astype(np.float32))/max_mag
+    if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
+        return [], sampling_rate or target_sr or 48000
+    if target_sr is not None and sampling_rate != target_sr:
+        data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
+        sampling_rate = target_sr
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+class STFT():
+    def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
+        self.target_sr = sr
+        self.n_mels     = n_mels
+        self.n_fft      = n_fft
+        self.win_size   = win_size
+        self.hop_length = hop_length
+        self.fmin     = fmin
+        self.fmax     = fmax
+        self.clip_val = clip_val
+        self.mel_basis = {}
+        self.hann_window = {}
+    def get_mel(self, y, center=False):
+        sampling_rate = self.target_sr
+        n_mels     = self.n_mels
+        n_fft      = self.n_fft
+        win_size   = self.win_size
+        hop_length = self.hop_length
+        fmin       = self.fmin
+        fmax       = self.fmax
+        clip_val   = self.clip_val
+        if torch.min(y) < -1.:
+            print('min value is ', torch.min(y))
+        if torch.max(y) > 1.:
+            print('max value is ', torch.max(y))
+        if fmax not in self.mel_basis:
+            mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
+            self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+            self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device)
+        y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect')
+        y = y.squeeze(1)
+        spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)],
+                          center=center, pad_mode='reflect', normalized=False, onesided=True)
+        # print(111,spec)
+        spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+        # print(222,spec)
+        spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec)
+        # print(333,spec)
+        spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
+        # print(444,spec)
+        return spec
+    def __call__(self, audiopath):
+        audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
+        spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
+        return spect
+stft = STFT()

modules/nsf_hifigan/utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import glob
+import os
+import matplotlib
+import torch
+from torch.nn.utils import weight_norm
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none')
+    plt.colorbar(im, ax=ax)
+    fig.canvas.draw()
+    plt.close()
+    return fig
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+def save_checkpoint(filepath, obj):
+    print("Saving checkpoint to {}".format(filepath))
+    torch.save(obj, filepath)
+    print("Complete.")
+def del_old_checkpoints(cp_dir, prefix, n_models=2):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern) # get checkpoint paths
+    cp_list = sorted(cp_list)# sort by iter
+    if len(cp_list) > n_models: # if more than n_models models are found
+        for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
+            open(cp, 'w').close()# empty file contents
+            os.unlink(cp)# delete file (move to trash when using Colab)
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]

modules/parallel_wavegan/__init__.py ADDED Viewed

File without changes

modules/parallel_wavegan/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .causal_conv import *  # NOQA
+from .pqmf import *  # NOQA
+from .residual_block import *  # NOQA
+from modules.parallel_wavegan.layers.residual_stack import *  # NOQA
+from .upsample import *  # NOQA

modules/parallel_wavegan/layers/causal_conv.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# -*- coding: utf-8 -*-
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""Causal convolusion layer modules."""
+import torch
+class CausalConv1d(torch.nn.Module):
+    """CausalConv1d module with customized initialization."""
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}):
+        """Initialize CausalConv1d module."""
+        super(CausalConv1d, self).__init__()
+        self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params)
+        self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size,
+                                    dilation=dilation, bias=bias)
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
+        """
+        return self.conv(self.pad(x))[:, :, :x.size(2)]
+class CausalConvTranspose1d(torch.nn.Module):
+    """CausalConvTranspose1d module with customized initialization."""
+    def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True):
+        """Initialize CausalConvTranspose1d module."""
+        super(CausalConvTranspose1d, self).__init__()
+        self.deconv = torch.nn.ConvTranspose1d(
+            in_channels, out_channels, kernel_size, stride, bias=bias)
+        self.stride = stride
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
+        """
+        return self.deconv(x)[:, :, :-self.stride]

modules/parallel_wavegan/layers/pqmf.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# -*- coding: utf-8 -*-
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""Pseudo QMF modules."""
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy.signal import kaiser
+def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
+    """Design prototype filter for PQMF.
+    This method is based on `A Kaiser window approach for the design of prototype
+    filters of cosine modulated filterbanks`_.
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+    Returns:
+        ndarray: Impluse response of prototype filter (taps + 1,).
+    .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
+        https://ieeexplore.ieee.org/abstract/document/681427
+    """
+    # check the arguments are valid
+    assert taps % 2 == 0, "The number of taps mush be even number."
+    assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
+    # make initial filter
+    omega_c = np.pi * cutoff_ratio
+    with np.errstate(invalid='ignore'):
+        h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \
+            / (np.pi * (np.arange(taps + 1) - 0.5 * taps))
+    h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
+    # apply kaiser window
+    w = kaiser(taps + 1, beta)
+    h = h_i * w
+    return h
+class PQMF(torch.nn.Module):
+    """PQMF module.
+    This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
+    .. _`Near-perfect-reconstruction pseudo-QMF banks`:
+        https://ieeexplore.ieee.org/document/258122
+    """
+    def __init__(self, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
+        """Initilize PQMF module.
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
+        """
+        super(PQMF, self).__init__()
+        # define filter coefficient
+        h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
+        h_analysis = np.zeros((subbands, len(h_proto)))
+        h_synthesis = np.zeros((subbands, len(h_proto)))
+        for k in range(subbands):
+            h_analysis[k] = 2 * h_proto * np.cos(
+                (2 * k + 1) * (np.pi / (2 * subbands)) *
+                (np.arange(taps + 1) - ((taps - 1) / 2)) +
+                (-1) ** k * np.pi / 4)
+            h_synthesis[k] = 2 * h_proto * np.cos(
+                (2 * k + 1) * (np.pi / (2 * subbands)) *
+                (np.arange(taps + 1) - ((taps - 1) / 2)) -
+                (-1) ** k * np.pi / 4)
+        # convert to tensor
+        analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
+        synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
+        # register coefficients as beffer
+        self.register_buffer("analysis_filter", analysis_filter)
+        self.register_buffer("synthesis_filter", synthesis_filter)
+        # filter for downsampling & upsampling
+        updown_filter = torch.zeros((subbands, subbands, subbands)).float()
+        for k in range(subbands):
+            updown_filter[k, k, 0] = 1.0
+        self.register_buffer("updown_filter", updown_filter)
+        self.subbands = subbands
+        # keep padding info
+        self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
+    def analysis(self, x):
+        """Analysis with PQMF.
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
+        """
+        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
+        return F.conv1d(x, self.updown_filter, stride=self.subbands)
+    def synthesis(self, x):
+        """Synthesis with PQMF.
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+        Returns:
+            Tensor: Output tensor (B, 1, T).
+        """
+        x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands)
+        return F.conv1d(self.pad_fn(x), self.synthesis_filter)

modules/parallel_wavegan/layers/residual_block.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# -*- coding: utf-8 -*-
+"""Residual block module in WaveNet.
+This code is modified from https://github.com/r9y9/wavenet_vocoder.
+"""
+import math
+import torch
+import torch.nn.functional as F
+class Conv1d(torch.nn.Conv1d):
+    """Conv1d module with customized initialization."""
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv1d module."""
+        super(Conv1d, self).__init__(*args, **kwargs)
+    def reset_parameters(self):
+        """Reset parameters."""
+        torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+class Conv1d1x1(Conv1d):
+    """1x1 Conv1d with customized initialization."""
+    def __init__(self, in_channels, out_channels, bias):
+        """Initialize 1x1 Conv1d module."""
+        super(Conv1d1x1, self).__init__(in_channels, out_channels,
+                                        kernel_size=1, padding=0,
+                                        dilation=1, bias=bias)
+class ResidualBlock(torch.nn.Module):
+    """Residual block module in WaveNet."""
+    def __init__(self,
+                 kernel_size=3,
+                 residual_channels=64,
+                 gate_channels=128,
+                 skip_channels=64,
+                 aux_channels=80,
+                 dropout=0.0,
+                 dilation=1,
+                 bias=True,
+                 use_causal_conv=False
+                 ):
+        """Initialize ResidualBlock module.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            residual_channels (int): Number of channels for residual connection.
+            skip_channels (int): Number of channels for skip connection.
+            aux_channels (int): Local conditioning channels i.e. auxiliary input dimension.
+            dropout (float): Dropout probability.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution.
+        """
+        super(ResidualBlock, self).__init__()
+        self.dropout = dropout
+        # no future time stamps available
+        if use_causal_conv:
+            padding = (kernel_size - 1) * dilation
+        else:
+            assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+            padding = (kernel_size - 1) // 2 * dilation
+        self.use_causal_conv = use_causal_conv
+        # dilation conv
+        self.conv = Conv1d(residual_channels, gate_channels, kernel_size,
+                           padding=padding, dilation=dilation, bias=bias)
+        # local conditioning
+        if aux_channels > 0:
+            self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False)
+        else:
+            self.conv1x1_aux = None
+        # conv output is split into two groups
+        gate_out_channels = gate_channels // 2
+        self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias)
+        self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias)
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, residual_channels, T).
+            c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T).
+        Returns:
+            Tensor: Output tensor for residual connection (B, residual_channels, T).
+            Tensor: Output tensor for skip connection (B, skip_channels, T).
+        """
+        residual = x
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = self.conv(x)
+        # remove future time steps if use_causal_conv conv
+        x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x
+        # split into two part for gated activation
+        splitdim = 1
+        xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
+        # local conditioning
+        if c is not None:
+            assert self.conv1x1_aux is not None
+            c = self.conv1x1_aux(c)
+            ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
+            xa, xb = xa + ca, xb + cb
+        x = torch.tanh(xa) * torch.sigmoid(xb)
+        # for skip connection
+        s = self.conv1x1_skip(x)
+        # for residual connection
+        x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5)
+        return x, s

modules/parallel_wavegan/layers/residual_stack.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# -*- coding: utf-8 -*-
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""Residual stack module in MelGAN."""
+import torch
+from . import CausalConv1d
+class ResidualStack(torch.nn.Module):
+    """Residual stack module introduced in MelGAN."""
+    def __init__(self,
+                 kernel_size=3,
+                 channels=32,
+                 dilation=1,
+                 bias=True,
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.2},
+                 pad="ReflectionPad1d",
+                 pad_params={},
+                 use_causal_conv=False,
+                 ):
+        """Initialize ResidualStack module.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
+        """
+        super(ResidualStack, self).__init__()
+        # defile residual stack part
+        if not use_causal_conv:
+            assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+            self.stack = torch.nn.Sequential(
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
+                torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                torch.nn.Conv1d(channels, channels, 1, bias=bias),
+            )
+        else:
+            self.stack = torch.nn.Sequential(
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                CausalConv1d(channels, channels, kernel_size, dilation=dilation,
+                             bias=bias, pad=pad, pad_params=pad_params),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                torch.nn.Conv1d(channels, channels, 1, bias=bias),
+            )
+        # defile extra layer for skip connection
+        self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
+    def forward(self, c):
+        """Calculate forward propagation.
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, chennels, T).
+        """
+        return self.stack(c) + self.skip_layer(c)

modules/parallel_wavegan/layers/tf_layers.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# -*- coding: utf-8 -*-
+# Copyright 2020 MINH ANH (@dathudeptrai)
+#  MIT License (https://opensource.org/licenses/MIT)
+"""Tensorflow Layer modules complatible with pytorch."""
+import tensorflow as tf
+class TFReflectionPad1d(tf.keras.layers.Layer):
+    """Tensorflow ReflectionPad1d module."""
+    def __init__(self, padding_size):
+        """Initialize TFReflectionPad1d module.
+        Args:
+            padding_size (int): Padding size.
+        """
+        super(TFReflectionPad1d, self).__init__()
+        self.padding_size = padding_size
+    @tf.function
+    def call(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, T, 1, C).
+        Returns:
+            Tensor: Padded tensor (B, T + 2 * padding_size, 1, C).
+        """
+        return tf.pad(x, [[0, 0], [self.padding_size, self.padding_size], [0, 0], [0, 0]], "REFLECT")
+class TFConvTranspose1d(tf.keras.layers.Layer):
+    """Tensorflow ConvTranspose1d module."""
+    def __init__(self, channels, kernel_size, stride, padding):
+        """Initialize TFConvTranspose1d( module.
+        Args:
+            channels (int): Number of channels.
+            kernel_size (int): kernel size.
+            strides (int): Stride width.
+            padding (str): Padding type ("same" or "valid").
+        """
+        super(TFConvTranspose1d, self).__init__()
+        self.conv1d_transpose = tf.keras.layers.Conv2DTranspose(
+            filters=channels,
+            kernel_size=(kernel_size, 1),
+            strides=(stride, 1),
+            padding=padding,
+        )
+    @tf.function
+    def call(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, T, 1, C).
+        Returns:
+            Tensors: Output tensor (B, T', 1, C').
+        """
+        x = self.conv1d_transpose(x)
+        return x
+class TFResidualStack(tf.keras.layers.Layer):
+    """Tensorflow ResidualStack module."""
+    def __init__(self,
+                 kernel_size,
+                 channels,
+                 dilation,
+                 bias,
+                 nonlinear_activation,
+                 nonlinear_activation_params,
+                 padding,
+                 ):
+        """Initialize TFResidualStack module.
+        Args:
+            kernel_size (int): Kernel size.
+            channles (int): Number of channels.
+            dilation (int): Dilation ine.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            padding (str): Padding type ("same" or "valid").
+        """
+        super(TFResidualStack, self).__init__()
+        self.block = [
+            getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params),
+            TFReflectionPad1d(dilation),
+            tf.keras.layers.Conv2D(
+                filters=channels,
+                kernel_size=(kernel_size, 1),
+                dilation_rate=(dilation, 1),
+                use_bias=bias,
+                padding="valid",
+            ),
+            getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params),
+            tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias)
+        ]
+        self.shortcut = tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias)
+    @tf.function
+    def call(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, T, 1, C).
+        Returns:
+            Tensor: Output tensor (B, T, 1, C).
+        """
+        _x = tf.identity(x)
+        for i, layer in enumerate(self.block):
+            _x = layer(_x)
+        shortcut = self.shortcut(x)
+        return shortcut + _x

modules/parallel_wavegan/layers/upsample.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# -*- coding: utf-8 -*-
+"""Upsampling module.
+This code is modified from https://github.com/r9y9/wavenet_vocoder.
+"""
+import numpy as np
+import torch
+import torch.nn.functional as F
+from . import Conv1d
+class Stretch2d(torch.nn.Module):
+    """Stretch2d module."""
+    def __init__(self, x_scale, y_scale, mode="nearest"):
+        """Initialize Stretch2d module.
+        Args:
+            x_scale (int): X scaling factor (Time axis in spectrogram).
+            y_scale (int): Y scaling factor (Frequency axis in spectrogram).
+            mode (str): Interpolation mode.
+        """
+        super(Stretch2d, self).__init__()
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+        self.mode = mode
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, C, F, T).
+        Returns:
+            Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
+        """
+        return F.interpolate(
+            x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode)
+class Conv2d(torch.nn.Conv2d):
+    """Conv2d module with customized initialization."""
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv2d module."""
+        super(Conv2d, self).__init__(*args, **kwargs)
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.weight.data.fill_(1. / np.prod(self.kernel_size))
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+class UpsampleNetwork(torch.nn.Module):
+    """Upsampling network module."""
+    def __init__(self,
+                 upsample_scales,
+                 nonlinear_activation=None,
+                 nonlinear_activation_params={},
+                 interpolate_mode="nearest",
+                 freq_axis_kernel_size=1,
+                 use_causal_conv=False,
+                 ):
+        """Initialize upsampling network module.
+        Args:
+            upsample_scales (list): List of upsampling scales.
+            nonlinear_activation (str): Activation function name.
+            nonlinear_activation_params (dict): Arguments for specified activation function.
+            interpolate_mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
+        """
+        super(UpsampleNetwork, self).__init__()
+        self.use_causal_conv = use_causal_conv
+        self.up_layers = torch.nn.ModuleList()
+        for scale in upsample_scales:
+            # interpolation layer
+            stretch = Stretch2d(scale, 1, interpolate_mode)
+            self.up_layers += [stretch]
+            # conv layer
+            assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size."
+            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
+            kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
+            if use_causal_conv:
+                padding = (freq_axis_padding, scale * 2)
+            else:
+                padding = (freq_axis_padding, scale)
+            conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
+            self.up_layers += [conv]
+            # nonlinear
+            if nonlinear_activation is not None:
+                nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
+                self.up_layers += [nonlinear]
+    def forward(self, c):
+        """Calculate forward propagation.
+        Args:
+            c : Input tensor (B, C, T).
+        Returns:
+            Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales).
+        """
+        c = c.unsqueeze(1)  # (B, 1, C, T)
+        for f in self.up_layers:
+            if self.use_causal_conv and isinstance(f, Conv2d):
+                c = f(c)[..., :c.size(-1)]
+            else:
+                c = f(c)
+        return c.squeeze(1)  # (B, C, T')
+class ConvInUpsampleNetwork(torch.nn.Module):
+    """Convolution + upsampling network module."""
+    def __init__(self,
+                 upsample_scales,
+                 nonlinear_activation=None,
+                 nonlinear_activation_params={},
+                 interpolate_mode="nearest",
+                 freq_axis_kernel_size=1,
+                 aux_channels=80,
+                 aux_context_window=0,
+                 use_causal_conv=False
+                 ):
+        """Initialize convolution + upsampling network module.
+        Args:
+            upsample_scales (list): List of upsampling scales.
+            nonlinear_activation (str): Activation function name.
+            nonlinear_activation_params (dict): Arguments for specified activation function.
+            mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
+            aux_channels (int): Number of channels of pre-convolutional layer.
+            aux_context_window (int): Context window size of the pre-convolutional layer.
+            use_causal_conv (bool): Whether to use causal structure.
+        """
+        super(ConvInUpsampleNetwork, self).__init__()
+        self.aux_context_window = aux_context_window
+        self.use_causal_conv = use_causal_conv and aux_context_window > 0
+        # To capture wide-context information in conditional features
+        kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
+        # NOTE(kan-bayashi): Here do not use padding because the input is already padded
+        self.conv_in = Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False)
+        self.upsample = UpsampleNetwork(
+            upsample_scales=upsample_scales,
+            nonlinear_activation=nonlinear_activation,
+            nonlinear_activation_params=nonlinear_activation_params,
+            interpolate_mode=interpolate_mode,
+            freq_axis_kernel_size=freq_axis_kernel_size,
+            use_causal_conv=use_causal_conv,
+        )
+    def forward(self, c):
+        """Calculate forward propagation.
+        Args:
+            c : Input tensor (B, C, T').
+        Returns:
+            Tensor: Upsampled tensor (B, C, T),
+                where T = (T' - aux_context_window * 2) * prod(upsample_scales).
+        Note:
+            The length of inputs considers the context window size.
+        """
+        c_ = self.conv_in(c)
+        c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
+        return self.upsample(c)

modules/parallel_wavegan/losses/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .stft_loss import * # NOQA

modules/parallel_wavegan/losses/stft_loss.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# -*- coding: utf-8 -*-
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""STFT-based Loss modules."""
+import torch
+import torch.nn.functional as F
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
+    real = x_stft[..., 0]
+    imag = x_stft[..., 1]
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
+class SpectralConvergengeLoss(torch.nn.Module):
+    """Spectral convergence loss module."""
+    def __init__(self):
+        """Initilize spectral convergence loss module."""
+        super(SpectralConvergengeLoss, self).__init__()
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Spectral convergence loss value.
+        """
+        return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
+class LogSTFTMagnitudeLoss(torch.nn.Module):
+    """Log STFT magnitude loss module."""
+    def __init__(self):
+        """Initilize los STFT magnitude loss module."""
+        super(LogSTFTMagnitudeLoss, self).__init__()
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Log STFT magnitude loss value.
+        """
+        return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
+class STFTLoss(torch.nn.Module):
+    """STFT loss module."""
+    def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
+        """Initialize STFT loss module."""
+        super(STFTLoss, self).__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = getattr(torch, window)(win_length)
+        self.spectral_convergenge_loss = SpectralConvergengeLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
+        sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+        return sc_loss, mag_loss
+class MultiResolutionSTFTLoss(torch.nn.Module):
+    """Multi resolution STFT loss module."""
+    def __init__(self,
+                 fft_sizes=[1024, 2048, 512],
+                 hop_sizes=[120, 240, 50],
+                 win_lengths=[600, 1200, 240],
+                 window="hann_window"):
+        """Initialize Multi resolution STFT loss module.
+        Args:
+            fft_sizes (list): List of FFT sizes.
+            hop_sizes (list): List of hop sizes.
+            win_lengths (list): List of window lengths.
+            window (str): Window function type.
+        """
+        super(MultiResolutionSTFTLoss, self).__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = torch.nn.ModuleList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses += [STFTLoss(fs, ss, wl, window)]
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
+        """
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+        return sc_loss, mag_loss

modules/parallel_wavegan/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .melgan import * # NOQA
2	+ from .parallel_wavegan import * # NOQA

modules/parallel_wavegan/models/melgan.py ADDED Viewed

	@@ -0,0 +1,427 @@

+# -*- coding: utf-8 -*-
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""MelGAN Modules."""
+import logging
+import numpy as np
+import torch
+from modules.parallel_wavegan.layers import CausalConv1d
+from modules.parallel_wavegan.layers import CausalConvTranspose1d
+from modules.parallel_wavegan.layers import ResidualStack
+class MelGANGenerator(torch.nn.Module):
+    """MelGAN generator module."""
+    def __init__(self,
+                 in_channels=80,
+                 out_channels=1,
+                 kernel_size=7,
+                 channels=512,
+                 bias=True,
+                 upsample_scales=[8, 8, 2, 2],
+                 stack_kernel_size=3,
+                 stacks=3,
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.2},
+                 pad="ReflectionPad1d",
+                 pad_params={},
+                 use_final_nonlinear_activation=True,
+                 use_weight_norm=True,
+                 use_causal_conv=False,
+                 ):
+        """Initialize MelGANGenerator module.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): Initial number of channels for conv layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (list): List of upsampling scales.
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+            stacks (int): Number of stacks in a single residual stack.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_final_nonlinear_activation (torch.nn.Module): Activation function for the final layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal convolution.
+        """
+        super(MelGANGenerator, self).__init__()
+        # check hyper parameters is valid
+        assert channels >= np.prod(upsample_scales)
+        assert channels % (2 ** len(upsample_scales)) == 0
+        if not use_causal_conv:
+            assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+        # add initial layer
+        layers = []
+        if not use_causal_conv:
+            layers += [
+                getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
+                torch.nn.Conv1d(in_channels, channels, kernel_size, bias=bias),
+            ]
+        else:
+            layers += [
+                CausalConv1d(in_channels, channels, kernel_size,
+                             bias=bias, pad=pad, pad_params=pad_params),
+            ]
+        for i, upsample_scale in enumerate(upsample_scales):
+            # add upsampling layer
+            layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)]
+            if not use_causal_conv:
+                layers += [
+                    torch.nn.ConvTranspose1d(
+                        channels // (2 ** i),
+                        channels // (2 ** (i + 1)),
+                        upsample_scale * 2,
+                        stride=upsample_scale,
+                        padding=upsample_scale // 2 + upsample_scale % 2,
+                        output_padding=upsample_scale % 2,
+                        bias=bias,
+                    )
+                ]
+            else:
+                layers += [
+                    CausalConvTranspose1d(
+                        channels // (2 ** i),
+                        channels // (2 ** (i + 1)),
+                        upsample_scale * 2,
+                        stride=upsample_scale,
+                        bias=bias,
+                    )
+                ]
+            # add residual stack
+            for j in range(stacks):
+                layers += [
+                    ResidualStack(
+                        kernel_size=stack_kernel_size,
+                        channels=channels // (2 ** (i + 1)),
+                        dilation=stack_kernel_size ** j,
+                        bias=bias,
+                        nonlinear_activation=nonlinear_activation,
+                        nonlinear_activation_params=nonlinear_activation_params,
+                        pad=pad,
+                        pad_params=pad_params,
+                        use_causal_conv=use_causal_conv,
+                    )
+                ]
+        # add final layer
+        layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)]
+        if not use_causal_conv:
+            layers += [
+                getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
+                torch.nn.Conv1d(channels // (2 ** (i + 1)), out_channels, kernel_size, bias=bias),
+            ]
+        else:
+            layers += [
+                CausalConv1d(channels // (2 ** (i + 1)), out_channels, kernel_size,
+                             bias=bias, pad=pad, pad_params=pad_params),
+            ]
+        if use_final_nonlinear_activation:
+            layers += [torch.nn.Tanh()]
+        # define the model as a single function
+        self.melgan = torch.nn.Sequential(*layers)
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+        # reset parameters
+        self.reset_parameters()
+    def forward(self, c):
+        """Calculate forward propagation.
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, 1, T ** prod(upsample_scales)).
+        """
+        return self.melgan(c)
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+        self.apply(_apply_weight_norm)
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/spec2wav/modules.py
+        """
+        def _reset_parameters(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+        self.apply(_reset_parameters)
+class MelGANDiscriminator(torch.nn.Module):
+    """MelGAN discriminator module."""
+    def __init__(self,
+                 in_channels=1,
+                 out_channels=1,
+                 kernel_sizes=[5, 3],
+                 channels=16,
+                 max_downsample_channels=1024,
+                 bias=True,
+                 downsample_scales=[4, 4, 4, 4],
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.2},
+                 pad="ReflectionPad1d",
+                 pad_params={},
+                 ):
+        """Initilize MelGAN discriminator module.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (list): List of two kernel sizes. The prod will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+                For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
+                the last two layers' kernel size will be 5 and 3, respectively.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+        """
+        super(MelGANDiscriminator, self).__init__()
+        self.layers = torch.nn.ModuleList()
+        # check kernel size is valid
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1
+        assert kernel_sizes[1] % 2 == 1
+        # add first layer
+        self.layers += [
+            torch.nn.Sequential(
+                getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
+                torch.nn.Conv1d(in_channels, channels, np.prod(kernel_sizes), bias=bias),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+        # add downsample layers
+        in_chs = channels
+        for downsample_scale in downsample_scales:
+            out_chs = min(in_chs * downsample_scale, max_downsample_channels)
+            self.layers += [
+                torch.nn.Sequential(
+                    torch.nn.Conv1d(
+                        in_chs, out_chs,
+                        kernel_size=downsample_scale * 10 + 1,
+                        stride=downsample_scale,
+                        padding=downsample_scale * 5,
+                        groups=in_chs // 4,
+                        bias=bias,
+                    ),
+                    getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                )
+            ]
+            in_chs = out_chs
+        # add final layers
+        out_chs = min(in_chs * 2, max_downsample_channels)
+        self.layers += [
+            torch.nn.Sequential(
+                torch.nn.Conv1d(
+                    in_chs, out_chs, kernel_sizes[0],
+                    padding=(kernel_sizes[0] - 1) // 2,
+                    bias=bias,
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+        self.layers += [
+            torch.nn.Conv1d(
+                out_chs, out_channels, kernel_sizes[1],
+                padding=(kernel_sizes[1] - 1) // 2,
+                bias=bias,
+            ),
+        ]
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer.
+        """
+        outs = []
+        for f in self.layers:
+            x = f(x)
+            outs += [x]
+        return outs
+class MelGANMultiScaleDiscriminator(torch.nn.Module):
+    """MelGAN multi-scale discriminator module."""
+    def __init__(self,
+                 in_channels=1,
+                 out_channels=1,
+                 scales=3,
+                 downsample_pooling="AvgPool1d",
+                 # follow the official implementation setting
+                 downsample_pooling_params={
+                     "kernel_size": 4,
+                     "stride": 2,
+                     "padding": 1,
+                     "count_include_pad": False,
+                 },
+                 kernel_sizes=[5, 3],
+                 channels=16,
+                 max_downsample_channels=1024,
+                 bias=True,
+                 downsample_scales=[4, 4, 4, 4],
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.2},
+                 pad="ReflectionPad1d",
+                 pad_params={},
+                 use_weight_norm=True,
+                 ):
+        """Initilize MelGAN multi-scale discriminator module.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            kernel_sizes (list): List of two kernel sizes. The sum will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
+        """
+        super(MelGANMultiScaleDiscriminator, self).__init__()
+        self.discriminators = torch.nn.ModuleList()
+        # add discriminators
+        for _ in range(scales):
+            self.discriminators += [
+                MelGANDiscriminator(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_sizes=kernel_sizes,
+                    channels=channels,
+                    max_downsample_channels=max_downsample_channels,
+                    bias=bias,
+                    downsample_scales=downsample_scales,
+                    nonlinear_activation=nonlinear_activation,
+                    nonlinear_activation_params=nonlinear_activation_params,
+                    pad=pad,
+                    pad_params=pad_params,
+                )
+            ]
+        self.pooling = getattr(torch.nn, downsample_pooling)(**downsample_pooling_params)
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+        # reset parameters
+        self.reset_parameters()
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+            x = self.pooling(x)
+        return outs
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+        self.apply(_apply_weight_norm)
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/spec2wav/modules.py
+        """
+        def _reset_parameters(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+        self.apply(_reset_parameters)

modules/parallel_wavegan/models/parallel_wavegan.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# -*- coding: utf-8 -*-
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""Parallel WaveGAN Modules."""
+import logging
+import math
+import torch
+from torch import nn
+from modules.parallel_wavegan.layers import Conv1d
+from modules.parallel_wavegan.layers import Conv1d1x1
+from modules.parallel_wavegan.layers import ResidualBlock
+from modules.parallel_wavegan.layers import upsample
+from modules.parallel_wavegan import models
+class ParallelWaveGANGenerator(torch.nn.Module):
+    """Parallel WaveGAN Generator module."""
+    def __init__(self,
+                 in_channels=1,
+                 out_channels=1,
+                 kernel_size=3,
+                 layers=30,
+                 stacks=3,
+                 residual_channels=64,
+                 gate_channels=128,
+                 skip_channels=64,
+                 aux_channels=80,
+                 aux_context_window=2,
+                 dropout=0.0,
+                 bias=True,
+                 use_weight_norm=True,
+                 use_causal_conv=False,
+                 upsample_conditional_features=True,
+                 upsample_net="ConvInUpsampleNetwork",
+                 upsample_params={"upsample_scales": [4, 4, 4, 4]},
+                 use_pitch_embed=False,
+                 ):
+        """Initialize Parallel WaveGAN Generator module.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of dilated convolution.
+            layers (int): Number of residual block layers.
+            stacks (int): Number of stacks i.e., dilation cycles.
+            residual_channels (int): Number of channels in residual conv.
+            gate_channels (int):  Number of channels in gated conv.
+            skip_channels (int): Number of channels in skip conv.
+            aux_channels (int): Number of channels for auxiliary feature conv.
+            aux_context_window (int): Context window size for auxiliary feature.
+            dropout (float): Dropout rate. 0.0 means no dropout applied.
+            bias (bool): Whether to use bias parameter in conv layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal structure.
+            upsample_conditional_features (bool): Whether to use upsampling network.
+            upsample_net (str): Upsampling network architecture.
+            upsample_params (dict): Upsampling network parameters.
+        """
+        super(ParallelWaveGANGenerator, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.aux_channels = aux_channels
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+        # check the number of layers and stacks
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+        # define first convolution
+        self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True)
+        # define conv + upsampling network
+        if upsample_conditional_features:
+            upsample_params.update({
+                "use_causal_conv": use_causal_conv,
+            })
+            if upsample_net == "MelGANGenerator":
+                assert aux_context_window == 0
+                upsample_params.update({
+                    "use_weight_norm": False,  # not to apply twice
+                    "use_final_nonlinear_activation": False,
+                })
+                self.upsample_net = getattr(models, upsample_net)(**upsample_params)
+            else:
+                if upsample_net == "ConvInUpsampleNetwork":
+                    upsample_params.update({
+                        "aux_channels": aux_channels,
+                        "aux_context_window": aux_context_window,
+                    })
+                self.upsample_net = getattr(upsample, upsample_net)(**upsample_params)
+        else:
+            self.upsample_net = None
+        # define residual blocks
+        self.conv_layers = torch.nn.ModuleList()
+        for layer in range(layers):
+            dilation = 2 ** (layer % layers_per_stack)
+            conv = ResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=aux_channels,
+                dilation=dilation,
+                dropout=dropout,
+                bias=bias,
+                use_causal_conv=use_causal_conv,
+            )
+            self.conv_layers += [conv]
+        # define output layers
+        self.last_conv_layers = torch.nn.ModuleList([
+            torch.nn.ReLU(inplace=True),
+            Conv1d1x1(skip_channels, skip_channels, bias=True),
+            torch.nn.ReLU(inplace=True),
+            Conv1d1x1(skip_channels, out_channels, bias=True),
+        ])
+        self.use_pitch_embed = use_pitch_embed
+        if use_pitch_embed:
+            self.pitch_embed = nn.Embedding(300, aux_channels, 0)
+            self.c_proj = nn.Linear(2 * aux_channels, aux_channels)
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+    def forward(self, x, c=None, pitch=None, **kwargs):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, C_in, T).
+            c (Tensor): Local conditioning auxiliary features (B, C ,T').
+            pitch (Tensor): Local conditioning pitch (B, T').
+        Returns:
+            Tensor: Output tensor (B, C_out, T)
+        """
+        # perform upsampling
+        if c is not None and self.upsample_net is not None:
+            if self.use_pitch_embed:
+                p = self.pitch_embed(pitch)
+                c = self.c_proj(torch.cat([c.transpose(1, 2), p], -1)).transpose(1, 2)
+            c = self.upsample_net(c)
+            assert c.size(-1) == x.size(-1), (c.size(-1), x.size(-1))
+        # encode to hidden representation
+        x = self.first_conv(x)
+        skips = 0
+        for f in self.conv_layers:
+            x, h = f(x, c)
+            skips += h
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+        # apply final layers
+        x = skips
+        for f in self.last_conv_layers:
+            x = f(x)
+        return x
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+        self.apply(_apply_weight_norm)
+    @staticmethod
+    def _get_receptive_field_size(layers, stacks, kernel_size,
+                                  dilation=lambda x: 2 ** x):
+        assert layers % stacks == 0
+        layers_per_cycle = layers // stacks
+        dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
+        return (kernel_size - 1) * sum(dilations) + 1
+    @property
+    def receptive_field_size(self):
+        """Return receptive field size."""
+        return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
+class ParallelWaveGANDiscriminator(torch.nn.Module):
+    """Parallel WaveGAN Discriminator module."""
+    def __init__(self,
+                 in_channels=1,
+                 out_channels=1,
+                 kernel_size=3,
+                 layers=10,
+                 conv_channels=64,
+                 dilation_factor=1,
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.2},
+                 bias=True,
+                 use_weight_norm=True,
+                 ):
+        """Initialize Parallel WaveGAN Discriminator module.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Number of output channels.
+            layers (int): Number of conv layers.
+            conv_channels (int): Number of chnn layers.
+            dilation_factor (int): Dilation factor. For example, if dilation_factor = 2,
+                the dilation will be 2, 4, 8, ..., and so on.
+            nonlinear_activation (str): Nonlinear function after each conv.
+            nonlinear_activation_params (dict): Nonlinear function parameters
+            bias (bool): Whether to use bias parameter in conv.
+            use_weight_norm (bool) Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+        """
+        super(ParallelWaveGANDiscriminator, self).__init__()
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+        assert dilation_factor > 0, "Dilation factor must be > 0."
+        self.conv_layers = torch.nn.ModuleList()
+        conv_in_channels = in_channels
+        for i in range(layers - 1):
+            if i == 0:
+                dilation = 1
+            else:
+                dilation = i if dilation_factor == 1 else dilation_factor ** i
+                conv_in_channels = conv_channels
+            padding = (kernel_size - 1) // 2 * dilation
+            conv_layer = [
+                Conv1d(conv_in_channels, conv_channels,
+                       kernel_size=kernel_size, padding=padding,
+                       dilation=dilation, bias=bias),
+                getattr(torch.nn, nonlinear_activation)(inplace=True, **nonlinear_activation_params)
+            ]
+            self.conv_layers += conv_layer
+        padding = (kernel_size - 1) // 2
+        last_conv_layer = Conv1d(
+            conv_in_channels, out_channels,
+            kernel_size=kernel_size, padding=padding, bias=bias)
+        self.conv_layers += [last_conv_layer]
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, 1, T)
+        """
+        for f in self.conv_layers:
+            x = f(x)
+        return x
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+        self.apply(_apply_weight_norm)
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+class ResidualParallelWaveGANDiscriminator(torch.nn.Module):
+    """Parallel WaveGAN Discriminator module."""
+    def __init__(self,
+                 in_channels=1,
+                 out_channels=1,
+                 kernel_size=3,
+                 layers=30,
+                 stacks=3,
+                 residual_channels=64,
+                 gate_channels=128,
+                 skip_channels=64,
+                 dropout=0.0,
+                 bias=True,
+                 use_weight_norm=True,
+                 use_causal_conv=False,
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.2},
+                 ):
+        """Initialize Parallel WaveGAN Discriminator module.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of dilated convolution.
+            layers (int): Number of residual block layers.
+            stacks (int): Number of stacks i.e., dilation cycles.
+            residual_channels (int): Number of channels in residual conv.
+            gate_channels (int):  Number of channels in gated conv.
+            skip_channels (int): Number of channels in skip conv.
+            dropout (float): Dropout rate. 0.0 means no dropout applied.
+            bias (bool): Whether to use bias parameter in conv.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal structure.
+            nonlinear_activation_params (dict): Nonlinear function parameters
+        """
+        super(ResidualParallelWaveGANDiscriminator, self).__init__()
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+        # check the number of layers and stacks
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+        # define first convolution
+        self.first_conv = torch.nn.Sequential(
+            Conv1d1x1(in_channels, residual_channels, bias=True),
+            getattr(torch.nn, nonlinear_activation)(
+                inplace=True, **nonlinear_activation_params),
+        )
+        # define residual blocks
+        self.conv_layers = torch.nn.ModuleList()
+        for layer in range(layers):
+            dilation = 2 ** (layer % layers_per_stack)
+            conv = ResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=-1,
+                dilation=dilation,
+                dropout=dropout,
+                bias=bias,
+                use_causal_conv=use_causal_conv,
+            )
+            self.conv_layers += [conv]
+        # define output layers
+        self.last_conv_layers = torch.nn.ModuleList([
+            getattr(torch.nn, nonlinear_activation)(
+                inplace=True, **nonlinear_activation_params),
+            Conv1d1x1(skip_channels, skip_channels, bias=True),
+            getattr(torch.nn, nonlinear_activation)(
+                inplace=True, **nonlinear_activation_params),
+            Conv1d1x1(skip_channels, out_channels, bias=True),
+        ])
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, 1, T)
+        """
+        x = self.first_conv(x)
+        skips = 0
+        for f in self.conv_layers:
+            x, h = f(x, None)
+            skips += h
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+        # apply final layers
+        x = skips
+        for f in self.last_conv_layers:
+            x = f(x)
+        return x
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+        self.apply(_apply_weight_norm)
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)

modules/parallel_wavegan/models/source.py ADDED Viewed

	@@ -0,0 +1,538 @@

+import torch
+import numpy as np
+import sys
+import torch.nn.functional as torch_nn_func
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
+                              device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            # for normal case
+            # To prevent torch.cumsum numerical overflow,
+            # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+            # Buffer tmp_over_one_idx indicates the time step to add -1.
+            # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
+            tmp_over_one = torch.cumsum(rad_values, 1) % 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] -
+                                tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
+                              * 2 * np.pi)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
+                                 device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+            # generate sine waveforms
+            sine_waves = self._f02sine(f0_buf) * self.sine_amp
+            # generate uv signal
+            # uv = torch.ones(f0.shape)
+            # uv = uv * (f0 > self.voiced_threshold)
+            uv = self._f02uv(f0)
+            # noise: for unvoiced should be similar to sine_amp
+            #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+            # .       for voiced regions is self.noise_std
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            # first: set the unvoiced part to 0 by uv
+            # then: additive noise
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class PulseGen(torch.nn.Module):
+    """ Definition of Pulse train generator
+    There are many ways to implement pulse generator.
+    Here, PulseGen is based on SinGen. For a perfect
+    """
+    def __init__(self, samp_rate, pulse_amp = 0.1,
+                 noise_std = 0.003, voiced_threshold = 0):
+        super(PulseGen, self).__init__()
+        self.pulse_amp = pulse_amp
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.noise_std = noise_std
+        self.l_sinegen = SineGen(self.sampling_rate, harmonic_num=0, \
+                                 sine_amp=self.pulse_amp, noise_std=0, \
+                                 voiced_threshold=self.voiced_threshold, \
+                                 flag_for_pulse=True)
+    def forward(self, f0):
+        """ Pulse train generator
+        pulse_train, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output pulse_train: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        Note: self.l_sine doesn't make sure that the initial phase of
+        a voiced segment is np.pi, the first pulse in a voiced segment
+        may not be at the first time step within a voiced segment
+        """
+        with torch.no_grad():
+            sine_wav, uv, noise = self.l_sinegen(f0)
+            # sine without additive noise
+            pure_sine = sine_wav - noise
+            # step t corresponds to a pulse if
+            # sine[t] > sine[t+1] & sine[t] > sine[t-1]
+            # & sine[t-1], sine[t+1], and sine[t] are voiced
+            # or
+            # sine[t] is voiced, sine[t-1] is unvoiced
+            # we use torch.roll to simulate sine[t+1] and sine[t-1]
+            sine_1 = torch.roll(pure_sine, shifts=1, dims=1)
+            uv_1 = torch.roll(uv, shifts=1, dims=1)
+            uv_1[:, 0, :] = 0
+            sine_2 = torch.roll(pure_sine, shifts=-1, dims=1)
+            uv_2 = torch.roll(uv, shifts=-1, dims=1)
+            uv_2[:, -1, :] = 0
+            loc = (pure_sine > sine_1) * (pure_sine > sine_2) \
+                  * (uv_1 > 0) * (uv_2 > 0) * (uv > 0) \
+                  + (uv_1 < 1) * (uv > 0)
+            # pulse train without noise
+            pulse_train = pure_sine * loc
+            # additive noise to pulse train
+            # note that noise from sinegen is zero in voiced regions
+            pulse_noise = torch.randn_like(pure_sine) * self.noise_std
+            # with additive noise on pulse, and unvoiced regions
+            pulse_train += pulse_noise * loc + pulse_noise * (1 - uv)
+        return pulse_train, sine_wav, uv, pulse_noise
+class SignalsConv1d(torch.nn.Module):
+    """ Filtering input signal with time invariant filter
+    Note: FIRFilter conducted filtering given fixed FIR weight
+          SignalsConv1d convolves two signals
+    Note: this is based on torch.nn.functional.conv1d
+    """
+    def __init__(self):
+        super(SignalsConv1d, self).__init__()
+    def forward(self, signal, system_ir):
+        """ output = forward(signal, system_ir)
+        signal:    (batchsize, length1, dim)
+        system_ir: (length2, dim)
+        output:    (batchsize, length1, dim)
+        """
+        if signal.shape[-1] != system_ir.shape[-1]:
+            print("Error: SignalsConv1d expects shape:")
+            print("signal    (batchsize, length1, dim)")
+            print("system_id (batchsize, length2, dim)")
+            print("But received signal: {:s}".format(str(signal.shape)))
+            print(" system_ir: {:s}".format(str(system_ir.shape)))
+            sys.exit(1)
+        padding_length = system_ir.shape[0] - 1
+        groups = signal.shape[-1]
+        # pad signal on the left
+        signal_pad = torch_nn_func.pad(signal.permute(0, 2, 1), \
+                                       (padding_length, 0))
+        # prepare system impulse response as (dim, 1, length2)
+        # also flip the impulse response
+        ir = torch.flip(system_ir.unsqueeze(1).permute(2, 1, 0), \
+                        dims=[2])
+        # convolute
+        output = torch_nn_func.conv1d(signal_pad, ir, groups=groups)
+        return output.permute(0, 2, 1)
+class CyclicNoiseGen_v1(torch.nn.Module):
+    """ CyclicnoiseGen_v1
+    Cyclic noise with a single parameter of beta.
+    Pytorch v1 implementation assumes f_t is also fixed
+    """
+    def __init__(self, samp_rate,
+                 noise_std=0.003, voiced_threshold=0):
+        super(CyclicNoiseGen_v1, self).__init__()
+        self.samp_rate = samp_rate
+        self.noise_std = noise_std
+        self.voiced_threshold = voiced_threshold
+        self.l_pulse = PulseGen(samp_rate, pulse_amp=1.0,
+                                noise_std=noise_std,
+                                voiced_threshold=voiced_threshold)
+        self.l_conv = SignalsConv1d()
+    def noise_decay(self, beta, f0mean):
+        """ decayed_noise = noise_decay(beta, f0mean)
+        decayed_noise =  n[t]exp(-t * f_mean / beta / samp_rate)
+        beta: (dim=1) or (batchsize=1, 1, dim=1)
+        f0mean (batchsize=1, 1, dim=1)
+        decayed_noise (batchsize=1, length, dim=1)
+        """
+        with torch.no_grad():
+            # exp(-1.0 n / T) < 0.01 => n > -log(0.01)*T = 4.60*T
+            # truncate the noise when decayed by -40 dB
+            length = 4.6 * self.samp_rate / f0mean
+            length = length.int()
+            time_idx = torch.arange(0, length, device=beta.device)
+            time_idx = time_idx.unsqueeze(0).unsqueeze(2)
+            time_idx = time_idx.repeat(beta.shape[0], 1, beta.shape[2])
+        noise = torch.randn(time_idx.shape, device=beta.device)
+        # due to Pytorch implementation, use f0_mean as the f0 factor
+        decay = torch.exp(-time_idx * f0mean / beta / self.samp_rate)
+        return noise * self.noise_std * decay
+    def forward(self, f0s, beta):
+        """ Producde cyclic-noise
+        """
+        # pulse train
+        pulse_train, sine_wav, uv, noise = self.l_pulse(f0s)
+        pure_pulse = pulse_train - noise
+        # decayed_noise (length, dim=1)
+        if (uv < 1).all():
+            # all unvoiced
+            cyc_noise = torch.zeros_like(sine_wav)
+        else:
+            f0mean = f0s[uv > 0].mean()
+            decayed_noise = self.noise_decay(beta, f0mean)[0, :, :]
+            # convolute
+            cyc_noise = self.l_conv(pure_pulse, decayed_noise)
+        # add noise in invoiced segments
+        cyc_noise = cyc_noise + noise * (1.0 - uv)
+        return cyc_noise, pulse_train, sine_wav, uv, noise
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
+                              device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            # for normal case
+            # To prevent torch.cumsum numerical overflow,
+            # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+            # Buffer tmp_over_one_idx indicates the time step to add -1.
+            # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
+            tmp_over_one = torch.cumsum(rad_values, 1) % 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] -
+                                tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
+                              * 2 * np.pi)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, \
+                                 device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+            # generate sine waveforms
+            sine_waves = self._f02sine(f0_buf) * self.sine_amp
+            # generate uv signal
+            # uv = torch.ones(f0.shape)
+            # uv = uv * (f0 > self.voiced_threshold)
+            uv = self._f02uv(f0)
+            # noise: for unvoiced should be similar to sine_amp
+            #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+            # .       for voiced regions is self.noise_std
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            # first: set the unvoiced part to 0 by uv
+            # then: additive noise
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleCycNoise_v1(torch.nn.Module):
+    """ SourceModuleCycNoise_v1
+    SourceModule(sampling_rate, noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    noise_std: std of Gaussian noise (default: 0.003)
+    voiced_threshold: threshold to set U/V given F0 (default: 0)
+    cyc, noise, uv = SourceModuleCycNoise_v1(F0_upsampled, beta)
+    F0_upsampled (batchsize, length, 1)
+    beta (1)
+    cyc (batchsize, length, 1)
+    noise (batchsize, length, 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleCycNoise_v1, self).__init__()
+        self.sampling_rate = sampling_rate
+        self.noise_std = noise_std
+        self.l_cyc_gen = CyclicNoiseGen_v1(sampling_rate, noise_std,
+                                           voiced_threshod)
+    def forward(self, f0_upsamped, beta):
+        """
+        cyc, noise, uv = SourceModuleCycNoise_v1(F0, beta)
+        F0_upsampled (batchsize, length, 1)
+        beta (1)
+        cyc (batchsize, length, 1)
+        noise (batchsize, length, 1)
+        uv (batchsize, length, 1)
+        """
+        # source for harmonic branch
+        cyc, pulse, sine, uv, add_noi = self.l_cyc_gen(f0_upsamped, beta)
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.noise_std / 3
+        return cyc, noise, uv
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+if __name__ == '__main__':
+    source = SourceModuleCycNoise_v1(24000)
+    x = torch.randn(16, 25600, 1)

modules/parallel_wavegan/optimizers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from torch.optim import * # NOQA
2	+ from .radam import * # NOQA

modules/parallel_wavegan/optimizers/radam.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# -*- coding: utf-8 -*-
+"""RAdam optimizer.
+This code is drived from https://github.com/LiyuanLucasLiu/RAdam.
+"""
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+class RAdam(Optimizer):
+    """Rectified Adam optimizer."""
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
+        """Initilize RAdam optimizer."""
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        self.buffer = [[None, None, None] for ind in range(10)]
+        super(RAdam, self).__init__(params, defaults)
+    def __setstate__(self, state):
+        """Set state."""
+        super(RAdam, self).__setstate__(state)
+    def step(self, closure=None):
+        """Run one step."""
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('RAdam does not support sparse gradients')
+                p_data_fp32 = p.data.float()
+                state = self.state[p]
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                state['step'] += 1
+                buffered = self.buffer[int(state['step'] % 10)]
+                if state['step'] == buffered[0]:
+                    N_sma, step_size = buffered[1], buffered[2]
+                else:
+                    buffered[0] = state['step']
+                    beta2_t = beta2 ** state['step']
+                    N_sma_max = 2 / (1 - beta2) - 1
+                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+                    buffered[1] = N_sma
+                    # more conservative since it's an approximated value
+                    if N_sma >= 5:
+                        step_size = math.sqrt(
+                            (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])  # NOQA
+                    else:
+                        step_size = 1.0 / (1 - beta1 ** state['step'])
+                    buffered[2] = step_size
+                if group['weight_decay'] != 0:
+                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                # more conservative since it's an approximated value
+                if N_sma >= 5:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
+                else:
+                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)
+                p.data.copy_(p_data_fp32)
+        return loss

modules/parallel_wavegan/stft_loss.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# -*- coding: utf-8 -*-
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""STFT-based Loss modules."""
+import librosa
+import torch
+from modules.parallel_wavegan.losses import LogSTFTMagnitudeLoss, SpectralConvergengeLoss, stft
+class STFTLoss(torch.nn.Module):
+    """STFT loss module."""
+    def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window",
+                 use_mel_loss=False):
+        """Initialize STFT loss module."""
+        super(STFTLoss, self).__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = getattr(torch, window)(win_length)
+        self.spectral_convergenge_loss = SpectralConvergengeLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+        self.use_mel_loss = use_mel_loss
+        self.mel_basis = None
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
+        if self.use_mel_loss:
+            if self.mel_basis is None:
+                self.mel_basis = torch.from_numpy(librosa.filters.mel(22050, self.fft_size, 80)).cuda().T
+            x_mag = x_mag @ self.mel_basis
+            y_mag = y_mag @ self.mel_basis
+        sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+        return sc_loss, mag_loss
+class MultiResolutionSTFTLoss(torch.nn.Module):
+    """Multi resolution STFT loss module."""
+    def __init__(self,
+                 fft_sizes=[1024, 2048, 512],
+                 hop_sizes=[120, 240, 50],
+                 win_lengths=[600, 1200, 240],
+                 window="hann_window",
+                 use_mel_loss=False):
+        """Initialize Multi resolution STFT loss module.
+        Args:
+            fft_sizes (list): List of FFT sizes.
+            hop_sizes (list): List of hop sizes.
+            win_lengths (list): List of window lengths.
+            window (str): Window function type.
+        """
+        super(MultiResolutionSTFTLoss, self).__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = torch.nn.ModuleList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses += [STFTLoss(fs, ss, wl, window, use_mel_loss)]
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
+        """
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+        return sc_loss, mag_loss

modules/parallel_wavegan/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .utils import * # NOQA

modules/parallel_wavegan/utils/utils.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# -*- coding: utf-8 -*-
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""Utility functions."""
+import fnmatch
+import logging
+import os
+import sys
+import h5py
+import numpy as np
+def find_files(root_dir, query="*.wav", include_root_dir=True):
+    """Find files recursively.
+    Args:
+        root_dir (str): Root root_dir to find.
+        query (str): Query to find.
+        include_root_dir (bool): If False, root_dir name is not included.
+    Returns:
+        list: List of found filenames.
+    """
+    files = []
+    for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
+        for filename in fnmatch.filter(filenames, query):
+            files.append(os.path.join(root, filename))
+    if not include_root_dir:
+        files = [file_.replace(root_dir + "/", "") for file_ in files]
+    return files
+def read_hdf5(hdf5_name, hdf5_path):
+    """Read hdf5 dataset.
+    Args:
+        hdf5_name (str): Filename of hdf5 file.
+        hdf5_path (str): Dataset name in hdf5 file.
+    Return:
+        any: Dataset values.
+    """
+    if not os.path.exists(hdf5_name):
+        logging.error(f"There is no such a hdf5 file ({hdf5_name}).")
+        sys.exit(1)
+    hdf5_file = h5py.File(hdf5_name, "r")
+    if hdf5_path not in hdf5_file:
+        logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})")
+        sys.exit(1)
+    hdf5_data = hdf5_file[hdf5_path][()]
+    hdf5_file.close()
+    return hdf5_data
+def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True):
+    """Write dataset to hdf5.
+    Args:
+        hdf5_name (str): Hdf5 dataset filename.
+        hdf5_path (str): Dataset path in hdf5.
+        write_data (ndarray): Data to write.
+        is_overwrite (bool): Whether to overwrite dataset.
+    """
+    # convert to numpy array
+    write_data = np.array(write_data)
+    # check folder existence
+    folder_name, _ = os.path.split(hdf5_name)
+    if not os.path.exists(folder_name) and len(folder_name) != 0:
+        os.makedirs(folder_name)
+    # check hdf5 existence
+    if os.path.exists(hdf5_name):
+        # if already exists, open with r+ mode
+        hdf5_file = h5py.File(hdf5_name, "r+")
+        # check dataset existence
+        if hdf5_path in hdf5_file:
+            if is_overwrite:
+                logging.warning("Dataset in hdf5 file already exists. "
+                                "recreate dataset in hdf5.")
+                hdf5_file.__delitem__(hdf5_path)
+            else:
+                logging.error("Dataset in hdf5 file already exists. "
+                              "if you want to overwrite, please set is_overwrite = True.")
+                hdf5_file.close()
+                sys.exit(1)
+    else:
+        # if not exists, open with w mode
+        hdf5_file = h5py.File(hdf5_name, "w")
+    # write data to hdf5
+    hdf5_file.create_dataset(hdf5_path, data=write_data)
+    hdf5_file.flush()
+    hdf5_file.close()
+class HDF5ScpLoader(object):
+    """Loader class for a fests.scp file of hdf5 file.
+    Examples:
+        key1 /some/path/a.h5:feats
+        key2 /some/path/b.h5:feats
+        key3 /some/path/c.h5:feats
+        key4 /some/path/d.h5:feats
+        ...
+        >>> loader = HDF5ScpLoader("hdf5.scp")
+        >>> array = loader["key1"]
+        key1 /some/path/a.h5
+        key2 /some/path/b.h5
+        key3 /some/path/c.h5
+        key4 /some/path/d.h5
+        ...
+        >>> loader = HDF5ScpLoader("hdf5.scp", "feats")
+        >>> array = loader["key1"]
+    """
+    def __init__(self, feats_scp, default_hdf5_path="feats"):
+        """Initialize HDF5 scp loader.
+        Args:
+            feats_scp (str): Kaldi-style feats.scp file with hdf5 format.
+            default_hdf5_path (str): Path in hdf5 file. If the scp contain the info, not used.
+        """
+        self.default_hdf5_path = default_hdf5_path
+        with open(feats_scp, encoding='utf-8') as f:
+            lines = [line.replace("\n", "") for line in f.readlines()]
+        self.data = {}
+        for line in lines:
+            key, value = line.split()
+            self.data[key] = value
+    def get_path(self, key):
+        """Get hdf5 file path for a given key."""
+        return self.data[key]
+    def __getitem__(self, key):
+        """Get ndarray for a given key."""
+        p = self.data[key]
+        if ":" in p:
+            return read_hdf5(*p.split(":"))
+        else:
+            return read_hdf5(p, self.default_hdf5_path)
+    def __len__(self):
+        """Return the length of the scp file."""
+        return len(self.data)
+    def __iter__(self):
+        """Return the iterator of the scp file."""
+        return iter(self.data)
+    def keys(self):
+        """Return the keys of the scp file."""
+        return self.data.keys()

network/diff/candidate_decoder.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from modules.fastspeech.tts_modules import FastspeechDecoder
+# from modules.fastspeech.fast_tacotron import DecoderRNN
+# from modules.fastspeech.speedy_speech.speedy_speech import ConvBlocks
+# from modules.fastspeech.conformer.conformer import ConformerDecoder
+import torch
+from torch.nn import functional as F
+import torch.nn as nn
+import math
+from utils.hparams import hparams
+from modules.commons.common_layers import Mish
+Linear = nn.Linear
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+def Conv1d(*args, **kwargs):
+    layer = nn.Conv1d(*args, **kwargs)
+    nn.init.kaiming_normal_(layer.weight)
+    return layer
+class FFT(FastspeechDecoder): # unused, because DiffSinger only uses FastspeechEncoder
+    # NOTE: this part of script is *isolated* from other scripts, which means
+    #       it may not be compatible with the current version.
+    def __init__(self, hidden_size=None, num_layers=None, kernel_size=None, num_heads=None):
+        super().__init__(hidden_size, num_layers, kernel_size, num_heads=num_heads)
+        dim = hparams['residual_channels']
+        self.input_projection = Conv1d(hparams['audio_num_mel_bins'], dim, 1)
+        self.diffusion_embedding = SinusoidalPosEmb(dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            Mish(),
+            nn.Linear(dim * 4, dim)
+        )
+        self.get_mel_out = Linear(hparams['hidden_size'], 80, bias=True)
+        self.get_decode_inp = Linear(hparams['hidden_size'] + dim + dim,
+                                     hparams['hidden_size'])  # hs + dim + 80 -> hs
+    def forward(self, spec, diffusion_step, cond, padding_mask=None, attn_mask=None, return_hiddens=False):
+        """
+        :param spec: [B, 1, 80, T]
+        :param diffusion_step: [B, 1]
+        :param cond: [B, M, T]
+        :return:
+        """
+        x = spec[:, 0]
+        x = self.input_projection(x).permute([0, 2, 1])  #  [B, T, residual_channel]
+        diffusion_step = self.diffusion_embedding(diffusion_step)
+        diffusion_step = self.mlp(diffusion_step)  # [B, dim]
+        cond = cond.permute([0, 2, 1])  # [B, T, M]
+        seq_len = cond.shape[1]  # [T_mel]
+        time_embed = diffusion_step[:, None, :]  # [B, 1, dim]
+        time_embed = time_embed.repeat([1, seq_len, 1])  # # [B, T, dim]
+        decoder_inp = torch.cat([x, cond, time_embed], dim=-1)  # [B, T, dim + H + dim]
+        decoder_inp = self.get_decode_inp(decoder_inp)  # [B, T, H]
+        x = decoder_inp
+        '''
+        Required x: [B, T, C]
+        :return: [B, T, C] or [L, B, T, C]
+        '''
+        padding_mask = x.abs().sum(-1).eq(0).data if padding_mask is None else padding_mask
+        nonpadding_mask_TB = 1 - padding_mask.transpose(0, 1).float()[:, :, None]  # [T, B, 1]
+        if self.use_pos_embed:
+            positions = self.pos_embed_alpha * self.embed_positions(x[..., 0])
+            x = x + positions
+            x = F.dropout(x, p=self.dropout, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1) * nonpadding_mask_TB
+        hiddens = []
+        for layer in self.layers:
+            x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
+            hiddens.append(x)
+        if self.use_last_norm:
+            x = self.layer_norm(x) * nonpadding_mask_TB
+        if return_hiddens:
+            x = torch.stack(hiddens, 0)  # [L, T, B, C]
+            x = x.transpose(1, 2)  # [L, B, T, C]
+        else:
+            x = x.transpose(0, 1)  # [B, T, C]
+        x = self.get_mel_out(x).permute([0, 2, 1])  # [B, 80, T]
+        return x[:, None, :, :]

network/diff/diffusion.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from collections import deque
+from functools import partial
+from inspect import isfunction
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+from modules.fastspeech.fs2 import FastSpeech2
+# from modules.diffsinger_midi.fs2 import FastSpeech2MIDI
+from utils.hparams import hparams
+from training.train_pipeline import Batch2Loss
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+# gaussian diffusion trainer class
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+def linear_beta_schedule(timesteps, max_beta=hparams.get('max_beta', 0.01)):
+    """
+    linear schedule
+    """
+    betas = np.linspace(1e-4, max_beta, timesteps)
+    return betas
+def cosine_beta_schedule(timesteps, s=0.008):
+    """
+    cosine schedule
+    as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+    """
+    steps = timesteps + 1
+    x = np.linspace(0, steps, steps)
+    alphas_cumprod = np.cos(((x / steps) + s) / (1 + s) * np.pi * 0.5) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return np.clip(betas, a_min=0, a_max=0.999)
+beta_schedule = {
+    "cosine": cosine_beta_schedule,
+    "linear": linear_beta_schedule,
+}
+class GaussianDiffusion(nn.Module):
+    def __init__(self, phone_encoder, out_dims, denoise_fn,
+                 timesteps=1000, K_step=1000, loss_type=hparams.get('diff_loss_type', 'l1'), betas=None, spec_min=None,
+                 spec_max=None):
+        super().__init__()
+        self.denoise_fn = denoise_fn
+        # if hparams.get('use_midi') is not None and hparams['use_midi']:
+        #     self.fs2 = FastSpeech2MIDI(phone_encoder, out_dims)
+        # else:
+        self.fs2 = FastSpeech2(phone_encoder, out_dims)
+        self.mel_bins = out_dims
+        if exists(betas):
+            betas = betas.detach().cpu().numpy() if isinstance(betas, torch.Tensor) else betas
+        else:
+            if 'schedule_type' in hparams.keys():
+                betas = beta_schedule[hparams['schedule_type']](timesteps)
+            else:
+                betas = cosine_beta_schedule(timesteps)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.K_step = K_step
+        self.loss_type = loss_type
+        self.noise_list = deque(maxlen=4)
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        self.register_buffer('posterior_variance', to_torch(posterior_variance))
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
+        self.register_buffer('posterior_mean_coef1', to_torch(
+            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
+        self.register_buffer('posterior_mean_coef2', to_torch(
+            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
+        self.register_buffer('spec_min', torch.FloatTensor(spec_min)[None, None, :hparams['keep_bins']])
+        self.register_buffer('spec_max', torch.FloatTensor(spec_max)[None, None, :hparams['keep_bins']])
+    def q_mean_variance(self, x_start, t):
+        mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = extract(1. - self.alphas_cumprod, t, x_start.shape)
+        log_variance = extract(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+                extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
+                extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+                extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
+                extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, x, t, cond, clip_denoised: bool):
+        noise_pred = self.denoise_fn(x, t, cond=cond)
+        x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
+        if clip_denoised:
+            x_recon.clamp_(-1., 1.)
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
+        return model_mean, posterior_variance, posterior_log_variance
+    @torch.no_grad()
+    def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
+        b, *_, device = *x.shape, x.device
+        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond, clip_denoised=clip_denoised)
+        noise = noise_like(x.shape, device, repeat_noise)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
+        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+    @torch.no_grad()
+    def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repeat_noise=False):
+        """
+        Use the PLMS method from [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778).
+        """
+        def get_x_pred(x, noise_t, t):
+            a_t = extract(self.alphas_cumprod, t, x.shape)
+            a_prev = extract(self.alphas_cumprod, torch.max(t-interval, torch.zeros_like(t)), x.shape)
+            a_t_sq, a_prev_sq = a_t.sqrt(), a_prev.sqrt()
+            x_delta = (a_prev - a_t) * ((1 / (a_t_sq * (a_t_sq + a_prev_sq))) * x - 1 / (a_t_sq * (((1 - a_prev) * a_t).sqrt() + ((1 - a_t) * a_prev).sqrt())) * noise_t)
+            x_pred = x + x_delta
+            return x_pred
+        noise_list = self.noise_list
+        noise_pred = self.denoise_fn(x, t, cond=cond)
+        if len(noise_list) == 0:
+            x_pred = get_x_pred(x, noise_pred, t)
+            noise_pred_prev = self.denoise_fn(x_pred, max(t-interval, 0), cond=cond)
+            noise_pred_prime = (noise_pred + noise_pred_prev) / 2
+        elif len(noise_list) == 1:
+            noise_pred_prime = (3 * noise_pred - noise_list[-1]) / 2
+        elif len(noise_list) == 2:
+            noise_pred_prime = (23 * noise_pred - 16 * noise_list[-1] + 5 * noise_list[-2]) / 12
+        elif len(noise_list) >= 3:
+            noise_pred_prime = (55 * noise_pred - 59 * noise_list[-1] + 37 * noise_list[-2] - 9 * noise_list[-3]) / 24
+        x_prev = get_x_pred(x, noise_pred_prime, t)
+        noise_list.append(noise_pred)
+        return x_prev
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (
+                extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
+                extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def p_losses(self, x_start, t, cond, noise=None, nonpadding=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        x_recon = self.denoise_fn(x_noisy, t, cond)
+        if self.loss_type == 'l1':
+            if nonpadding is not None:
+                loss = ((noise - x_recon).abs() * nonpadding.unsqueeze(1)).mean()
+            else:
+                # print('are you sure w/o nonpadding?')
+                loss = (noise - x_recon).abs().mean()
+        elif self.loss_type == 'l2':
+            loss = F.mse_loss(noise, x_recon)
+        else:
+            raise NotImplementedError()
+        return loss
+    def forward(self, hubert, mel2ph=None, spk_embed=None,
+                ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
+        '''
+            conditioning diffusion, use fastspeech2 encoder output as the condition
+        '''
+        ret = self.fs2(hubert, mel2ph, spk_embed, None, f0, uv, energy,
+                       skip_decoder=True, infer=infer, **kwargs)
+        cond = ret['decoder_inp'].transpose(1, 2)
+        b, *_, device = *hubert.shape, hubert.device
+        if not infer:
+            Batch2Loss.module4(
+                self.p_losses,
+                self.norm_spec(ref_mels), cond, ret, self.K_step, b, device
+            )
+        else:
+            '''
+            ret['fs2_mel'] = ret['mel_out']
+            fs2_mels = ret['mel_out']
+            t = self.K_step
+            fs2_mels = self.norm_spec(fs2_mels)
+            fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
+            x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
+            if hparams.get('gaussian_start') is not None and hparams['gaussian_start']:
+                print('===> gaussion start.')
+                shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
+                x = torch.randn(shape, device=device)
+            '''
+            if 'use_gt_mel' in kwargs.keys() and kwargs['use_gt_mel']:
+                t =kwargs['add_noise_step']
+                print('===>using ground truth mel as start, please make sure parameter "key==0" !')
+                fs2_mels = ref_mels
+                fs2_mels = self.norm_spec(fs2_mels)
+                fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
+                x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
+                # for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
+                #     x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
+            else:
+                t = self.K_step
+                #print('===> gaussion start.')
+                shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
+                x = torch.randn(shape, device=device)
+            if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1:
+                self.noise_list = deque(maxlen=4)
+                iteration_interval =hparams['pndm_speedup']
+                for i in tqdm(reversed(range(0, t, iteration_interval)), desc='sample time step',
+                              total=t // iteration_interval):
+                    x = self.p_sample_plms(x, torch.full((b,), i, device=device, dtype=torch.long), iteration_interval,
+                                           cond)
+            else:
+                for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
+                    x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
+            x = x[:, 0].transpose(1, 2)
+            if mel2ph is not None:  # for singing
+                ret['mel_out'] = self.denorm_spec(x) * ((mel2ph > 0).float()[:, :, None])
+            else:
+                ret['mel_out'] = self.denorm_spec(x)
+        return ret
+    def norm_spec(self, x):
+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+    def denorm_spec(self, x):
+        return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
+    def cwt2f0_norm(self, cwt_spec, mean, std, mel2ph):
+        return self.fs2.cwt2f0_norm(cwt_spec, mean, std, mel2ph)
+    def out2mel(self, x):
+        return x
+class OfflineGaussianDiffusion(GaussianDiffusion):
+    def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
+                ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
+        b, *_, device = *txt_tokens.shape, txt_tokens.device
+        ret = self.fs2(txt_tokens, mel2ph, spk_embed, ref_mels, f0, uv, energy,
+                       skip_decoder=True, infer=True, **kwargs)
+        cond = ret['decoder_inp'].transpose(1, 2)
+        fs2_mels = ref_mels[1]
+        ref_mels = ref_mels[0]
+        if not infer:
+            t = torch.randint(0, self.K_step, (b,), device=device).long()
+            x = ref_mels
+            x = self.norm_spec(x)
+            x = x.transpose(1, 2)[:, None, :, :]  # [B, 1, M, T]
+            ret['diff_loss'] = self.p_losses(x, t, cond)
+        else:
+            t = self.K_step
+            fs2_mels = self.norm_spec(fs2_mels)
+            fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
+            x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
+            if hparams.get('gaussian_start') is not None and hparams['gaussian_start']:
+                print('===> gaussion start.')
+                shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
+                x = torch.randn(shape, device=device)
+            for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
+                x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
+            x = x[:, 0].transpose(1, 2)
+            ret['mel_out'] = self.denorm_spec(x)
+        return ret

network/diff/net.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from math import sqrt
+from utils.hparams import hparams
+from modules.commons.common_layers import Mish
+Linear = nn.Linear
+ConvTranspose2d = nn.ConvTranspose2d
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+    def override(self, attrs):
+        if isinstance(attrs, dict):
+            self.__dict__.update(**attrs)
+        elif isinstance(attrs, (list, tuple, set)):
+            for attr in attrs:
+                self.override(attr)
+        elif attrs is not None:
+            raise NotImplementedError
+        return self
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+def Conv1d(*args, **kwargs):
+    layer = nn.Conv1d(*args, **kwargs)
+    nn.init.kaiming_normal_(layer.weight)
+    return layer
+@torch.jit.script
+def silu(x):
+    return x * torch.sigmoid(x)
+class ResidualBlock(nn.Module):
+    def __init__(self, encoder_hidden, residual_channels, dilation):
+        super().__init__()
+        self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
+        self.diffusion_projection = Linear(residual_channels, residual_channels)
+        self.conditioner_projection = Conv1d(encoder_hidden, 2 * residual_channels, 1)
+        self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
+    def forward(self, x, conditioner, diffusion_step):
+        diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
+        conditioner = self.conditioner_projection(conditioner)
+        y = x + diffusion_step
+        y = self.dilated_conv(y) + conditioner
+        gate, filter = torch.chunk(y, 2, dim=1)
+        # Using torch.split instead of torch.chunk to avoid using onnx::Slice
+        # gate, filter = torch.split(y, torch.div(y.shape[1], 2), dim=1)
+        y = torch.sigmoid(gate) * torch.tanh(filter)
+        y = self.output_projection(y)
+        residual, skip = torch.chunk(y, 2, dim=1)
+        # Using torch.split instead of torch.chunk to avoid using onnx::Slice
+        # residual, skip = torch.split(y, torch.div(y.shape[1], 2), dim=1)
+        return (x + residual) / sqrt(2.0), skip
+class DiffNet(nn.Module):
+    def __init__(self, in_dims=80):
+        super().__init__()
+        self.params = params = AttrDict(
+            # Model params
+            encoder_hidden=hparams['hidden_size'],
+            residual_layers=hparams['residual_layers'],
+            residual_channels=hparams['residual_channels'],
+            dilation_cycle_length=hparams['dilation_cycle_length'],
+        )
+        self.input_projection = Conv1d(in_dims, params.residual_channels, 1)
+        self.diffusion_embedding = SinusoidalPosEmb(params.residual_channels)
+        dim = params.residual_channels
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            Mish(),
+            nn.Linear(dim * 4, dim)
+        )
+        self.residual_layers = nn.ModuleList([
+            ResidualBlock(params.encoder_hidden, params.residual_channels, 2 ** (i % params.dilation_cycle_length))
+            for i in range(params.residual_layers)
+        ])
+        self.skip_projection = Conv1d(params.residual_channels, params.residual_channels, 1)
+        self.output_projection = Conv1d(params.residual_channels, in_dims, 1)
+        nn.init.zeros_(self.output_projection.weight)
+    def forward(self, spec, diffusion_step, cond):
+        """
+        :param spec: [B, 1, M, T]
+        :param diffusion_step: [B, 1]
+        :param cond: [B, M, T]
+        :return:
+        """
+        x = spec[:, 0]
+        x = self.input_projection(x)  # x [B, residual_channel, T]
+        x = F.relu(x)
+        diffusion_step = self.diffusion_embedding(diffusion_step)
+        diffusion_step = self.mlp(diffusion_step)
+        skip = []
+        for layer_id, layer in enumerate(self.residual_layers):
+            x, skip_connection = layer(x, cond, diffusion_step)
+            skip.append(skip_connection)
+        x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
+        x = self.skip_projection(x)
+        x = F.relu(x)
+        x = self.output_projection(x)  # [B, 80, T]
+        return x[:, None, :, :]

network/hubert/hubert_model.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import copy
+import os
+import random
+from typing import Optional, Tuple
+import librosa
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as t_func
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+from utils import hparams
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+    def encode(
+            self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+    # @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = torch.nn.functional.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+    def forward(self, wav: torch.Tensor):
+        return self.units(wav)
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = t_func.gelu(self.norm0(self.conv0(x)))
+        x = t_func.gelu(self.conv1(x))
+        x = t_func.gelu(self.conv2(x))
+        x = t_func.gelu(self.conv3(x))
+        x = t_func.gelu(self.conv4(x))
+        x = t_func.gelu(self.conv5(x))
+        x = t_func.gelu(self.conv6(x))
+        return x
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = t_func.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+class TransformerEncoder(nn.Module):
+    def __init__(
+            self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+    def forward(
+            self,
+            src: torch.Tensor,
+            mask: torch.Tensor = None,
+            src_key_padding_mask: torch.Tensor = None,
+            output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+def _compute_mask(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        device: torch.device,
+        min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+    return mask
+def hubert_soft(
+        path: str
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval().to(dev)
+    return hubert
+def get_units(hbt_soft, raw_wav_path, dev=torch.device('cuda')):
+    wav, sr = librosa.load(raw_wav_path, sr=None)
+    assert (sr >= 16000)
+    if len(wav.shape) > 1:
+        wav = librosa.to_mono(wav)
+    if sr != 16000:
+        wav16 = librosa.resample(wav, sr, 16000)
+    else:
+        wav16 = wav
+    dev = torch.device("cuda" if (dev == torch.device('cuda') and torch.cuda.is_available()) else "cpu")
+    torch.cuda.is_available() and torch.cuda.empty_cache()
+    with torch.inference_mode():
+        units = hbt_soft.units(torch.FloatTensor(wav16.astype(float)).unsqueeze(0).unsqueeze(0).to(dev))
+        return units
+def get_end_file(dir_path, end):
+    file_list = []
+    for root, dirs, files in os.walk(dir_path):
+        files = [f for f in files if f[0] != '.']
+        dirs[:] = [d for d in dirs if d[0] != '.']
+        for f_file in files:
+            if f_file.endswith(end):
+                file_list.append(os.path.join(root, f_file).replace("\\", "/"))
+    return file_list
+if __name__ == '__main__':
+    from pathlib import Path
+    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # hubert的模型路径
+    hbt_model = hubert_soft(str(list(Path(hparams['hubert_path']).home().rglob('*.pt'))[0]))
+    # 这个不用改，自动在根目录下所有wav的同文件夹生成其对应的npy
+    file_lists = list(Path(hparams['raw_data_dir']).rglob('*.wav'))
+    nums = len(file_lists)
+    count = 0
+    for wav_path in file_lists:
+        npy_path = wav_path.with_suffix(".npy")
+        npy_content = get_units(hbt_model, wav_path).cpu().numpy()[0]
+        np.save(str(npy_path), npy_content)
+        count += 1
+        print(f"hubert process：{round(count * 100 / nums, 2)}%")

network/hubert/vec_model.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from pathlib import Path
+import librosa
+import numpy as np
+import torch
+def load_model(vec_path):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print("load model(s) from {}".format(vec_path))
+    from fairseq import checkpoint_utils
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [vec_path],
+        suffix="",
+    )
+    model = models[0]
+    model = model.to(device)
+    model.eval()
+    return model
+def get_vec_units(con_model, audio_path, dev):
+    audio, sampling_rate = librosa.load(audio_path)
+    if len(audio.shape) > 1:
+        audio = librosa.to_mono(audio.transpose(1, 0))
+    if sampling_rate != 16000:
+        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
+    feats = torch.from_numpy(audio).float()
+    if feats.dim() == 2:  # double channels
+        feats = feats.mean(-1)
+    assert feats.dim() == 1, feats.dim()
+    feats = feats.view(1, -1)
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.to(dev),
+        "padding_mask": padding_mask.to(dev),
+        "output_layer": 9,  # layer 9
+    }
+    with torch.no_grad():
+        logits = con_model.extract_features(**inputs)
+        feats = con_model.final_proj(logits[0])
+    return feats
+if __name__ == '__main__':
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_path = "../../checkpoints/checkpoint_best_legacy_500.pt"  # checkpoint_best_legacy_500.pt
+    vec_model = load_model(model_path)
+    # 这个不用改，自动在根目录下所有wav的同文件夹生成其对应的npy
+    file_lists = list(Path("../../data/vecfox").rglob('*.wav'))
+    nums = len(file_lists)
+    count = 0
+    for wav_path in file_lists:
+        npy_path = wav_path.with_suffix(".npy")
+        npy_content = get_vec_units(vec_model, str(wav_path), device).cpu().numpy()[0]
+        np.save(str(npy_path), npy_content)
+        count += 1
+        print(f"hubert process：{round(count * 100 / nums, 2)}%")

network/vocoders/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from network.vocoders import hifigan
2	+ from network.vocoders import nsf_hifigan

network/vocoders/base_vocoder.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import importlib
+VOCODERS = {}
+def register_vocoder(cls):
+    VOCODERS[cls.__name__.lower()] = cls
+    VOCODERS[cls.__name__] = cls
+    return cls
+def get_vocoder_cls(hparams):
+    if hparams['vocoder'] in VOCODERS:
+        return VOCODERS[hparams['vocoder']]
+    else:
+        vocoder_cls = hparams['vocoder']
+        pkg = ".".join(vocoder_cls.split(".")[:-1])
+        cls_name = vocoder_cls.split(".")[-1]
+        vocoder_cls = getattr(importlib.import_module(pkg), cls_name)
+        return vocoder_cls
+class BaseVocoder:
+    def spec2wav(self, mel):
+        """
+        :param mel: [T, 80]
+        :return: wav: [T']
+        """
+        raise NotImplementedError
+    @staticmethod
+    def wav2spec(wav_fn):
+        """
+        :param wav_fn: str
+        :return: wav, mel: [T, 80]
+        """
+        raise NotImplementedError

network/vocoders/hifigan.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import glob
+import json
+import os
+import re
+import librosa
+import torch
+import utils
+from modules.hifigan.hifigan import HifiGanGenerator
+from utils.hparams import hparams, set_hparams
+from network.vocoders.base_vocoder import register_vocoder
+from network.vocoders.pwg import PWG
+from network.vocoders.vocoder_utils import denoise
+def load_model(config_path, file_path):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    ext = os.path.splitext(file_path)[-1]
+    if ext == '.pth':
+        if '.yaml' in config_path:
+            config = set_hparams(config_path, global_hparams=False)
+        elif '.json' in config_path:
+            config = json.load(open(config_path, 'r', encoding='utf-8'))
+        model = torch.load(file_path, map_location="cpu")
+    elif ext == '.ckpt':
+        ckpt_dict = torch.load(file_path, map_location="cpu")
+        if '.yaml' in config_path:
+            config = set_hparams(config_path, global_hparams=False)
+            state = ckpt_dict["state_dict"]["model_gen"]
+        elif '.json' in config_path:
+            config = json.load(open(config_path, 'r', encoding='utf-8'))
+            state = ckpt_dict["generator"]
+        model = HifiGanGenerator(config)
+        model.load_state_dict(state, strict=True)
+        model.remove_weight_norm()
+    model = model.eval().to(device)
+    print(f"| Loaded model parameters from {file_path}.")
+    print(f"| HifiGAN device: {device}.")
+    return model, config, device
+total_time = 0
+@register_vocoder
+class HifiGAN(PWG):
+    def __init__(self):
+        base_dir = hparams['vocoder_ckpt']
+        config_path = f'{base_dir}/config.yaml'
+        if os.path.exists(config_path):
+            file_path = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.*'), key=
+            lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).*', x.replace('\\','/'))[0]))[-1]
+            print('| load HifiGAN: ', file_path)
+            self.model, self.config, self.device = load_model(config_path=config_path, file_path=file_path)
+        else:
+            config_path = f'{base_dir}/config.json'
+            ckpt = f'{base_dir}/generator_v1'
+            if os.path.exists(config_path):
+                self.model, self.config, self.device = load_model(config_path=config_path, file_path=file_path)
+    def spec2wav(self, mel, **kwargs):
+        device = self.device
+        with torch.no_grad():
+            c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(device)
+            with utils.Timer('hifigan', print_time=hparams['profile_infer']):
+                f0 = kwargs.get('f0')
+                if f0 is not None and hparams.get('use_nsf'):
+                    f0 = torch.FloatTensor(f0[None, :]).to(device)
+                    y = self.model(c, f0).view(-1)
+                else:
+                    y = self.model(c).view(-1)
+        wav_out = y.cpu().numpy()
+        if hparams.get('vocoder_denoise_c', 0.0) > 0:
+            wav_out = denoise(wav_out, v=hparams['vocoder_denoise_c'])
+        return wav_out
+    # @staticmethod
+    # def wav2spec(wav_fn, **kwargs):
+    #     wav, _ = librosa.core.load(wav_fn, sr=hparams['audio_sample_rate'])
+    #     wav_torch = torch.FloatTensor(wav)[None, :]
+    #     mel = mel_spectrogram(wav_torch, hparams).numpy()[0]
+    #     return wav, mel.T

network/vocoders/nsf_hifigan.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import torch
+from modules.nsf_hifigan.models import load_model, Generator
+from modules.nsf_hifigan.nvSTFT import load_wav_to_torch, STFT
+from utils.hparams import hparams
+from network.vocoders.base_vocoder import BaseVocoder, register_vocoder
+@register_vocoder
+class NsfHifiGAN(BaseVocoder):
+    def __init__(self, device=None):
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = device
+        model_path = hparams['vocoder_ckpt']
+        if os.path.exists(model_path):
+            print('| Load HifiGAN: ', model_path)
+            self.model, self.h = load_model(model_path, device=self.device)
+        else:
+            print('Error: HifiGAN model file is not found!')
+    def spec2wav_torch(self, mel, **kwargs): # mel: [B, T, bins]
+        if self.h.sampling_rate != hparams['audio_sample_rate']:
+            print('Mismatch parameters: hparams[\'audio_sample_rate\']=',hparams['audio_sample_rate'],'!=',self.h.sampling_rate,'(vocoder)')
+        if self.h.num_mels != hparams['audio_num_mel_bins']:
+            print('Mismatch parameters: hparams[\'audio_num_mel_bins\']=',hparams['audio_num_mel_bins'],'!=',self.h.num_mels,'(vocoder)')
+        if self.h.n_fft != hparams['fft_size']:
+            print('Mismatch parameters: hparams[\'fft_size\']=',hparams['fft_size'],'!=',self.h.n_fft,'(vocoder)')
+        if self.h.win_size != hparams['win_size']:
+            print('Mismatch parameters: hparams[\'win_size\']=',hparams['win_size'],'!=',self.h.win_size,'(vocoder)')
+        if self.h.hop_size != hparams['hop_size']:
+            print('Mismatch parameters: hparams[\'hop_size\']=',hparams['hop_size'],'!=',self.h.hop_size,'(vocoder)')
+        if self.h.fmin != hparams['fmin']:
+            print('Mismatch parameters: hparams[\'fmin\']=',hparams['fmin'],'!=',self.h.fmin,'(vocoder)')
+        if self.h.fmax != hparams['fmax']:
+            print('Mismatch parameters: hparams[\'fmax\']=',hparams['fmax'],'!=',self.h.fmax,'(vocoder)')
+        with torch.no_grad():
+            c = mel.transpose(2, 1) #[B, T, bins]
+            #log10 to log mel
+            c = 2.30259 * c
+            f0 = kwargs.get('f0') #[B, T]
+            if f0 is not None and hparams.get('use_nsf'):
+                y = self.model(c, f0).view(-1)
+            else:
+                y = self.model(c).view(-1)
+        return y
+    def spec2wav(self, mel, **kwargs):
+        if self.h.sampling_rate != hparams['audio_sample_rate']:
+            print('Mismatch parameters: hparams[\'audio_sample_rate\']=',hparams['audio_sample_rate'],'!=',self.h.sampling_rate,'(vocoder)')
+        if self.h.num_mels != hparams['audio_num_mel_bins']:
+            print('Mismatch parameters: hparams[\'audio_num_mel_bins\']=',hparams['audio_num_mel_bins'],'!=',self.h.num_mels,'(vocoder)')
+        if self.h.n_fft != hparams['fft_size']:
+            print('Mismatch parameters: hparams[\'fft_size\']=',hparams['fft_size'],'!=',self.h.n_fft,'(vocoder)')
+        if self.h.win_size != hparams['win_size']:
+            print('Mismatch parameters: hparams[\'win_size\']=',hparams['win_size'],'!=',self.h.win_size,'(vocoder)')
+        if self.h.hop_size != hparams['hop_size']:
+            print('Mismatch parameters: hparams[\'hop_size\']=',hparams['hop_size'],'!=',self.h.hop_size,'(vocoder)')
+        if self.h.fmin != hparams['fmin']:
+            print('Mismatch parameters: hparams[\'fmin\']=',hparams['fmin'],'!=',self.h.fmin,'(vocoder)')
+        if self.h.fmax != hparams['fmax']:
+            print('Mismatch parameters: hparams[\'fmax\']=',hparams['fmax'],'!=',self.h.fmax,'(vocoder)')
+        with torch.no_grad():
+            c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(self.device)
+            #log10 to log mel
+            c = 2.30259 * c
+            f0 = kwargs.get('f0')
+            if f0 is not None and hparams.get('use_nsf'):
+                f0 = torch.FloatTensor(f0[None, :]).to(self.device)
+                y = self.model(c, f0).view(-1)
+            else:
+                y = self.model(c).view(-1)
+        wav_out = y.cpu().numpy()
+        return wav_out
+    @staticmethod
+    def wav2spec(inp_path, device=None):
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        sampling_rate = hparams['audio_sample_rate']
+        num_mels = hparams['audio_num_mel_bins']
+        n_fft = hparams['fft_size']
+        win_size =hparams['win_size']
+        hop_size = hparams['hop_size']
+        fmin = hparams['fmin']
+        fmax = hparams['fmax']
+        stft = STFT(sampling_rate, num_mels, n_fft, win_size, hop_size, fmin, fmax)
+        with torch.no_grad():
+            wav_torch, _ = load_wav_to_torch(inp_path, target_sr=stft.target_sr)
+            mel_torch = stft.get_mel(wav_torch.unsqueeze(0).to(device)).squeeze(0).T
+            #log mel to log10 mel
+            mel_torch = 0.434294 * mel_torch
+            return wav_torch.cpu().numpy(), mel_torch.cpu().numpy()

network/vocoders/pwg.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import glob
+import re
+import librosa
+import torch
+import yaml
+from sklearn.preprocessing import StandardScaler
+from torch import nn
+from modules.parallel_wavegan.models import ParallelWaveGANGenerator
+from modules.parallel_wavegan.utils import read_hdf5
+from utils.hparams import hparams
+from utils.pitch_utils import f0_to_coarse
+from network.vocoders.base_vocoder import BaseVocoder, register_vocoder
+import numpy as np
+def load_pwg_model(config_path, checkpoint_path, stats_path):
+    # load config
+    with open(config_path, encoding='utf-8') as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    # setup
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    model = ParallelWaveGANGenerator(**config["generator_params"])
+    ckpt_dict = torch.load(checkpoint_path, map_location="cpu")
+    if 'state_dict' not in ckpt_dict:  # official vocoder
+        model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]["generator"])
+        scaler = StandardScaler()
+        if config["format"] == "hdf5":
+            scaler.mean_ = read_hdf5(stats_path, "mean")
+            scaler.scale_ = read_hdf5(stats_path, "scale")
+        elif config["format"] == "npy":
+            scaler.mean_ = np.load(stats_path)[0]
+            scaler.scale_ = np.load(stats_path)[1]
+        else:
+            raise ValueError("support only hdf5 or npy format.")
+    else:  # custom PWG vocoder
+        fake_task = nn.Module()
+        fake_task.model_gen = model
+        fake_task.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["state_dict"], strict=False)
+        scaler = None
+    model.remove_weight_norm()
+    model = model.eval().to(device)
+    print(f"| Loaded model parameters from {checkpoint_path}.")
+    print(f"| PWG device: {device}.")
+    return model, scaler, config, device
+@register_vocoder
+class PWG(BaseVocoder):
+    def __init__(self):
+        if hparams['vocoder_ckpt'] == '':  # load LJSpeech PWG pretrained model
+            base_dir = 'wavegan_pretrained'
+            ckpts = glob.glob(f'{base_dir}/checkpoint-*steps.pkl')
+            ckpt = sorted(ckpts, key=
+            lambda x: int(re.findall(f'{base_dir}/checkpoint-(\d+)steps.pkl', x)[0]))[-1]
+            config_path = f'{base_dir}/config.yaml'
+            print('| load PWG: ', ckpt)
+            self.model, self.scaler, self.config, self.device = load_pwg_model(
+                config_path=config_path,
+                checkpoint_path=ckpt,
+                stats_path=f'{base_dir}/stats.h5',
+            )
+        else:
+            base_dir = hparams['vocoder_ckpt']
+            print(base_dir)
+            config_path = f'{base_dir}/config.yaml'
+            ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
+            lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
+            print('| load PWG: ', ckpt)
+            self.scaler = None
+            self.model, _, self.config, self.device = load_pwg_model(
+                config_path=config_path,
+                checkpoint_path=ckpt,
+                stats_path=f'{base_dir}/stats.h5',
+            )
+    def spec2wav(self, mel, **kwargs):
+        # start generation
+        config = self.config
+        device = self.device
+        pad_size = (config["generator_params"]["aux_context_window"],
+                    config["generator_params"]["aux_context_window"])
+        c = mel
+        if self.scaler is not None:
+            c = self.scaler.transform(c)
+        with torch.no_grad():
+            z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device)
+            c = np.pad(c, (pad_size, (0, 0)), "edge")
+            c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device)
+            p = kwargs.get('f0')
+            if p is not None:
+                p = f0_to_coarse(p)
+                p = np.pad(p, (pad_size,), "edge")
+                p = torch.LongTensor(p[None, :]).to(device)
+            y = self.model(z, c, p).view(-1)
+        wav_out = y.cpu().numpy()
+        return wav_out
+    @staticmethod
+    def wav2spec(wav_fn, return_linear=False):
+        from preprocessing.data_gen_utils import process_utterance
+        res = process_utterance(
+            wav_fn, fft_size=hparams['fft_size'],
+            hop_size=hparams['hop_size'],
+            win_length=hparams['win_size'],
+            num_mels=hparams['audio_num_mel_bins'],
+            fmin=hparams['fmin'],
+            fmax=hparams['fmax'],
+            sample_rate=hparams['audio_sample_rate'],
+            loud_norm=hparams['loud_norm'],
+            min_level_db=hparams['min_level_db'],
+            return_linear=return_linear, vocoder='pwg', eps=float(hparams.get('wav2spec_eps', 1e-10)))
+        if return_linear:
+            return res[0], res[1].T, res[2].T  # [T, 80], [T, n_fft]
+        else:
+            return res[0], res[1].T
+    @staticmethod
+    def wav2mfcc(wav_fn):
+        fft_size = hparams['fft_size']
+        hop_size = hparams['hop_size']
+        win_length = hparams['win_size']
+        sample_rate = hparams['audio_sample_rate']
+        wav, _ = librosa.core.load(wav_fn, sr=sample_rate)
+        mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13,
+                                    n_fft=fft_size, hop_length=hop_size,
+                                    win_length=win_length, pad_mode="constant", power=1.0)
+        mfcc_delta = librosa.feature.delta(mfcc, order=1)
+        mfcc_delta_delta = librosa.feature.delta(mfcc, order=2)
+        mfcc = np.concatenate([mfcc, mfcc_delta, mfcc_delta_delta]).T
+        return mfcc

network/vocoders/vocoder_utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import librosa
+from utils.hparams import hparams
+import numpy as np
+def denoise(wav, v=0.1):
+    spec = librosa.stft(y=wav, n_fft=hparams['fft_size'], hop_length=hparams['hop_size'],
+                        win_length=hparams['win_size'], pad_mode='constant')
+    spec_m = np.abs(spec)
+    spec_m = np.clip(spec_m - v, a_min=0, a_max=None)
+    spec_a = np.angle(spec)
+    return librosa.istft(spec_m * np.exp(1j * spec_a), hop_length=hparams['hop_size'],
+                         win_length=hparams['win_size'])

preprocessing/SVCpre.py ADDED Viewed

	@@ -0,0 +1,63 @@

+'''
+    item: one piece of data
+    item_name: data id
+    wavfn: wave file path
+    txt: lyrics
+    ph: phoneme
+    tgfn: text grid file path (unused)
+    spk: dataset name
+    wdb: word boundary
+    ph_durs: phoneme durations
+    midi: pitch as midi notes
+    midi_dur: midi duration
+    is_slur: keep singing upon note changes
+'''
+from copy import deepcopy
+import logging
+from preprocessing.process_pipeline import File2Batch
+from utils.hparams import hparams
+from preprocessing.base_binarizer import BaseBinarizer
+SVCSINGING_ITEM_ATTRIBUTES = ['wav_fn', 'spk_id']
+class SVCBinarizer(BaseBinarizer):
+    def __init__(self, item_attributes=SVCSINGING_ITEM_ATTRIBUTES):
+        super().__init__(item_attributes)
+        print('spkers: ', set(item['spk_id'] for item in self.items.values()))
+        self.item_names = sorted(list(self.items.keys()))
+        self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
+        # self._valid_item_names=[]
+    def split_train_test_set(self, item_names):
+        item_names = deepcopy(item_names)
+        if hparams['choose_test_manually']:
+            test_item_names = [x for x in item_names if any([x.startswith(ts) for ts in hparams['test_prefixes']])]
+        else:
+            test_item_names = item_names[-5:]
+        train_item_names = [x for x in item_names if x not in set(test_item_names)]
+        logging.info("train {}".format(len(train_item_names)))
+        logging.info("test {}".format(len(test_item_names)))
+        return train_item_names, test_item_names
+    @property
+    def train_item_names(self):
+        return self._train_item_names
+    @property
+    def valid_item_names(self):
+        return self._test_item_names
+    @property
+    def test_item_names(self):
+        return self._test_item_names
+    def load_meta_data(self):
+        self.items = File2Batch.file2temporary_dict()
+    def _phone_encoder(self):
+        from preprocessing.hubertinfer import Hubertencoder
+        return Hubertencoder(hparams['hubert_path'])

preprocessing/base_binarizer.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import os
+from webbrowser import get
+os.environ["OMP_NUM_THREADS"] = "1"
+import yaml
+from utils.multiprocess_utils import chunked_multiprocess_run
+import random
+import json
+# from resemblyzer import VoiceEncoder
+from tqdm import tqdm
+from preprocessing.data_gen_utils import get_mel2ph, get_pitch_parselmouth, build_phone_encoder,get_pitch_crepe
+from utils.hparams import set_hparams, hparams
+import numpy as np
+from utils.indexed_datasets import IndexedDatasetBuilder
+class BinarizationError(Exception):
+    pass
+BASE_ITEM_ATTRIBUTES = ['txt', 'ph', 'wav_fn', 'tg_fn', 'spk_id']
+class BaseBinarizer:
+    '''
+        Base class for data processing.
+        1. *process* and *process_data_split*:
+            process entire data, generate the train-test split (support parallel processing);
+        2. *process_item*:
+            process singe piece of data;
+        3. *get_pitch*:
+            infer the pitch using some algorithm;
+        4. *get_align*:
+            get the alignment using 'mel2ph' format (see https://arxiv.org/abs/1905.09263).
+        5. phoneme encoder, voice encoder, etc.
+        Subclasses should define:
+        1. *load_metadata*:
+            how to read multiple datasets from files;
+        2. *train_item_names*, *valid_item_names*, *test_item_names*:
+            how to split the dataset;
+        3. load_ph_set:
+            the phoneme set.
+    '''
+    def __init__(self, item_attributes=BASE_ITEM_ATTRIBUTES):
+        self.binarization_args = hparams['binarization_args']
+        #self.pre_align_args = hparams['pre_align_args']
+        self.items = {}
+        # every item in self.items has some attributes
+        self.item_attributes = item_attributes
+        self.load_meta_data()
+        # check program correctness 检查itemdict的key只能在给定的列表中取值
+        assert all([attr in self.item_attributes for attr in list(self.items.values())[0].keys()])
+        self.item_names = sorted(list(self.items.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+        # set default get_pitch algorithm
+        if hparams['use_crepe']:
+            self.get_pitch_algorithm = get_pitch_crepe
+        else:
+            self.get_pitch_algorithm = get_pitch_parselmouth
+    def load_meta_data(self):
+        raise NotImplementedError
+    @property
+    def train_item_names(self):
+        raise NotImplementedError
+    @property
+    def valid_item_names(self):
+        raise NotImplementedError
+    @property
+    def test_item_names(self):
+        raise NotImplementedError
+    def build_spk_map(self):
+        spk_map = set()
+        for item_name in self.item_names:
+            spk_name = self.items[item_name]['spk_id']
+            spk_map.add(spk_name)
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        return spk_map
+    def item_name2spk_id(self, item_name):
+        return self.spk_map[self.items[item_name]['spk_id']]
+    def _phone_encoder(self):
+        '''
+        use hubert encoder
+        '''
+        raise NotImplementedError
+        '''
+            create 'phone_set.json' file if it doesn't exist
+        '''
+        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
+        ph_set = []
+        if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            self.load_ph_set(ph_set)
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w', encoding='utf-8'))
+            print("| Build phone set: ", ph_set)
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r', encoding='utf-8'))
+            print("| Load phone set: ", ph_set)
+        return build_phone_encoder(hparams['binary_data_dir'])
+    def load_ph_set(self, ph_set):
+        raise NotImplementedError
+    def meta_data_iterator(self, prefix):
+        if prefix == 'valid':
+            item_names = self.valid_item_names
+        elif prefix == 'test':
+            item_names = self.test_item_names
+        else:
+            item_names = self.train_item_names
+        for item_name in item_names:
+            meta_data = self.items[item_name]
+            yield item_name, meta_data
+    def process(self):
+        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
+        self.spk_map = self.build_spk_map()
+        print("| spk_map: ", self.spk_map)
+        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
+        json.dump(self.spk_map, open(spk_map_fn, 'w', encoding='utf-8'))
+        self.phone_encoder =self._phone_encoder()
+        self.process_data_split('valid')
+        self.process_data_split('test')
+        self.process_data_split('train')
+    def process_data_split(self, prefix):
+        data_dir = hparams['binary_data_dir']
+        args = []
+        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
+        lengths = []
+        f0s = []
+        total_sec = 0
+        # if self.binarization_args['with_spk_embed']:
+        #     voice_encoder = VoiceEncoder().cuda()
+        for item_name, meta_data in self.meta_data_iterator(prefix):
+            args.append([item_name, meta_data, self.binarization_args])
+        spec_min=[]
+        spec_max=[]
+        # code for single cpu processing
+        for i in tqdm(reversed(range(len(args))), total=len(args)):
+            a = args[i]
+            item = self.process_item(*a)
+            if item is None:
+                continue
+            spec_min.append(item['spec_min'])
+            spec_max.append(item['spec_max'])
+            # item['spk_embe'] = voice_encoder.embed_utterance(item['wav']) \
+            #     if self.binardization_args['with_spk_embed'] else None
+            if not self.binarization_args['with_wav'] and 'wav' in item:
+                if hparams['debug']:
+                    print("del wav")
+                del item['wav']
+            if(hparams['debug']):
+                print(item)
+            builder.add_item(item)
+            lengths.append(item['len'])
+            total_sec += item['sec']
+            # if item.get('f0') is not None:
+            #     f0s.append(item['f0'])
+        if prefix=='train':
+            spec_max=np.max(spec_max,0)
+            spec_min=np.min(spec_min,0)
+            print(spec_max.shape)
+            with open(hparams['config_path'], encoding='utf-8') as f:
+                _hparams=yaml.safe_load(f)
+                _hparams['spec_max']=spec_max.tolist()
+                _hparams['spec_min']=spec_min.tolist()
+            with open(hparams['config_path'], 'w', encoding='utf-8') as f:
+                yaml.safe_dump(_hparams,f)
+        builder.finalize()
+        np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
+        if len(f0s) > 0:
+            f0s = np.concatenate(f0s, 0)
+            f0s = f0s[f0s != 0]
+            np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
+        print(f"| {prefix} total duration: {total_sec:.3f}s")
+    def process_item(self, item_name, meta_data, binarization_args):
+        from preprocessing.process_pipeline import File2Batch
+        return File2Batch.temporary_dict2processed_input(item_name, meta_data, self.phone_encoder, binarization_args)
+    def get_align(self, meta_data, mel, phone_encoded, res):
+        raise NotImplementedError
+    def get_align_from_textgrid(self, meta_data, mel, phone_encoded, res):
+        '''
+            NOTE: this part of script is *isolated* from other scripts, which means
+                  it may not be compatible with the current version.
+        '''
+        return
+        tg_fn, ph = meta_data['tg_fn'], meta_data['ph']
+        if tg_fn is not None and os.path.exists(tg_fn):
+            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
+        else:
+            raise BinarizationError(f"Align not found")
+        if mel2ph.max() - 1 >= len(phone_encoded):
+            raise BinarizationError(
+                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
+        res['mel2ph'] = mel2ph
+        res['dur'] = dur
+    def get_f0cwt(self, f0, res):
+        '''
+            NOTE: this part of script is *isolated* from other scripts, which means
+                  it may not be compatible with the current version.
+        '''
+        return
+        from utils.cwt import get_cont_lf0, get_lf0_cwt
+        uv, cont_lf0_lpf = get_cont_lf0(f0)
+        logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
+        cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
+        Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
+        if np.any(np.isnan(Wavelet_lf0)):
+            raise BinarizationError("NaN CWT")
+        res['cwt_spec'] = Wavelet_lf0
+        res['cwt_scales'] = scales
+        res['f0_mean'] = logf0s_mean_org
+        res['f0_std'] = logf0s_std_org
+if __name__ == "__main__":
+    set_hparams()
+    BaseBinarizer().process()