diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c102e2ca8c51ab623f295f09aab7797a1cdc42b4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -11,7 +11,7 @@ *.mlmodel filter=lfs diff=lfs merge=lfs -text *.model filter=lfs diff=lfs merge=lfs -text *.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text +# *.npy filter=lfs diff=lfs merge=lfs -text *.npz filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text *.ot filter=lfs diff=lfs merge=lfs -text @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +exp/default/g_00700000 filter=lfs diff=lfs merge=lfs -text +Utils/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..9b0d55d2b578380d2a42389b5de298ecea42a2f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +flagged +out.wav \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..1ab91dd6113ba955fb0cd3523f6a414b8e3fcba9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Jingyi Li + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index da27b766af69f14237a16f2e7065c4e6ce6f6ac4..fb6d03426ee7431cdc451f7fccb924703a6ae9a4 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ sdk: gradio sdk_version: 4.22.0 app_file: app.py pinned: false +license: mit --- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/Utils/JDC/__init__.py b/Utils/JDC/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/Utils/JDC/__init__.py @@ -0,0 +1 @@ + diff --git a/Utils/JDC/bst.t7.txt b/Utils/JDC/bst.t7.txt new file mode 100644 index 0000000000000000000000000000000000000000..95c37077711f9e5435a00c521f688275dab8feb6 --- /dev/null +++ b/Utils/JDC/bst.t7.txt @@ -0,0 +1 @@ +https://github.com/yl4579/HiFTNet/blob/main/Utils/JDC/bst.t7 \ No newline at end of file diff --git a/Utils/JDC/model.py b/Utils/JDC/model.py new file mode 100644 index 0000000000000000000000000000000000000000..83cd266d1cd6f054d0684e8e1a60496044048605 --- /dev/null +++ b/Utils/JDC/model.py @@ -0,0 +1,190 @@ +""" +Implementation of model from: +Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using +Convolutional Recurrent Neural Networks" (2019) +Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d +""" +import torch +from torch import nn + +class JDCNet(nn.Module): + """ + Joint Detection and Classification Network model for singing voice melody. + """ + def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01): + super().__init__() + self.num_class = num_class + + # input = (b, 1, 31, 513), b = batch size + self.conv_block = nn.Sequential( + nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False), # out: (b, 64, 31, 513) + nn.BatchNorm2d(num_features=64), + nn.LeakyReLU(leaky_relu_slope, inplace=True), + nn.Conv2d(64, 64, 3, padding=1, bias=False), # (b, 64, 31, 513) + ) + + # res blocks + self.res_block1 = ResBlock(in_channels=64, out_channels=128) # (b, 128, 31, 128) + self.res_block2 = ResBlock(in_channels=128, out_channels=192) # (b, 192, 31, 32) + self.res_block3 = ResBlock(in_channels=192, out_channels=256) # (b, 256, 31, 8) + + # pool block + self.pool_block = nn.Sequential( + nn.BatchNorm2d(num_features=256), + nn.LeakyReLU(leaky_relu_slope, inplace=True), + nn.MaxPool2d(kernel_size=(1, 4)), # (b, 256, 31, 2) + nn.Dropout(p=0.2), + ) + + # maxpool layers (for auxiliary network inputs) + # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2) + self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40)) + # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2) + self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20)) + # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2) + self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10)) + + # in = (b, 640, 31, 2), out = (b, 256, 31, 2) + self.detector_conv = nn.Sequential( + nn.Conv2d(640, 256, 1, bias=False), + nn.BatchNorm2d(256), + nn.LeakyReLU(leaky_relu_slope, inplace=True), + nn.Dropout(p=0.2), + ) + + # input: (b, 31, 512) - resized from (b, 256, 31, 2) + self.bilstm_classifier = nn.LSTM( + input_size=512, hidden_size=256, + batch_first=True, bidirectional=True) # (b, 31, 512) + + # input: (b, 31, 512) - resized from (b, 256, 31, 2) + self.bilstm_detector = nn.LSTM( + input_size=512, hidden_size=256, + batch_first=True, bidirectional=True) # (b, 31, 512) + + # input: (b * 31, 512) + self.classifier = nn.Linear(in_features=512, out_features=self.num_class) # (b * 31, num_class) + + # input: (b * 31, 512) + self.detector = nn.Linear(in_features=512, out_features=2) # (b * 31, 2) - binary classifier + + # initialize weights + self.apply(self.init_weights) + + def get_feature_GAN(self, x): + seq_len = x.shape[-2] + x = x.float().transpose(-1, -2) + + convblock_out = self.conv_block(x) + + resblock1_out = self.res_block1(convblock_out) + resblock2_out = self.res_block2(resblock1_out) + resblock3_out = self.res_block3(resblock2_out) + poolblock_out = self.pool_block[0](resblock3_out) + poolblock_out = self.pool_block[1](poolblock_out) + + return poolblock_out.transpose(-1, -2) + + def get_feature(self, x): + seq_len = x.shape[-2] + x = x.float().transpose(-1, -2) + + convblock_out = self.conv_block(x) + + resblock1_out = self.res_block1(convblock_out) + resblock2_out = self.res_block2(resblock1_out) + resblock3_out = self.res_block3(resblock2_out) + poolblock_out = self.pool_block[0](resblock3_out) + poolblock_out = self.pool_block[1](poolblock_out) + + return self.pool_block[2](poolblock_out) + + def forward(self, x): + """ + Returns: + classification_prediction, detection_prediction + sizes: (b, 31, 722), (b, 31, 2) + """ + ############################### + # forward pass for classifier # + ############################### + seq_len = x.shape[-1] + x = x.float().transpose(-1, -2) + + convblock_out = self.conv_block(x) + + resblock1_out = self.res_block1(convblock_out) + resblock2_out = self.res_block2(resblock1_out) + resblock3_out = self.res_block3(resblock2_out) + + + poolblock_out = self.pool_block[0](resblock3_out) + poolblock_out = self.pool_block[1](poolblock_out) + GAN_feature = poolblock_out.transpose(-1, -2) + poolblock_out = self.pool_block[2](poolblock_out) + + # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512) + classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512)) + classifier_out, _ = self.bilstm_classifier(classifier_out) # ignore the hidden states + + classifier_out = classifier_out.contiguous().view((-1, 512)) # (b * 31, 512) + classifier_out = self.classifier(classifier_out) + classifier_out = classifier_out.view((-1, seq_len, self.num_class)) # (b, 31, num_class) + + # sizes: (b, 31, 722), (b, 31, 2) + # classifier output consists of predicted pitch classes per frame + # detector output consists of: (isvoice, notvoice) estimates per frame + return torch.abs(classifier_out.squeeze()), GAN_feature, poolblock_out + + @staticmethod + def init_weights(m): + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Conv2d): + nn.init.xavier_normal_(m.weight) + elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell): + for p in m.parameters(): + if p.data is None: + continue + + if len(p.shape) >= 2: + nn.init.orthogonal_(p.data) + else: + nn.init.normal_(p.data) + + +class ResBlock(nn.Module): + def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01): + super().__init__() + self.downsample = in_channels != out_channels + + # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper + self.pre_conv = nn.Sequential( + nn.BatchNorm2d(num_features=in_channels), + nn.LeakyReLU(leaky_relu_slope, inplace=True), + nn.MaxPool2d(kernel_size=(1, 2)), # apply downsampling on the y axis only + ) + + # conv layers + self.conv = nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(out_channels), + nn.LeakyReLU(leaky_relu_slope, inplace=True), + nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False), + ) + + # 1 x 1 convolution layer to match the feature dimensions + self.conv1by1 = None + if self.downsample: + self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False) + + def forward(self, x): + x = self.pre_conv(x) + if self.downsample: + x = self.conv(x) + self.conv1by1(x) + else: + x = self.conv(x) + x + return x \ No newline at end of file diff --git a/Utils/__init__.py b/Utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/Utils/__init__.py @@ -0,0 +1 @@ + diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..2b5ede249fb31b9df725ab377274f2cfe853469d --- /dev/null +++ b/app.py @@ -0,0 +1,142 @@ +import os +import json +import math + +import torch +import torch.nn.functional as F +import librosa +import numpy as np +import soundfile as sf +import gradio as gr +from transformers import WavLMModel + +from env import AttrDict +from meldataset import mel_spectrogram, MAX_WAV_VALUE +from models import Generator +from stft import TorchSTFT +from Utils.JDC.model import JDCNet + + +# files +hpfile = "config_v1_16k.json" +ptfile = "exp/default/g_00700000" +spk2id_path = "filelists/spk2id.json" +f0_stats_path = "filelists/f0_stats.json" +spk_stats_path = "filelists/spk_stats.json" +spk_emb_dir = "dataset/spk" +spk_wav_dir = "dataset/audio" + +# device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# load config +with open(hpfile) as f: + data = f.read() +json_config = json.loads(data) +h = AttrDict(json_config) + +# load models +F0_model = JDCNet(num_class=1, seq_len=192) +generator = Generator(h, F0_model).to(device) +stft = TorchSTFT(filter_length=h.gen_istft_n_fft, hop_length=h.gen_istft_hop_size, win_length=h.gen_istft_n_fft).to(device) + +state_dict_g = torch.load(ptfile, map_location=device) +generator.load_state_dict(state_dict_g['generator'], strict=True) +generator.remove_weight_norm() +_ = generator.eval() + +wavlm = WavLMModel.from_pretrained("microsoft/wavlm-base-plus") +wavlm.eval() +wavlm.to(device) + +# load stats +with open(spk2id_path) as f: + spk2id = json.load(f) +with open(f0_stats_path) as f: + f0_stats = json.load(f) +with open(spk_stats_path) as f: + spk_stats = json.load(f) + +# tune f0 +threshold = 10 +step = (math.log(1100) - math.log(50)) / 256 +def tune_f0(initial_f0, i): + if i == 0: + return initial_f0 + voiced = initial_f0 > threshold + initial_lf0 = torch.log(initial_f0) + lf0 = initial_lf0 + step * i + f0 = torch.exp(lf0) + f0 = torch.where(voiced, f0, initial_f0) + return f0 + +# convert function +def convert(tgt_spk, src_wav, f0_shift=0): + tgt_ref = spk_stats[tgt_spk]["best_spk_emb"] + tgt_emb = f"{spk_emb_dir}/{tgt_spk}/{tgt_ref}.npy" + + with torch.no_grad(): + # tgt + spk_id = spk2id[tgt_spk] + spk_id = torch.LongTensor([spk_id]).unsqueeze(0).to(device) + + spk_emb = np.load(tgt_emb) + spk_emb = torch.from_numpy(spk_emb).unsqueeze(0).to(device) + + f0_mean_tgt = f0_stats[tgt_spk]["mean"] + + # src + wav, sr = librosa.load(src_wav, sr=16000) + wav = torch.FloatTensor(wav).to(device) + mel = mel_spectrogram(wav.unsqueeze(0), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax) + + x = wavlm(wav.unsqueeze(0)).last_hidden_state + x = x.transpose(1, 2) # (B, C, T) + x = F.pad(x, (0, mel.size(2) - x.size(2)), 'constant') + + # cvt + f0 = generator.get_f0(mel, f0_mean_tgt) + f0 = tune_f0(f0, f0_shift) + x = generator.get_x(x, spk_emb, spk_id) + y = generator.infer(x, f0, stft) + + audio = y.squeeze() + audio = audio / torch.max(torch.abs(audio)) * 0.95 + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype('int16') + + sf.write("out.wav", audio, h.sampling_rate, "PCM_16") + + out_wav = "out.wav" + return out_wav + +# change spk +def change_spk(tgt_spk): + tgt_ref = spk_stats[tgt_spk]["best_spk_emb"] + tgt_wav = f"{spk_wav_dir}/{tgt_spk}/{tgt_ref}.wav" + return tgt_wav + +# interface +with gr.Blocks() as demo: + gr.Markdown("# PitchVC") + gr.Markdown("Gradio Demo for PitchVC. ([Github Repo](https://github.com/OlaWod/PitchVC))") + + with gr.Row(): + with gr.Column(): + tgt_spk = gr.Dropdown(choices=spk2id.keys(), type="value", label="Target Speaker") + ref_audio = gr.Audio(label="Reference Audio", type='filepath') + src_audio = gr.Audio(label="Source Audio", type='filepath') + f0_shift = gr.Slider(minimum=-30, maximum=30, value=0, step=1, label="F0 Shift") + with gr.Column(): + out_audio = gr.Audio(label="Output Audio", type='filepath') + submit = gr.Button(value="Submit") + + tgt_spk.change(fn=change_spk, inputs=[tgt_spk], outputs=[ref_audio]) + submit.click(convert, [tgt_spk, src_audio, f0_shift], [out_audio]) + + examples = gr.Examples( + examples=[["p225", 'dataset/audio/p226/p226_341.wav', 0], + ["p226", 'dataset/audio/p225/p225_220.wav', -5]], + inputs=[tgt_spk, src_audio, f0_shift]) + +demo.launch() diff --git a/config_v1_16k.json b/config_v1_16k.json new file mode 100644 index 0000000000000000000000000000000000000000..fb6192e4dbd23cc061dbb975b5bea1ff8442a318 --- /dev/null +++ b/config_v1_16k.json @@ -0,0 +1,42 @@ +{ + "F0_path": "Utils/JDC/bst.t7", + + "use_aug": true, + + "resblock": "1", + "num_gpus": 1, + "batch_size": 16, + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [10,8], + "upsample_kernel_sizes": [20,16], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "gen_istft_n_fft": 16, + "gen_istft_hop_size": 4, + + "segment_size": 16000, + "num_mels": 80, + "n_fft": 1024, + "hop_size": 320, + "win_size": 1024, + + "sampling_rate": 16000, + + "fmin": 0, + "fmax": 8000, + "fmax_for_loss": null, + + "num_workers": 8, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1 + } +} diff --git a/dataset/audio/p225/p225_220.wav b/dataset/audio/p225/p225_220.wav new file mode 100644 index 0000000000000000000000000000000000000000..9e3248bcebe7eeec8dfbd833a890fd24712396b4 Binary files /dev/null and b/dataset/audio/p225/p225_220.wav differ diff --git a/dataset/audio/p226/p226_341.wav b/dataset/audio/p226/p226_341.wav new file mode 100644 index 0000000000000000000000000000000000000000..376405ff532be2f9504b3711179fd1f2e1c45765 Binary files /dev/null and b/dataset/audio/p226/p226_341.wav differ diff --git a/dataset/audio/p227/p227_021.wav b/dataset/audio/p227/p227_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..e1e58df9ef15d8bf7911e139f6bd941c6d8318ab Binary files /dev/null and b/dataset/audio/p227/p227_021.wav differ diff --git a/dataset/audio/p228/p228_242.wav b/dataset/audio/p228/p228_242.wav new file mode 100644 index 0000000000000000000000000000000000000000..e145d54f08eec387794e4695d96c2b9e202bef48 Binary files /dev/null and b/dataset/audio/p228/p228_242.wav differ diff --git a/dataset/audio/p229/p229_021.wav b/dataset/audio/p229/p229_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..887fc8fd1456ecee1c48c412b335b54f96cfa9a9 Binary files /dev/null and b/dataset/audio/p229/p229_021.wav differ diff --git a/dataset/audio/p230/p230_361.wav b/dataset/audio/p230/p230_361.wav new file mode 100644 index 0000000000000000000000000000000000000000..dc82f7c09eba92461316b54a541518c6ff8651a9 Binary files /dev/null and b/dataset/audio/p230/p230_361.wav differ diff --git a/dataset/audio/p231/p231_197.wav b/dataset/audio/p231/p231_197.wav new file mode 100644 index 0000000000000000000000000000000000000000..19083e9135bdb4ecfde06bee0c0d3b4fcedd4f6a Binary files /dev/null and b/dataset/audio/p231/p231_197.wav differ diff --git a/dataset/audio/p232/p232_023.wav b/dataset/audio/p232/p232_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..e35c9993fc7b15822632a6d31ddd3becf03b99af Binary files /dev/null and b/dataset/audio/p232/p232_023.wav differ diff --git a/dataset/audio/p233/p233_323.wav b/dataset/audio/p233/p233_323.wav new file mode 100644 index 0000000000000000000000000000000000000000..e7efda397c7e513a6ff70a4602b30b3e0ac7cc85 Binary files /dev/null and b/dataset/audio/p233/p233_323.wav differ diff --git a/dataset/audio/p234/p234_229.wav b/dataset/audio/p234/p234_229.wav new file mode 100644 index 0000000000000000000000000000000000000000..28ddb3d508f43b06a55955830ba810fd17d53c70 Binary files /dev/null and b/dataset/audio/p234/p234_229.wav differ diff --git a/dataset/audio/p236/p236_068.wav b/dataset/audio/p236/p236_068.wav new file mode 100644 index 0000000000000000000000000000000000000000..15db8728b0804662d689f3042b77bba0dd8e1881 Binary files /dev/null and b/dataset/audio/p236/p236_068.wav differ diff --git a/dataset/audio/p237/p237_023.wav b/dataset/audio/p237/p237_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..84d09fe2e2d255b72616e6dd5a45fcff420cd73b Binary files /dev/null and b/dataset/audio/p237/p237_023.wav differ diff --git a/dataset/audio/p238/p238_023.wav b/dataset/audio/p238/p238_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..6cec5ad932ea5821da99246be700da88a1fa6e24 Binary files /dev/null and b/dataset/audio/p238/p238_023.wav differ diff --git a/dataset/audio/p239/p239_023.wav b/dataset/audio/p239/p239_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..3248b3751193cb477f18e746340a9f5f9848695e Binary files /dev/null and b/dataset/audio/p239/p239_023.wav differ diff --git a/dataset/audio/p240/p240_004.wav b/dataset/audio/p240/p240_004.wav new file mode 100644 index 0000000000000000000000000000000000000000..3b2bd2d00361d86f125807f05d99791328422b93 Binary files /dev/null and b/dataset/audio/p240/p240_004.wav differ diff --git a/dataset/audio/p241/p241_050.wav b/dataset/audio/p241/p241_050.wav new file mode 100644 index 0000000000000000000000000000000000000000..31ae79ffbceb63db2444f589e878669cf68ca286 Binary files /dev/null and b/dataset/audio/p241/p241_050.wav differ diff --git a/dataset/audio/p243/p243_087.wav b/dataset/audio/p243/p243_087.wav new file mode 100644 index 0000000000000000000000000000000000000000..4b88110aae082111015ac6a0645f3a2a7580590e Binary files /dev/null and b/dataset/audio/p243/p243_087.wav differ diff --git a/dataset/audio/p244/p244_008.wav b/dataset/audio/p244/p244_008.wav new file mode 100644 index 0000000000000000000000000000000000000000..a698666c42ca4c3d6fbb85ebbe89085bc981a595 Binary files /dev/null and b/dataset/audio/p244/p244_008.wav differ diff --git a/dataset/audio/p245/p245_014.wav b/dataset/audio/p245/p245_014.wav new file mode 100644 index 0000000000000000000000000000000000000000..721df3c591ea0cbdb2771a07a3f220acc2fe827d Binary files /dev/null and b/dataset/audio/p245/p245_014.wav differ diff --git a/dataset/audio/p246/p246_022.wav b/dataset/audio/p246/p246_022.wav new file mode 100644 index 0000000000000000000000000000000000000000..d14ea175907bf64d259c5c78d1d04361e8ab9de5 Binary files /dev/null and b/dataset/audio/p246/p246_022.wav differ diff --git a/dataset/audio/p247/p247_380.wav b/dataset/audio/p247/p247_380.wav new file mode 100644 index 0000000000000000000000000000000000000000..5c138f071c726f2bca0d60f06364ca6d62088b9b Binary files /dev/null and b/dataset/audio/p247/p247_380.wav differ diff --git a/dataset/audio/p248/p248_023.wav b/dataset/audio/p248/p248_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..1aa3d282d68d22c22cc3e07c9bf87c4d550c6426 Binary files /dev/null and b/dataset/audio/p248/p248_023.wav differ diff --git a/dataset/audio/p249/p249_223.wav b/dataset/audio/p249/p249_223.wav new file mode 100644 index 0000000000000000000000000000000000000000..9f4cadb20b6a80f6e55587b27842e83fc31b4d25 Binary files /dev/null and b/dataset/audio/p249/p249_223.wav differ diff --git a/dataset/audio/p250/p250_021.wav b/dataset/audio/p250/p250_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..ceec89419a48aa431dcace3407929f391c67302a Binary files /dev/null and b/dataset/audio/p250/p250_021.wav differ diff --git a/dataset/audio/p251/p251_364.wav b/dataset/audio/p251/p251_364.wav new file mode 100644 index 0000000000000000000000000000000000000000..12fe54ac784b9373731ef54ddcd0c89943076934 Binary files /dev/null and b/dataset/audio/p251/p251_364.wav differ diff --git a/dataset/audio/p252/p252_023.wav b/dataset/audio/p252/p252_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..2bcb8f3bed9d93506b14ef27b79b954b72352dbc Binary files /dev/null and b/dataset/audio/p252/p252_023.wav differ diff --git a/dataset/audio/p253/p253_207.wav b/dataset/audio/p253/p253_207.wav new file mode 100644 index 0000000000000000000000000000000000000000..fd99a7f74d77ce33c6f9dca5c25c4f93f2a2f760 Binary files /dev/null and b/dataset/audio/p253/p253_207.wav differ diff --git a/dataset/audio/p254/p254_023.wav b/dataset/audio/p254/p254_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..7efb3d3cd662dad0d8d72cb5f4415b212dcf9713 Binary files /dev/null and b/dataset/audio/p254/p254_023.wav differ diff --git a/dataset/audio/p255/p255_038.wav b/dataset/audio/p255/p255_038.wav new file mode 100644 index 0000000000000000000000000000000000000000..ac150c31cd8687eeed5905a42475090552192f52 Binary files /dev/null and b/dataset/audio/p255/p255_038.wav differ diff --git a/dataset/audio/p256/p256_079.wav b/dataset/audio/p256/p256_079.wav new file mode 100644 index 0000000000000000000000000000000000000000..5d969b7174f3202055067f2a42fc2af89a9f538a Binary files /dev/null and b/dataset/audio/p256/p256_079.wav differ diff --git a/dataset/audio/p257/p257_023.wav b/dataset/audio/p257/p257_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..1507ae2f1307a14eb8de94ff638d236a4ceeffd3 Binary files /dev/null and b/dataset/audio/p257/p257_023.wav differ diff --git a/dataset/audio/p258/p258_228.wav b/dataset/audio/p258/p258_228.wav new file mode 100644 index 0000000000000000000000000000000000000000..da76c01732d25bed80e2269c0abb56b352b074d5 Binary files /dev/null and b/dataset/audio/p258/p258_228.wav differ diff --git a/dataset/audio/p259/p259_011.wav b/dataset/audio/p259/p259_011.wav new file mode 100644 index 0000000000000000000000000000000000000000..f1a8599dde466dfe532ae80a61fa3f3750065ada Binary files /dev/null and b/dataset/audio/p259/p259_011.wav differ diff --git a/dataset/audio/p260/p260_103.wav b/dataset/audio/p260/p260_103.wav new file mode 100644 index 0000000000000000000000000000000000000000..22eead75daca73ca5de2586591f0ffa972b62144 Binary files /dev/null and b/dataset/audio/p260/p260_103.wav differ diff --git a/dataset/audio/p261/p261_023.wav b/dataset/audio/p261/p261_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..fe9a66aa1207979b2f2cc0a1c1e8ab9100e4081c Binary files /dev/null and b/dataset/audio/p261/p261_023.wav differ diff --git a/dataset/audio/p262/p262_210.wav b/dataset/audio/p262/p262_210.wav new file mode 100644 index 0000000000000000000000000000000000000000..9ff72e1245481ac8a219f1d879ce8566b7e7e719 Binary files /dev/null and b/dataset/audio/p262/p262_210.wav differ diff --git a/dataset/audio/p263/p263_218.wav b/dataset/audio/p263/p263_218.wav new file mode 100644 index 0000000000000000000000000000000000000000..05f34df5eec816416adc4a6624f9514faefa1c7c Binary files /dev/null and b/dataset/audio/p263/p263_218.wav differ diff --git a/dataset/audio/p264/p264_438.wav b/dataset/audio/p264/p264_438.wav new file mode 100644 index 0000000000000000000000000000000000000000..3eb2ae85425958b62007a1ccdd725ab882170b80 Binary files /dev/null and b/dataset/audio/p264/p264_438.wav differ diff --git a/dataset/audio/p265/p265_273.wav b/dataset/audio/p265/p265_273.wav new file mode 100644 index 0000000000000000000000000000000000000000..672f26c247938f477fbf0edaf725cb7bd771e305 Binary files /dev/null and b/dataset/audio/p265/p265_273.wav differ diff --git a/dataset/audio/p266/p266_417.wav b/dataset/audio/p266/p266_417.wav new file mode 100644 index 0000000000000000000000000000000000000000..1606a0eb8e2af0821b5bd3202cc122a3e59afb77 Binary files /dev/null and b/dataset/audio/p266/p266_417.wav differ diff --git a/dataset/audio/p267/p267_022.wav b/dataset/audio/p267/p267_022.wav new file mode 100644 index 0000000000000000000000000000000000000000..ea0963b07c06d6f1037790a9a72b2249f4f16ba4 Binary files /dev/null and b/dataset/audio/p267/p267_022.wav differ diff --git a/dataset/audio/p268/p268_021.wav b/dataset/audio/p268/p268_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..b610655320c57ae775046427be6eea246f99b468 Binary files /dev/null and b/dataset/audio/p268/p268_021.wav differ diff --git a/dataset/audio/p269/p269_332.wav b/dataset/audio/p269/p269_332.wav new file mode 100644 index 0000000000000000000000000000000000000000..4d3ddaf59bbaad831755b817df3e770bfb9a1a8a Binary files /dev/null and b/dataset/audio/p269/p269_332.wav differ diff --git a/dataset/audio/p270/p270_297.wav b/dataset/audio/p270/p270_297.wav new file mode 100644 index 0000000000000000000000000000000000000000..b1186069c62705f093d7ed06fc9f4465caa0d48b Binary files /dev/null and b/dataset/audio/p270/p270_297.wav differ diff --git a/dataset/audio/p271/p271_170.wav b/dataset/audio/p271/p271_170.wav new file mode 100644 index 0000000000000000000000000000000000000000..78610a35f95e8b5f57a7289eaa6a05940b8cc6c3 Binary files /dev/null and b/dataset/audio/p271/p271_170.wav differ diff --git a/dataset/audio/p272/p272_257.wav b/dataset/audio/p272/p272_257.wav new file mode 100644 index 0000000000000000000000000000000000000000..a8270c9a4ca6ebe07ed47d28c0364019b2a99ef2 Binary files /dev/null and b/dataset/audio/p272/p272_257.wav differ diff --git a/dataset/audio/p273/p273_023.wav b/dataset/audio/p273/p273_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..edbc039fee439fcee79d2412b48e2de66f78f86e Binary files /dev/null and b/dataset/audio/p273/p273_023.wav differ diff --git a/dataset/audio/p274/p274_296.wav b/dataset/audio/p274/p274_296.wav new file mode 100644 index 0000000000000000000000000000000000000000..b8c026d3976cb7e2120750b2e23a768ef7924f21 Binary files /dev/null and b/dataset/audio/p274/p274_296.wav differ diff --git a/dataset/audio/p275/p275_023.wav b/dataset/audio/p275/p275_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..2fdea9fe396fff1f944fac16020e11558359ab86 Binary files /dev/null and b/dataset/audio/p275/p275_023.wav differ diff --git a/dataset/audio/p276/p276_023.wav b/dataset/audio/p276/p276_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..af013e60fafce2d763e9bea445cd5395eaf288fb Binary files /dev/null and b/dataset/audio/p276/p276_023.wav differ diff --git a/dataset/audio/p277/p277_023.wav b/dataset/audio/p277/p277_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..1db692e7fd1d463a231893d04119a0c245a1bb13 Binary files /dev/null and b/dataset/audio/p277/p277_023.wav differ diff --git a/dataset/audio/p278/p278_023.wav b/dataset/audio/p278/p278_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..43f92cbd3da0a455d9daf20aae9864eebf8c6bd7 Binary files /dev/null and b/dataset/audio/p278/p278_023.wav differ diff --git a/dataset/audio/p279/p279_003.wav b/dataset/audio/p279/p279_003.wav new file mode 100644 index 0000000000000000000000000000000000000000..650026998093ce1f324d984cce6293857c6a6ad1 Binary files /dev/null and b/dataset/audio/p279/p279_003.wav differ diff --git a/dataset/audio/p281/p281_022.wav b/dataset/audio/p281/p281_022.wav new file mode 100644 index 0000000000000000000000000000000000000000..579f836defa2517933d1c486cbc2b817c5942b48 Binary files /dev/null and b/dataset/audio/p281/p281_022.wav differ diff --git a/dataset/audio/p282/p282_023.wav b/dataset/audio/p282/p282_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..513e83a45b90ba8fa6ddc9d147864b6e4c13e701 Binary files /dev/null and b/dataset/audio/p282/p282_023.wav differ diff --git a/dataset/audio/p283/p283_023.wav b/dataset/audio/p283/p283_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..303a4b3919bcf8d6b59130bd32df4c1416ebbdec Binary files /dev/null and b/dataset/audio/p283/p283_023.wav differ diff --git a/dataset/audio/p284/p284_365.wav b/dataset/audio/p284/p284_365.wav new file mode 100644 index 0000000000000000000000000000000000000000..9a6e2c739c25650dee581b770bb7e126dd8fb14d Binary files /dev/null and b/dataset/audio/p284/p284_365.wav differ diff --git a/dataset/audio/p285/p285_364.wav b/dataset/audio/p285/p285_364.wav new file mode 100644 index 0000000000000000000000000000000000000000..537837b3dfc90eef92dc8485fd6a2d0764822a58 Binary files /dev/null and b/dataset/audio/p285/p285_364.wav differ diff --git a/dataset/audio/p286/p286_160.wav b/dataset/audio/p286/p286_160.wav new file mode 100644 index 0000000000000000000000000000000000000000..4a76bb035855f425cb083cb949a1825568dbff84 Binary files /dev/null and b/dataset/audio/p286/p286_160.wav differ diff --git a/dataset/audio/p287/p287_023.wav b/dataset/audio/p287/p287_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..78dc59dc90bddbc8f74c6210f485b776a9bfd89f Binary files /dev/null and b/dataset/audio/p287/p287_023.wav differ diff --git a/dataset/audio/p288/p288_255.wav b/dataset/audio/p288/p288_255.wav new file mode 100644 index 0000000000000000000000000000000000000000..8e8bd385592a17bfab5c7d7e6d800af539be730f Binary files /dev/null and b/dataset/audio/p288/p288_255.wav differ diff --git a/dataset/audio/p292/p292_288.wav b/dataset/audio/p292/p292_288.wav new file mode 100644 index 0000000000000000000000000000000000000000..a5b07643eeda6533b83998e45bbf25453f03589c Binary files /dev/null and b/dataset/audio/p292/p292_288.wav differ diff --git a/dataset/audio/p293/p293_023.wav b/dataset/audio/p293/p293_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..4c9797165e72a2d3ca1269e2654834393f64c7da Binary files /dev/null and b/dataset/audio/p293/p293_023.wav differ diff --git a/dataset/audio/p294/p294_016.wav b/dataset/audio/p294/p294_016.wav new file mode 100644 index 0000000000000000000000000000000000000000..2bc1d61a8274e9166bb1d3463af50548032f671b Binary files /dev/null and b/dataset/audio/p294/p294_016.wav differ diff --git a/dataset/audio/p295/p295_022.wav b/dataset/audio/p295/p295_022.wav new file mode 100644 index 0000000000000000000000000000000000000000..b917b7e29732ab6936412b9213f028e076fbc867 Binary files /dev/null and b/dataset/audio/p295/p295_022.wav differ diff --git a/dataset/audio/p297/p297_023.wav b/dataset/audio/p297/p297_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..0ed3d82508420cdf32b21b9a1da5f0012aee3449 Binary files /dev/null and b/dataset/audio/p297/p297_023.wav differ diff --git a/dataset/audio/p298/p298_344.wav b/dataset/audio/p298/p298_344.wav new file mode 100644 index 0000000000000000000000000000000000000000..53d1e13ead849d75d9b3aed1cb58627dd39a6c83 Binary files /dev/null and b/dataset/audio/p298/p298_344.wav differ diff --git a/dataset/audio/p299/p299_213.wav b/dataset/audio/p299/p299_213.wav new file mode 100644 index 0000000000000000000000000000000000000000..0f492c792003be4070a1ecd17dbeb77c9968cc70 Binary files /dev/null and b/dataset/audio/p299/p299_213.wav differ diff --git a/dataset/audio/p300/p300_021.wav b/dataset/audio/p300/p300_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..5de70cf1ad499056d1fd15b84c6795fb9e36454d Binary files /dev/null and b/dataset/audio/p300/p300_021.wav differ diff --git a/dataset/audio/p301/p301_023.wav b/dataset/audio/p301/p301_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..99b70abf6e6cf6dfb2f69499ecccdbcebf1cb3c8 Binary files /dev/null and b/dataset/audio/p301/p301_023.wav differ diff --git a/dataset/audio/p302/p302_023.wav b/dataset/audio/p302/p302_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..15480603f8c88f845c642c373cc8900853ce07f4 Binary files /dev/null and b/dataset/audio/p302/p302_023.wav differ diff --git a/dataset/audio/p303/p303_023.wav b/dataset/audio/p303/p303_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..d1704ca780ba0fb351821a1a4d47384223a8621f Binary files /dev/null and b/dataset/audio/p303/p303_023.wav differ diff --git a/dataset/audio/p304/p304_078.wav b/dataset/audio/p304/p304_078.wav new file mode 100644 index 0000000000000000000000000000000000000000..150fcb4d90957fbfb00f9f859a57fd1714663632 Binary files /dev/null and b/dataset/audio/p304/p304_078.wav differ diff --git a/dataset/audio/p305/p305_188.wav b/dataset/audio/p305/p305_188.wav new file mode 100644 index 0000000000000000000000000000000000000000..93dfd08c9d33b366e6c1eaa885f65b609f21d8cb Binary files /dev/null and b/dataset/audio/p305/p305_188.wav differ diff --git a/dataset/audio/p306/p306_021.wav b/dataset/audio/p306/p306_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..2a831aff030166c3619b9ea0e5c9f87c55833961 Binary files /dev/null and b/dataset/audio/p306/p306_021.wav differ diff --git a/dataset/audio/p307/p307_023.wav b/dataset/audio/p307/p307_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..85405ce060b952c3d1deb0a00d9f9db51592e56d Binary files /dev/null and b/dataset/audio/p307/p307_023.wav differ diff --git a/dataset/audio/p308/p308_328.wav b/dataset/audio/p308/p308_328.wav new file mode 100644 index 0000000000000000000000000000000000000000..173c7d2feb54f062d89a004ccd24408942106c2e Binary files /dev/null and b/dataset/audio/p308/p308_328.wav differ diff --git a/dataset/audio/p310/p310_023.wav b/dataset/audio/p310/p310_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..92e6bbf794391c31a1880c64c96535b080afd052 Binary files /dev/null and b/dataset/audio/p310/p310_023.wav differ diff --git a/dataset/audio/p311/p311_008.wav b/dataset/audio/p311/p311_008.wav new file mode 100644 index 0000000000000000000000000000000000000000..45b528e4a7fe52278552afc2447706e822b82643 Binary files /dev/null and b/dataset/audio/p311/p311_008.wav differ diff --git a/dataset/audio/p312/p312_185.wav b/dataset/audio/p312/p312_185.wav new file mode 100644 index 0000000000000000000000000000000000000000..d6483b912d739de44ceddce4db979c3e745c9997 Binary files /dev/null and b/dataset/audio/p312/p312_185.wav differ diff --git a/dataset/audio/p313/p313_011.wav b/dataset/audio/p313/p313_011.wav new file mode 100644 index 0000000000000000000000000000000000000000..62006129e35f613ee880fe1b44108f6abfafa5d7 Binary files /dev/null and b/dataset/audio/p313/p313_011.wav differ diff --git a/dataset/audio/p314/p314_323.wav b/dataset/audio/p314/p314_323.wav new file mode 100644 index 0000000000000000000000000000000000000000..e3c59ef4a3caa9ae1d75d35076d9e0a6f21af044 Binary files /dev/null and b/dataset/audio/p314/p314_323.wav differ diff --git a/dataset/audio/p316/p316_011.wav b/dataset/audio/p316/p316_011.wav new file mode 100644 index 0000000000000000000000000000000000000000..b5070ef99049bb758aaf799f1042d7a1c9724d25 Binary files /dev/null and b/dataset/audio/p316/p316_011.wav differ diff --git a/dataset/audio/p317/p317_021.wav b/dataset/audio/p317/p317_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..fbd96edd30450386cf4f222fb51131f663c75abd Binary files /dev/null and b/dataset/audio/p317/p317_021.wav differ diff --git a/dataset/audio/p318/p318_023.wav b/dataset/audio/p318/p318_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..35a916cd221e942e3a255c057c87f3322ab2db8a Binary files /dev/null and b/dataset/audio/p318/p318_023.wav differ diff --git a/dataset/audio/p323/p323_023.wav b/dataset/audio/p323/p323_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..e8b64e88377ad7cc98ea95b92946edc182906ae2 Binary files /dev/null and b/dataset/audio/p323/p323_023.wav differ diff --git a/dataset/audio/p326/p326_022.wav b/dataset/audio/p326/p326_022.wav new file mode 100644 index 0000000000000000000000000000000000000000..c119d4676419045b9d27cfb9e795dcec0dd55198 Binary files /dev/null and b/dataset/audio/p326/p326_022.wav differ diff --git a/dataset/audio/p329/p329_021.wav b/dataset/audio/p329/p329_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..b0df23d1c155de038a9da6a0e14988c4e0f7fcad Binary files /dev/null and b/dataset/audio/p329/p329_021.wav differ diff --git a/dataset/audio/p330/p330_008.wav b/dataset/audio/p330/p330_008.wav new file mode 100644 index 0000000000000000000000000000000000000000..80e95ec5028472d730a6cc9c1f505c2da6430927 Binary files /dev/null and b/dataset/audio/p330/p330_008.wav differ diff --git a/dataset/audio/p333/p333_023.wav b/dataset/audio/p333/p333_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..a7faa2d20e497d0b76537d0f6b7442243d3fab37 Binary files /dev/null and b/dataset/audio/p333/p333_023.wav differ diff --git a/dataset/audio/p334/p334_184.wav b/dataset/audio/p334/p334_184.wav new file mode 100644 index 0000000000000000000000000000000000000000..f2c85418eb7ab551c644601fa2ae5139c1151415 Binary files /dev/null and b/dataset/audio/p334/p334_184.wav differ diff --git a/dataset/audio/p335/p335_120.wav b/dataset/audio/p335/p335_120.wav new file mode 100644 index 0000000000000000000000000000000000000000..8570c4fc241c17706ee6997a89fe439c3b9e0f6f Binary files /dev/null and b/dataset/audio/p335/p335_120.wav differ diff --git a/dataset/audio/p336/p336_021.wav b/dataset/audio/p336/p336_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..f9b0fa3824e4a2edd3b8549936a7764d5b9e3d1c Binary files /dev/null and b/dataset/audio/p336/p336_021.wav differ diff --git a/dataset/audio/p339/p339_021.wav b/dataset/audio/p339/p339_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..a597fc9c47a2a21594174de5a38c223efbb59cb6 Binary files /dev/null and b/dataset/audio/p339/p339_021.wav differ diff --git a/dataset/audio/p340/p340_021.wav b/dataset/audio/p340/p340_021.wav new file mode 100644 index 0000000000000000000000000000000000000000..842b621d3353af15d45c10deffe478924659e0bf Binary files /dev/null and b/dataset/audio/p340/p340_021.wav differ diff --git a/dataset/audio/p341/p341_019.wav b/dataset/audio/p341/p341_019.wav new file mode 100644 index 0000000000000000000000000000000000000000..9ad74414300e63ef58eab43af78be7854fae9e8d Binary files /dev/null and b/dataset/audio/p341/p341_019.wav differ diff --git a/dataset/audio/p343/p343_023.wav b/dataset/audio/p343/p343_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..3a593427cf4a132b0935f9c58076095633e114f8 Binary files /dev/null and b/dataset/audio/p343/p343_023.wav differ diff --git a/dataset/audio/p345/p345_023.wav b/dataset/audio/p345/p345_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..f5fe517292fb2a719bdd89f7331e7eeffedd34fc Binary files /dev/null and b/dataset/audio/p345/p345_023.wav differ diff --git a/dataset/audio/p347/p347_011.wav b/dataset/audio/p347/p347_011.wav new file mode 100644 index 0000000000000000000000000000000000000000..bb195ed266da3fc8e8c449c8572876f995467e2c Binary files /dev/null and b/dataset/audio/p347/p347_011.wav differ diff --git a/dataset/audio/p351/p351_023.wav b/dataset/audio/p351/p351_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..f7805315d345c3b0ac57253c1773ac5e96331069 Binary files /dev/null and b/dataset/audio/p351/p351_023.wav differ diff --git a/dataset/audio/p360/p360_023.wav b/dataset/audio/p360/p360_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..6107c1cfc24199cc4e72b487666a4bc7cb640a7c Binary files /dev/null and b/dataset/audio/p360/p360_023.wav differ diff --git a/dataset/audio/p361/p361_023.wav b/dataset/audio/p361/p361_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..19be387b0bee6afe48e7d1717a341bcce357777f Binary files /dev/null and b/dataset/audio/p361/p361_023.wav differ diff --git a/dataset/audio/p362/p362_022.wav b/dataset/audio/p362/p362_022.wav new file mode 100644 index 0000000000000000000000000000000000000000..f72ef8004087d372396dd503cd723cf040513d5d Binary files /dev/null and b/dataset/audio/p362/p362_022.wav differ diff --git a/dataset/audio/p363/p363_023.wav b/dataset/audio/p363/p363_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..1816ff51d87ac4efdfd345a9d5b55892dab1dd72 Binary files /dev/null and b/dataset/audio/p363/p363_023.wav differ diff --git a/dataset/audio/p364/p364_023.wav b/dataset/audio/p364/p364_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..2547a3f9235773cd8de30d19b2c4bed9bb0026de Binary files /dev/null and b/dataset/audio/p364/p364_023.wav differ diff --git a/dataset/audio/p374/p374_023.wav b/dataset/audio/p374/p374_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..eb4cf6ce38a83214ae1dc8401712c4bee8304978 Binary files /dev/null and b/dataset/audio/p374/p374_023.wav differ diff --git a/dataset/audio/p376/p376_023.wav b/dataset/audio/p376/p376_023.wav new file mode 100644 index 0000000000000000000000000000000000000000..94cf36834096f650fdcfcfab6e7d8f502e9a7637 Binary files /dev/null and b/dataset/audio/p376/p376_023.wav differ diff --git a/dataset/spk/p225/p225_220.npy b/dataset/spk/p225/p225_220.npy new file mode 100644 index 0000000000000000000000000000000000000000..71d5cd852f23f30fe9a4c2608fdd0de5a131b9fb Binary files /dev/null and b/dataset/spk/p225/p225_220.npy differ diff --git a/dataset/spk/p226/p226_341.npy b/dataset/spk/p226/p226_341.npy new file mode 100644 index 0000000000000000000000000000000000000000..7a68dcf8868c7f5bd3c5a89dc04cf087006af1d6 Binary files /dev/null and b/dataset/spk/p226/p226_341.npy differ diff --git a/dataset/spk/p227/p227_021.npy b/dataset/spk/p227/p227_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..46f5187a2dbb5bd2df255a7475dc7932dc35d81c Binary files /dev/null and b/dataset/spk/p227/p227_021.npy differ diff --git a/dataset/spk/p228/p228_242.npy b/dataset/spk/p228/p228_242.npy new file mode 100644 index 0000000000000000000000000000000000000000..6d7aeef00617d1e7166e89badc77e8adbde0a35a Binary files /dev/null and b/dataset/spk/p228/p228_242.npy differ diff --git a/dataset/spk/p229/p229_021.npy b/dataset/spk/p229/p229_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..2dd3ed874ad9f0307dc21a37f2d81ebb8e3be4f6 Binary files /dev/null and b/dataset/spk/p229/p229_021.npy differ diff --git a/dataset/spk/p230/p230_361.npy b/dataset/spk/p230/p230_361.npy new file mode 100644 index 0000000000000000000000000000000000000000..bd698c585582b31277f2751d6cffe8f94274d324 Binary files /dev/null and b/dataset/spk/p230/p230_361.npy differ diff --git a/dataset/spk/p231/p231_197.npy b/dataset/spk/p231/p231_197.npy new file mode 100644 index 0000000000000000000000000000000000000000..3448b4531c5480902ac81131599cd61af4e67ca6 Binary files /dev/null and b/dataset/spk/p231/p231_197.npy differ diff --git a/dataset/spk/p232/p232_023.npy b/dataset/spk/p232/p232_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..0b0e27ca86a1da1df602e3dd83e15be6acce1d87 Binary files /dev/null and b/dataset/spk/p232/p232_023.npy differ diff --git a/dataset/spk/p233/p233_323.npy b/dataset/spk/p233/p233_323.npy new file mode 100644 index 0000000000000000000000000000000000000000..0b7621fa0afe94c47d1ef094988dad55a97a9c55 Binary files /dev/null and b/dataset/spk/p233/p233_323.npy differ diff --git a/dataset/spk/p234/p234_229.npy b/dataset/spk/p234/p234_229.npy new file mode 100644 index 0000000000000000000000000000000000000000..3372ec12a4a70453f6b03fee0b868b27fd367599 Binary files /dev/null and b/dataset/spk/p234/p234_229.npy differ diff --git a/dataset/spk/p236/p236_068.npy b/dataset/spk/p236/p236_068.npy new file mode 100644 index 0000000000000000000000000000000000000000..e09e23ddc9c56fa91bb4207f8d6cff0b6c24a553 Binary files /dev/null and b/dataset/spk/p236/p236_068.npy differ diff --git a/dataset/spk/p237/p237_023.npy b/dataset/spk/p237/p237_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..79198b102e958ca49eb5e0bd29fc78f880508758 Binary files /dev/null and b/dataset/spk/p237/p237_023.npy differ diff --git a/dataset/spk/p238/p238_023.npy b/dataset/spk/p238/p238_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..e5a1f4e1f6326b828fb7771428d3078d6ee644a8 Binary files /dev/null and b/dataset/spk/p238/p238_023.npy differ diff --git a/dataset/spk/p239/p239_023.npy b/dataset/spk/p239/p239_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..a10981a12262e20118ef2d3a673de5def645b98c Binary files /dev/null and b/dataset/spk/p239/p239_023.npy differ diff --git a/dataset/spk/p240/p240_004.npy b/dataset/spk/p240/p240_004.npy new file mode 100644 index 0000000000000000000000000000000000000000..1f8c0a6b6f262c5e63581009da5f47cdffd839ad Binary files /dev/null and b/dataset/spk/p240/p240_004.npy differ diff --git a/dataset/spk/p241/p241_050.npy b/dataset/spk/p241/p241_050.npy new file mode 100644 index 0000000000000000000000000000000000000000..f79cf9e83c7cedfdf881b66c48c2204eaff8480d Binary files /dev/null and b/dataset/spk/p241/p241_050.npy differ diff --git a/dataset/spk/p243/p243_087.npy b/dataset/spk/p243/p243_087.npy new file mode 100644 index 0000000000000000000000000000000000000000..8797080906ba7ceeeee9f765d9a4e7aa63d91385 Binary files /dev/null and b/dataset/spk/p243/p243_087.npy differ diff --git a/dataset/spk/p244/p244_008.npy b/dataset/spk/p244/p244_008.npy new file mode 100644 index 0000000000000000000000000000000000000000..c7e0f5eb7d067ec768efa1e36e980dd6d4dabe2a Binary files /dev/null and b/dataset/spk/p244/p244_008.npy differ diff --git a/dataset/spk/p245/p245_014.npy b/dataset/spk/p245/p245_014.npy new file mode 100644 index 0000000000000000000000000000000000000000..d392427903cc813ccbc030023cac03d6362b7220 Binary files /dev/null and b/dataset/spk/p245/p245_014.npy differ diff --git a/dataset/spk/p246/p246_022.npy b/dataset/spk/p246/p246_022.npy new file mode 100644 index 0000000000000000000000000000000000000000..eaedca15ff6045c2024f59085e88cbd08b91eda4 Binary files /dev/null and b/dataset/spk/p246/p246_022.npy differ diff --git a/dataset/spk/p247/p247_380.npy b/dataset/spk/p247/p247_380.npy new file mode 100644 index 0000000000000000000000000000000000000000..4902baf91d7b4d45f96e2f08c1a39ee15681e117 Binary files /dev/null and b/dataset/spk/p247/p247_380.npy differ diff --git a/dataset/spk/p248/p248_023.npy b/dataset/spk/p248/p248_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..efb8a61a4435b1cdecce650c2357f263568080e0 Binary files /dev/null and b/dataset/spk/p248/p248_023.npy differ diff --git a/dataset/spk/p249/p249_223.npy b/dataset/spk/p249/p249_223.npy new file mode 100644 index 0000000000000000000000000000000000000000..654b1c3b47408435319a92f6b69d3b89b16e0c38 Binary files /dev/null and b/dataset/spk/p249/p249_223.npy differ diff --git a/dataset/spk/p250/p250_021.npy b/dataset/spk/p250/p250_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..ae08f921d9d692f24d92e5a65a8a5cd5aa6320b7 Binary files /dev/null and b/dataset/spk/p250/p250_021.npy differ diff --git a/dataset/spk/p251/p251_364.npy b/dataset/spk/p251/p251_364.npy new file mode 100644 index 0000000000000000000000000000000000000000..ba77f587aab6d3b3fa7bed2f93a57974a058cf66 Binary files /dev/null and b/dataset/spk/p251/p251_364.npy differ diff --git a/dataset/spk/p252/p252_023.npy b/dataset/spk/p252/p252_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..452910de9a78b26505835d0695d6d257c292990d Binary files /dev/null and b/dataset/spk/p252/p252_023.npy differ diff --git a/dataset/spk/p253/p253_207.npy b/dataset/spk/p253/p253_207.npy new file mode 100644 index 0000000000000000000000000000000000000000..3a3c71f8bd8f11d3e45b280bdd55cb121bb79035 Binary files /dev/null and b/dataset/spk/p253/p253_207.npy differ diff --git a/dataset/spk/p254/p254_023.npy b/dataset/spk/p254/p254_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..cb3ff1306dd08b22831ed1d6d37042e87967106e Binary files /dev/null and b/dataset/spk/p254/p254_023.npy differ diff --git a/dataset/spk/p255/p255_038.npy b/dataset/spk/p255/p255_038.npy new file mode 100644 index 0000000000000000000000000000000000000000..c54a6df14ce603b764bf991ba0f29efdb1219a70 Binary files /dev/null and b/dataset/spk/p255/p255_038.npy differ diff --git a/dataset/spk/p256/p256_079.npy b/dataset/spk/p256/p256_079.npy new file mode 100644 index 0000000000000000000000000000000000000000..6ed865fac7c010cb8485e653448e0256c6994da6 Binary files /dev/null and b/dataset/spk/p256/p256_079.npy differ diff --git a/dataset/spk/p257/p257_023.npy b/dataset/spk/p257/p257_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..41686fe8cd1bf6fabfa6b50954b74fac0907e30a Binary files /dev/null and b/dataset/spk/p257/p257_023.npy differ diff --git a/dataset/spk/p258/p258_228.npy b/dataset/spk/p258/p258_228.npy new file mode 100644 index 0000000000000000000000000000000000000000..3ff2edf577fe8ec39a1261773075402d523d2ff6 Binary files /dev/null and b/dataset/spk/p258/p258_228.npy differ diff --git a/dataset/spk/p259/p259_011.npy b/dataset/spk/p259/p259_011.npy new file mode 100644 index 0000000000000000000000000000000000000000..2ce0445cca1c2d022398d38ac7f82bea3d47d6d1 Binary files /dev/null and b/dataset/spk/p259/p259_011.npy differ diff --git a/dataset/spk/p260/p260_103.npy b/dataset/spk/p260/p260_103.npy new file mode 100644 index 0000000000000000000000000000000000000000..925da955d031fd5cc5d90023853853234228ef33 Binary files /dev/null and b/dataset/spk/p260/p260_103.npy differ diff --git a/dataset/spk/p261/p261_023.npy b/dataset/spk/p261/p261_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..068f0d8b2783dc8696372e1b2d980fa9dc0c6fc7 Binary files /dev/null and b/dataset/spk/p261/p261_023.npy differ diff --git a/dataset/spk/p262/p262_210.npy b/dataset/spk/p262/p262_210.npy new file mode 100644 index 0000000000000000000000000000000000000000..5f5e4b4ea7238b5b413195da4ff429007d3acb2a Binary files /dev/null and b/dataset/spk/p262/p262_210.npy differ diff --git a/dataset/spk/p263/p263_218.npy b/dataset/spk/p263/p263_218.npy new file mode 100644 index 0000000000000000000000000000000000000000..216bc22e2f5cb3dcce762a0c7c397040aa2b2d1a Binary files /dev/null and b/dataset/spk/p263/p263_218.npy differ diff --git a/dataset/spk/p264/p264_438.npy b/dataset/spk/p264/p264_438.npy new file mode 100644 index 0000000000000000000000000000000000000000..1c40edf3850ce93384d35fd604a26a9f228ec1ac Binary files /dev/null and b/dataset/spk/p264/p264_438.npy differ diff --git a/dataset/spk/p265/p265_273.npy b/dataset/spk/p265/p265_273.npy new file mode 100644 index 0000000000000000000000000000000000000000..1d3b9d8ffa50de887cb95d6f2a86fae07e7d3dbf Binary files /dev/null and b/dataset/spk/p265/p265_273.npy differ diff --git a/dataset/spk/p266/p266_417.npy b/dataset/spk/p266/p266_417.npy new file mode 100644 index 0000000000000000000000000000000000000000..afd442b226efc34ba7def7a020f7f976545fe269 Binary files /dev/null and b/dataset/spk/p266/p266_417.npy differ diff --git a/dataset/spk/p267/p267_022.npy b/dataset/spk/p267/p267_022.npy new file mode 100644 index 0000000000000000000000000000000000000000..034c913e26c59265bd5206fb5e656682c777e3b2 Binary files /dev/null and b/dataset/spk/p267/p267_022.npy differ diff --git a/dataset/spk/p268/p268_021.npy b/dataset/spk/p268/p268_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..e46d9d96fe72a4fa6b2b84ad194bc437895d45dc Binary files /dev/null and b/dataset/spk/p268/p268_021.npy differ diff --git a/dataset/spk/p269/p269_332.npy b/dataset/spk/p269/p269_332.npy new file mode 100644 index 0000000000000000000000000000000000000000..447b0fad35647f4fbbe7e2fed1d907d4e2b0ed5b Binary files /dev/null and b/dataset/spk/p269/p269_332.npy differ diff --git a/dataset/spk/p270/p270_297.npy b/dataset/spk/p270/p270_297.npy new file mode 100644 index 0000000000000000000000000000000000000000..764bd55854689d1a17ad7c0199139596178c9136 Binary files /dev/null and b/dataset/spk/p270/p270_297.npy differ diff --git a/dataset/spk/p271/p271_170.npy b/dataset/spk/p271/p271_170.npy new file mode 100644 index 0000000000000000000000000000000000000000..0755fd18f7c20cb2640c54007d0a142810c63fe4 Binary files /dev/null and b/dataset/spk/p271/p271_170.npy differ diff --git a/dataset/spk/p272/p272_257.npy b/dataset/spk/p272/p272_257.npy new file mode 100644 index 0000000000000000000000000000000000000000..0a1112b7a1c7e621dc49b5f79d3168207f85d507 Binary files /dev/null and b/dataset/spk/p272/p272_257.npy differ diff --git a/dataset/spk/p273/p273_023.npy b/dataset/spk/p273/p273_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..98502d2417ad75cc47c4713a63e4c401ef29f6e9 Binary files /dev/null and b/dataset/spk/p273/p273_023.npy differ diff --git a/dataset/spk/p274/p274_296.npy b/dataset/spk/p274/p274_296.npy new file mode 100644 index 0000000000000000000000000000000000000000..ca3217cf6e219fcbb7ef2af08380dcd15c6e2442 Binary files /dev/null and b/dataset/spk/p274/p274_296.npy differ diff --git a/dataset/spk/p275/p275_023.npy b/dataset/spk/p275/p275_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..b0b015b8fbcbec653146b96b13fb03d538608db7 Binary files /dev/null and b/dataset/spk/p275/p275_023.npy differ diff --git a/dataset/spk/p276/p276_023.npy b/dataset/spk/p276/p276_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..9066934d7eee7ffc35be475d699b509593866c35 Binary files /dev/null and b/dataset/spk/p276/p276_023.npy differ diff --git a/dataset/spk/p277/p277_023.npy b/dataset/spk/p277/p277_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..aa4d49ab3b1866756f9fb2fc08a8d82423ee43b7 Binary files /dev/null and b/dataset/spk/p277/p277_023.npy differ diff --git a/dataset/spk/p278/p278_023.npy b/dataset/spk/p278/p278_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..8f6afcf931b955ac14b3ec1fc5cdcffe1bf10371 Binary files /dev/null and b/dataset/spk/p278/p278_023.npy differ diff --git a/dataset/spk/p279/p279_003.npy b/dataset/spk/p279/p279_003.npy new file mode 100644 index 0000000000000000000000000000000000000000..9c056fd3df908355cde1e182323f975d119f38cd Binary files /dev/null and b/dataset/spk/p279/p279_003.npy differ diff --git a/dataset/spk/p281/p281_022.npy b/dataset/spk/p281/p281_022.npy new file mode 100644 index 0000000000000000000000000000000000000000..05e41856a9e700c84683c8d14325835ad79ad24b Binary files /dev/null and b/dataset/spk/p281/p281_022.npy differ diff --git a/dataset/spk/p282/p282_023.npy b/dataset/spk/p282/p282_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..e08a0582e5cab9956f0471baf74b9b350badaaa9 Binary files /dev/null and b/dataset/spk/p282/p282_023.npy differ diff --git a/dataset/spk/p283/p283_023.npy b/dataset/spk/p283/p283_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..64c202346792b818f175e34e1ffc0fa1a430591e Binary files /dev/null and b/dataset/spk/p283/p283_023.npy differ diff --git a/dataset/spk/p284/p284_365.npy b/dataset/spk/p284/p284_365.npy new file mode 100644 index 0000000000000000000000000000000000000000..5820f48b175d2103f10569dcc6f9f78b27d0c5f8 Binary files /dev/null and b/dataset/spk/p284/p284_365.npy differ diff --git a/dataset/spk/p285/p285_364.npy b/dataset/spk/p285/p285_364.npy new file mode 100644 index 0000000000000000000000000000000000000000..5111cbaa1ec998622d7bdf76129b79052fb499b6 Binary files /dev/null and b/dataset/spk/p285/p285_364.npy differ diff --git a/dataset/spk/p286/p286_160.npy b/dataset/spk/p286/p286_160.npy new file mode 100644 index 0000000000000000000000000000000000000000..5f3875f536dfcf23bfeb0f51a8d3cb960ed0ee48 Binary files /dev/null and b/dataset/spk/p286/p286_160.npy differ diff --git a/dataset/spk/p287/p287_023.npy b/dataset/spk/p287/p287_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..f02d559857af4526e8f4163ef17e48fd2e623d30 Binary files /dev/null and b/dataset/spk/p287/p287_023.npy differ diff --git a/dataset/spk/p288/p288_255.npy b/dataset/spk/p288/p288_255.npy new file mode 100644 index 0000000000000000000000000000000000000000..1d591c4ca3577841e27e5af70c33b6fa064013d2 Binary files /dev/null and b/dataset/spk/p288/p288_255.npy differ diff --git a/dataset/spk/p292/p292_288.npy b/dataset/spk/p292/p292_288.npy new file mode 100644 index 0000000000000000000000000000000000000000..aff4d0b91d7f70af930dc0cd5a87973635212456 Binary files /dev/null and b/dataset/spk/p292/p292_288.npy differ diff --git a/dataset/spk/p293/p293_023.npy b/dataset/spk/p293/p293_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..8f587fb5224e571b16bb7000e2999985ca54d73b Binary files /dev/null and b/dataset/spk/p293/p293_023.npy differ diff --git a/dataset/spk/p294/p294_016.npy b/dataset/spk/p294/p294_016.npy new file mode 100644 index 0000000000000000000000000000000000000000..1dedde498666ffc336df1ce17de0480119f0211b Binary files /dev/null and b/dataset/spk/p294/p294_016.npy differ diff --git a/dataset/spk/p295/p295_022.npy b/dataset/spk/p295/p295_022.npy new file mode 100644 index 0000000000000000000000000000000000000000..8f54893aebff003b3c3409fd44a20cc238b735b3 Binary files /dev/null and b/dataset/spk/p295/p295_022.npy differ diff --git a/dataset/spk/p297/p297_023.npy b/dataset/spk/p297/p297_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..1dcf5c2b6fb9ce1e6f24ca31f05eb2c832e99c5a Binary files /dev/null and b/dataset/spk/p297/p297_023.npy differ diff --git a/dataset/spk/p298/p298_344.npy b/dataset/spk/p298/p298_344.npy new file mode 100644 index 0000000000000000000000000000000000000000..01eb9d15015ee0d05c53d832e6166d21755e5dc8 Binary files /dev/null and b/dataset/spk/p298/p298_344.npy differ diff --git a/dataset/spk/p299/p299_213.npy b/dataset/spk/p299/p299_213.npy new file mode 100644 index 0000000000000000000000000000000000000000..4a80f9c4255927445f7fa818134aafb00c36e12e Binary files /dev/null and b/dataset/spk/p299/p299_213.npy differ diff --git a/dataset/spk/p300/p300_021.npy b/dataset/spk/p300/p300_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..9564083590d3985a9f4eed3d3a187115231330a4 Binary files /dev/null and b/dataset/spk/p300/p300_021.npy differ diff --git a/dataset/spk/p301/p301_023.npy b/dataset/spk/p301/p301_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..a408f06f4802e165a598e31bdae7d2e6dacdf843 Binary files /dev/null and b/dataset/spk/p301/p301_023.npy differ diff --git a/dataset/spk/p302/p302_023.npy b/dataset/spk/p302/p302_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..b40efe7742256476bf749dcb832795e40450486c Binary files /dev/null and b/dataset/spk/p302/p302_023.npy differ diff --git a/dataset/spk/p303/p303_023.npy b/dataset/spk/p303/p303_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..ec2286c16b7c09f59a3150e8f3220945514482a7 Binary files /dev/null and b/dataset/spk/p303/p303_023.npy differ diff --git a/dataset/spk/p304/p304_078.npy b/dataset/spk/p304/p304_078.npy new file mode 100644 index 0000000000000000000000000000000000000000..1cdc2c825eff1f6d612b02450e36826cfa923679 Binary files /dev/null and b/dataset/spk/p304/p304_078.npy differ diff --git a/dataset/spk/p305/p305_188.npy b/dataset/spk/p305/p305_188.npy new file mode 100644 index 0000000000000000000000000000000000000000..825c30d3b76388158bfebb51dc821843425357de Binary files /dev/null and b/dataset/spk/p305/p305_188.npy differ diff --git a/dataset/spk/p306/p306_021.npy b/dataset/spk/p306/p306_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..217b5f4a523f28cb1ebdf86b42decdecee34d5d3 Binary files /dev/null and b/dataset/spk/p306/p306_021.npy differ diff --git a/dataset/spk/p307/p307_023.npy b/dataset/spk/p307/p307_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..66aa4551e1ae2b737ddd47917c6173fa4cae806f Binary files /dev/null and b/dataset/spk/p307/p307_023.npy differ diff --git a/dataset/spk/p308/p308_328.npy b/dataset/spk/p308/p308_328.npy new file mode 100644 index 0000000000000000000000000000000000000000..8b43ee9f0a638d8c4c0a67897f385f34e2037175 Binary files /dev/null and b/dataset/spk/p308/p308_328.npy differ diff --git a/dataset/spk/p310/p310_023.npy b/dataset/spk/p310/p310_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..85b895f7c093d89085f5326d960aebec5564395c Binary files /dev/null and b/dataset/spk/p310/p310_023.npy differ diff --git a/dataset/spk/p311/p311_008.npy b/dataset/spk/p311/p311_008.npy new file mode 100644 index 0000000000000000000000000000000000000000..03b35abddf279a235bae1f642b85349078a042ea Binary files /dev/null and b/dataset/spk/p311/p311_008.npy differ diff --git a/dataset/spk/p312/p312_185.npy b/dataset/spk/p312/p312_185.npy new file mode 100644 index 0000000000000000000000000000000000000000..f18e1704dbd7a2dc12c85c5c359f3b6e6c53c6bd Binary files /dev/null and b/dataset/spk/p312/p312_185.npy differ diff --git a/dataset/spk/p313/p313_011.npy b/dataset/spk/p313/p313_011.npy new file mode 100644 index 0000000000000000000000000000000000000000..b079cbba1daa15215a6033767ee1158f550087f4 Binary files /dev/null and b/dataset/spk/p313/p313_011.npy differ diff --git a/dataset/spk/p314/p314_323.npy b/dataset/spk/p314/p314_323.npy new file mode 100644 index 0000000000000000000000000000000000000000..8f008529f33fd24a578a9478eec26c29991863cd Binary files /dev/null and b/dataset/spk/p314/p314_323.npy differ diff --git a/dataset/spk/p316/p316_011.npy b/dataset/spk/p316/p316_011.npy new file mode 100644 index 0000000000000000000000000000000000000000..b48b9222209e5bce123fb1ef48ad04732c347a01 Binary files /dev/null and b/dataset/spk/p316/p316_011.npy differ diff --git a/dataset/spk/p317/p317_021.npy b/dataset/spk/p317/p317_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..7cd850077da69e8ef0579b4d82f9d8224e3e8e1a Binary files /dev/null and b/dataset/spk/p317/p317_021.npy differ diff --git a/dataset/spk/p318/p318_023.npy b/dataset/spk/p318/p318_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..b5b16e84c1c03bcbb3f519353205d77a761a86cc Binary files /dev/null and b/dataset/spk/p318/p318_023.npy differ diff --git a/dataset/spk/p323/p323_023.npy b/dataset/spk/p323/p323_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..9bca1484006d0972584122eb41f02cbe6c1f0dcd Binary files /dev/null and b/dataset/spk/p323/p323_023.npy differ diff --git a/dataset/spk/p326/p326_022.npy b/dataset/spk/p326/p326_022.npy new file mode 100644 index 0000000000000000000000000000000000000000..df3a9867f369145c93d46d662cb5de86e741407a Binary files /dev/null and b/dataset/spk/p326/p326_022.npy differ diff --git a/dataset/spk/p329/p329_021.npy b/dataset/spk/p329/p329_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..5745976e6510c25f15441d1dee025fc7d3b00fc6 Binary files /dev/null and b/dataset/spk/p329/p329_021.npy differ diff --git a/dataset/spk/p330/p330_008.npy b/dataset/spk/p330/p330_008.npy new file mode 100644 index 0000000000000000000000000000000000000000..3eaae43339840da22972a1c798c152418130c8f0 Binary files /dev/null and b/dataset/spk/p330/p330_008.npy differ diff --git a/dataset/spk/p333/p333_023.npy b/dataset/spk/p333/p333_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..51fe7454f8763f7b897e624a1b32895a3ab08b62 Binary files /dev/null and b/dataset/spk/p333/p333_023.npy differ diff --git a/dataset/spk/p334/p334_184.npy b/dataset/spk/p334/p334_184.npy new file mode 100644 index 0000000000000000000000000000000000000000..e703e66092d384530f184239385badbaefac6485 Binary files /dev/null and b/dataset/spk/p334/p334_184.npy differ diff --git a/dataset/spk/p335/p335_120.npy b/dataset/spk/p335/p335_120.npy new file mode 100644 index 0000000000000000000000000000000000000000..687f129e8d99f996fe66a227397aa944b900f7a5 Binary files /dev/null and b/dataset/spk/p335/p335_120.npy differ diff --git a/dataset/spk/p336/p336_021.npy b/dataset/spk/p336/p336_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..2076a7cfea0eaf29482bb074fd1f640771cd729f Binary files /dev/null and b/dataset/spk/p336/p336_021.npy differ diff --git a/dataset/spk/p339/p339_021.npy b/dataset/spk/p339/p339_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..2fc1906ab4dfcff930d1089a62bc04bb8087f08e Binary files /dev/null and b/dataset/spk/p339/p339_021.npy differ diff --git a/dataset/spk/p340/p340_021.npy b/dataset/spk/p340/p340_021.npy new file mode 100644 index 0000000000000000000000000000000000000000..a329a4e1e06e78122446f3c2df4119a6a178e78f Binary files /dev/null and b/dataset/spk/p340/p340_021.npy differ diff --git a/dataset/spk/p341/p341_019.npy b/dataset/spk/p341/p341_019.npy new file mode 100644 index 0000000000000000000000000000000000000000..24c4874cbcdecbb61ff9a776fbc53727d8f75eec Binary files /dev/null and b/dataset/spk/p341/p341_019.npy differ diff --git a/dataset/spk/p343/p343_023.npy b/dataset/spk/p343/p343_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..9e37a6b3e9a992adeb84b617c752d18c6daac38e Binary files /dev/null and b/dataset/spk/p343/p343_023.npy differ diff --git a/dataset/spk/p345/p345_023.npy b/dataset/spk/p345/p345_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..5dd55044e53fbf3a09859789b16b1f6f00817535 Binary files /dev/null and b/dataset/spk/p345/p345_023.npy differ diff --git a/dataset/spk/p347/p347_011.npy b/dataset/spk/p347/p347_011.npy new file mode 100644 index 0000000000000000000000000000000000000000..1051d8dfb2e260ffa9d4cc4cbd42ae8ae673af22 Binary files /dev/null and b/dataset/spk/p347/p347_011.npy differ diff --git a/dataset/spk/p351/p351_023.npy b/dataset/spk/p351/p351_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..c8e5c8a8ef2e5619416e20120de0b33df874298f Binary files /dev/null and b/dataset/spk/p351/p351_023.npy differ diff --git a/dataset/spk/p360/p360_023.npy b/dataset/spk/p360/p360_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..eae7009c37ee402ea6d1440a5dd855d335a8ed08 Binary files /dev/null and b/dataset/spk/p360/p360_023.npy differ diff --git a/dataset/spk/p361/p361_023.npy b/dataset/spk/p361/p361_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..bf81fc14b8cc12e6e2337df10f4f104389941517 Binary files /dev/null and b/dataset/spk/p361/p361_023.npy differ diff --git a/dataset/spk/p362/p362_022.npy b/dataset/spk/p362/p362_022.npy new file mode 100644 index 0000000000000000000000000000000000000000..9137b4131292ef288747d48fa60ba8e722599639 Binary files /dev/null and b/dataset/spk/p362/p362_022.npy differ diff --git a/dataset/spk/p363/p363_023.npy b/dataset/spk/p363/p363_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..b95495ebc5b130ea6869da32c7d42f00ba67a573 Binary files /dev/null and b/dataset/spk/p363/p363_023.npy differ diff --git a/dataset/spk/p364/p364_023.npy b/dataset/spk/p364/p364_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..6e06ec51248dafc04d0183157a6276c6b6d2d695 Binary files /dev/null and b/dataset/spk/p364/p364_023.npy differ diff --git a/dataset/spk/p374/p374_023.npy b/dataset/spk/p374/p374_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..f7d1914703b92a2b75520de2fe32a4ea2ebfd527 Binary files /dev/null and b/dataset/spk/p374/p374_023.npy differ diff --git a/dataset/spk/p376/p376_023.npy b/dataset/spk/p376/p376_023.npy new file mode 100644 index 0000000000000000000000000000000000000000..81a6611da23c8811d9c3ef18506c3c5bfbb1679e Binary files /dev/null and b/dataset/spk/p376/p376_023.npy differ diff --git a/env.py b/env.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdbc95d4f7a8bad8fd4f5eef657e2b51d946056 --- /dev/null +++ b/env.py @@ -0,0 +1,15 @@ +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/exp/default/.gitkeep b/exp/default/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/filelists/f0_stats.json b/filelists/f0_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..57392e92a53477715d1db1c5611a25ecacb43702 --- /dev/null +++ b/filelists/f0_stats.json @@ -0,0 +1,644 @@ +{ + "p266": { + "mean": 187.5668047997471, + "std": 21.7111977983326, + "max": 376.1280212402344, + "min": 50.37684631347656 + }, + "p270": { + "mean": 106.23142090840634, + "std": 13.499452414213351, + "max": 172.0406494140625, + "min": 56.70237350463867 + }, + "p237": { + "mean": 89.33731298962593, + "std": 13.803607936017114, + "max": 197.36489868164062, + "min": 40.889896392822266 + }, + "p263": { + "mean": 109.45713694139691, + "std": 17.992642924184093, + "max": 192.3251495361328, + "min": 43.85630416870117 + }, + "p335": { + "mean": 209.05143026652237, + "std": 32.40300665756925, + "max": 371.473876953125, + "min": 88.41365051269531 + }, + "p228": { + "mean": 201.04827754407572, + "std": 28.89781406157594, + "max": 343.4189758300781, + "min": 68.82327270507812 + }, + "p301": { + "mean": 176.83474620049896, + "std": 21.949461432843542, + "max": 257.40008544921875, + "min": 89.48204040527344 + }, + "p326": { + "mean": 86.69441306602499, + "std": 11.763469359904498, + "max": 151.2174835205078, + "min": 40.937156677246094 + }, + "p310": { + "mean": 244.232260080041, + "std": 39.77771002958188, + "max": 440.3475646972656, + "min": 53.8338737487793 + }, + "p288": { + "mean": 192.03983897029107, + "std": 29.06469115595905, + "max": 339.5445861816406, + "min": 77.81259155273438 + }, + "p343": { + "mean": 159.66486628845632, + "std": 26.623411195905877, + "max": 280.691650390625, + "min": 56.7653694152832 + }, + "p264": { + "mean": 198.6177753235762, + "std": 22.851517214249892, + "max": 348.9320983886719, + "min": 40.23796463012695 + }, + "p281": { + "mean": 96.09042569179093, + "std": 13.117764368410802, + "max": 169.49920654296875, + "min": 45.52116012573242 + }, + "p273": { + "mean": 162.20369103876038, + "std": 28.00698942698969, + "max": 388.9765319824219, + "min": 42.892356872558594 + }, + "p339": { + "mean": 206.9092679145247, + "std": 29.845213465322256, + "max": 395.5875549316406, + "min": 69.36454010009766 + }, + "p341": { + "mean": 186.28873951029638, + "std": 31.371651899882256, + "max": 315.6408386230469, + "min": 55.91461181640625 + }, + "p299": { + "mean": 178.52948559778696, + "std": 30.395621494198092, + "max": 341.9533386230469, + "min": 45.8315544128418 + }, + "p286": { + "mean": 139.46711192680075, + "std": 23.466399928306622, + "max": 240.65101623535156, + "min": 49.577598571777344 + }, + "p268": { + "mean": 209.14686966282886, + "std": 32.66311689938003, + "max": 330.8009948730469, + "min": 79.77095031738281 + }, + "p333": { + "mean": 187.00838148139292, + "std": 28.801170246778963, + "max": 305.175048828125, + "min": 48.803321838378906 + }, + "p276": { + "mean": 232.88111740409414, + "std": 37.57394422698847, + "max": 478.6584167480469, + "min": 54.82196044921875 + }, + "p256": { + "mean": 94.53500113548228, + "std": 14.250615842825113, + "max": 251.59117126464844, + "min": 42.66779708862305 + }, + "p225": { + "mean": 183.69540950253102, + "std": 29.40774095681133, + "max": 335.45989990234375, + "min": 48.01626968383789 + }, + "p376": { + "mean": 104.45483811341958, + "std": 17.707810044656426, + "max": 195.90940856933594, + "min": 42.755516052246094 + }, + "p254": { + "mean": 89.6544123423712, + "std": 17.590927942739878, + "max": 521.3363037109375, + "min": 40.573612213134766 + }, + "p277": { + "mean": 199.0565758160441, + "std": 23.735464473404154, + "max": 284.9627990722656, + "min": 58.1547966003418 + }, + "p340": { + "mean": 209.43921278912748, + "std": 27.835762381963335, + "max": 348.8285827636719, + "min": 41.222232818603516 + }, + "p317": { + "mean": 250.22190927547794, + "std": 36.39590562850019, + "max": 520.8858642578125, + "min": 50.66531753540039 + }, + "p374": { + "mean": 123.33478569335622, + "std": 29.049692519461416, + "max": 296.4808044433594, + "min": 40.45524215698242 + }, + "p292": { + "mean": 108.51144915693115, + "std": 17.966688549629353, + "max": 212.6140899658203, + "min": 41.02241897583008 + }, + "p347": { + "mean": 95.38718437465519, + "std": 12.18414828864238, + "max": 143.6361083984375, + "min": 50.8834342956543 + }, + "p330": { + "mean": 193.186867767112, + "std": 24.543509483868295, + "max": 297.7118225097656, + "min": 46.03744888305664 + }, + "p245": { + "mean": 95.49357399893047, + "std": 14.289431847160401, + "max": 165.13453674316406, + "min": 42.07103729248047 + }, + "p302": { + "mean": 123.42743079715035, + "std": 18.2750375662803, + "max": 211.37318420410156, + "min": 57.9265022277832 + }, + "p282": { + "mean": 193.55083881484296, + "std": 27.34603035477489, + "max": 388.7534484863281, + "min": 60.87688064575195 + }, + "p257": { + "mean": 210.4240967552056, + "std": 28.9904827494371, + "max": 352.4640808105469, + "min": 96.31816101074219 + }, + "p279": { + "mean": 125.26435818940722, + "std": 24.125652840903125, + "max": 252.1083984375, + "min": 43.52880859375 + }, + "p362": { + "mean": 204.36670107379342, + "std": 34.12163033688038, + "max": 454.8257751464844, + "min": 90.29649353027344 + }, + "p351": { + "mean": 223.32094763809528, + "std": 20.82899177786335, + "max": 314.9055480957031, + "min": 125.82318115234375 + }, + "p303": { + "mean": 219.39558959815898, + "std": 36.319293680515784, + "max": 368.044677734375, + "min": 40.094757080078125 + }, + "p287": { + "mean": 102.16660891590256, + "std": 15.931422844828708, + "max": 187.77085876464844, + "min": 40.18850326538086 + }, + "p265": { + "mean": 196.45972132511133, + "std": 33.14160272904223, + "max": 382.5567321777344, + "min": 60.463096618652344 + }, + "p278": { + "mean": 115.31142658968115, + "std": 18.872993259839767, + "max": 205.69667053222656, + "min": 53.35262680053711 + }, + "p363": { + "mean": 111.65163261899708, + "std": 22.39757343779322, + "max": 278.500244140625, + "min": 40.5815544128418 + }, + "p250": { + "mean": 210.59566406020883, + "std": 35.89387497508014, + "max": 388.6906433105469, + "min": 43.92763137817383 + }, + "p285": { + "mean": 123.9795913506589, + "std": 21.956612957716814, + "max": 236.64837646484375, + "min": 46.916378021240234 + }, + "p271": { + "mean": 119.58804562158997, + "std": 15.495521060160133, + "max": 307.2266845703125, + "min": 58.65855407714844 + }, + "p306": { + "mean": 190.63130996069177, + "std": 23.754563419756554, + "max": 316.92498779296875, + "min": 54.29426956176758 + }, + "p230": { + "mean": 193.4971089872057, + "std": 30.180507961667367, + "max": 322.01513671875, + "min": 42.22804641723633 + }, + "p300": { + "mean": 200.74951753921437, + "std": 28.15534994336106, + "max": 336.1123046875, + "min": 69.45367431640625 + }, + "p323": { + "mean": 229.47334493738978, + "std": 58.514650323142696, + "max": 599.419677734375, + "min": 40.60541915893555 + }, + "p234": { + "mean": 190.4669546414671, + "std": 25.502122279718524, + "max": 321.3721923828125, + "min": 53.592079162597656 + }, + "p258": { + "mean": 121.27015130385243, + "std": 17.31227912126549, + "max": 244.57733154296875, + "min": 49.44462585449219 + }, + "p253": { + "mean": 229.03161264490427, + "std": 28.53928268951982, + "max": 366.3857116699219, + "min": 40.31864547729492 + }, + "p259": { + "mean": 121.93712918231527, + "std": 18.875150732571537, + "max": 272.75982666015625, + "min": 56.28487777709961 + }, + "p293": { + "mean": 190.13595722964666, + "std": 25.10802513642931, + "max": 300.9972839355469, + "min": 79.09306335449219 + }, + "p229": { + "mean": 182.9622892384888, + "std": 18.455528497457973, + "max": 263.1113586425781, + "min": 54.22048568725586 + }, + "p248": { + "mean": 237.62166287201683, + "std": 49.04045899983563, + "max": 519.955810546875, + "min": 63.55820846557617 + }, + "p231": { + "mean": 181.81469287206338, + "std": 34.53426854891174, + "max": 314.9605712890625, + "min": 40.432891845703125 + }, + "p249": { + "mean": 171.54162380660486, + "std": 37.10684091580513, + "max": 371.80792236328125, + "min": 40.863765716552734 + }, + "p252": { + "mean": 119.15190215189784, + "std": 16.74519393993888, + "max": 203.45106506347656, + "min": 50.42327880859375 + }, + "p251": { + "mean": 123.56003973906452, + "std": 20.046319042814655, + "max": 231.60464477539062, + "min": 40.03911209106445 + }, + "p361": { + "mean": 191.9538478812668, + "std": 33.72349592748691, + "max": 451.9200134277344, + "min": 44.08780288696289 + }, + "p238": { + "mean": 210.4943730033053, + "std": 46.925702855599404, + "max": 459.459716796875, + "min": 41.720279693603516 + }, + "p284": { + "mean": 101.98103076188423, + "std": 21.823536086970158, + "max": 225.5602569580078, + "min": 40.468353271484375 + }, + "p275": { + "mean": 110.28120440089297, + "std": 16.47485999383007, + "max": 192.66773986816406, + "min": 41.00666046142578 + }, + "p295": { + "mean": 191.38950101209218, + "std": 21.419800322989683, + "max": 301.0599365234375, + "min": 64.40604400634766 + }, + "p227": { + "mean": 120.21700638398332, + "std": 19.727900656560276, + "max": 261.1825256347656, + "min": 42.1202278137207 + }, + "p336": { + "mean": 210.0863478194363, + "std": 33.67162723458451, + "max": 370.92694091796875, + "min": 51.08160400390625 + }, + "p255": { + "mean": 148.62940011033183, + "std": 25.74459290585267, + "max": 251.4580078125, + "min": 53.957801818847656 + }, + "p334": { + "mean": 100.99390988737254, + "std": 16.553536054620587, + "max": 182.91346740722656, + "min": 40.570701599121094 + }, + "p305": { + "mean": 238.12860813373442, + "std": 36.43497156738361, + "max": 382.19512939453125, + "min": 40.54560089111328 + }, + "p233": { + "mean": 209.03228425284013, + "std": 27.704784931555448, + "max": 382.94146728515625, + "min": 98.1698226928711 + }, + "p360": { + "mean": 102.27932692487238, + "std": 18.004615610177247, + "max": 198.44578552246094, + "min": 40.58131408691406 + }, + "p311": { + "mean": 105.34755199911065, + "std": 18.59014367334217, + "max": 205.04534912109375, + "min": 40.27201461791992 + }, + "p232": { + "mean": 116.90505563912124, + "std": 25.09924093167438, + "max": 237.59555053710938, + "min": 42.470245361328125 + }, + "p262": { + "mean": 168.66148361188195, + "std": 29.247341639810053, + "max": 298.7639465332031, + "min": 42.99590301513672 + }, + "p244": { + "mean": 208.03186444281297, + "std": 33.69767833205533, + "max": 396.6988525390625, + "min": 53.796627044677734 + }, + "p260": { + "mean": 109.73949148535377, + "std": 21.290710697184323, + "max": 234.24095153808594, + "min": 40.17658615112305 + }, + "p294": { + "mean": 170.39733518391964, + "std": 35.44397237750051, + "max": 279.1806945800781, + "min": 40.36360549926758 + }, + "p240": { + "mean": 231.23205043901694, + "std": 33.74844475698175, + "max": 473.3512268066406, + "min": 45.20843505859375 + }, + "p239": { + "mean": 198.0565362599245, + "std": 29.525757428273124, + "max": 412.086669921875, + "min": 65.79889678955078 + }, + "p307": { + "mean": 255.02235788596576, + "std": 41.35703831146687, + "max": 443.2418212890625, + "min": 47.97418975830078 + }, + "p272": { + "mean": 120.31454169907771, + "std": 19.925364539606058, + "max": 222.39834594726562, + "min": 40.82168960571289 + }, + "p313": { + "mean": 179.0156566793693, + "std": 27.66029037114466, + "max": 392.4278564453125, + "min": 45.59797286987305 + }, + "p247": { + "mean": 141.14835949411395, + "std": 22.263398239353805, + "max": 239.49876403808594, + "min": 60.11503219604492 + }, + "p269": { + "mean": 191.27523290518943, + "std": 23.647922743845832, + "max": 310.88201904296875, + "min": 72.79782104492188 + }, + "p236": { + "mean": 231.7551339140072, + "std": 33.00736859766793, + "max": 371.03778076171875, + "min": 52.81330871582031 + }, + "p304": { + "mean": 105.72255566466471, + "std": 20.440635337669427, + "max": 205.92649841308594, + "min": 40.22262191772461 + }, + "p329": { + "mean": 212.072124222229, + "std": 29.717528128639145, + "max": 389.5606689453125, + "min": 43.54408645629883 + }, + "p345": { + "mean": 101.55025337903844, + "std": 14.9950937549672, + "max": 169.50440979003906, + "min": 51.92561721801758 + }, + "p316": { + "mean": 100.22608042338832, + "std": 13.356102588454666, + "max": 171.4343719482422, + "min": 42.968101501464844 + }, + "p308": { + "mean": 189.6860312152447, + "std": 28.671378581571478, + "max": 359.3836364746094, + "min": 43.60274124145508 + }, + "p246": { + "mean": 103.0846538908923, + "std": 13.895091628470002, + "max": 166.29299926757812, + "min": 50.82132339477539 + }, + "p226": { + "mean": 111.71245702419112, + "std": 19.010036930037955, + "max": 229.51353454589844, + "min": 42.755619049072266 + }, + "p267": { + "mean": 187.91824200252267, + "std": 26.142982893619575, + "max": 316.70416259765625, + "min": 70.84942626953125 + }, + "p274": { + "mean": 104.13274922279172, + "std": 17.56780929079296, + "max": 203.72390747070312, + "min": 45.00861740112305 + }, + "p318": { + "mean": 197.00485584182212, + "std": 32.833598138637, + "max": 342.5959167480469, + "min": 50.479679107666016 + }, + "p241": { + "mean": 118.96700060164899, + "std": 17.83242850991315, + "max": 231.78211975097656, + "min": 61.846641540527344 + }, + "p298": { + "mean": 123.0115994901693, + "std": 21.31895438298803, + "max": 227.3868408203125, + "min": 48.60580825805664 + }, + "p261": { + "mean": 244.49336153979283, + "std": 41.48190452638297, + "max": 512.063720703125, + "min": 42.431522369384766 + }, + "p314": { + "mean": 186.63487619528496, + "std": 30.41936838133075, + "max": 323.5397644042969, + "min": 45.96017837524414 + }, + "p364": { + "mean": 103.87505272716182, + "std": 14.114793262086437, + "max": 165.49851989746094, + "min": 51.83555603027344 + }, + "p283": { + "mean": 208.92734572925735, + "std": 30.50503755696407, + "max": 366.21307373046875, + "min": 40.93833541870117 + }, + "p297": { + "mean": 189.8235774629039, + "std": 36.81058428999704, + "max": 379.5383605957031, + "min": 53.18995666503906 + }, + "p243": { + "mean": 120.15555354963577, + "std": 17.355552883223684, + "max": 222.6884307861328, + "min": 55.50260543823242 + }, + "p312": { + "mean": 206.5565351067497, + "std": 29.16553200995325, + "max": 336.6168518066406, + "min": 81.44790649414062 + } +} \ No newline at end of file diff --git a/filelists/spk2id.json b/filelists/spk2id.json new file mode 100644 index 0000000000000000000000000000000000000000..765db35fc5ff912cc462401c389ffdaebb7feeeb --- /dev/null +++ b/filelists/spk2id.json @@ -0,0 +1,109 @@ +{ + "p225": 1, + "p226": 2, + "p227": 3, + "p228": 4, + "p229": 5, + "p230": 6, + "p231": 7, + "p232": 8, + "p233": 9, + "p234": 10, + "p236": 11, + "p237": 12, + "p238": 13, + "p239": 14, + "p240": 15, + "p241": 16, + "p243": 17, + "p244": 18, + "p245": 19, + "p246": 20, + "p247": 21, + "p248": 22, + "p249": 23, + "p250": 24, + "p251": 25, + "p252": 26, + "p253": 27, + "p254": 28, + "p255": 29, + "p256": 30, + "p257": 31, + "p258": 32, + "p259": 33, + "p260": 34, + "p261": 35, + "p262": 36, + "p263": 37, + "p264": 38, + "p265": 39, + "p266": 40, + "p267": 41, + "p268": 42, + "p269": 43, + "p270": 44, + "p271": 45, + "p272": 46, + "p273": 47, + "p274": 48, + "p275": 49, + "p276": 50, + "p277": 51, + "p278": 52, + "p279": 53, + "p281": 54, + "p282": 55, + "p283": 56, + "p284": 57, + "p285": 58, + "p286": 59, + "p287": 60, + "p288": 61, + "p292": 62, + "p293": 63, + "p294": 64, + "p295": 65, + "p297": 66, + "p298": 67, + "p299": 68, + "p300": 69, + "p301": 70, + "p302": 71, + "p303": 72, + "p304": 73, + "p305": 74, + "p306": 75, + "p307": 76, + "p308": 77, + "p310": 78, + "p311": 79, + "p312": 80, + "p313": 81, + "p314": 82, + "p316": 83, + "p317": 84, + "p318": 85, + "p323": 86, + "p326": 87, + "p329": 88, + "p330": 89, + "p333": 90, + "p334": 91, + "p335": 92, + "p336": 93, + "p339": 94, + "p340": 95, + "p341": 96, + "p343": 97, + "p345": 98, + "p347": 99, + "p351": 100, + "p360": 101, + "p361": 102, + "p362": 103, + "p363": 104, + "p364": 105, + "p374": 106, + "p376": 107 +} \ No newline at end of file diff --git a/filelists/spk_stats.json b/filelists/spk_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..1501c481f4e8b6568b7add41834d4763e15a8e57 --- /dev/null +++ b/filelists/spk_stats.json @@ -0,0 +1,430 @@ +{ + "p266": { + "best_spk_emb": "p266_417", + "max_score": 0.970996081829071 + }, + "p270": { + "best_spk_emb": "p270_297", + "max_score": 0.9682378768920898 + }, + "p237": { + "best_spk_emb": "p237_023", + "max_score": 0.9648184180259705 + }, + "p263": { + "best_spk_emb": "p263_218", + "max_score": 0.9591436386108398 + }, + "p335": { + "best_spk_emb": "p335_120", + "max_score": 0.9628006219863892 + }, + "p228": { + "best_spk_emb": "p228_242", + "max_score": 0.9608617424964905 + }, + "p301": { + "best_spk_emb": "p301_023", + "max_score": 0.9769936800003052 + }, + "p326": { + "best_spk_emb": "p326_022", + "max_score": 0.9705225229263306 + }, + "p310": { + "best_spk_emb": "p310_023", + "max_score": 0.9717426896095276 + }, + "p288": { + "best_spk_emb": "p288_255", + "max_score": 0.9637439250946045 + }, + "p343": { + "best_spk_emb": "p343_023", + "max_score": 0.9708293676376343 + }, + "p264": { + "best_spk_emb": "p264_438", + "max_score": 0.9665981531143188 + }, + "p281": { + "best_spk_emb": "p281_022", + "max_score": 0.9601494073867798 + }, + "p273": { + "best_spk_emb": "p273_023", + "max_score": 0.9673324227333069 + }, + "p339": { + "best_spk_emb": "p339_021", + "max_score": 0.9655042886734009 + }, + "p341": { + "best_spk_emb": "p341_019", + "max_score": 0.9662896394729614 + }, + "p299": { + "best_spk_emb": "p299_213", + "max_score": 0.9517490863800049 + }, + "p286": { + "best_spk_emb": "p286_160", + "max_score": 0.973882257938385 + }, + "p268": { + "best_spk_emb": "p268_021", + "max_score": 0.9776462912559509 + }, + "p333": { + "best_spk_emb": "p333_023", + "max_score": 0.9644259810447693 + }, + "p276": { + "best_spk_emb": "p276_023", + "max_score": 0.9704123735427856 + }, + "p256": { + "best_spk_emb": "p256_079", + "max_score": 0.9699604511260986 + }, + "p225": { + "best_spk_emb": "p225_220", + "max_score": 0.9597213864326477 + }, + "p376": { + "best_spk_emb": "p376_023", + "max_score": 0.9645803570747375 + }, + "p254": { + "best_spk_emb": "p254_023", + "max_score": 0.9744523763656616 + }, + "p277": { + "best_spk_emb": "p277_023", + "max_score": 0.9669135808944702 + }, + "p340": { + "best_spk_emb": "p340_021", + "max_score": 0.964806318283081 + }, + "p317": { + "best_spk_emb": "p317_021", + "max_score": 0.9730928540229797 + }, + "p374": { + "best_spk_emb": "p374_023", + "max_score": 0.9731501936912537 + }, + "p292": { + "best_spk_emb": "p292_288", + "max_score": 0.9664127826690674 + }, + "p347": { + "best_spk_emb": "p347_011", + "max_score": 0.9828923940658569 + }, + "p330": { + "best_spk_emb": "p330_008", + "max_score": 0.9744166731834412 + }, + "p245": { + "best_spk_emb": "p245_014", + "max_score": 0.975610613822937 + }, + "p302": { + "best_spk_emb": "p302_023", + "max_score": 0.964878499507904 + }, + "p282": { + "best_spk_emb": "p282_023", + "max_score": 0.972954273223877 + }, + "p257": { + "best_spk_emb": "p257_023", + "max_score": 0.9610278606414795 + }, + "p279": { + "best_spk_emb": "p279_003", + "max_score": 0.952425479888916 + }, + "p362": { + "best_spk_emb": "p362_022", + "max_score": 0.9609566330909729 + }, + "p351": { + "best_spk_emb": "p351_023", + "max_score": 0.9633771181106567 + }, + "p303": { + "best_spk_emb": "p303_023", + "max_score": 0.9685837626457214 + }, + "p287": { + "best_spk_emb": "p287_023", + "max_score": 0.9646018147468567 + }, + "p265": { + "best_spk_emb": "p265_273", + "max_score": 0.9665485620498657 + }, + "p278": { + "best_spk_emb": "p278_023", + "max_score": 0.9733871817588806 + }, + "p363": { + "best_spk_emb": "p363_023", + "max_score": 0.9586407542228699 + }, + "p250": { + "best_spk_emb": "p250_021", + "max_score": 0.9690421223640442 + }, + "p285": { + "best_spk_emb": "p285_364", + "max_score": 0.9608680009841919 + }, + "p271": { + "best_spk_emb": "p271_170", + "max_score": 0.9624505043029785 + }, + "p306": { + "best_spk_emb": "p306_021", + "max_score": 0.9801679849624634 + }, + "p230": { + "best_spk_emb": "p230_361", + "max_score": 0.9690300226211548 + }, + "p300": { + "best_spk_emb": "p300_021", + "max_score": 0.9783481955528259 + }, + "p323": { + "best_spk_emb": "p323_023", + "max_score": 0.9751206040382385 + }, + "p234": { + "best_spk_emb": "p234_229", + "max_score": 0.9664621949195862 + }, + "p258": { + "best_spk_emb": "p258_228", + "max_score": 0.9660263061523438 + }, + "p253": { + "best_spk_emb": "p253_207", + "max_score": 0.9585143327713013 + }, + "p259": { + "best_spk_emb": "p259_011", + "max_score": 0.9748988151550293 + }, + "p293": { + "best_spk_emb": "p293_023", + "max_score": 0.9742795825004578 + }, + "p229": { + "best_spk_emb": "p229_021", + "max_score": 0.9733553528785706 + }, + "p248": { + "best_spk_emb": "p248_023", + "max_score": 0.972179651260376 + }, + "p231": { + "best_spk_emb": "p231_197", + "max_score": 0.9446225166320801 + }, + "p249": { + "best_spk_emb": "p249_223", + "max_score": 0.9684494733810425 + }, + "p252": { + "best_spk_emb": "p252_023", + "max_score": 0.9722281098365784 + }, + "p251": { + "best_spk_emb": "p251_364", + "max_score": 0.9563640356063843 + }, + "p361": { + "best_spk_emb": "p361_023", + "max_score": 0.9655197262763977 + }, + "p238": { + "best_spk_emb": "p238_023", + "max_score": 0.975940465927124 + }, + "p284": { + "best_spk_emb": "p284_365", + "max_score": 0.9638668298721313 + }, + "p275": { + "best_spk_emb": "p275_023", + "max_score": 0.9620155692100525 + }, + "p295": { + "best_spk_emb": "p295_022", + "max_score": 0.9640949964523315 + }, + "p227": { + "best_spk_emb": "p227_021", + "max_score": 0.9739644527435303 + }, + "p336": { + "best_spk_emb": "p336_021", + "max_score": 0.9751037955284119 + }, + "p255": { + "best_spk_emb": "p255_038", + "max_score": 0.9673135280609131 + }, + "p334": { + "best_spk_emb": "p334_184", + "max_score": 0.9662039279937744 + }, + "p305": { + "best_spk_emb": "p305_188", + "max_score": 0.9668008089065552 + }, + "p233": { + "best_spk_emb": "p233_323", + "max_score": 0.9736969470977783 + }, + "p360": { + "best_spk_emb": "p360_023", + "max_score": 0.9581485390663147 + }, + "p311": { + "best_spk_emb": "p311_008", + "max_score": 0.9598737955093384 + }, + "p232": { + "best_spk_emb": "p232_023", + "max_score": 0.9630817174911499 + }, + "p262": { + "best_spk_emb": "p262_210", + "max_score": 0.9644728302955627 + }, + "p244": { + "best_spk_emb": "p244_008", + "max_score": 0.9677988290786743 + }, + "p260": { + "best_spk_emb": "p260_103", + "max_score": 0.9559831619262695 + }, + "p294": { + "best_spk_emb": "p294_016", + "max_score": 0.9600991010665894 + }, + "p240": { + "best_spk_emb": "p240_004", + "max_score": 0.9616067409515381 + }, + "p239": { + "best_spk_emb": "p239_023", + "max_score": 0.9626854658126831 + }, + "p307": { + "best_spk_emb": "p307_023", + "max_score": 0.9625473618507385 + }, + "p272": { + "best_spk_emb": "p272_257", + "max_score": 0.9546929001808167 + }, + "p313": { + "best_spk_emb": "p313_011", + "max_score": 0.9648092985153198 + }, + "p247": { + "best_spk_emb": "p247_380", + "max_score": 0.9651156067848206 + }, + "p269": { + "best_spk_emb": "p269_332", + "max_score": 0.9700820446014404 + }, + "p236": { + "best_spk_emb": "p236_068", + "max_score": 0.9643090963363647 + }, + "p304": { + "best_spk_emb": "p304_078", + "max_score": 0.9588027596473694 + }, + "p329": { + "best_spk_emb": "p329_021", + "max_score": 0.9740150570869446 + }, + "p345": { + "best_spk_emb": "p345_023", + "max_score": 0.9643380641937256 + }, + "p316": { + "best_spk_emb": "p316_011", + "max_score": 0.9680041074752808 + }, + "p308": { + "best_spk_emb": "p308_328", + "max_score": 0.973384439945221 + }, + "p246": { + "best_spk_emb": "p246_022", + "max_score": 0.9762457609176636 + }, + "p226": { + "best_spk_emb": "p226_341", + "max_score": 0.9645286202430725 + }, + "p267": { + "best_spk_emb": "p267_022", + "max_score": 0.9674316644668579 + }, + "p274": { + "best_spk_emb": "p274_296", + "max_score": 0.9561160206794739 + }, + "p318": { + "best_spk_emb": "p318_023", + "max_score": 0.9727175831794739 + }, + "p241": { + "best_spk_emb": "p241_050", + "max_score": 0.9624007344245911 + }, + "p298": { + "best_spk_emb": "p298_344", + "max_score": 0.9591403603553772 + }, + "p261": { + "best_spk_emb": "p261_023", + "max_score": 0.9671145081520081 + }, + "p314": { + "best_spk_emb": "p314_323", + "max_score": 0.9615058898925781 + }, + "p364": { + "best_spk_emb": "p364_023", + "max_score": 0.9585931301116943 + }, + "p283": { + "best_spk_emb": "p283_023", + "max_score": 0.9633697271347046 + }, + "p297": { + "best_spk_emb": "p297_023", + "max_score": 0.962790846824646 + }, + "p243": { + "best_spk_emb": "p243_087", + "max_score": 0.963236391544342 + }, + "p312": { + "best_spk_emb": "p312_185", + "max_score": 0.9470360279083252 + } +} \ No newline at end of file diff --git a/meldataset.py b/meldataset.py new file mode 100644 index 0000000000000000000000000000000000000000..d46439befd3a790c381fd1fb4df27fb0b1f8cf77 --- /dev/null +++ b/meldataset.py @@ -0,0 +1,211 @@ +import math +import os +import json +import random +import torch +# from torchvision.transforms.functional import resize +import torch.utils.data +import numpy as np +import librosa +from librosa.util import normalize +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn +# from speechbrain.lobes.models.FastSpeech2 import mel_spectogram + +MAX_WAV_VALUE = 32768.0 + + +def load_wav(full_path): + sampling_rate, data = read(full_path) + return data, sampling_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + # if torch.min(y) < -1.: + # print('min value is ', torch.min(y)) + # if torch.max(y) > 1.: + # print('max value is ', torch.max(y)) + + global mel_basis, hann_window + if fmax not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) + hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = y.squeeze(1) + + # complex tensor as default, then use view_as_real for future pytorch compatibility + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) + spec = torch.view_as_real(spec) + spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) + + spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) + spec = spectral_normalize_torch(spec) + + return spec + + +def get_dataset_filelist(a): + training_files =[] + validation_files =[] + total_files = 0 + + audio_dir = "dataset/audio" + + with open("filelists/train.txt") as f: + training_files = f.readlines() + for i, line in enumerate(training_files): + spk, basename = line.strip().split('|') + training_files[i] = f"{audio_dir}/{spk}/{basename}.wav" + + with open("filelists/val.txt") as f: + validation_files = f.readlines() + for i, line in enumerate(validation_files): + spk, basename = line.strip().split('|') + validation_files[i] = f"{audio_dir}/{spk}/{basename}.wav" + + random.seed(1234) + random.shuffle(training_files) + random.shuffle(validation_files) + + return training_files, validation_files + + +class MelDataset(torch.utils.data.Dataset): + def __init__(self, training_files, segment_size, n_fft, num_mels, + hop_size, win_size, sampling_rate, fmin, fmax, shuffle=True, n_cache_reuse=1, + device=None, fmax_loss=None, use_aug=False): + self.audio_files = training_files + random.seed(1234) + if shuffle: + random.shuffle(self.audio_files) + self.segment_size = segment_size + self.sampling_rate = sampling_rate + self.n_fft = n_fft + self.num_mels = num_mels + self.hop_size = hop_size + self.win_size = win_size + self.fmin = fmin + self.fmax = fmax + self.fmax_loss = fmax_loss + self.cached_wav = None + self.n_cache_reuse = n_cache_reuse + self._cache_ref_count = 0 + self.device = device + self.use_aug = use_aug + + with open("filelists/spk2id.json") as f: + self.spk2id = json.load(f) + + def __getitem__(self, index): + filename = self.audio_files[index] + if self._cache_ref_count == 0: + audio, sampling_rate = load_wav(filename) + audio = audio / MAX_WAV_VALUE + audio = normalize(audio) * 0.95 + self.cached_wav = audio + if sampling_rate != self.sampling_rate: + raise ValueError("{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate)) + self._cache_ref_count = self.n_cache_reuse + else: + audio = self.cached_wav + self._cache_ref_count -= 1 + + audio = torch.FloatTensor(audio) + audio = audio.unsqueeze(0) + + if audio.size(1) >= self.segment_size: + max_audio_start = audio.size(1) - self.segment_size + audio_start = random.randint(0, max_audio_start) + audio = audio[:, audio_start:audio_start+self.segment_size] + else: + audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') + + mel = mel_spectrogram(audio, self.n_fft, self.num_mels, + self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, + center=False) + + mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, + self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, + center=False) + + spk_path = filename.replace("audio", "spk").replace(".wav", ".npy") + spk_emb = torch.from_numpy(np.load(spk_path)) # (256) + spk = filename.split("/")[-2] + spk_id = self.spk2id[spk] + spk_id = torch.LongTensor([spk_id]) + + if not self.use_aug: + return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze(), spk_emb, spk_id) + + mel_aug, _ = mel_spectogram( + audio=audio.squeeze(), + sample_rate=16000, + hop_length=256, + win_length=1024, + n_mels=80, + n_fft=1024, + f_min=0.0, + f_max=8000.0, + power=1, + normalized=False, + min_max_energy_norm=True, + norm="slaney", + mel_scale="slaney", + compression=True + ) + mel_aug = self.resize_mel(mel_aug.unsqueeze(0)).squeeze(0) + + return (mel_aug.squeeze(), mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze(), spk_emb, spk_id) + + def __len__(self): + return len(self.audio_files) + + def resize_mel(self, mel): + ratio = 0.85 + 0.3 * torch.rand(1) # 0.85 ~ 1.15 + height = int(mel.size(-2) * ratio) + width = mel.size(-1) + + mel_r = resize(mel, (height, width), antialias=True) + + if height >= mel.size(-2): + mel_r = mel_r[:, :mel.size(-2), :] + else: + pad = mel_r[:, -1:, :].repeat(1, mel.size(-2) - height, 1) + pad += torch.randn_like(pad) / 1e3 + mel_r = torch.cat((mel_r, pad), 1) + + return mel_r diff --git a/models.py b/models.py new file mode 100644 index 0000000000000000000000000000000000000000..89eeee9f8a6b9ed3030f4071901c063afc1c96e8 --- /dev/null +++ b/models.py @@ -0,0 +1,920 @@ +import math +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from utils import init_weights, get_padding +import numpy as np +from stft import TorchSTFT + +LRELU_SLOPE = 0.1 + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class WN(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert(kernel_size % 2 == 1) + self.hidden_channels =hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] + else: + g_l = torch.zeros_like(x_in) + + acts = fused_add_tanh_sigmoid_multiply( + x_in, + g_l, + n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:,:self.hidden_channels,:] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:,self.hidden_channels:,:] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class Encoder(nn.Module): + def __init__(self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + + def forward(self, x, x_mask=1, g=None): + # x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + x = self.proj(x) * x_mask + return x + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))]) + self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))]) + + + def forward(self, x): + for c1, c2, a1, a2 in zip(self.convs1, self.convs2, self.alpha1, self.alpha2): + xt = x + (1 / a1) * (torch.sin(a1 * x) ** 2) # Snake1D + xt = c1(xt) + xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + +class ResBlock1_old(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class SineGen(torch.nn.Module): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, upsample_scale, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.flag_for_pulse = flag_for_pulse + self.upsample_scale = upsample_scale + + def _f02uv(self, f0): + # generate uv signal + uv = (f0 > self.voiced_threshold).type(torch.float32) + return uv + + def _f02sine(self, f0_values): + """ f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \ + device=f0_values.device) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + if not self.flag_for_pulse: +# # for normal case + +# # To prevent torch.cumsum numerical overflow, +# # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. +# # Buffer tmp_over_one_idx indicates the time step to add -1. +# # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi +# tmp_over_one = torch.cumsum(rad_values, 1) % 1 +# tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 +# cumsum_shift = torch.zeros_like(rad_values) +# cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + +# phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi + rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2), + scale_factor=1/self.upsample_scale, + mode="linear").transpose(1, 2) + +# tmp_over_one = torch.cumsum(rad_values, 1) % 1 +# tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 +# cumsum_shift = torch.zeros_like(rad_values) +# cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi + phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale, + scale_factor=self.upsample_scale, mode="linear").transpose(1, 2) + sines = torch.sin(phase) + + else: + # If necessary, make sure that the first time step of every + # voiced segments is sin(pi) or cos(0) + # This is used for pulse-train generation + + # identify the last time step in unvoiced segments + uv = self._f02uv(f0_values) + uv_1 = torch.roll(uv, shifts=-1, dims=1) + uv_1[:, -1, :] = 1 + u_loc = (uv < 1) * (uv_1 > 0) + + # get the instantanouse phase + tmp_cumsum = torch.cumsum(rad_values, dim=1) + # different batch needs to be processed differently + for idx in range(f0_values.shape[0]): + temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] + temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] + # stores the accumulation of i.phase within + # each voiced segments + tmp_cumsum[idx, :, :] = 0 + tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum + + # rad_values - tmp_cumsum: remove the accumulation of i.phase + # within the previous voiced segment. + i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) + + # get the sines + sines = torch.cos(i_phase * 2 * np.pi) + return sines + + def forward(self, f0): + """ sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, + device=f0.device) + # fundamental component + fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) + + # generate sine waveforms + sine_waves = self._f02sine(fn) * self.sine_amp + + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """ SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num, + sine_amp, add_noise_std, voiced_threshod) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x): + """ + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + """ + # source for harmonic branch + with torch.no_grad(): + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + # source for noise branch, in the same shape as uv + noise = torch.randn_like(uv) * self.sine_amp / 3 + return sine_merge, noise, uv +def padDiff(x): + return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0) + + + +class Generator(torch.nn.Module): + def __init__(self, h, F0_model): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + resblock = ResBlock1 if h.resblock == '1' else ResBlock2 + + self.m_source = SourceModuleHnNSF( + sampling_rate=h.sampling_rate, + upsample_scale=np.prod(h.upsample_rates) * h.gen_istft_hop_size, + harmonic_num=8, voiced_threshod=10) + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h.upsample_rates) * h.gen_istft_hop_size) + self.noise_convs = nn.ModuleList() + self.noise_res = nn.ModuleList() + + self.F0_model = F0_model + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append(weight_norm( + ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), + k, u, padding=(k-u)//2))) + + c_cur = h.upsample_initial_channel // (2 ** (i + 1)) + + if i + 1 < len(h.upsample_rates): # + stride_f0 = np.prod(h.upsample_rates[i + 1:]) + self.noise_convs.append(Conv1d( + h.gen_istft_n_fft + 2, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2)) + self.noise_res.append(resblock(h, c_cur, 7, [1,3,5])) + else: + self.noise_convs.append(Conv1d(h.gen_istft_n_fft + 2, c_cur, kernel_size=1)) + self.noise_res.append(resblock(h, c_cur, 11, [1,3,5])) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel//(2**(i+1)) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d)) + + self.post_n_fft = h.gen_istft_n_fft + self.conv_post = weight_norm(Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + self.reflection_pad = torch.nn.ReflectionPad1d((1, 0)) + self.stft = TorchSTFT(filter_length=h.gen_istft_n_fft, hop_length=h.gen_istft_hop_size, win_length=h.gen_istft_n_fft) + + gin_channels = 256 + inter_channels = hidden_channels = h.upsample_initial_channel - gin_channels + + self.embed_spk = nn.Embedding(108, gin_channels) + self.enc = Encoder(768, inter_channels, hidden_channels, 5, 1, 4) + self.dec = Encoder(inter_channels, inter_channels, hidden_channels, 5, 1, 20, gin_channels=gin_channels) + + def forward(self, x, mel, spk_emb, spk_id): + g = self.embed_spk(spk_id).transpose(1, 2) + g = g + spk_emb.unsqueeze(-1) + + f0, _, _ = self.F0_model(mel.unsqueeze(1)) + if len(f0.shape) == 1: + f0 = f0.unsqueeze(0) + + f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t + + har_source, _, _ = self.m_source(f0) + har_source = har_source.transpose(1, 2).squeeze(1) + har_spec, har_phase = self.stft.transform(har_source) + har = torch.cat([har_spec, har_phase], dim=1) + + x = self.enc(x) + x = self.dec(x, g=g) + g = g.repeat(1, 1, x.shape[-1]) + x = torch.cat([x, g], dim=1) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x_source = self.noise_convs[i](har) + x_source = self.noise_res[i](x_source) + + x = self.ups[i](x) + if i == self.num_upsamples - 1: + x = self.reflection_pad(x) + + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i*self.num_kernels+j](x) + else: + xs += self.resblocks[i*self.num_kernels+j](x) + x = xs / self.num_kernels + + x = F.leaky_relu(x) + x = self.conv_post(x) + spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :]) + phase = torch.sin(x[:, self.post_n_fft // 2 + 1:, :]) + + return spec, phase + + def get_f0(self, mel, f0_mean_tgt, voiced_threshold=10, interp=True): + f0, _, _ = self.F0_model(mel.unsqueeze(1)) + + voiced = f0 > voiced_threshold + + lf0 = torch.log(f0) + lf0_mean = lf0[voiced].mean() + lf0_adj = lf0 - lf0_mean + math.log(f0_mean_tgt) + f0_adj = torch.exp(lf0_adj) + + f0_adj = torch.where(voiced, f0_adj, 0) + + # interpolate unsilent unvoiced f0 frames + if interp: + f0_adj = self.interp_f0(f0_adj.unsqueeze(0), voiced.unsqueeze(0)).squeeze(0) + energy = torch.sum(mel.squeeze(0), dim=0) # simple vad + unsilent = energy > -700 + unsilent = unsilent | voiced + f0_adj = torch.where(unsilent, f0_adj, 0) + + if len(f0_adj.shape) == 1: + f0_adj = f0_adj.unsqueeze(0) + + return f0_adj + + def get_x(self, x, spk_emb, spk_id): + g = self.embed_spk(spk_id).transpose(1, 2) + g = g + spk_emb.unsqueeze(-1) + + x = self.enc(x) + x = self.dec(x, g=g) + g = g.repeat(1, 1, x.shape[-1]) + x = torch.cat([x, g], dim=1) + + return x + + def infer(self, x, f0, stft): + f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t + + har_source, _, _ = self.m_source(f0) + har_source = har_source.transpose(1, 2).squeeze(1) + har_spec, har_phase = self.stft.transform(har_source) + har = torch.cat([har_spec, har_phase], dim=1) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x_source = self.noise_convs[i](har) + x_source = self.noise_res[i](x_source) + + x = self.ups[i](x) + if i == self.num_upsamples - 1: + x = self.reflection_pad(x) + + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i*self.num_kernels+j](x) + else: + xs += self.resblocks[i*self.num_kernels+j](x) + x = xs / self.num_kernels + + x = F.leaky_relu(x) + x = self.conv_post(x) + spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :]) + phase = torch.sin(x[:, self.post_n_fft // 2 + 1:, :]) + + y = stft.inverse(spec, phase) + + return y + + def interp_f0(self, pitch, voiced): + """Fill unvoiced regions via linear interpolation""" + + # Handle no voiced frames + if not voiced.any(): + return pitch + + # Pitch is linear in base-2 log-space + pitch = torch.log2(pitch) + + # Anchor endpoints + pitch[..., 0] = pitch[voiced][..., 0] + pitch[..., -1] = pitch[voiced][..., -1] + voiced[..., 0] = True + voiced[..., -1] = True + + # Interpolate + pitch[~voiced] = self.interp( + torch.where(~voiced[0])[0][None], + torch.where(voiced[0])[0][None], + pitch[voiced][None]) + + return 2 ** pitch + + @staticmethod + def interp(x, xp, fp): + """1D linear interpolation for monotonically increasing sample points""" + # Handle edge cases + if xp.shape[-1] == 0: + return x + if xp.shape[-1] == 1: + return torch.full( + x.shape, + fp.squeeze(), + device=fp.device, + dtype=fp.dtype) + + # Get slope and intercept using right-side first-differences + m = (fp[:, 1:] - fp[:, :-1]) / (xp[:, 1:] - xp[:, :-1]) + b = fp[:, :-1] - (m.mul(xp[:, :-1])) + + # Get indices to sample slope and intercept + indicies = torch.sum(torch.ge(x[:, :, None], xp[:, None, :]), -1) - 1 + indicies = torch.clamp(indicies, 0, m.shape[-1] - 1) + line_idx = torch.linspace( + 0, + indicies.shape[0], + 1, + device=indicies.device).to(torch.long).expand(indicies.shape) + + # Interpolate + return m[line_idx, indicies].mul(x) + b[line_idx, indicies] + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_post) + + +def stft(x, fft_size, hop_size, win_length, window): + """Perform STFT and convert to magnitude spectrogram. + Args: + x (Tensor): Input signal tensor (B, T). + fft_size (int): FFT size. + hop_size (int): Hop size. + win_length (int): Window length. + window (str): Window function type. + Returns: + Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). + """ + x_stft = torch.stft(x, fft_size, hop_size, win_length, window, + return_complex=True) + real = x_stft[..., 0] + imag = x_stft[..., 1] + + # NOTE(kan-bayashi): clamp is needed to avoid nan or inf + return torch.abs(x_stft).transpose(2, 1) + +class SpecDiscriminator(nn.Module): + """docstring for Discriminator.""" + + def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", use_spectral_norm=False): + super(SpecDiscriminator, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.fft_size = fft_size + self.shift_size = shift_size + self.win_length = win_length + self.window = getattr(torch, window)(win_length) + self.discriminators = nn.ModuleList([ + norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1,1), padding=(1, 1))), + ]) + + self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1)) + + def forward(self, y): + + fmap = [] + y = y.squeeze(1) + y = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(y.get_device())) + y = y.unsqueeze(1) + for i, d in enumerate(self.discriminators): + y = d(y) + y = F.leaky_relu(y, LRELU_SLOPE) + fmap.append(y) + + y = self.out(y) + fmap.append(y) + + return torch.flatten(y, 1, -1), fmap + +class MultiResSpecDiscriminator(torch.nn.Module): + + def __init__(self, + fft_sizes=[1024, 2048, 512], + hop_sizes=[120, 240, 50], + win_lengths=[600, 1200, 240], + window="hann_window"): + + super(MultiResSpecDiscriminator, self).__init__() + self.discriminators = nn.ModuleList([ + SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window), + SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window), + SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window) + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiPeriodDiscriminator, self).__init__() + self.discriminators = nn.ModuleList([ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.ModuleList([ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ]) + self.meanpools = nn.ModuleList([ + AvgPool1d(4, 2, padding=2), + AvgPool1d(4, 2, padding=2) + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i-1](y) + y_hat = self.meanpools[i-1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss*2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1-dr)**2) + g_loss = torch.mean(dg**2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1-dg)**2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + +def discriminator_TPRLS_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + tau = 0.04 + m_DG = torch.median((dr-dg)) + L_rel = torch.mean((((dr - dg) - m_DG)**2)[dr < dg + m_DG]) + loss += tau - F.relu(tau - L_rel) + return loss + +def generator_TPRLS_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + for dg, dr in zip(disc_real_outputs, disc_generated_outputs): + tau = 0.04 + m_DG = torch.median((dr-dg)) + L_rel = torch.mean((((dr - dg) - m_DG)**2)[dr < dg + m_DG]) + loss += tau - F.relu(tau - L_rel) + return loss diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..168d409ab60599dc1af20efcd9b226d5a21dae58 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +librosa +numpy +soundfile +torch +transformers \ No newline at end of file diff --git a/stft.py b/stft.py new file mode 100644 index 0000000000000000000000000000000000000000..d352fcd89ef99e1b9422155bc1c2a483c737fc5b --- /dev/null +++ b/stft.py @@ -0,0 +1,209 @@ +""" +BSD 3-Clause License +Copyright (c) 2017, Prem Seetharaman +All rights reserved. +* Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import torch +import numpy as np +import torch.nn.functional as F +from torch.autograd import Variable +from scipy.signal import get_window +from librosa.util import pad_center, tiny +import librosa.util as librosa_util + +def window_sumsquare(window, n_frames, hop_length=200, win_length=800, + n_fft=800, dtype=np.float32, norm=None): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + n_frames : int > 0 + The number of analysis frames + hop_length : int > 0 + The number of samples to advance between frames + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + n_fft : int > 0 + The length of each analysis frame. + dtype : np.dtype + The data type of the output + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = librosa_util.normalize(win_sq, norm=norm)**2 + win_sq = librosa_util.pad_center(win_sq, n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] + return x + + +class STFT(torch.nn.Module): + """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" + def __init__(self, filter_length=800, hop_length=200, win_length=800, + window='hann'): + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.window = window + self.forward_transform = None + scale = self.filter_length / self.hop_length + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), + np.imag(fourier_basis[:cutoff, :])]) + + forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) + inverse_basis = torch.FloatTensor( + np.linalg.pinv(scale * fourier_basis).T[:, None, :]) + + if window is not None: + assert(filter_length >= win_length) + # get window and zero center pad it to filter_length + fft_window = get_window(window, win_length, fftbins=True) + fft_window = pad_center(fft_window, filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer('forward_basis', forward_basis.float()) + self.register_buffer('inverse_basis', inverse_basis.float()) + + def transform(self, input_data): + num_batches = input_data.size(0) + num_samples = input_data.size(1) + + self.num_samples = num_samples + + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + input_data = F.pad( + input_data.unsqueeze(1), + (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), + mode='reflect') + input_data = input_data.squeeze(1) + + forward_transform = F.conv1d( + input_data, + Variable(self.forward_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + + magnitude = torch.sqrt(real_part**2 + imag_part**2) + phase = torch.autograd.Variable( + torch.atan2(imag_part.data, real_part.data)) + + return magnitude, phase + + def inverse(self, magnitude, phase): + recombine_magnitude_phase = torch.cat( + [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) + + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, + Variable(self.inverse_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, magnitude.size(-1), hop_length=self.hop_length, + win_length=self.win_length, n_fft=self.filter_length, + dtype=np.float32) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0]) + window_sum = torch.autograd.Variable( + torch.from_numpy(window_sum), requires_grad=False) + window_sum = window_sum.to(inverse_transform.device()) if magnitude.is_cuda else window_sum + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] + inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] + + return inverse_transform + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction + + +class TorchSTFT(torch.nn.Module): + def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'): + super().__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32)) + + def transform(self, input_data): + forward_transform = torch.stft( + input_data, + self.filter_length, self.hop_length, self.win_length, window=self.window.to(input_data.device), + return_complex=True) + + return torch.abs(forward_transform), torch.angle(forward_transform) + + def inverse(self, magnitude, phase): + inverse_transform = torch.istft( + magnitude * torch.exp(phase * 1j), + self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device)) + + return inverse_transform.unsqueeze(-2) # unsqueeze to stay consistent with conv_transpose1d implementation + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction + + diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2a536e6205ba4252b41bdfe05b491908b500be --- /dev/null +++ b/utils.py @@ -0,0 +1,58 @@ +import glob +import os +import matplotlib +import torch +from torch.nn.utils import weight_norm +matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("Saving checkpoint to {}".format(filepath)) + torch.save(obj, filepath) + print("Complete.") + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] +