{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "HwaQq4GRU_Nw" }, "source": [ "# StarGANv2-VC Demo (VCTK 20 Speakers)" ] }, { "cell_type": "markdown", "metadata": { "id": "hCpoXuZeGKAn" }, "source": [ "### Utils" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%cd .." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 24923, "status": "ok", "timestamp": 1613984920200, "user": { "displayName": "Yinghao Li", "photoUrl": "", "userId": "12798981472803960591" }, "user_tz": 300 }, "id": "3on9IjGhVGTP", "outputId": "63a799f8-564d-48c2-fb0f-e66c0cd9fdb8" }, "outputs": [], "source": [ "# load packages\n", "import random\n", "import yaml\n", "from munch import Munch\n", "import numpy as np\n", "import paddle\n", "from paddle import nn\n", "import paddle.nn.functional as F\n", "import paddleaudio\n", "import librosa\n", "\n", "from starganv2vc_paddle.Utils.ASR.models import ASRCNN\n", "from starganv2vc_paddle.Utils.JDC.model import JDCNet\n", "from starganv2vc_paddle.models import Generator, MappingNetwork, StyleEncoder\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Source: http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/en_speaker_used.txt\n", "# Source: https://github.com/jjery2243542/voice_conversion\n", "\n", "speakers = [225,228,229,230,231,233,236,239,240,244,226,227,232,243,254,256,258,259,270,273]\n", "\n", "to_mel = paddleaudio.features.MelSpectrogram(\n", " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n", "to_mel.fbank_matrix[:] = paddle.load('starganv2vc_paddle/fbank_matrix.pd')['fbank_matrix']\n", "mean, std = -4, 4\n", "\n", "def preprocess(wave):\n", " wave_tensor = paddle.to_tensor(wave).astype(paddle.float32)\n", " mel_tensor = to_mel(wave_tensor)\n", " mel_tensor = (paddle.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n", " return mel_tensor\n", "\n", "def build_model(model_params={}):\n", " args = Munch(model_params)\n", " generator = Generator(args.dim_in, args.style_dim, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)\n", " mapping_network = MappingNetwork(args.latent_dim, args.style_dim, args.num_domains, hidden_dim=args.max_conv_dim)\n", " style_encoder = StyleEncoder(args.dim_in, args.style_dim, args.num_domains, args.max_conv_dim)\n", " \n", " nets_ema = Munch(generator=generator,\n", " mapping_network=mapping_network,\n", " style_encoder=style_encoder)\n", "\n", " return nets_ema\n", "\n", "def compute_style(speaker_dicts):\n", " reference_embeddings = {}\n", " for key, (path, speaker) in speaker_dicts.items():\n", " if path == \"\":\n", " label = paddle.to_tensor([speaker], dtype=paddle.int64)\n", " latent_dim = starganv2.mapping_network.shared[0].weight.shape[0]\n", " ref = starganv2.mapping_network(paddle.randn([1, latent_dim]), label)\n", " else:\n", " wave, sr = librosa.load(path, sr=24000)\n", " audio, index = librosa.effects.trim(wave, top_db=30)\n", " if sr != 24000:\n", " wave = librosa.resample(wave, sr, 24000)\n", " mel_tensor = preprocess(wave)\n", "\n", " with paddle.no_grad():\n", " label = paddle.to_tensor([speaker], dtype=paddle.int64)\n", " ref = starganv2.style_encoder(mel_tensor.unsqueeze(1), label)\n", " reference_embeddings[key] = (ref, label)\n", " \n", " return reference_embeddings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load models" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load F0 model\n", "\n", "F0_model = JDCNet(num_class=1, seq_len=192)\n", "params = paddle.load(\"Models/bst.pd\")['net']\n", "F0_model.set_state_dict(params)\n", "_ = F0_model.eval()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "executionInfo": { "elapsed": 43003, "status": "ok", "timestamp": 1613984938321, "user": { "displayName": "Yinghao Li", "photoUrl": "", "userId": "12798981472803960591" }, "user_tz": 300 }, "id": "NZA3ot-oF5t-" }, "outputs": [], "source": [ "# load vocoder\n", "\n", "import yaml\n", "import paddle\n", "\n", "from yacs.config import CfgNode\n", "from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator\n", "\n", "with open('Vocoder/config.yml') as f:\n", " voc_config = CfgNode(yaml.safe_load(f))\n", "voc_config[\"generator_params\"].pop(\"upsample_net\")\n", "voc_config[\"generator_params\"][\"upsample_scales\"] = voc_config[\"generator_params\"].pop(\"upsample_params\")[\"upsample_scales\"]\n", "vocoder = PWGGenerator(**voc_config[\"generator_params\"])\n", "vocoder.remove_weight_norm()\n", "vocoder.eval()\n", "vocoder.set_state_dict(paddle.load('Vocoder/checkpoint-400000steps.pd'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 24462, "status": "ok", "timestamp": 1613985522414, "user": { "displayName": "Yinghao Li", "photoUrl": "", "userId": "12798981472803960591" }, "user_tz": 300 }, "id": "Ou4367LCyefA", "outputId": "19c61f6f-f39a-43b9-9275-09418c2aebb4" }, "outputs": [], "source": [ "# load starganv2\n", "\n", "model_path = 'Models/vc_ema.pd'\n", "\n", "with open('Models/config.yml') as f:\n", " starganv2_config = yaml.safe_load(f)\n", "starganv2 = build_model(model_params=starganv2_config[\"model_params\"])\n", "params = paddle.load(model_path)\n", "params = params['model_ema']\n", "_ = [starganv2[key].set_state_dict(params[key]) for key in starganv2]\n", "_ = [starganv2[key].eval() for key in starganv2]\n", "starganv2.style_encoder = starganv2.style_encoder\n", "starganv2.mapping_network = starganv2.mapping_network\n", "starganv2.generator = starganv2.generator" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Conversion" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load input wave\n", "selected_speakers = [273, 259, 258, 243, 254, 244, 236, 233, 230, 228]\n", "k = random.choice(selected_speakers)\n", "wav_path = 'Demo/VCTK-corpus/p' + str(k) + '/p' + str(k) + '_023.wav'\n", "audio, source_sr = librosa.load(wav_path, sr=24000)\n", "audio = audio / np.max(np.abs(audio))\n", "audio.dtype = np.float32" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Convert by style encoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with reference, using style encoder\n", "speaker_dicts = {}\n", "for s in selected_speakers:\n", " k = s\n", " speaker_dicts['p' + str(s)] = ('Demo/VCTK-corpus/p' + str(k) + '/p' + str(k) + '_023.wav', speakers.index(s))\n", "\n", "reference_embeddings = compute_style(speaker_dicts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 333 }, "executionInfo": { "elapsed": 1424, "status": "ok", "timestamp": 1613986299525, "user": { "displayName": "Yinghao Li", "photoUrl": "", "userId": "12798981472803960591" }, "user_tz": 300 }, "id": "T5tahObUyN-d", "outputId": "f4f38742-2235-4f59-cb2a-5008912cd870", "scrolled": true }, "outputs": [], "source": [ "# conversion \n", "import time\n", "start = time.time()\n", " \n", "source = preprocess(audio)\n", "keys = []\n", "converted_samples = {}\n", "reconstructed_samples = {}\n", "converted_mels = {}\n", "\n", "for key, (ref, _) in reference_embeddings.items():\n", " with paddle.no_grad():\n", " f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))\n", " out = starganv2.generator(source.unsqueeze(1), ref, F0=f0_feat)\n", " \n", " c = out.transpose([0,1,3,2]).squeeze()\n", " y_out = vocoder.inference(c)\n", " y_out = y_out.reshape([-1])\n", "\n", " if key not in speaker_dicts or speaker_dicts[key][0] == \"\":\n", " recon = None\n", " else:\n", " wave, sr = librosa.load(speaker_dicts[key][0], sr=24000)\n", " mel = preprocess(wave)\n", " c = mel.transpose([0,2,1]).squeeze()\n", " recon = vocoder.inference(c)\n", " recon = recon.reshape([-1]).numpy()\n", "\n", " converted_samples[key] = y_out.numpy()\n", " reconstructed_samples[key] = recon\n", "\n", " converted_mels[key] = out\n", " \n", " keys.append(key)\n", "end = time.time()\n", "print('total processing time: %.3f sec' % (end - start) )\n", "\n", "import IPython.display as ipd\n", "for key, wave in converted_samples.items():\n", " print('Converted: %s' % key)\n", " display(ipd.Audio(wave, rate=24000))\n", " print('Reference (vocoder): %s' % key)\n", " if reconstructed_samples[key] is not None:\n", " display(ipd.Audio(reconstructed_samples[key], rate=24000))\n", "\n", "print('Original (vocoder):')\n", "wave, sr = librosa.load(wav_path, sr=24000)\n", "mel = preprocess(wave)\n", "c = mel.transpose([0,2,1]).squeeze()\n", "with paddle.no_grad():\n", " recon = vocoder.inference(c)\n", " recon = recon.reshape([-1]).numpy()\n", "display(ipd.Audio(recon, rate=24000))\n", "print('Original:')\n", "display(ipd.Audio(wav_path, rate=24000))" ] }, { "cell_type": "markdown", "metadata": { "id": "SWh3o9hvGvJt" }, "source": [ "#### Convert by mapping network" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# no reference, using mapping network\n", "speaker_dicts = {}\n", "selected_speakers = [273, 259, 258, 243, 254, 244, 236, 233, 230, 228]\n", "for s in selected_speakers:\n", " k = s\n", " speaker_dicts['p' + str(s)] = ('', speakers.index(s))\n", "\n", "reference_embeddings = compute_style(speaker_dicts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# conversion \n", "import time\n", "start = time.time()\n", " \n", "source = preprocess(audio)\n", "keys = []\n", "converted_samples = {}\n", "reconstructed_samples = {}\n", "converted_mels = {}\n", "\n", "for key, (ref, _) in reference_embeddings.items():\n", " with paddle.no_grad():\n", " f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))\n", " out = starganv2.generator(source.unsqueeze(1), ref, F0=f0_feat)\n", " \n", " c = out.transpose([0,1,3,2]).squeeze()\n", " y_out = vocoder.inference(c)\n", " y_out = y_out.reshape([-1])\n", "\n", " if key not in speaker_dicts or speaker_dicts[key][0] == \"\":\n", " recon = None\n", " else:\n", " wave, sr = librosa.load(speaker_dicts[key][0], sr=24000)\n", " mel = preprocess(wave)\n", " c = mel.transpose([0,2,1]).squeeze()\n", " recon = vocoder.inference(c)\n", " recon = recon.reshape([-1]).numpy()\n", "\n", " converted_samples[key] = y_out.numpy()\n", " reconstructed_samples[key] = recon\n", "\n", " converted_mels[key] = out\n", " \n", " keys.append(key)\n", "end = time.time()\n", "print('total processing time: %.3f sec' % (end - start) )\n", "\n", "import IPython.display as ipd\n", "for key, wave in converted_samples.items():\n", " print('Converted: %s' % key)\n", " display(ipd.Audio(wave, rate=24000))\n", " print('Reference (vocoder): %s' % key)\n", " if reconstructed_samples[key] is not None:\n", " display(ipd.Audio(reconstructed_samples[key], rate=24000))\n", "\n", "print('Original (vocoder):')\n", "wave, sr = librosa.load(wav_path, sr=24000)\n", "mel = preprocess(wave)\n", "c = mel.transpose([0,2,1]).squeeze().to('cuda')\n", "with paddle.no_grad():\n", " recon = vocoder.inference(c)\n", " recon = recon.reshape([-1]).numpy()\n", "display(ipd.Audio(recon, rate=24000))\n", "print('Original:')\n", "display(ipd.Audio(wav_path, rate=24000))" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [ "hCpoXuZeGKAn" ], "name": "Starganv2_vc.ipynb", "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 1 }