Spaces:

siyangyuan
/

mb

Runtime error

App Files Files Community

siyangyuan commited on Nov 22, 2022

Commit

14cff58

1 Parent(s): 640063e

Upload 45 files

Browse files

Files changed (45) hide show

.gitattributes +8 -26
.gitignore +160 -0
MANIFEST.in +4 -0
README.md +28 -6
app.py +86 -0
data/azusa/azusa.pt +3 -0
data/encoder.pt +3 -0
data/g_hifigan.pt +3 -0
data/ltyai/ltyai.pt +3 -0
data/nanmei/nanmei.pt +3 -0
data/tianyi/tianyi.pt +3 -0
data/wavernn.pt +3 -0
mockingbirdforuse/__init__.py +120 -0
mockingbirdforuse/encoder/__init__.py +0 -0
mockingbirdforuse/encoder/audio.py +121 -0
mockingbirdforuse/encoder/hparams.py +42 -0
mockingbirdforuse/encoder/inference.py +154 -0
mockingbirdforuse/encoder/model.py +145 -0
mockingbirdforuse/log.py +40 -0
mockingbirdforuse/synthesizer/__init__.py +0 -0
mockingbirdforuse/synthesizer/gst_hyperparameters.py +19 -0
mockingbirdforuse/synthesizer/hparams.py +113 -0
mockingbirdforuse/synthesizer/inference.py +151 -0
mockingbirdforuse/synthesizer/models/global_style_token.py +175 -0
mockingbirdforuse/synthesizer/models/tacotron.py +678 -0
mockingbirdforuse/synthesizer/utils/__init__.py +46 -0
mockingbirdforuse/synthesizer/utils/cleaners.py +91 -0
mockingbirdforuse/synthesizer/utils/logmmse.py +245 -0
mockingbirdforuse/synthesizer/utils/numbers.py +70 -0
mockingbirdforuse/synthesizer/utils/symbols.py +20 -0
mockingbirdforuse/synthesizer/utils/text.py +74 -0
mockingbirdforuse/vocoder/__init__.py +0 -0
mockingbirdforuse/vocoder/distribution.py +136 -0
mockingbirdforuse/vocoder/hifigan/__init__.py +0 -0
mockingbirdforuse/vocoder/hifigan/hparams.py +37 -0
mockingbirdforuse/vocoder/hifigan/inference.py +32 -0
mockingbirdforuse/vocoder/hifigan/models.py +460 -0
mockingbirdforuse/vocoder/wavernn/__init__.py +0 -0
mockingbirdforuse/vocoder/wavernn/audio.py +118 -0
mockingbirdforuse/vocoder/wavernn/hparams.py +53 -0
mockingbirdforuse/vocoder/wavernn/inference.py +56 -0
mockingbirdforuse/vocoder/wavernn/models/deepmind_version.py +180 -0
mockingbirdforuse/vocoder/wavernn/models/fatchord_version.py +445 -0
packages.txt +3 -0
requirements.txt +13 -0

.gitattributes CHANGED Viewed

@@ -1,34 +1,16 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,4 @@

+include assets/*
+include inputs/*
+include LICENSE
+include requirements.txt

README.md CHANGED Viewed

@@ -1,12 +1,34 @@
 ---
-title: Mb
-emoji: 😻
-colorFrom: gray
-colorTo: blue
 sdk: gradio
-sdk_version: 3.10.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MockingBird
+emoji: 🏃
+colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 3.1.7
 app_file: app.py
 pinned: false
 ---
+# Configuration
+`title`: _string_
+Display title for the Space
+`emoji`: _string_
+Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio` or `streamlit`
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code).
+Path is relative to the root of the repository.
+`pinned`: _boolean_
+Whether the Space stays on top of your list.

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import httpx
+import torch
+import gradio as gr
+from tempfile import NamedTemporaryFile
+from pathlib import Path
+from mockingbirdforuse import MockingBird
+mockingbird = MockingBird()
+mockingbird_path = Path(os.path.dirname(__file__)) / "data"
+base_url = "https://al.smoe.top/d/Home/source/mockingbird/"
+for sy in ["encoder.pt", "g_hifigan.pt", "wavernn.pt"]:
+    if not os.path.exists(os.path.join(mockingbird_path, sy)):
+        torch.hub.download_url_to_file(f"{base_url}/{sy}", mockingbird_path / sy)
+for model in ["azusa", "nanmei", "ltyai", "tianyi"]:
+    model_path = mockingbird_path / model
+    model_path.mkdir(parents=True, exist_ok=True)
+    for file_name in ["record.wav", f"{model}.pt"]:
+        if not os.path.exists(os.path.join(model_path, file_name)):
+            torch.hub.download_url_to_file(
+                f"{base_url}/{model}/{file_name}", model_path / file_name
+            )
+mockingbird.load_model(
+    Path(os.path.join(mockingbird_path, "encoder.pt")),
+    Path(os.path.join(mockingbird_path, "g_hifigan.pt")),
+    Path(os.path.join(mockingbird_path, "wavernn.pt")),
+)
+def inference(
+    text: str,
+    model_name: str,
+    vocoder_type: str = "HifiGan",
+    style_idx: int = 0,
+    min_stop_token: int = 9,
+    steps: int = 2000,
+):
+    model_path = mockingbird_path / model_name
+    mockingbird.set_synthesizer(Path(os.path.join(model_path, f"{model_name}.pt")))
+    fd = NamedTemporaryFile(suffix=".wav", delete=False)
+    record = mockingbird.synthesize(
+        text=str(text),
+        input_wav=model_path / "record.wav",
+        vocoder_type=vocoder_type,
+        style_idx=style_idx,
+        min_stop_token=min_stop_token,
+        steps=steps,
+    )
+    with open(fd.name, "wb") as file:
+        file.write(record.getvalue())
+    return fd.name
+title = "MockingBird"
+description = "🚀AI拟声: 5秒内克隆您的声音并生成任意语音内容 Clone a voice in 5 seconds to generate arbitrary speech in real-time"
+article = "<a href='https://github.com/babysor/MockingBird'>Github Repo</a></p>"
+gr.Interface(
+    inference,
+    [
+        gr.Textbox(label="Input"),
+        gr.Radio(
+            ["azusa", "nanmei", "ltyai", "tianyi"],
+            label="model type",
+            value="azusa",
+        ),
+        gr.Radio(
+            ["HifiGan", "WaveRNN"],
+            label="Vocoder type",
+            value="HifiGan",
+        ),
+        gr.Slider(minimum=-1, maximum=9, step=1, label="style idx", value=0),
+        gr.Slider(minimum=3, maximum=9, label="min stop token", value=9),
+        gr.Slider(minimum=200, maximum=2000, label="steps", value=2000),
+    ],
+    gr.Audio(type="filepath", label="Output"),
+    title=title,
+    description=description,
+    article=article,
+    examples=[["阿梓不是你的电子播放器", "azusa", "HifiGan", 0, 9, 2000], ["不是", "nanmei", "HifiGan", 0, 9, 2000]],
+).launch()

data/azusa/azusa.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f5cc81057c8c7a5c8000ac8f5dd0335f878484640e69e2bb1f7a84d9b0bbf90
+size 526153469

data/encoder.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57715adc6f36047166ab06e37b904240aee2f4d10fc88f78ed91510cf4b38666
+size 17095158

data/g_hifigan.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c5b29830f9b42c481c108cb0b89d56f380928d4d46e1d30d65c92340ddc694e
+size 51985448

data/ltyai/ltyai.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4bd4b759a30efd70d0064628c3b107aa7cd9d0bff8a36a242946a46d7c5235c
+size 526153021

data/nanmei/nanmei.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95e90985b4c6b8090d8b328e7b23078eb00cffa0464ca9982464f0000b44a2a9
+size 526153469

data/tianyi/tianyi.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9140c057ad8f4243e47a18103e773e0f823c4423927eec67dd47a3c3e9a9293
+size 526153469

data/wavernn.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d7a6861589e927e0fbdaa5849ca022258fe2b58a20cc7bfb8fb598ccf936169
+size 53845290

mockingbirdforuse/__init__.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import re
+import librosa
+import numpy as np
+from io import BytesIO
+from pathlib import Path
+from scipy.io import wavfile
+from typing import List, Literal, Optional
+from .encoder.inference import Encoder, preprocess_wav
+from .synthesizer.inference import Synthesizer
+from .vocoder.hifigan.inference import HifiGanVocoder
+from .vocoder.wavernn.inference import WaveRNNVocoder
+from .log import logger
+def process_text(text: str) -> List[str]:
+    punctuation = "！，。、,?!,"  # punctuate and split/clean text
+    processed_texts = []
+    text = re.sub(r"[{}]+".format(punctuation), "\n", text)
+    for processed_text in text.split("\n"):
+        if processed_text:
+            processed_texts.append(processed_text.strip())
+    return processed_texts
+class MockingBird:
+    def __init__(self):
+        self.encoder: Optional[Encoder] = None
+        self.gan_vocoder: Optional[HifiGanVocoder] = None
+        self.rnn_vocoder: Optional[WaveRNNVocoder] = None
+        self.synthesizer: Optional[Synthesizer] = None
+    def load_model(
+        self,
+        encoder_path: Path,
+        gan_vocoder_path: Optional[Path] = None,
+        rnn_vocoder_path: Optional[Path] = None,
+    ):
+        """
+        设置 Encoder模型 和 Vocoder模型 路径
+        Args:
+            encoder_path (Path): Encoder模型路径
+            gan_vocoder_path (Path): HifiGan Vocoder模型路径，可选，需要用到 HifiGan 类型时必须填写
+            rnn_vocoder_path (Path): WaveRNN Vocoder模型路径，可选，需要用到 WaveRNN 类型时必须填写
+        """
+        self.encoder = Encoder(encoder_path)
+        if gan_vocoder_path:
+            self.gan_vocoder = HifiGanVocoder(gan_vocoder_path)
+        if rnn_vocoder_path:
+            self.rnn_vocoder = WaveRNNVocoder(rnn_vocoder_path)
+    def set_synthesizer(self, synthesizer_path: Path):
+        """
+        设置Synthesizer模型路径
+        Args:
+            synthesizer_path (Path): Synthesizer模型路径
+        """
+        self.synthesizer = Synthesizer(synthesizer_path)
+        logger.info(f"using synthesizer model: {synthesizer_path}")
+    def synthesize(
+        self,
+        text: str,
+        input_wav: Path,
+        vocoder_type: Literal["HifiGan", "WaveRNN"] = "HifiGan",
+        style_idx: int = 0,
+        min_stop_token: int = 5,
+        steps: int = 1000,
+    ) -> BytesIO:
+        """
+        生成语音
+        Args:
+            text (str): 目标文字
+            input_wav (Path): 目标录音路径
+            vocoder_type (HifiGan / WaveRNN): Vocoder模型，默认使用HifiGan
+            style_idx (int, optional): Style 范围 -1~9，默认为 0
+            min_stop_token (int, optional): Accuracy(精度) 范围3~9，默认为 5
+            steps (int, optional): MaxLength(最大句长) 范围200~2000，默认为 1000
+        """
+        if not self.encoder:
+            raise Exception("Please set encoder path first")
+        if not self.synthesizer:
+            raise Exception("Please set synthesizer path first")
+        # Load input wav
+        wav, sample_rate = librosa.load(input_wav)
+        encoder_wav = preprocess_wav(wav, sample_rate)
+        embed, _, _ = self.encoder.embed_utterance(encoder_wav, return_partials=True)
+        # Load input text
+        texts = process_text(text)
+        # synthesize and vocode
+        embeds = [embed] * len(texts)
+        specs = self.synthesizer.synthesize_spectrograms(
+            texts,
+            embeds,
+            style_idx=style_idx,
+            min_stop_token=min_stop_token,
+            steps=steps,
+        )
+        spec = np.concatenate(specs, axis=1)
+        if vocoder_type == "WaveRNN":
+            if not self.rnn_vocoder:
+                raise Exception("Please set wavernn vocoder path first")
+            wav, sample_rate = self.rnn_vocoder.infer_waveform(spec)
+        else:
+            if not self.gan_vocoder:
+                raise Exception("Please set hifigan vocoder path first")
+            wav, sample_rate = self.gan_vocoder.infer_waveform(spec)
+        # Return cooked wav
+        out = BytesIO()
+        wavfile.write(out, sample_rate, wav.astype(np.float32))
+        return out

mockingbirdforuse/encoder/__init__.py ADDED Viewed

File without changes

mockingbirdforuse/encoder/audio.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import struct
+import librosa
+import webrtcvad
+import numpy as np
+from pathlib import Path
+from typing import Optional, Union
+from scipy.ndimage.morphology import binary_dilation
+from .hparams import hparams as hp
+def preprocess_wav(
+    fpath_or_wav: Union[str, Path, np.ndarray],
+    source_sr: Optional[int] = None,
+    normalize: Optional[bool] = True,
+    trim_silence: Optional[bool] = True,
+):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(str(fpath_or_wav))
+    else:
+        wav = fpath_or_wav
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != hp.sampling_rate:
+        wav = librosa.resample(wav, orig_sr=source_sr, target_sr=hp.sampling_rate)
+    # Apply the preprocessing: normalize volume and shorten long silences
+    if normalize:
+        wav = normalize_volume(wav, hp.audio_norm_target_dBFS, increase_only=True)
+    if webrtcvad and trim_silence:
+        wav = trim_long_silences(wav)
+    return wav
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        y=wav,
+        sr=hp.sampling_rate,
+        n_fft=int(hp.sampling_rate * hp.mel_window_length / 1000),
+        hop_length=int(hp.sampling_rate * hp.mel_window_step / 1000),
+        n_mels=hp.mel_n_channels,
+    )
+    return frames.astype(np.float32).T
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (hp.vad_window_length * hp.sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[: len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    int16_max = (2**15) - 1
+    pcm_wave = struct.pack(
+        "%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)
+    )
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(
+            vad.is_speech(
+                pcm_wave[window_start * 2 : window_end * 2],
+                sample_rate=hp.sampling_rate,
+            )
+        )
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate(
+            (np.zeros((width - 1) // 2), array, np.zeros(width // 2))
+        )
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1 :] / width
+    audio_mask = moving_average(voice_flags, hp.vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool8)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(hp.vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    return wav[audio_mask == True]
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))

mockingbirdforuse/encoder/hparams.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from dataclasses import dataclass
+@dataclass
+class HParams:
+    ## Mel-filterbank
+    mel_window_length = 25  # In milliseconds
+    mel_window_step = 10  # In milliseconds
+    mel_n_channels = 40
+    ## Audio
+    sampling_rate = 16000
+    # Number of spectrogram frames in a partial utterance
+    partials_n_frames = 160  # 1600 ms
+    # Number of spectrogram frames at inference
+    inference_n_frames = 80  #  800 ms
+    ## Voice Activation Detection
+    # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+    # This sets the granularity of the VAD. Should not need to be changed.
+    vad_window_length = 30  # In milliseconds
+    # Number of frames to average together when performing the moving average smoothing.
+    # The larger this value, the larger the VAD variations must be to not get smoothed out.
+    vad_moving_average_width = 8
+    # Maximum number of consecutive silent frames a segment can have.
+    vad_max_silence_length = 6
+    ## Audio volume normalization
+    audio_norm_target_dBFS = -30
+    ## Model parameters
+    model_hidden_size = 256
+    model_embedding_size = 256
+    model_num_layers = 3
+    ## Training parameters
+    learning_rate_init = 1e-4
+    speakers_per_batch = 64
+    utterances_per_speaker = 10
+hparams = HParams()

mockingbirdforuse/encoder/inference.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import torch
+import numpy as np
+from pathlib import Path
+from . import audio
+from .model import SpeakerEncoder
+from .audio import preprocess_wav  # We want to expose this function from here
+from .hparams import hparams as hp
+from ..log import logger
+class Encoder:
+    def __init__(self, model_path: Path):
+        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self._model = SpeakerEncoder(self._device, torch.device("cpu"))
+        checkpoint = torch.load(model_path, self._device)
+        self._model.load_state_dict(checkpoint["model_state"])
+        self._model.eval()
+        logger.info(
+            f"Loaded encoder {model_path.name} trained to step {checkpoint['step']}"
+        )
+    def embed_frames_batch(self, frames_batch):
+        """
+        Computes embeddings for a batch of mel spectrogram.
+        :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
+        (batch_size, n_frames, n_channels)
+        :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+        """
+        frames = torch.from_numpy(frames_batch).to(self._device)
+        embed = self._model.forward(frames).detach().cpu().numpy()
+        return embed
+    def compute_partial_slices(
+        self,
+        n_samples,
+        partial_utterance_n_frames=hp.partials_n_frames,
+        min_pad_coverage=0.75,
+        overlap=0.5,
+        rate=None,
+    ):
+        """
+        Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+        partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
+        spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+        its spectrogram. This function assumes that the mel spectrogram parameters used are those
+        defined in params_data.py.
+        The returned ranges may be indexing further than the length of the waveform. It is
+        recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+        :param n_samples: the number of samples in the waveform
+        :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+        utterance
+        :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+        enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+        then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+        it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+        utterance, this parameter is ignored so that the function always returns at least 1 slice.
+        :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+        utterances are entirely disjoint.
+        :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+        respectively the waveform and the mel spectrogram with these slices to obtain the partial
+        utterances.
+        """
+        assert 0 <= overlap < 1
+        assert 0 < min_pad_coverage <= 1
+        if rate != None:
+            samples_per_frame = int((hp.sampling_rate * hp.mel_window_step / 1000))
+            n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+            frame_step = int(np.round((hp.sampling_rate / rate) / samples_per_frame))
+        else:
+            samples_per_frame = int((hp.sampling_rate * hp.mel_window_step / 1000))
+            n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+            frame_step = max(
+                int(np.round(partial_utterance_n_frames * (1 - overlap))), 1
+            )
+        assert 0 < frame_step, "The rate is too high"
+        assert (
+            frame_step <= hp.partials_n_frames
+        ), "The rate is too low, it should be %f at least" % (
+            hp.sampling_rate / (samples_per_frame * hp.partials_n_frames)
+        )
+        # Compute the slices
+        wav_slices, mel_slices = [], []
+        steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+        for i in range(0, steps, frame_step):
+            mel_range = np.array([i, i + partial_utterance_n_frames])
+            wav_range = mel_range * samples_per_frame
+            mel_slices.append(slice(*mel_range))
+            wav_slices.append(slice(*wav_range))
+        # Evaluate whether extra padding is warranted or not
+        last_wav_range = wav_slices[-1]
+        coverage = (n_samples - last_wav_range.start) / (
+            last_wav_range.stop - last_wav_range.start
+        )
+        if coverage < min_pad_coverage and len(mel_slices) > 1:
+            mel_slices = mel_slices[:-1]
+            wav_slices = wav_slices[:-1]
+        return wav_slices, mel_slices
+    def embed_utterance(
+        self, wav, using_partials: bool = True, return_partials: bool = False, **kwargs
+    ):
+        """
+        Computes an embedding for a single utterance.
+        :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+        :param using_partials: if True, then the utterance is split in partial utterances of
+        <partial_utterance_n_frames> frames and the utterance embedding is computed from their
+        normalized average. If False, the utterance is instead computed from feeding the entire
+        spectogram to the network.
+        :param return_partials: if True, the partial embeddings will also be returned along with the
+        wav slices that correspond to the partial embeddings.
+        :param kwargs: additional arguments to compute_partial_splits()
+        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+        <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+        (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+        returned. If <using_partials> is simultaneously set to False, both these values will be None
+        instead.
+        """
+        # Process the entire utterance if not using partials
+        if not using_partials:
+            frames = audio.wav_to_mel_spectrogram(wav)
+            embed = self.embed_frames_batch(frames[None, ...])[0]
+            if return_partials:
+                return embed, None, None
+            return embed
+        # Compute where to split the utterance into partials and pad if necessary
+        wave_slices, mel_slices = self.compute_partial_slices(len(wav), **kwargs)
+        max_wave_length = wave_slices[-1].stop
+        if max_wave_length >= len(wav):
+            wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+        # Split the utterance into partials
+        frames = audio.wav_to_mel_spectrogram(wav)
+        frames_batch = np.array([frames[s] for s in mel_slices])
+        partial_embeds = self.embed_frames_batch(frames_batch)
+        # Compute the utterance embedding from the partial embeddings
+        raw_embed = np.mean(partial_embeds, axis=0)
+        embed = raw_embed / np.linalg.norm(raw_embed, 2)
+        if return_partials:
+            return embed, partial_embeds, wave_slices
+        return embed

mockingbirdforuse/encoder/model.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import torch
+import numpy as np
+from torch import nn
+from scipy.optimize import brentq
+from sklearn.metrics import roc_curve
+from scipy.interpolate import interp1d
+from torch.nn.parameter import Parameter
+from torch.nn.utils.clip_grad import clip_grad_norm_
+from .hparams import hparams as hp
+class SpeakerEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        # Network defition
+        self.lstm = nn.LSTM(
+            input_size=hp.mel_n_channels,
+            hidden_size=hp.model_hidden_size,
+            num_layers=hp.model_num_layers,
+            batch_first=True,
+        ).to(device)
+        self.linear = nn.Linear(
+            in_features=hp.model_hidden_size, out_features=hp.model_embedding_size
+        ).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = Parameter(torch.tensor([10.0])).to(loss_device)
+        self.similarity_bias = Parameter(torch.tensor([-5.0])).to(loss_device)
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+        (batch_size, n_frames, n_channels)
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        # L2-normalize it
+        embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
+        return embeds
+    def similarity_matrix(self, embeds):
+        """
+        Computes the similarity matrix according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, speakers_per_batch)
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+        centroids_incl = centroids_incl.clone() / (
+            torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5
+        )
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = torch.sum(embeds, dim=1, keepdim=True) - embeds
+        centroids_excl /= utterances_per_speaker - 1
+        centroids_excl = centroids_excl.clone() / (
+            torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5
+        )
+        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+        # We vectorize the computation for efficiency.
+        sim_matrix = torch.zeros(
+            speakers_per_batch, utterances_per_speaker, speakers_per_batch
+        ).to(self.loss_device)
+        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int32)
+        for j in range(speakers_per_batch):
+            mask = np.where(mask_matrix[j])[0]
+            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+        ## Even more vectorized version (slower maybe because of transpose)
+        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+        #                           ).to(self.loss_device)
+        # eye = np.eye(speakers_per_batch, dtype=np.int)
+        # mask = np.where(1 - eye)
+        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+        # mask = np.where(eye)
+        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+        # sim_matrix2 = sim_matrix2.transpose(1, 2)
+        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+        return sim_matrix
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Loss
+        sim_matrix = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape(
+            (speakers_per_batch * utterances_per_speaker, speakers_per_batch)
+        )
+        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+        loss = self.loss_fn(sim_matrix, target)
+        # EER (not backpropagated)
+        with torch.no_grad():
+            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int32)[0]
+            labels = np.array([inv_argmax(i) for i in ground_truth])
+            preds = sim_matrix.detach().cpu().numpy()
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
+            eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0)
+        return loss, eer

mockingbirdforuse/log.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import sys
+import loguru
+from typing import TYPE_CHECKING, Union
+if TYPE_CHECKING:
+    from loguru import Logger
+logger: "Logger" = loguru.logger
+class Filter:
+    def __init__(self) -> None:
+        self.level: Union[int, str] = "DEBUG"
+    def __call__(self, record):
+        module_name: str = record["name"]
+        record["name"] = module_name.split(".")[0]
+        levelno = (
+            logger.level(self.level).no if isinstance(self.level, str) else self.level
+        )
+        return record["level"].no >= levelno
+logger.remove()
+default_filter: Filter = Filter()
+default_format: str = (
+    "<g>{time:MM-DD HH:mm:ss}</g> "
+    "[<lvl>{level}</lvl>] "
+    "<c><u>{name}</u></c> | "
+    "{message}"
+)
+logger.add(
+    sys.stdout,
+    level=0,
+    colorize=True,
+    diagnose=False,
+    filter=default_filter,
+    format=default_format,
+)

mockingbirdforuse/synthesizer/__init__.py ADDED Viewed

File without changes

mockingbirdforuse/synthesizer/gst_hyperparameters.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from dataclasses import dataclass
+@dataclass
+class GSTHyperparameters:
+    E = 512
+    # reference encoder
+    ref_enc_filters = [32, 32, 64, 64, 128, 128]
+    # style token layer
+    token_num = 10
+    # token_emb_size = 256
+    num_heads = 8
+    n_mels = 256  # Number of Mel banks to generate
+hparams = GSTHyperparameters()

mockingbirdforuse/synthesizer/hparams.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from dataclasses import dataclass
+@dataclass
+class HParams:
+    ### Signal Processing (used in both synthesizer and vocoder)
+    sample_rate = 16000
+    n_fft = 800
+    num_mels = 80
+    hop_size = 200
+    """Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)"""
+    win_size = 800
+    """Tacotron uses 50 ms frame length (set to sample_rate * 0.050)"""
+    fmin = 55
+    min_level_db = -100
+    ref_level_db = 20
+    max_abs_value = 4.0
+    """Gradient explodes if too big, premature convergence if too small."""
+    preemphasis = 0.97
+    """Filter coefficient to use if preemphasize is True"""
+    preemphasize = True
+    ### Tacotron Text-to-Speech (TTS)
+    tts_embed_dims = 512
+    """Embedding dimension for the graphemes/phoneme inputs"""
+    tts_encoder_dims = 256
+    tts_decoder_dims = 128
+    tts_postnet_dims = 512
+    tts_encoder_K = 5
+    tts_lstm_dims = 1024
+    tts_postnet_K = 5
+    tts_num_highways = 4
+    tts_dropout = 0.5
+    tts_cleaner_names = ["basic_cleaners"]
+    tts_stop_threshold = -3.4
+    """
+    Value below which audio generation ends.
+    For example, for a range of [-4, 4], this
+    will terminate the sequence at the first
+    frame that has all values < -3.4
+    """
+    ### Tacotron Training
+    tts_schedule = [
+        (2, 1e-3, 10_000, 12),
+        (2, 5e-4, 15_000, 12),
+        (2, 2e-4, 20_000, 12),
+        (2, 1e-4, 30_000, 12),
+        (2, 5e-5, 40_000, 12),
+        (2, 1e-5, 60_000, 12),
+        (2, 5e-6, 160_000, 12),
+        (2, 3e-6, 320_000, 12),
+        (2, 1e-6, 640_000, 12),
+    ]
+    """
+    Progressive training schedule
+    (r, lr, step, batch_size)
+    r = reduction factor (# of mel frames synthesized for each decoder iteration)
+    lr = learning rate
+    """
+    tts_clip_grad_norm = 1.0
+    """clips the gradient norm to prevent explosion - set to None if not needed"""
+    tts_eval_interval = 500
+    """
+    Number of steps between model evaluation (sample generation)
+    Set to -1 to generate after completing epoch, or 0 to disable
+    """
+    tts_eval_num_samples = 1
+    """Makes this number of samples"""
+    tts_finetune_layers = []
+    """For finetune usage, if set, only selected layers will be trained, available: encoder,encoder_proj,gst,decoder,postnet,post_proj"""
+    ### Data Preprocessing
+    max_mel_frames = 900
+    rescale = True
+    rescaling_max = 0.9
+    synthesis_batch_size = 16
+    """For vocoder preprocessing and inference."""
+    ### Mel Visualization and Griffin-Lim
+    signal_normalization = True
+    power = 1.5
+    griffin_lim_iters = 60
+    ### Audio processing options
+    fmax = 7600
+    """Should not exceed (sample_rate // 2)"""
+    allow_clipping_in_normalization = True
+    """Used when signal_normalization = True"""
+    clip_mels_length = True
+    """If true, discards samples exceeding max_mel_frames"""
+    use_lws = False
+    """Fast spectrogram phase recovery using local weighted sums"""
+    symmetric_mels = True
+    """Sets mel range to [-max_abs_value, max_abs_value] if True, and [0, max_abs_value] if False"""
+    trim_silence = True
+    """Use with sample_rate of 16000 for best results"""
+    ### SV2TTS
+    speaker_embedding_size = 256
+    """Dimension for the speaker embedding"""
+    silence_min_duration_split = 0.4
+    """Duration in seconds of a silence for an utterance to be split"""
+    utterance_min_duration = 1.6
+    """Duration in seconds below which utterances are discarded"""
+    use_gst = True
+    """Whether to use global style token"""
+    use_ser_for_gst = True
+    """Whether to use speaker embedding referenced for global style token"""
+hparams = HParams()

mockingbirdforuse/synthesizer/inference.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torch
+import librosa
+import numpy as np
+from pathlib import Path
+from typing import Union, List
+from pypinyin import lazy_pinyin, Style
+from .hparams import hparams as hp
+from .utils.symbols import symbols
+from .models.tacotron import Tacotron
+from .utils.text import text_to_sequence
+from .utils.logmmse import denoise, profile_noise
+from ..log import logger
+class Synthesizer:
+    def __init__(self, model_path: Path):
+        # Check for GPU
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+        logger.info(f"Synthesizer using device: {self.device}")
+        self._model = Tacotron(
+            embed_dims=hp.tts_embed_dims,
+            num_chars=len(symbols),
+            encoder_dims=hp.tts_encoder_dims,
+            decoder_dims=hp.tts_decoder_dims,
+            n_mels=hp.num_mels,
+            fft_bins=hp.num_mels,
+            postnet_dims=hp.tts_postnet_dims,
+            encoder_K=hp.tts_encoder_K,
+            lstm_dims=hp.tts_lstm_dims,
+            postnet_K=hp.tts_postnet_K,
+            num_highways=hp.tts_num_highways,
+            dropout=hp.tts_dropout,
+            stop_threshold=hp.tts_stop_threshold,
+            speaker_embedding_size=hp.speaker_embedding_size,
+        ).to(self.device)
+        self._model.load(model_path, self.device)
+        self._model.eval()
+        logger.info(
+            'Loaded synthesizer "%s" trained to step %d'
+            % (model_path.name, self._model.state_dict()["step"])
+        )
+    def synthesize_spectrograms(
+        self,
+        texts: List[str],
+        embeddings: Union[np.ndarray, List[np.ndarray]],
+        return_alignments=False,
+        style_idx=0,
+        min_stop_token=5,
+        steps=2000,
+    ):
+        """
+        Synthesizes mel spectrograms from texts and speaker embeddings.
+        :param texts: a list of N text prompts to be synthesized
+        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
+        :param return_alignments: if True, a matrix representing the alignments between the
+        characters
+        and each decoder output step will be returned for each spectrogram
+        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
+        sequence length of spectrogram i, and possibly the alignments.
+        """
+        logger.debug("Read " + str(texts))
+        texts = [
+            " ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True))
+            for v in texts
+        ]
+        logger.debug("Synthesizing " + str(texts))
+        # Preprocess text inputs
+        inputs = [text_to_sequence(text, hp.tts_cleaner_names) for text in texts]
+        if not isinstance(embeddings, list):
+            embeddings = [embeddings]
+        # Batch inputs
+        batched_inputs = [
+            inputs[i : i + hp.synthesis_batch_size]
+            for i in range(0, len(inputs), hp.synthesis_batch_size)
+        ]
+        batched_embeds = [
+            embeddings[i : i + hp.synthesis_batch_size]
+            for i in range(0, len(embeddings), hp.synthesis_batch_size)
+        ]
+        specs = []
+        alignments = []
+        for i, batch in enumerate(batched_inputs, 1):
+            logger.debug(f"\n| Generating {i}/{len(batched_inputs)}")
+            # Pad texts so they are all the same length
+            text_lens = [len(text) for text in batch]
+            max_text_len = max(text_lens)
+            chars = [pad1d(text, max_text_len) for text in batch]
+            chars = np.stack(chars)
+            # Stack speaker embeddings into 2D array for batch processing
+            speaker_embeds = np.stack(batched_embeds[i - 1])
+            # Convert to tensor
+            chars = torch.tensor(chars).long().to(self.device)
+            speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
+            # Inference
+            _, mels, alignments = self._model.generate(
+                chars,
+                speaker_embeddings,
+                style_idx=style_idx,
+                min_stop_token=min_stop_token,
+                steps=steps,
+            )
+            mels = mels.detach().cpu().numpy()
+            for m in mels:
+                # Trim silence from end of each spectrogram
+                while np.max(m[:, -1]) < hp.tts_stop_threshold:
+                    m = m[:, :-1]
+                specs.append(m)
+        logger.debug("\n\nDone.\n")
+        return (specs, alignments) if return_alignments else specs
+    @staticmethod
+    def load_preprocess_wav(fpath):
+        """
+        Loads and preprocesses an audio file under the same conditions the audio files were used to
+        train the synthesizer.
+        """
+        wav = librosa.load(path=str(fpath), sr=hp.sample_rate)[0]
+        if hp.rescale:
+            wav = wav / np.abs(wav).max() * hp.rescaling_max
+        # denoise
+        if len(wav) > hp.sample_rate * (0.3 + 0.1):
+            noise_wav = np.concatenate(
+                [
+                    wav[: int(hp.sample_rate * 0.15)],
+                    wav[-int(hp.sample_rate * 0.15) :],
+                ]
+            )
+            profile = profile_noise(noise_wav, hp.sample_rate)
+            wav = denoise(wav, profile)
+        return wav
+def pad1d(x, max_len, pad_value=0):
+    return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)

mockingbirdforuse/synthesizer/models/global_style_token.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+import torch.nn.functional as tFunctional
+from ..hparams import hparams as hp
+from ..gst_hyperparameters import hparams as gst_hp
+class GlobalStyleToken(nn.Module):
+    """
+    inputs: style mel spectrograms [batch_size, num_spec_frames, num_mel]
+    speaker_embedding: speaker mel spectrograms [batch_size, num_spec_frames, num_mel]
+    outputs: [batch_size, embedding_dim]
+    """
+    def __init__(self, speaker_embedding_dim=None):
+        super().__init__()
+        self.encoder = ReferenceEncoder()
+        self.stl = STL(speaker_embedding_dim)
+    def forward(self, inputs, speaker_embedding=None):
+        enc_out = self.encoder(inputs)
+        # concat speaker_embedding according to https://github.com/mozilla/TTS/blob/master/TTS/tts/layers/gst_layers.py
+        if hp.use_ser_for_gst and speaker_embedding is not None:
+            enc_out = torch.cat([enc_out, speaker_embedding], dim=-1)
+        style_embed = self.stl(enc_out)
+        return style_embed
+class ReferenceEncoder(nn.Module):
+    """
+    inputs --- [N, Ty/r, n_mels*r]  mels
+    outputs --- [N, ref_enc_gru_size]
+    """
+    def __init__(self):
+        super().__init__()
+        K = len(gst_hp.ref_enc_filters)
+        filters = [1] + gst_hp.ref_enc_filters
+        convs = [
+            nn.Conv2d(
+                in_channels=filters[i],
+                out_channels=filters[i + 1],
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1),
+            )
+            for i in range(K)
+        ]
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(
+            [nn.BatchNorm2d(num_features=gst_hp.ref_enc_filters[i]) for i in range(K)]
+        )
+        out_channels = self.calculate_channels(gst_hp.n_mels, 3, 2, 1, K)
+        self.gru = nn.GRU(
+            input_size=gst_hp.ref_enc_filters[-1] * out_channels,
+            hidden_size=gst_hp.E // 2,
+            batch_first=True,
+        )
+    def forward(self, inputs):
+        N = inputs.size(0)
+        out = inputs.view(N, 1, -1, gst_hp.n_mels)  # [N, 1, Ty, n_mels]
+        for conv, bn in zip(self.convs, self.bns):
+            out = conv(out)
+            out = bn(out)
+            out = tFunctional.relu(out)  # [N, 128, Ty//2^K, n_mels//2^K]
+        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
+        T = out.size(1)
+        N = out.size(0)
+        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
+        self.gru.flatten_parameters()
+        memory, out = self.gru(out)  # out --- [1, N, E//2]
+        return out.squeeze(0)
+    def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
+        for i in range(n_convs):
+            L = (L - kernel_size + 2 * pad) // stride + 1
+        return L
+class STL(nn.Module):
+    """
+    inputs --- [N, E//2]
+    """
+    def __init__(self, speaker_embedding_dim=None):
+        super().__init__()
+        self.embed = Parameter(
+            torch.FloatTensor(gst_hp.token_num, gst_hp.E // gst_hp.num_heads)
+        )
+        d_q = gst_hp.E // 2
+        d_k = gst_hp.E // gst_hp.num_heads
+        # self.attention = MultiHeadAttention(gst_hp.num_heads, d_model, d_q, d_v)
+        if hp.use_ser_for_gst and speaker_embedding_dim is not None:
+            d_q += speaker_embedding_dim
+        self.attention = MultiHeadAttention(
+            query_dim=d_q, key_dim=d_k, num_units=gst_hp.E, num_heads=gst_hp.num_heads
+        )
+        init.normal_(self.embed, mean=0, std=0.5)
+    def forward(self, inputs):
+        N = inputs.size(0)
+        query = inputs.unsqueeze(1)  # [N, 1, E//2]
+        keys = (
+            torch.tanh(self.embed).unsqueeze(0).expand(N, -1, -1)
+        )  # [N, token_num, E // num_heads]
+        style_embed = self.attention(query, keys)
+        return style_embed
+class MultiHeadAttention(nn.Module):
+    """
+    input:
+        query --- [N, T_q, query_dim]
+        key --- [N, T_k, key_dim]
+    output:
+        out --- [N, T_q, num_units]
+    """
+    def __init__(self, query_dim, key_dim, num_units, num_heads):
+        super().__init__()
+        self.num_units = num_units
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.W_query = nn.Linear(
+            in_features=query_dim, out_features=num_units, bias=False
+        )
+        self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+        self.W_value = nn.Linear(
+            in_features=key_dim, out_features=num_units, bias=False
+        )
+    def forward(self, query, key):
+        querys = self.W_query(query)  # [N, T_q, num_units]
+        keys = self.W_key(key)  # [N, T_k, num_units]
+        values = self.W_value(key)
+        split_size = self.num_units // self.num_heads
+        querys = torch.stack(
+            torch.split(querys, split_size, dim=2), dim=0
+        )  # [h, N, T_q, num_units/h]
+        keys = torch.stack(
+            torch.split(keys, split_size, dim=2), dim=0
+        )  # [h, N, T_k, num_units/h]
+        values = torch.stack(
+            torch.split(values, split_size, dim=2), dim=0
+        )  # [h, N, T_k, num_units/h]
+        # score = softmax(QK^T / (d_k ** 0.5))
+        scores = torch.matmul(querys, keys.transpose(2, 3))  # [h, N, T_q, T_k]
+        scores = scores / (self.key_dim**0.5)
+        scores = tFunctional.softmax(scores, dim=3)
+        # out = score * V
+        out = torch.matmul(scores, values)  # [h, N, T_q, num_units/h]
+        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(
+            0
+        )  # [N, T_q, num_units]
+        return out

mockingbirdforuse/synthesizer/models/tacotron.py ADDED Viewed

	@@ -0,0 +1,678 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from ..hparams import hparams as hp
+from .global_style_token import GlobalStyleToken
+from ..gst_hyperparameters import hparams as gst_hp
+from ...log import logger
+class HighwayNetwork(nn.Module):
+    def __init__(self, size):
+        super().__init__()
+        self.W1 = nn.Linear(size, size)
+        self.W2 = nn.Linear(size, size)
+        self.W1.bias.data.fill_(0.0)
+    def forward(self, x):
+        x1 = self.W1(x)
+        x2 = self.W2(x)
+        g = torch.sigmoid(x2)
+        y = g * F.relu(x1) + (1.0 - g) * x
+        return y
+class Encoder(nn.Module):
+    def __init__(self, embed_dims, num_chars, encoder_dims, K, num_highways, dropout):
+        super().__init__()
+        prenet_dims = (encoder_dims, encoder_dims)
+        cbhg_channels = encoder_dims
+        self.embedding = nn.Embedding(num_chars, embed_dims)
+        self.pre_net = PreNet(
+            embed_dims,
+            fc1_dims=prenet_dims[0],
+            fc2_dims=prenet_dims[1],
+            dropout=dropout,
+        )
+        self.cbhg = CBHG(
+            K=K,
+            in_channels=cbhg_channels,
+            channels=cbhg_channels,
+            proj_channels=[cbhg_channels, cbhg_channels],
+            num_highways=num_highways,
+        )
+    def forward(self, x, speaker_embedding=None):
+        x = self.embedding(x)
+        x = self.pre_net(x)
+        x.transpose_(1, 2)
+        x = self.cbhg(x)
+        if speaker_embedding is not None:
+            x = self.add_speaker_embedding(x, speaker_embedding)
+        return x
+    def add_speaker_embedding(self, x, speaker_embedding):
+        # SV2TTS
+        # The input x is the encoder output and is a 3D tensor with size (batch_size, num_chars, tts_embed_dims)
+        # When training, speaker_embedding is also a 2D tensor with size (batch_size, speaker_embedding_size)
+        #     (for inference, speaker_embedding is a 1D tensor with size (speaker_embedding_size))
+        # This concats the speaker embedding for each char in the encoder output
+        # Save the dimensions as human-readable names
+        batch_size = x.size()[0]
+        num_chars = x.size()[1]
+        if speaker_embedding.dim() == 1:
+            idx = 0
+        else:
+            idx = 1
+        # Start by making a copy of each speaker embedding to match the input text length
+        # The output of this has size (batch_size, num_chars * speaker_embedding_size)
+        speaker_embedding_size = speaker_embedding.size()[idx]
+        e = speaker_embedding.repeat_interleave(num_chars, dim=idx)
+        # Reshape it and transpose
+        e = e.reshape(batch_size, speaker_embedding_size, num_chars)
+        e = e.transpose(1, 2)
+        # Concatenate the tiled speaker embedding with the encoder output
+        x = torch.cat((x, e), 2)
+        return x
+class BatchNormConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel, relu=True):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel, stride=1, padding=kernel // 2, bias=False
+        )
+        self.bnorm = nn.BatchNorm1d(out_channels)
+        self.relu = relu
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(x) if self.relu is True else x
+        return self.bnorm(x)
+class CBHG(nn.Module):
+    def __init__(self, K, in_channels, channels, proj_channels, num_highways):
+        super().__init__()
+        # List of all rnns to call `flatten_parameters()` on
+        self._to_flatten = []
+        self.bank_kernels = [i for i in range(1, K + 1)]
+        self.conv1d_bank = nn.ModuleList()
+        for k in self.bank_kernels:
+            conv = BatchNormConv(in_channels, channels, k)
+            self.conv1d_bank.append(conv)
+        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
+        self.conv_project1 = BatchNormConv(
+            len(self.bank_kernels) * channels, proj_channels[0], 3
+        )
+        self.conv_project2 = BatchNormConv(
+            proj_channels[0], proj_channels[1], 3, relu=False
+        )
+        # Fix the highway input if necessary
+        if proj_channels[-1] != channels:
+            self.highway_mismatch = True
+            self.pre_highway = nn.Linear(proj_channels[-1], channels, bias=False)
+        else:
+            self.highway_mismatch = False
+        self.highways = nn.ModuleList()
+        for i in range(num_highways):
+            hn = HighwayNetwork(channels)
+            self.highways.append(hn)
+        self.rnn = nn.GRU(channels, channels // 2, batch_first=True, bidirectional=True)
+        self._to_flatten.append(self.rnn)
+        # Avoid fragmentation of RNN parameters and associated warning
+        self._flatten_parameters()
+    def forward(self, x):
+        # Although we `_flatten_parameters()` on init, when using DataParallel
+        # the model gets replicated, making it no longer guaranteed that the
+        # weights are contiguous in GPU memory. Hence, we must call it again
+        self.rnn.flatten_parameters()
+        # Save these for later
+        residual = x
+        seq_len = x.size(-1)
+        conv_bank = []
+        # Convolution Bank
+        for conv in self.conv1d_bank:
+            c = conv(x)  # Convolution
+            conv_bank.append(c[:, :, :seq_len])
+        # Stack along the channel axis
+        conv_bank = torch.cat(conv_bank, dim=1)
+        # dump the last padding to fit residual
+        x = self.maxpool(conv_bank)[:, :, :seq_len]
+        # Conv1d projections
+        x = self.conv_project1(x)
+        x = self.conv_project2(x)
+        # Residual Connect
+        x = x + residual
+        # Through the highways
+        x = x.transpose(1, 2)
+        if self.highway_mismatch is True:
+            x = self.pre_highway(x)
+        for h in self.highways:
+            x = h(x)
+        # And then the RNN
+        x, _ = self.rnn(x)
+        return x
+    def _flatten_parameters(self):
+        """Calls `flatten_parameters` on all the rnns used by the WaveRNN. Used
+        to improve efficiency and avoid PyTorch yelling at us."""
+        [m.flatten_parameters() for m in self._to_flatten]
+class PreNet(nn.Module):
+    def __init__(self, in_dims, fc1_dims=256, fc2_dims=128, dropout=0.5):
+        super().__init__()
+        self.fc1 = nn.Linear(in_dims, fc1_dims)
+        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
+        self.p = dropout
+    def forward(self, x):
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = F.dropout(x, self.p, training=True)
+        x = self.fc2(x)
+        x = F.relu(x)
+        x = F.dropout(x, self.p, training=True)
+        return x
+class Attention(nn.Module):
+    def __init__(self, attn_dims):
+        super().__init__()
+        self.W = nn.Linear(attn_dims, attn_dims, bias=False)
+        self.v = nn.Linear(attn_dims, 1, bias=False)
+    def forward(self, encoder_seq_proj, query, t):
+        # Transform the query vector
+        query_proj = self.W(query).unsqueeze(1)
+        # Compute the scores
+        u = self.v(torch.tanh(encoder_seq_proj + query_proj))
+        scores = F.softmax(u, dim=1)
+        return scores.transpose(1, 2)
+class LSA(nn.Module):
+    def __init__(self, attn_dim, kernel_size=31, filters=32):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            1,
+            filters,
+            padding=(kernel_size - 1) // 2,
+            kernel_size=kernel_size,
+            bias=True,
+        )
+        self.L = nn.Linear(filters, attn_dim, bias=False)
+        self.W = nn.Linear(
+            attn_dim, attn_dim, bias=True
+        )  # Include the attention bias in this term
+        self.v = nn.Linear(attn_dim, 1, bias=False)
+        self.cumulative = None
+        self.attention = None
+    def init_attention(self, encoder_seq_proj):
+        device = encoder_seq_proj.device  # use same device as parameters
+        b, t, c = encoder_seq_proj.size()
+        self.cumulative = torch.zeros(b, t, device=device)
+        self.attention = torch.zeros(b, t, device=device)
+    def forward(self, encoder_seq_proj, query, t, chars):
+        if t == 0:
+            self.init_attention(encoder_seq_proj)
+        processed_query = self.W(query).unsqueeze(1)
+        location = self.cumulative.unsqueeze(1)
+        processed_loc = self.L(self.conv(location).transpose(1, 2))
+        u = self.v(torch.tanh(processed_query + encoder_seq_proj + processed_loc))
+        u = u.squeeze(-1)
+        # Mask zero padding chars
+        u = u * (chars != 0).float()
+        # Smooth Attention
+        # scores = torch.sigmoid(u) / torch.sigmoid(u).sum(dim=1, keepdim=True)
+        scores = F.softmax(u, dim=1)
+        self.attention = scores
+        self.cumulative = self.cumulative + self.attention
+        return scores.unsqueeze(-1).transpose(1, 2)
+class Decoder(nn.Module):
+    # Class variable because its value doesn't change between classes
+    # yet ought to be scoped by class because its a property of a Decoder
+    max_r = 20
+    def __init__(
+        self,
+        n_mels,
+        encoder_dims,
+        decoder_dims,
+        lstm_dims,
+        dropout,
+        speaker_embedding_size,
+    ):
+        super().__init__()
+        self.register_buffer("r", torch.tensor(1, dtype=torch.int))
+        self.n_mels = n_mels
+        prenet_dims = (decoder_dims * 2, decoder_dims * 2)
+        self.prenet = PreNet(
+            n_mels, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1], dropout=dropout
+        )
+        self.attn_net = LSA(decoder_dims)
+        if hp.use_gst:
+            speaker_embedding_size += gst_hp.E
+        self.attn_rnn = nn.GRUCell(
+            encoder_dims + prenet_dims[1] + speaker_embedding_size, decoder_dims
+        )
+        self.rnn_input = nn.Linear(
+            encoder_dims + decoder_dims + speaker_embedding_size, lstm_dims
+        )
+        self.res_rnn1 = nn.LSTMCell(lstm_dims, lstm_dims)
+        self.res_rnn2 = nn.LSTMCell(lstm_dims, lstm_dims)
+        self.mel_proj = nn.Linear(lstm_dims, n_mels * self.max_r, bias=False)
+        self.stop_proj = nn.Linear(encoder_dims + speaker_embedding_size + lstm_dims, 1)
+    def zoneout(self, prev, current, device, p=0.1):
+        mask = torch.zeros(prev.size(), device=device).bernoulli_(p)
+        return prev * mask + current * (1 - mask)
+    def forward(
+        self,
+        encoder_seq,
+        encoder_seq_proj,
+        prenet_in,
+        hidden_states,
+        cell_states,
+        context_vec,
+        t,
+        chars,
+    ):
+        # Need this for reshaping mels
+        batch_size = encoder_seq.size(0)
+        device = encoder_seq.device
+        # Unpack the hidden and cell states
+        attn_hidden, rnn1_hidden, rnn2_hidden = hidden_states
+        rnn1_cell, rnn2_cell = cell_states
+        # PreNet for the Attention RNN
+        prenet_out = self.prenet(prenet_in)
+        # Compute the Attention RNN hidden state
+        attn_rnn_in = torch.cat([context_vec, prenet_out], dim=-1)
+        attn_hidden = self.attn_rnn(attn_rnn_in.squeeze(1), attn_hidden)
+        # Compute the attention scores
+        scores = self.attn_net(encoder_seq_proj, attn_hidden, t, chars)
+        # Dot product to create the context vector
+        context_vec = scores @ encoder_seq
+        context_vec = context_vec.squeeze(1)
+        # Concat Attention RNN output w. Context Vector & project
+        x = torch.cat([context_vec, attn_hidden], dim=1)
+        x = self.rnn_input(x)
+        # Compute first Residual RNN
+        rnn1_hidden_next, rnn1_cell = self.res_rnn1(x, (rnn1_hidden, rnn1_cell))
+        if self.training:
+            rnn1_hidden = self.zoneout(rnn1_hidden, rnn1_hidden_next, device=device)
+        else:
+            rnn1_hidden = rnn1_hidden_next
+        x = x + rnn1_hidden
+        # Compute second Residual RNN
+        rnn2_hidden_next, rnn2_cell = self.res_rnn2(x, (rnn2_hidden, rnn2_cell))
+        if self.training:
+            rnn2_hidden = self.zoneout(rnn2_hidden, rnn2_hidden_next, device=device)
+        else:
+            rnn2_hidden = rnn2_hidden_next
+        x = x + rnn2_hidden
+        # Project Mels
+        mels = self.mel_proj(x)
+        mels = mels.view(batch_size, self.n_mels, self.max_r)[:, :, : self.r]
+        hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
+        cell_states = (rnn1_cell, rnn2_cell)
+        # Stop token prediction
+        s = torch.cat((x, context_vec), dim=1)
+        s = self.stop_proj(s)
+        stop_tokens = torch.sigmoid(s)
+        return mels, scores, hidden_states, cell_states, context_vec, stop_tokens
+class Tacotron(nn.Module):
+    def __init__(
+        self,
+        embed_dims,
+        num_chars,
+        encoder_dims,
+        decoder_dims,
+        n_mels,
+        fft_bins,
+        postnet_dims,
+        encoder_K,
+        lstm_dims,
+        postnet_K,
+        num_highways,
+        dropout,
+        stop_threshold,
+        speaker_embedding_size,
+    ):
+        super().__init__()
+        self.n_mels = n_mels
+        self.lstm_dims = lstm_dims
+        self.encoder_dims = encoder_dims
+        self.decoder_dims = decoder_dims
+        self.speaker_embedding_size = speaker_embedding_size
+        self.encoder = Encoder(
+            embed_dims, num_chars, encoder_dims, encoder_K, num_highways, dropout
+        )
+        project_dims = encoder_dims + speaker_embedding_size
+        if hp.use_gst:
+            project_dims += gst_hp.E
+        self.encoder_proj = nn.Linear(project_dims, decoder_dims, bias=False)
+        if hp.use_gst:
+            self.gst = GlobalStyleToken(speaker_embedding_size)
+        self.decoder = Decoder(
+            n_mels,
+            encoder_dims,
+            decoder_dims,
+            lstm_dims,
+            dropout,
+            speaker_embedding_size,
+        )
+        self.postnet = CBHG(
+            postnet_K, n_mels, postnet_dims, [postnet_dims, fft_bins], num_highways
+        )
+        self.post_proj = nn.Linear(postnet_dims, fft_bins, bias=False)
+        self.init_model()
+        self.num_params()
+        self.register_buffer("step", torch.zeros(1, dtype=torch.long))
+        self.register_buffer(
+            "stop_threshold", torch.tensor(stop_threshold, dtype=torch.float32)
+        )
+    @property
+    def r(self):
+        return self.decoder.r.item()
+    @r.setter
+    def r(self, value):
+        self.decoder.r = self.decoder.r.new_tensor(value, requires_grad=False)
+    @staticmethod
+    def _concat_speaker_embedding(outputs, speaker_embeddings):
+        speaker_embeddings_ = speaker_embeddings.expand(
+            outputs.size(0), outputs.size(1), -1
+        )
+        outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
+        return outputs
+    def forward(self, texts, mels, speaker_embedding):
+        device = texts.device  # use same device as parameters
+        self.step += 1
+        batch_size, _, steps = mels.size()
+        # Initialise all hidden states and pack into tuple
+        attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
+        rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
+        rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
+        hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
+        # Initialise all lstm cell states and pack into tuple
+        rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
+        rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
+        cell_states = (rnn1_cell, rnn2_cell)
+        # <GO> Frame for start of decoder loop
+        go_frame = torch.zeros(batch_size, self.n_mels, device=device)
+        # Need an initial context vector
+        size = self.encoder_dims + self.speaker_embedding_size
+        if hp.use_gst:
+            size += gst_hp.E
+        context_vec = torch.zeros(batch_size, size, device=device)
+        # SV2TTS: Run the encoder with the speaker embedding
+        # The projection avoids unnecessary matmuls in the decoder loop
+        encoder_seq = self.encoder(texts, speaker_embedding)
+        # put after encoder
+        if hp.use_gst and self.gst is not None:
+            style_embed = self.gst(
+                speaker_embedding, speaker_embedding
+            )  # for training, speaker embedding can represent both style inputs and referenced
+            # style_embed = style_embed.expand_as(encoder_seq)
+            # encoder_seq = torch.cat((encoder_seq, style_embed), 2)
+            encoder_seq = self._concat_speaker_embedding(encoder_seq, style_embed)
+        encoder_seq_proj = self.encoder_proj(encoder_seq)
+        # Need a couple of lists for outputs
+        mel_outputs, attn_scores, stop_outputs = [], [], []
+        # Run the decoder loop
+        for t in range(0, steps, self.r):
+            prenet_in = mels[:, :, t - 1] if t > 0 else go_frame
+            (
+                mel_frames,
+                scores,
+                hidden_states,
+                cell_states,
+                context_vec,
+                stop_tokens,
+            ) = self.decoder(
+                encoder_seq,
+                encoder_seq_proj,
+                prenet_in,
+                hidden_states,
+                cell_states,
+                context_vec,
+                t,
+                texts,
+            )
+            mel_outputs.append(mel_frames)
+            attn_scores.append(scores)
+            stop_outputs.extend([stop_tokens] * self.r)
+        # Concat the mel outputs into sequence
+        mel_outputs = torch.cat(mel_outputs, dim=2)
+        # Post-Process for Linear Spectrograms
+        postnet_out = self.postnet(mel_outputs)
+        linear = self.post_proj(postnet_out)
+        linear = linear.transpose(1, 2)
+        # For easy visualisation
+        attn_scores = torch.cat(attn_scores, 1)
+        # attn_scores = attn_scores.cpu().data.numpy()
+        stop_outputs = torch.cat(stop_outputs, 1)
+        return mel_outputs, linear, attn_scores, stop_outputs
+    def generate(
+        self, x, speaker_embedding=None, steps=2000, style_idx=0, min_stop_token=5
+    ):
+        self.eval()
+        device = x.device  # use same device as parameters
+        batch_size, _ = x.size()
+        # Need to initialise all hidden states and pack into tuple for tidyness
+        attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
+        rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
+        rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
+        hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
+        # Need to initialise all lstm cell states and pack into tuple for tidyness
+        rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
+        rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
+        cell_states = (rnn1_cell, rnn2_cell)
+        # Need a <GO> Frame for start of decoder loop
+        go_frame = torch.zeros(batch_size, self.n_mels, device=device)
+        # Need an initial context vector
+        size = self.encoder_dims + self.speaker_embedding_size
+        if hp.use_gst:
+            size += gst_hp.E
+        context_vec = torch.zeros(batch_size, size, device=device)
+        # SV2TTS: Run the encoder with the speaker embedding
+        # The projection avoids unnecessary matmuls in the decoder loop
+        encoder_seq = self.encoder(x, speaker_embedding)
+        # put after encoder
+        if hp.use_gst and self.gst is not None:
+            if style_idx >= 0 and style_idx < 10:
+                query = torch.zeros(1, 1, self.gst.stl.attention.num_units)
+                if device.type == "cuda":
+                    query = query.cuda()
+                gst_embed = torch.tanh(self.gst.stl.embed)
+                key = gst_embed[style_idx].unsqueeze(0).expand(1, -1, -1)
+                style_embed = self.gst.stl.attention(query, key)
+            else:
+                speaker_embedding_style = torch.zeros(
+                    speaker_embedding.size()[0], 1, self.speaker_embedding_size
+                ).to(device)
+                style_embed = self.gst(speaker_embedding_style, speaker_embedding)
+            encoder_seq = self._concat_speaker_embedding(encoder_seq, style_embed)
+            # style_embed = style_embed.expand_as(encoder_seq)
+            # encoder_seq = torch.cat((encoder_seq, style_embed), 2)
+        encoder_seq_proj = self.encoder_proj(encoder_seq)
+        # Need a couple of lists for outputs
+        mel_outputs, attn_scores, stop_outputs = [], [], []
+        # Run the decoder loop
+        for t in range(0, steps, self.r):
+            prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
+            (
+                mel_frames,
+                scores,
+                hidden_states,
+                cell_states,
+                context_vec,
+                stop_tokens,
+            ) = self.decoder(
+                encoder_seq,
+                encoder_seq_proj,
+                prenet_in,
+                hidden_states,
+                cell_states,
+                context_vec,
+                t,
+                x,
+            )
+            mel_outputs.append(mel_frames)
+            attn_scores.append(scores)
+            stop_outputs.extend([stop_tokens] * self.r)
+            # Stop the loop when all stop tokens in batch exceed threshold
+            if (stop_tokens * 10 > min_stop_token).all() and t > 10:
+                break
+        # Concat the mel outputs into sequence
+        mel_outputs = torch.cat(mel_outputs, dim=2)
+        # Post-Process for Linear Spectrograms
+        postnet_out = self.postnet(mel_outputs)
+        linear = self.post_proj(postnet_out)
+        linear = linear.transpose(1, 2)
+        # For easy visualisation
+        attn_scores = torch.cat(attn_scores, 1)
+        stop_outputs = torch.cat(stop_outputs, 1)
+        self.train()
+        return mel_outputs, linear, attn_scores
+    def init_model(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def finetune_partial(self, whitelist_layers):
+        self.zero_grad()
+        for name, child in self.named_children():
+            if name in whitelist_layers:
+                logger.debug("Trainable Layer: %s" % name)
+                logger.debug(
+                    "Trainable Parameters: %.3f"
+                    % sum([np.prod(p.size()) for p in child.parameters()])
+                )
+                for param in child.parameters():
+                    param.requires_grad = False
+    def get_step(self):
+        return self.step.data.item()
+    def reset_step(self):
+        # assignment to parameters or buffers is overloaded, updates internal dict entry
+        self.step = self.step.data.new_tensor(1)
+    def load(self, path, device, optimizer=None):
+        # Use device of model params as location for loaded state
+        checkpoint = torch.load(str(path), map_location=device)
+        self.load_state_dict(checkpoint["model_state"], strict=False)
+        if "optimizer_state" in checkpoint and optimizer is not None:
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+    def save(self, path, optimizer=None):
+        if optimizer is not None:
+            torch.save(
+                {
+                    "model_state": self.state_dict(),
+                    "optimizer_state": optimizer.state_dict(),
+                },
+                str(path),
+            )
+        else:
+            torch.save(
+                {
+                    "model_state": self.state_dict(),
+                },
+                str(path),
+            )
+    def num_params(self):
+        parameters = filter(lambda p: p.requires_grad, self.parameters())
+        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
+        logger.debug("Trainable Parameters: %.3fM" % parameters)
+        return parameters

mockingbirdforuse/synthesizer/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+_output_ref = None
+_replicas_ref = None
+def data_parallel_workaround(model, *input):
+    global _output_ref
+    global _replicas_ref
+    device_ids = list(range(torch.cuda.device_count()))
+    output_device = device_ids[0]
+    replicas = torch.nn.parallel.replicate(model, device_ids)
+    # input.shape = (num_args, batch, ...)
+    inputs = torch.nn.parallel.scatter(input, device_ids)
+    # inputs.shape = (num_gpus, num_args, batch/num_gpus, ...)
+    replicas = replicas[: len(inputs)]
+    outputs = torch.nn.parallel.parallel_apply(replicas, inputs)
+    y_hat = torch.nn.parallel.gather(outputs, output_device)
+    _output_ref = outputs
+    _replicas_ref = replicas
+    return y_hat
+class ValueWindow:
+    def __init__(self, window_size=100):
+        self._window_size = window_size
+        self._values = []
+    def append(self, x):
+        self._values = self._values[-(self._window_size - 1) :] + [x]
+    @property
+    def sum(self):
+        return sum(self._values)
+    @property
+    def count(self):
+        return len(self._values)
+    @property
+    def average(self):
+        return self.sum / max(1, self.count)
+    def reset(self):
+        self._values = []

mockingbirdforuse/synthesizer/utils/cleaners.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+"""
+import re
+from unidecode import unidecode
+from .numbers import normalize_numbers
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r"\s+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def expand_numbers(text):
+    return normalize_numbers(text)
+def lowercase(text):
+    """lowercase input tokens."""
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def convert_to_ascii(text):
+    return unidecode(text)
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def transliteration_cleaners(text):
+    """Pipeline for non-English text that transliterates to ASCII."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def english_cleaners(text):
+    """Pipeline for English text, including number and abbreviation expansion."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text

mockingbirdforuse/synthesizer/utils/logmmse.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# The MIT License (MIT)
+#
+# Copyright (c) 2015 braindead
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+#
+# This code was extracted from the logmmse package (https://pypi.org/project/logmmse/) and I
+# simply modified the interface to meet my needs.
+import numpy as np
+import math
+from scipy.special import expn
+from collections import namedtuple
+NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2")
+def profile_noise(noise, sampling_rate, window_size=0):
+    """
+    Creates a profile of the noise in a given waveform.
+    :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints.
+    :param sampling_rate: the sampling rate of the audio
+    :param window_size: the size of the window the logmmse algorithm operates on. A default value
+    will be picked if left as 0.
+    :return: a NoiseProfile object
+    """
+    noise, dtype = to_float(noise)
+    noise += np.finfo(np.float64).eps
+    if window_size == 0:
+        window_size = int(math.floor(0.02 * sampling_rate))
+    if window_size % 2 == 1:
+        window_size = window_size + 1
+    perc = 50
+    len1 = int(math.floor(window_size * perc / 100))
+    len2 = int(window_size - len1)
+    win = np.hanning(window_size)
+    win = win * len2 / np.sum(win)
+    n_fft = 2 * window_size
+    noise_mean = np.zeros(n_fft)
+    n_frames = len(noise) // window_size
+    for j in range(0, window_size * n_frames, window_size):
+        noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0))
+    noise_mu2 = (noise_mean / n_frames) ** 2
+    return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2)
+def denoise(wav, noise_profile: NoiseProfile, eta=0.15):
+    """
+    Cleans the noise from a speech waveform given a noise profile. The waveform must have the
+    same sampling rate as the one used to create the noise profile.
+    :param wav: a speech waveform as a numpy array of floats or ints.
+    :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of
+    the same) waveform.
+    :param eta: voice threshold for noise update. While the voice activation detection value is
+    below this threshold, the noise profile will be continuously updated throughout the audio.
+    Set to 0 to disable updating the noise profile.
+    :return: the clean wav as a numpy array of floats or ints of the same length.
+    """
+    wav, dtype = to_float(wav)
+    wav += np.finfo(np.float64).eps
+    p = noise_profile
+    nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2))
+    x_final = np.zeros(nframes * p.len2)
+    aa = 0.98
+    mu = 0.98
+    ksi_min = 10 ** (-25 / 10)
+    x_old = np.zeros(p.len1)
+    xk_prev = np.zeros(p.len1)
+    noise_mu2 = p.noise_mu2
+    for k in range(0, nframes * p.len2, p.len2):
+        insign = p.win * wav[k:k + p.window_size]
+        spec = np.fft.fft(insign, p.n_fft, axis=0)
+        sig = np.absolute(spec)
+        sig2 = sig ** 2
+        gammak = np.minimum(sig2 / noise_mu2, 40)
+        if xk_prev.all() == 0:
+            ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
+        else:
+            ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
+            ksi = np.maximum(ksi_min, ksi)
+        log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
+        vad_decision = np.sum(log_sigma_k) / p.window_size
+        if vad_decision < eta:
+            noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
+        a = ksi / (1 + ksi)
+        vk = a * gammak
+        ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
+        hw = a * np.exp(ei_vk)
+        sig = sig * hw
+        xk_prev = sig ** 2
+        xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0)
+        xi_w = np.real(xi_w)
+        x_final[k:k + p.len2] = x_old + xi_w[0:p.len1]
+        x_old = xi_w[p.len1:p.window_size]
+    output = from_float(x_final, dtype)
+    output = np.pad(output, (0, len(wav) - len(output)), mode="constant")
+    return output
+## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that
+## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of
+## webrctvad
+# def vad(wav, sampling_rate, eta=0.15, window_size=0):
+#     """
+#     TODO: fix doc
+#     Creates a profile of the noise in a given waveform.
+#
+#     :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints.
+#     :param sampling_rate: the sampling rate of the audio
+#     :param window_size: the size of the window the logmmse algorithm operates on. A default value
+#     will be picked if left as 0.
+#     :param eta: voice threshold for noise update. While the voice activation detection value is
+#     below this threshold, the noise profile will be continuously updated throughout the audio.
+#     Set to 0 to disable updating the noise profile.
+#     """
+#     wav, dtype = to_float(wav)
+#     wav += np.finfo(np.float64).eps
+#
+#     if window_size == 0:
+#         window_size = int(math.floor(0.02 * sampling_rate))
+#
+#     if window_size % 2 == 1:
+#         window_size = window_size + 1
+#
+#     perc = 50
+#     len1 = int(math.floor(window_size * perc / 100))
+#     len2 = int(window_size - len1)
+#
+#     win = np.hanning(window_size)
+#     win = win * len2 / np.sum(win)
+#     n_fft = 2 * window_size
+#
+#     wav_mean = np.zeros(n_fft)
+#     n_frames = len(wav) // window_size
+#     for j in range(0, window_size * n_frames, window_size):
+#         wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0))
+#     noise_mu2 = (wav_mean / n_frames) ** 2
+#
+#     wav, dtype = to_float(wav)
+#     wav += np.finfo(np.float64).eps
+#
+#     nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2))
+#     vad = np.zeros(nframes * len2, dtype=np.bool)
+#
+#     aa = 0.98
+#     mu = 0.98
+#     ksi_min = 10 ** (-25 / 10)
+#
+#     xk_prev = np.zeros(len1)
+#     noise_mu2 = noise_mu2
+#     for k in range(0, nframes * len2, len2):
+#         insign = win * wav[k:k + window_size]
+#
+#         spec = np.fft.fft(insign, n_fft, axis=0)
+#         sig = np.absolute(spec)
+#         sig2 = sig ** 2
+#
+#         gammak = np.minimum(sig2 / noise_mu2, 40)
+#
+#         if xk_prev.all() == 0:
+#             ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
+#         else:
+#             ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
+#             ksi = np.maximum(ksi_min, ksi)
+#
+#         log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
+#         vad_decision = np.sum(log_sigma_k) / window_size
+#         if vad_decision < eta:
+#             noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
+#
+#         a = ksi / (1 + ksi)
+#         vk = a * gammak
+#         ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
+#         hw = a * np.exp(ei_vk)
+#         sig = sig * hw
+#         xk_prev = sig ** 2
+#
+#         vad[k:k + len2] = vad_decision >= eta
+#
+#     vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant")
+#     return vad
+def to_float(_input):
+    if _input.dtype == np.float64:
+        return _input, _input.dtype
+    elif _input.dtype == np.float32:
+        return _input.astype(np.float64), _input.dtype
+    elif _input.dtype == np.uint8:
+        return (_input - 128) / 128., _input.dtype
+    elif _input.dtype == np.int16:
+        return _input / 32768., _input.dtype
+    elif _input.dtype == np.int32:
+        return _input / 2147483648., _input.dtype
+    raise ValueError('Unsupported wave file format')
+def from_float(_input, dtype):
+    if dtype == np.float64:
+        return _input, np.float64
+    elif dtype == np.float32:
+        return _input.astype(np.float32)
+    elif dtype == np.uint8:
+        return ((_input * 128) + 128).astype(np.uint8)
+    elif dtype == np.int16:
+        return (_input * 32768).astype(np.int16)
+    elif dtype == np.int32:
+        return (_input * 2147483648).astype(np.int32)
+    raise ValueError('Unsupported wave file format')

mockingbirdforuse/synthesizer/utils/numbers.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import re
+import inflect
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(
+                num, andword="", zero="oh", group=2
+            ).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text

mockingbirdforuse/synthesizer/utils/symbols.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Defines the set of symbols used in text input to the model.
+The default is a set of ASCII characters that works well for English or text that has been run
+through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
+"""
+# from . import cmudict
+_pad = "_"
+_eos = "~"
+_characters = (
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!'(),-.:;? "
+)
+# _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? ' # use this old one if you want to train old model
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+# _arpabet = ["@' + s for s in cmudict.valid_symbols]
+# Export all symbols:
+symbols = [_pad, _eos] + list(_characters)  # + _arpabet

mockingbirdforuse/synthesizer/utils/text.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from .symbols import symbols
+from . import cleaners
+import re
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+# Regular expression matching text enclosed in curly braces:
+_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
+def text_to_sequence(text, cleaner_names):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
+    sequence = []
+    # Check for curly braces and treat their contents as ARPAbet:
+    while len(text):
+        m = _curly_re.match(text)
+        if not m:
+            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
+            break
+        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _arpabet_to_sequence(m.group(2))
+        text = m.group(3)
+    # Append EOS token
+    sequence.append(_symbol_to_id["~"])
+    return sequence
+def sequence_to_text(sequence):
+    """Converts a sequence of IDs back to a string"""
+    result = ""
+    for symbol_id in sequence:
+        if symbol_id in _id_to_symbol:
+            s = _id_to_symbol[symbol_id]
+            # Enclose ARPAbet back in curly braces:
+            if len(s) > 1 and s[0] == "@":
+                s = "{%s}" % s[1:]
+            result += s
+    return result.replace("}{", " ")
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception("Unknown cleaner: %s" % name)
+        text = cleaner(text)
+    return text
+def _symbols_to_sequence(symbols):
+    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
+def _arpabet_to_sequence(text):
+    return _symbols_to_sequence(["@" + s for s in text.split()])
+def _should_keep_symbol(s):
+    return s in _symbol_to_id and s not in ("_", "~")

mockingbirdforuse/vocoder/__init__.py ADDED Viewed

File without changes

mockingbirdforuse/vocoder/distribution.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+def log_sum_exp(x):
+    """numerically stable log_sum_exp implementation that prevents overflow"""
+    # TF ordering
+    axis = len(x.size()) - 1
+    m, _ = torch.max(x, dim=axis)
+    m2, _ = torch.max(x, dim=axis, keepdim=True)
+    return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
+# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
+def discretized_mix_logistic_loss(
+    y_hat, y, num_classes=65536, log_scale_min=None, reduce=True
+):
+    if log_scale_min is None:
+        log_scale_min = float(np.log(1e-14))
+    y_hat = y_hat.permute(0, 2, 1)
+    assert y_hat.dim() == 3
+    assert y_hat.size(1) % 3 == 0
+    nr_mix = y_hat.size(1) // 3
+    # (B x T x C)
+    y_hat = y_hat.transpose(1, 2)
+    # unpack parameters. (B, T, num_mixtures) x 3
+    logit_probs = y_hat[:, :, :nr_mix]
+    means = y_hat[:, :, nr_mix : 2 * nr_mix]
+    log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min)
+    # B x T x 1 -> B x T x num_mixtures
+    y = y.expand_as(means)
+    centered_y = y - means
+    inv_stdv = torch.exp(-log_scales)
+    plus_in = inv_stdv * (centered_y + 1.0 / (num_classes - 1))
+    cdf_plus = torch.sigmoid(plus_in)
+    min_in = inv_stdv * (centered_y - 1.0 / (num_classes - 1))
+    cdf_min = torch.sigmoid(min_in)
+    # log probability for edge case of 0 (before scaling)
+    # equivalent: torch.log(F.sigmoid(plus_in))
+    log_cdf_plus = plus_in - F.softplus(plus_in)
+    # log probability for edge case of 255 (before scaling)
+    # equivalent: (1 - F.sigmoid(min_in)).log()
+    log_one_minus_cdf_min = -F.softplus(min_in)
+    # probability for all other cases
+    cdf_delta = cdf_plus - cdf_min
+    mid_in = inv_stdv * centered_y
+    # log probability in the center of the bin, to be used in extreme cases
+    # (not actually used in our code)
+    log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in)
+    # tf equivalent
+    """
+    log_probs = tf.where(x < -0.999, log_cdf_plus,
+                         tf.where(x > 0.999, log_one_minus_cdf_min,
+                                  tf.where(cdf_delta > 1e-5,
+                                           tf.log(tf.maximum(cdf_delta, 1e-12)),
+                                           log_pdf_mid - np.log(127.5))))
+    """
+    # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
+    # for num_classes=65536 case? 1e-7? not sure..
+    inner_inner_cond = (cdf_delta > 1e-5).float()
+    inner_inner_out = inner_inner_cond * torch.log(
+        torch.clamp(cdf_delta, min=1e-12)
+    ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
+    inner_cond = (y > 0.999).float()
+    inner_out = (
+        inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out
+    )
+    cond = (y < -0.999).float()
+    log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out
+    log_probs = log_probs + F.log_softmax(logit_probs, -1)
+    if reduce:
+        return -torch.mean(log_sum_exp(log_probs))
+    else:
+        return -log_sum_exp(log_probs).unsqueeze(-1)
+def sample_from_discretized_mix_logistic(y, log_scale_min=None):
+    """
+    Sample from discretized mixture of logistic distributions
+    Args:
+        y (Tensor): B x C x T
+        log_scale_min (float): Log scale minimum value
+    Returns:
+        Tensor: sample in range of [-1, 1].
+    """
+    if log_scale_min is None:
+        log_scale_min = float(np.log(1e-14))
+    assert y.size(1) % 3 == 0
+    nr_mix = y.size(1) // 3
+    # B x T x C
+    y = y.transpose(1, 2)
+    logit_probs = y[:, :, :nr_mix]
+    # sample mixture indicator from softmax
+    temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
+    temp = logit_probs.data - torch.log(-torch.log(temp))
+    _, argmax = temp.max(dim=-1)
+    # (B, T) -> (B, T, nr_mix)
+    one_hot = to_one_hot(argmax, nr_mix)
+    # select logistic parameters
+    means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1)
+    log_scales = torch.clamp(
+        torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min
+    )
+    # sample from logistic & clip to interval
+    # we don't actually round to the nearest 8bit value when sampling
+    u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
+    x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1.0 - u))
+    x = torch.clamp(torch.clamp(x, min=-1.0), max=1.0)
+    return x
+def to_one_hot(tensor, n, fill_with=1.0):
+    # we perform one hot encore with respect to the last axis
+    one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
+    if tensor.is_cuda:
+        one_hot = one_hot.cuda()
+    one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
+    return one_hot

mockingbirdforuse/vocoder/hifigan/__init__.py ADDED Viewed

File without changes

mockingbirdforuse/vocoder/hifigan/hparams.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from dataclasses import dataclass
+@dataclass
+class HParams:
+    resblock = "1"
+    num_gpus = 0
+    batch_size = 16
+    learning_rate = 0.0002
+    adam_b1 = 0.8
+    adam_b2 = 0.99
+    lr_decay = 0.999
+    seed = 1234
+    upsample_rates = [5, 5, 4, 2]
+    upsample_kernel_sizes = [10, 10, 8, 4]
+    upsample_initial_channel = 512
+    resblock_kernel_sizes = [3, 7, 11]
+    resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    segment_size = 6400
+    num_mels = 80
+    num_freq = 1025
+    n_fft = 1024
+    hop_size = 200
+    win_size = 800
+    sampling_rate = 16000
+    fmin = 0
+    fmax = 7600
+    fmax_for_loss = None
+    num_workers = 4
+hparams = HParams()

mockingbirdforuse/vocoder/hifigan/inference.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+from pathlib import Path
+from .hparams import hparams as hp
+from .models import Generator
+from ...log import logger
+class HifiGanVocoder:
+    def __init__(self, model_path: Path):
+        torch.manual_seed(hp.seed)
+        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.generator = Generator(hp).to(self._device)
+        logger.debug("Loading '{}'".format(model_path))
+        state_dict_g = torch.load(model_path, map_location=self._device)
+        logger.debug("Complete.")
+        self.generator.load_state_dict(state_dict_g["generator"])
+        self.generator.eval()
+        self.generator.remove_weight_norm()
+    def infer_waveform(self, mel):
+        mel = torch.FloatTensor(mel).to(self._device)
+        mel = mel.unsqueeze(0)
+        with torch.no_grad():
+            y_g_hat = self.generator(mel)
+            audio = y_g_hat.squeeze()
+        audio = audio.cpu().numpy()
+        return audio, hp.sampling_rate

mockingbirdforuse/vocoder/hifigan/models.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.spectral_norm import spectral_norm
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils.weight_norm import weight_norm, remove_weight_norm
+from ...log import logger
+LRELU_SLOPE = 0.1
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class InterpolationBlock(torch.nn.Module):
+    def __init__(
+        self, scale_factor, mode="nearest", align_corners=None, downsample=False
+    ):
+        super(InterpolationBlock, self).__init__()
+        self.downsample = downsample
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        outputs = F.interpolate(
+            x,
+            size=x.shape[-1] * self.scale_factor
+            if not self.downsample
+            else x.shape[-1] // self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+            recompute_scale_factor=False,
+        )
+        return outputs
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)
+        )
+        resblock = ResBlock1 if h.resblock == "1" else ResBlock2
+        self.ups = nn.ModuleList()
+        #         for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+        # #             self.ups.append(weight_norm(
+        # #                 ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+        # #                                 k, u, padding=(k-u)//2)))
+        if h.sampling_rate == 24000:
+            for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+                self.ups.append(
+                    torch.nn.Sequential(
+                        InterpolationBlock(u),
+                        weight_norm(
+                            torch.nn.Conv1d(
+                                h.upsample_initial_channel // (2**i),
+                                h.upsample_initial_channel // (2 ** (i + 1)),
+                                k,
+                                padding=(k - 1) // 2,
+                            )
+                        ),
+                    )
+                )
+        else:
+            for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+                self.ups.append(
+                    weight_norm(
+                        ConvTranspose1d(
+                            h.upsample_initial_channel // (2**i),
+                            h.upsample_initial_channel // (2 ** (i + 1)),
+                            k,
+                            u,
+                            padding=(u // 2 + u % 2),
+                            output_padding=u % 2,
+                        )
+                    )
+                )
+        self.resblocks = nn.ModuleList()
+        ch = 0
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        logger.debug("Removing weight norm...")
+        for module in self.ups:
+            if self.h.sampling_rate == 24000:
+                remove_weight_norm(module[-1])
+            else:
+                remove_weight_norm(module)
+        for module in self.resblocks:
+            module.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorP(2),
+                DiscriminatorP(3),
+                DiscriminatorP(5),
+                DiscriminatorP(7),
+                DiscriminatorP(11),
+            ]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+                norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+                norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+                norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorS(use_spectral_norm=True),
+                DiscriminatorS(),
+                DiscriminatorS(),
+            ]
+        )
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

mockingbirdforuse/vocoder/wavernn/__init__.py ADDED Viewed

File without changes

mockingbirdforuse/vocoder/wavernn/audio.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import math
+import librosa
+import numpy as np
+import soundfile as sf
+from scipy.signal import lfilter
+from .hparams import hparams as hp
+def label_2_float(x, bits):
+    return 2 * x / (2**bits - 1.0) - 1.0
+def float_2_label(x, bits):
+    assert abs(x).max() <= 1.0
+    x = (x + 1.0) * (2**bits - 1) / 2
+    return x.clip(0, 2**bits - 1)
+def load_wav(path):
+    return librosa.load(str(path), sr=hp.sample_rate)[0]
+def save_wav(x, path):
+    sf.write(path, x.astype(np.float32), hp.sample_rate)
+def split_signal(x):
+    unsigned = x + 2**15
+    coarse = unsigned // 256
+    fine = unsigned % 256
+    return coarse, fine
+def combine_signal(coarse, fine):
+    return coarse * 256 + fine - 2**15
+def encode_16bits(x):
+    return np.clip(x * 2**15, -(2**15), 2**15 - 1).astype(np.int16)
+mel_basis = None
+def linear_to_mel(spectrogram):
+    global mel_basis
+    if mel_basis is None:
+        mel_basis = build_mel_basis()
+    return np.dot(mel_basis, spectrogram)
+def build_mel_basis():
+    return librosa.filters.mel(
+        sr=hp.sample_rate,
+        n_fft=hp.n_fft,
+        n_mels=hp.num_mels,
+        fmin=hp.fmin,
+    )
+def normalize(S):
+    return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
+def denormalize(S):
+    return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
+def amp_to_db(x):
+    return 20 * np.log10(np.maximum(1e-5, x))
+def db_to_amp(x):
+    return np.power(10.0, x * 0.05)
+def spectrogram(y):
+    D = stft(y)
+    S = amp_to_db(np.abs(D)) - hp.ref_level_db
+    return normalize(S)
+def melspectrogram(y):
+    D = stft(y)
+    S = amp_to_db(linear_to_mel(np.abs(D)))
+    return normalize(S)
+def stft(y):
+    return librosa.stft(
+        y=y,
+        n_fft=hp.n_fft,
+        hop_length=hp.hop_length,
+        win_length=hp.win_length,
+    )
+def pre_emphasis(x):
+    return lfilter([1, -hp.preemphasis], [1], x)
+def de_emphasis(x):
+    return lfilter([1], [1, -hp.preemphasis], x)
+def encode_mu_law(x, mu):
+    mu = mu - 1
+    fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
+    return np.floor((fx + 1) / 2 * mu + 0.5)
+def decode_mu_law(y, mu, from_labels=True):
+    if from_labels:
+        y = label_2_float(y, math.log2(mu))
+    mu = mu - 1
+    x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
+    return x

mockingbirdforuse/vocoder/wavernn/hparams.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from dataclasses import dataclass
+from ...synthesizer.hparams import hparams as _syn_hp
+@dataclass
+class HParams:
+    # Audio settings------------------------------------------------------------------------
+    # Match the values of the synthesizer
+    sample_rate = _syn_hp.sample_rate
+    n_fft = _syn_hp.n_fft
+    num_mels = _syn_hp.num_mels
+    hop_length = _syn_hp.hop_size
+    win_length = _syn_hp.win_size
+    fmin = _syn_hp.fmin
+    min_level_db = _syn_hp.min_level_db
+    ref_level_db = _syn_hp.ref_level_db
+    mel_max_abs_value = _syn_hp.max_abs_value
+    preemphasis = _syn_hp.preemphasis
+    apply_preemphasis = _syn_hp.preemphasize
+    bits = 9  # bit depth of signal
+    mu_law = True  # Recommended to suppress noise if using raw bits in hp.voc_mode
+    # below
+    # WAVERNN / VOCODER --------------------------------------------------------------------------------
+    voc_mode = "RAW"  # either 'RAW' (softmax on raw bits) or 'MOL' (sample from
+    # mixture of logistics)
+    voc_upsample_factors = (
+        5,
+        5,
+        8,
+    )  # NB - this needs to correctly factorise hop_length
+    voc_rnn_dims = 512
+    voc_fc_dims = 512
+    voc_compute_dims = 128
+    voc_res_out_dims = 128
+    voc_res_blocks = 10
+    # Training
+    voc_batch_size = 100
+    voc_lr = 1e-4
+    voc_gen_at_checkpoint = 5  # number of samples to generate at each checkpoint
+    voc_pad = 2  # this will pad the input so that the resnet can 'see' wider
+    # than input length
+    voc_seq_len = hop_length * 5  # must be a multiple of hop_length
+    # Generating / Synthesizing
+    voc_gen_batched = True  # very fast (realtime+) single utterance batched generation
+    voc_target = 8000  # target number of samples to be generated in each batch entry
+    voc_overlap = 400  # number of samples for crossfading between batches
+hparams = HParams()

mockingbirdforuse/vocoder/wavernn/inference.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from pathlib import Path
+from .hparams import hparams as hp
+from .models.fatchord_version import WaveRNN
+from ...log import logger
+class WaveRNNVocoder:
+    def __init__(self, model_path: Path):
+        logger.debug("Building Wave-RNN")
+        self._model = WaveRNN(
+            rnn_dims=hp.voc_rnn_dims,
+            fc_dims=hp.voc_fc_dims,
+            bits=hp.bits,
+            pad=hp.voc_pad,
+            upsample_factors=hp.voc_upsample_factors,
+            feat_dims=hp.num_mels,
+            compute_dims=hp.voc_compute_dims,
+            res_out_dims=hp.voc_res_out_dims,
+            res_blocks=hp.voc_res_blocks,
+            hop_length=hp.hop_length,
+            sample_rate=hp.sample_rate,
+            mode=hp.voc_mode,
+        )
+        if torch.cuda.is_available():
+            self._model = self._model.cuda()
+            self._device = torch.device("cuda")
+        else:
+            self._device = torch.device("cpu")
+        logger.debug("Loading model weights at %s" % model_path)
+        checkpoint = torch.load(model_path, self._device)
+        self._model.load_state_dict(checkpoint["model_state"])
+        self._model.eval()
+    def infer_waveform(
+        self, mel, normalize=True, batched=True, target=8000, overlap=800
+    ):
+        """
+        Infers the waveform of a mel spectrogram output by the synthesizer (the format must match
+        that of the synthesizer!)
+        :param normalize:
+        :param batched:
+        :param target:
+        :param overlap:
+        :return:
+        """
+        if normalize:
+            mel = mel / hp.mel_max_abs_value
+        mel = torch.from_numpy(mel[None, ...])
+        wav = self._model.generate(mel, batched, target, overlap, hp.mu_law)
+        return wav, hp.sample_rate

mockingbirdforuse/vocoder/wavernn/models/deepmind_version.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import time
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+from ..audio import combine_signal
+from ....log import logger
+class WaveRNN(nn.Module):
+    def __init__(self, hidden_size=896, quantisation=256):
+        super(WaveRNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.split_size = hidden_size // 2
+        # The main matmul
+        self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
+        # Output fc layers
+        self.O1 = nn.Linear(self.split_size, self.split_size)
+        self.O2 = nn.Linear(self.split_size, quantisation)
+        self.O3 = nn.Linear(self.split_size, self.split_size)
+        self.O4 = nn.Linear(self.split_size, quantisation)
+        # Input fc layers
+        self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False)
+        self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False)
+        # biases for the gates
+        self.bias_u = Parameter(torch.zeros(self.hidden_size))
+        self.bias_r = Parameter(torch.zeros(self.hidden_size))
+        self.bias_e = Parameter(torch.zeros(self.hidden_size))
+        # display num params
+        self.num_params()
+    def forward(self, prev_y, prev_hidden, current_coarse):
+        # Main matmul - the projection is split 3 ways
+        R_hidden = self.R(prev_hidden)
+        (
+            R_u,
+            R_r,
+            R_e,
+        ) = torch.split(R_hidden, self.hidden_size, dim=1)
+        # Project the prev input
+        coarse_input_proj = self.I_coarse(prev_y)
+        I_coarse_u, I_coarse_r, I_coarse_e = torch.split(
+            coarse_input_proj, self.split_size, dim=1
+        )
+        # Project the prev input and current coarse sample
+        fine_input = torch.cat([prev_y, current_coarse], dim=1)
+        fine_input_proj = self.I_fine(fine_input)
+        I_fine_u, I_fine_r, I_fine_e = torch.split(
+            fine_input_proj, self.split_size, dim=1
+        )
+        # concatenate for the gates
+        I_u = torch.cat([I_coarse_u, I_fine_u], dim=1)
+        I_r = torch.cat([I_coarse_r, I_fine_r], dim=1)
+        I_e = torch.cat([I_coarse_e, I_fine_e], dim=1)
+        # Compute all gates for coarse and fine
+        u = F.sigmoid(R_u + I_u + self.bias_u)
+        r = F.sigmoid(R_r + I_r + self.bias_r)
+        e = torch.tanh(r * R_e + I_e + self.bias_e)
+        hidden = u * prev_hidden + (1.0 - u) * e
+        # Split the hidden state
+        hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1)
+        # Compute outputs
+        out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
+        out_fine = self.O4(F.relu(self.O3(hidden_fine)))
+        return out_coarse, out_fine, hidden
+    def generate(self, seq_len):
+        with torch.no_grad():
+            # First split up the biases for the gates
+            b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size)
+            b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size)
+            b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size)
+            # Lists for the two output seqs
+            c_outputs, f_outputs = [], []
+            # Some initial inputs
+            out_coarse = torch.LongTensor([0]).cuda()
+            out_fine = torch.LongTensor([0]).cuda()
+            # We'll meed a hidden state
+            hidden = self.init_hidden()
+            # Need a clock for display
+            start = time.time()
+            # Loop for generation
+            for i in range(seq_len):
+                # Split into two hidden states
+                hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1)
+                # Scale and concat previous predictions
+                out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1.0
+                out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1.0
+                prev_outputs = torch.cat([out_coarse, out_fine], dim=1)
+                # Project input
+                coarse_input_proj = self.I_coarse(prev_outputs)
+                I_coarse_u, I_coarse_r, I_coarse_e = torch.split(
+                    coarse_input_proj, self.split_size, dim=1
+                )
+                # Project hidden state and split 6 ways
+                R_hidden = self.R(hidden)
+                (
+                    R_coarse_u,
+                    R_fine_u,
+                    R_coarse_r,
+                    R_fine_r,
+                    R_coarse_e,
+                    R_fine_e,
+                ) = torch.split(R_hidden, self.split_size, dim=1)
+                # Compute the coarse gates
+                u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u)
+                r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r)
+                e = torch.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e)
+                hidden_coarse = u * hidden_coarse + (1.0 - u) * e
+                # Compute the coarse output
+                out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
+                posterior = F.softmax(out_coarse, dim=1)
+                distrib = torch.distributions.Categorical(posterior)
+                out_coarse = distrib.sample()
+                c_outputs.append(out_coarse)
+                # Project the [prev outputs and predicted coarse sample]
+                coarse_pred = out_coarse.float() / 127.5 - 1.0
+                fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1)
+                fine_input_proj = self.I_fine(fine_input)
+                I_fine_u, I_fine_r, I_fine_e = torch.split(
+                    fine_input_proj, self.split_size, dim=1
+                )
+                # Compute the fine gates
+                u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u)
+                r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r)
+                e = torch.tanh(r * R_fine_e + I_fine_e + b_fine_e)
+                hidden_fine = u * hidden_fine + (1.0 - u) * e
+                # Compute the fine output
+                out_fine = self.O4(F.relu(self.O3(hidden_fine)))
+                posterior = F.softmax(out_fine, dim=1)
+                distrib = torch.distributions.Categorical(posterior)
+                out_fine = distrib.sample()
+                f_outputs.append(out_fine)
+                # Put the hidden state back together
+                hidden = torch.cat([hidden_coarse, hidden_fine], dim=1)
+            coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy()
+            fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy()
+            output = combine_signal(coarse, fine)
+        return output, coarse, fine
+    def init_hidden(self, batch_size=1):
+        return torch.zeros(batch_size, self.hidden_size).cuda()
+    def num_params(self):
+        parameters = filter(lambda p: p.requires_grad, self.parameters())
+        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
+        logger.debug("Trainable Parameters: %.3f million" % parameters)

mockingbirdforuse/vocoder/wavernn/models/fatchord_version.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import time
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+from ..audio import de_emphasis, decode_mu_law
+from ..hparams import hparams as hp
+from ...distribution import sample_from_discretized_mix_logistic
+from ....log import logger
+class ResBlock(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
+        self.batch_norm1 = nn.BatchNorm1d(dims)
+        self.batch_norm2 = nn.BatchNorm1d(dims)
+    def forward(self, x):
+        residual = x
+        x = self.conv1(x)
+        x = self.batch_norm1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = self.batch_norm2(x)
+        return x + residual
+class MelResNet(nn.Module):
+    def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad):
+        super().__init__()
+        k_size = pad * 2 + 1
+        self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False)
+        self.batch_norm = nn.BatchNorm1d(compute_dims)
+        self.layers = nn.ModuleList()
+        for i in range(res_blocks):
+            self.layers.append(ResBlock(compute_dims))
+        self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1)
+    def forward(self, x):
+        x = self.conv_in(x)
+        x = self.batch_norm(x)
+        x = F.relu(x)
+        for f in self.layers:
+            x = f(x)
+        x = self.conv_out(x)
+        return x
+class Stretch2d(nn.Module):
+    def __init__(self, x_scale, y_scale):
+        super().__init__()
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+    def forward(self, x):
+        b, c, h, w = x.size()
+        x = x.unsqueeze(-1).unsqueeze(3)
+        x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
+        return x.view(b, c, h * self.y_scale, w * self.x_scale)
+class UpsampleNetwork(nn.Module):
+    def __init__(
+        self, feat_dims, upsample_scales, compute_dims, res_blocks, res_out_dims, pad
+    ):
+        super().__init__()
+        total_scale = np.cumproduct(upsample_scales)[-1]
+        self.indent = pad * total_scale
+        self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad)
+        self.resnet_stretch = Stretch2d(total_scale, 1)
+        self.up_layers = nn.ModuleList()
+        for scale in upsample_scales:
+            k_size = (1, scale * 2 + 1)
+            padding = (0, scale)
+            stretch = Stretch2d(scale, 1)
+            conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False)
+            conv.weight.data.fill_(1.0 / k_size[1])
+            self.up_layers.append(stretch)
+            self.up_layers.append(conv)
+    def forward(self, m):
+        aux = self.resnet(m).unsqueeze(1)
+        aux = self.resnet_stretch(aux)
+        aux = aux.squeeze(1)
+        m = m.unsqueeze(1)
+        for f in self.up_layers:
+            m = f(m)
+        m = m.squeeze(1)[:, :, self.indent : -self.indent]
+        return m.transpose(1, 2), aux.transpose(1, 2)
+class WaveRNN(nn.Module):
+    def __init__(
+        self,
+        rnn_dims,
+        fc_dims,
+        bits,
+        pad,
+        upsample_factors,
+        feat_dims,
+        compute_dims,
+        res_out_dims,
+        res_blocks,
+        hop_length,
+        sample_rate,
+        mode="RAW",
+    ):
+        super().__init__()
+        self.mode = mode
+        self.pad = pad
+        if self.mode == "RAW":
+            self.n_classes = 2**bits
+        elif self.mode == "MOL":
+            self.n_classes = 30
+        else:
+            RuntimeError("Unknown model mode value - ", self.mode)
+        self.rnn_dims = rnn_dims
+        self.aux_dims = res_out_dims // 4
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.upsample = UpsampleNetwork(
+            feat_dims, upsample_factors, compute_dims, res_blocks, res_out_dims, pad
+        )
+        self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims)
+        self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True)
+        self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True)
+        self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
+        self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
+        self.fc3 = nn.Linear(fc_dims, self.n_classes)
+        self.step = Parameter(torch.zeros(1).long(), requires_grad=False)
+        self.num_params()
+    def forward(self, x, mels):
+        self.step += 1
+        bsize = x.size(0)
+        if torch.cuda.is_available():
+            h1 = torch.zeros(1, bsize, self.rnn_dims).cuda()
+            h2 = torch.zeros(1, bsize, self.rnn_dims).cuda()
+        else:
+            h1 = torch.zeros(1, bsize, self.rnn_dims).cpu()
+            h2 = torch.zeros(1, bsize, self.rnn_dims).cpu()
+        mels, aux = self.upsample(mels)
+        aux_idx = [self.aux_dims * i for i in range(5)]
+        a1 = aux[:, :, aux_idx[0] : aux_idx[1]]
+        a2 = aux[:, :, aux_idx[1] : aux_idx[2]]
+        a3 = aux[:, :, aux_idx[2] : aux_idx[3]]
+        a4 = aux[:, :, aux_idx[3] : aux_idx[4]]
+        x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
+        x = self.I(x)
+        res = x
+        x, _ = self.rnn1(x, h1)
+        x = x + res
+        res = x
+        x = torch.cat([x, a2], dim=2)
+        x, _ = self.rnn2(x, h2)
+        x = x + res
+        x = torch.cat([x, a3], dim=2)
+        x = F.relu(self.fc1(x))
+        x = torch.cat([x, a4], dim=2)
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
+    def generate(self, mels, batched, target, overlap, mu_law):
+        mu_law = mu_law if self.mode == "RAW" else False
+        self.eval()
+        output = []
+        start = time.time()
+        rnn1 = self.get_gru_cell(self.rnn1)
+        rnn2 = self.get_gru_cell(self.rnn2)
+        with torch.no_grad():
+            if torch.cuda.is_available():
+                mels = mels.cuda()
+            else:
+                mels = mels.cpu()
+            wave_len = (mels.size(-1) - 1) * self.hop_length
+            mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both")
+            mels, aux = self.upsample(mels.transpose(1, 2))
+            if batched:
+                mels = self.fold_with_overlap(mels, target, overlap)
+                aux = self.fold_with_overlap(aux, target, overlap)
+            b_size, seq_len, _ = mels.size()
+            if torch.cuda.is_available():
+                h1 = torch.zeros(b_size, self.rnn_dims).cuda()
+                h2 = torch.zeros(b_size, self.rnn_dims).cuda()
+                x = torch.zeros(b_size, 1).cuda()
+            else:
+                h1 = torch.zeros(b_size, self.rnn_dims).cpu()
+                h2 = torch.zeros(b_size, self.rnn_dims).cpu()
+                x = torch.zeros(b_size, 1).cpu()
+            d = self.aux_dims
+            aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)]
+            for i in range(seq_len):
+                m_t = mels[:, i, :]
+                a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
+                x = torch.cat([x, m_t, a1_t], dim=1)
+                x = self.I(x)
+                h1 = rnn1(x, h1)
+                x = x + h1
+                inp = torch.cat([x, a2_t], dim=1)
+                h2 = rnn2(inp, h2)
+                x = x + h2
+                x = torch.cat([x, a3_t], dim=1)
+                x = F.relu(self.fc1(x))
+                x = torch.cat([x, a4_t], dim=1)
+                x = F.relu(self.fc2(x))
+                logits = self.fc3(x)
+                if self.mode == "MOL":
+                    sample = sample_from_discretized_mix_logistic(
+                        logits.unsqueeze(0).transpose(1, 2)
+                    )
+                    output.append(sample.view(-1))
+                    if torch.cuda.is_available():
+                        # x = torch.FloatTensor([[sample]]).cuda()
+                        x = sample.transpose(0, 1).cuda()
+                    else:
+                        x = sample.transpose(0, 1)
+                elif self.mode == "RAW":
+                    posterior = F.softmax(logits, dim=1)
+                    distrib = torch.distributions.Categorical(posterior)
+                    sample = 2 * distrib.sample().float() / (self.n_classes - 1.0) - 1.0
+                    output.append(sample)
+                    x = sample.unsqueeze(-1)
+                else:
+                    raise RuntimeError("Unknown model mode value - ", self.mode)
+        output = torch.stack(output).transpose(0, 1)
+        output = output.cpu().numpy()
+        output = output.astype(np.float64)
+        if batched:
+            output = self.xfade_and_unfold(output, target, overlap)
+        else:
+            output = output[0]
+        if mu_law:
+            output = decode_mu_law(output, self.n_classes, False)
+        if hp.apply_preemphasis:
+            output = de_emphasis(output)
+        # Fade-out at the end to avoid signal cutting out suddenly
+        fade_out = np.linspace(1, 0, 20 * self.hop_length)
+        output = output[:wave_len]
+        output[-20 * self.hop_length :] *= fade_out
+        self.train()
+        return output
+    def get_gru_cell(self, gru):
+        gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
+        gru_cell.weight_hh.data = gru.weight_hh_l0.data
+        gru_cell.weight_ih.data = gru.weight_ih_l0.data
+        gru_cell.bias_hh.data = gru.bias_hh_l0.data
+        gru_cell.bias_ih.data = gru.bias_ih_l0.data
+        return gru_cell
+    def pad_tensor(self, x, pad, side="both"):
+        # NB - this is just a quick method i need right now
+        # i.e., it won't generalise to other shapes/dims
+        b, t, c = x.size()
+        total = t + 2 * pad if side == "both" else t + pad
+        if torch.cuda.is_available():
+            padded = torch.zeros(b, total, c).cuda()
+        else:
+            padded = torch.zeros(b, total, c).cpu()
+        if side == "before" or side == "both":
+            padded[:, pad : pad + t, :] = x
+        elif side == "after":
+            padded[:, :t, :] = x
+        return padded
+    def fold_with_overlap(self, x, target, overlap):
+        """Fold the tensor with overlap for quick batched inference.
+            Overlap will be used for crossfading in xfade_and_unfold()
+        Args:
+            x (tensor)    : Upsampled conditioning features.
+                            shape=(1, timesteps, features)
+            target (int)  : Target timesteps for each index of batch
+            overlap (int) : Timesteps for both xfade and rnn warmup
+        Return:
+            (tensor) : shape=(num_folds, target + 2 * overlap, features)
+        Details:
+            x = [[h1, h2, ... hn]]
+            Where each h is a vector of conditioning features
+            Eg: target=2, overlap=1 with x.size(1)=10
+            folded = [[h1, h2, h3, h4],
+                      [h4, h5, h6, h7],
+                      [h7, h8, h9, h10]]
+        """
+        _, total_len, features = x.size()
+        # Calculate variables needed
+        num_folds = (total_len - overlap) // (target + overlap)
+        extended_len = num_folds * (overlap + target) + overlap
+        remaining = total_len - extended_len
+        # Pad if some time steps poking out
+        if remaining != 0:
+            num_folds += 1
+            padding = target + 2 * overlap - remaining
+            x = self.pad_tensor(x, padding, side="after")
+        if torch.cuda.is_available():
+            folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
+        else:
+            folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu()
+        # Get the values for the folded tensor
+        for i in range(num_folds):
+            start = i * (target + overlap)
+            end = start + target + 2 * overlap
+            folded[i] = x[:, start:end, :]
+        return folded
+    def xfade_and_unfold(self, y, target, overlap):
+        """Applies a crossfade and unfolds into a 1d array.
+        Args:
+            y (ndarry)    : Batched sequences of audio samples
+                            shape=(num_folds, target + 2 * overlap)
+                            dtype=np.float64
+            overlap (int) : Timesteps for both xfade and rnn warmup
+        Return:
+            (ndarry) : audio samples in a 1d array
+                       shape=(total_len)
+                       dtype=np.float64
+        Details:
+            y = [[seq1],
+                 [seq2],
+                 [seq3]]
+            Apply a gain envelope at both ends of the sequences
+            y = [[seq1_in, seq1_target, seq1_out],
+                 [seq2_in, seq2_target, seq2_out],
+                 [seq3_in, seq3_target, seq3_out]]
+            Stagger and add up the groups of samples:
+            [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
+        """
+        num_folds, length = y.shape
+        target = length - 2 * overlap
+        total_len = num_folds * (target + overlap) + overlap
+        # Need some silence for the rnn warmup
+        silence_len = overlap // 2
+        fade_len = overlap - silence_len
+        silence = np.zeros((silence_len), dtype=np.float64)
+        # Equal power crossfade
+        t = np.linspace(-1, 1, fade_len, dtype=np.float64)
+        fade_in = np.sqrt(0.5 * (1 + t))
+        fade_out = np.sqrt(0.5 * (1 - t))
+        # Concat the silence to the fades
+        fade_in = np.concatenate([silence, fade_in])
+        fade_out = np.concatenate([fade_out, silence])
+        # Apply the gain to the overlap samples
+        y[:, :overlap] *= fade_in
+        y[:, -overlap:] *= fade_out
+        unfolded = np.zeros((total_len), dtype=np.float64)
+        # Loop to add up all the samples
+        for i in range(num_folds):
+            start = i * (target + overlap)
+            end = start + target + 2 * overlap
+            unfolded[start:end] += y[i]
+        return unfolded
+    def get_step(self):
+        return self.step.data.item()
+    def checkpoint(self, model_dir, optimizer):
+        k_steps = self.get_step() // 1000
+        self.save(model_dir.joinpath("checkpoint_%dk_steps.pt" % k_steps), optimizer)
+    def load(self, path, optimizer):
+        checkpoint = torch.load(path)
+        if "optimizer_state" in checkpoint:
+            self.load_state_dict(checkpoint["model_state"])
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+        else:
+            # Backwards compatibility
+            self.load_state_dict(checkpoint)
+    def save(self, path, optimizer):
+        torch.save(
+            {
+                "model_state": self.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            },
+            path,
+        )
+    def num_params(self):
+        parameters = filter(lambda p: p.requires_grad, self.parameters())
+        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
+        logger.debug("Trainable Parameters: %.3fM" % parameters)

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+ffmpeg
+libsm6
+libxext6

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch
+torchvision
+numpy
+numba
+opencv-python-headless
+scipy
+pypinyin
+librosa
+webrtcvad
+Unidecode
+inflect
+loguru
+gradio