VALL-E-X

Runtime error

App Files Files Community

Xuan2060320350

Plachta commited on Aug 25, 2023

Commit

cd6614b

0 Parent(s):

Duplicate from Plachta/VALL-E-X

Browse files

Co-authored-by: ElderFrog <[email protected]>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
LICENSE +21 -0
README.md +14 -0
__init__.py +1 -0
app.py +574 -0
data/__init__.py +1 -0
data/collation.py +118 -0
data/tokenizer.py +117 -0
descriptions.py +27 -0
epoch-10.pt +3 -0
images/vallex_framework.jpg +0 -0
macros.py +39 -0
models/__init__.py +126 -0
models/macros.py +11 -0
models/vallex.py +830 -0
modules/__init__.py +0 -0
modules/activation.py +612 -0
modules/embedding.py +97 -0
modules/scaling.py +1401 -0
modules/transformer.py +683 -0
presets/acou_1.npz +3 -0
presets/acou_2.npz +3 -0
presets/acou_3.npz +3 -0
presets/acou_4.npz +3 -0
presets/alan.npz +3 -0
presets/amused.npz +3 -0
presets/anger.npz +3 -0
presets/babara.npz +3 -0
presets/bronya_1.npz +3 -0
presets/cafe.npz +3 -0
presets/dingzhen.npz +3 -0
presets/dingzhen_1.npz +3 -0
presets/disgust.npz +3 -0
presets/emo_amused.npz +3 -0
presets/emo_anger.npz +3 -0
presets/emo_neutral.npz +3 -0
presets/emo_sleepy.npz +3 -0
presets/emotion_sleepiness.npz +3 -0
presets/en2zh_tts_1.npz +3 -0
presets/en2zh_tts_2.npz +3 -0
presets/en2zh_tts_3.npz +3 -0
presets/en2zh_tts_4.npz +3 -0
presets/esta.npz +3 -0
presets/fuxuan_2.npz +3 -0
presets/librispeech_1.npz +3 -0
presets/librispeech_2.npz +3 -0
presets/librispeech_3.npz +3 -0
presets/librispeech_4.npz +3 -0
presets/neutral.npz +3 -0
presets/paimon_1.npz +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Songting
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: VALL E X
+emoji: 🎙
+colorFrom: green
+colorTo: purple
+sdk: gradio
+sdk_version: 3.39.0
+app_file: app.py
+pinned: false
+license: mit
+duplicated_from: Plachta/VALL-E-X
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import data, models, modules, utils

app.py ADDED Viewed

	@@ -0,0 +1,574 @@

+import argparse
+import logging
+import os
+import pathlib
+import time
+import tempfile
+import platform
+if platform.system().lower() == 'windows':
+    temp = pathlib.PosixPath
+    pathlib.PosixPath = pathlib.WindowsPath
+elif platform.system().lower() == 'linux':
+    temp = pathlib.WindowsPath
+    pathlib.WindowsPath = pathlib.PosixPath
+os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+import langid
+langid.set_languages(['en', 'zh', 'ja'])
+import torch
+import torchaudio
+import random
+import numpy as np
+from data.tokenizer import (
+    AudioTokenizer,
+    tokenize_audio,
+)
+from data.collation import get_text_token_collater
+from models.vallex import VALLE
+from utils.g2p import PhonemeBpeTokenizer
+from descriptions import *
+from macros import *
+import gradio as gr
+import whisper
+import multiprocessing
+thread_count = multiprocessing.cpu_count()
+print("Use",thread_count,"cpu cores for computing")
+torch.set_num_threads(thread_count)
+torch.set_num_interop_threads(thread_count)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_set_profiling_mode(False)
+torch._C._set_graph_executor_optimize(False)
+text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
+text_collater = get_text_token_collater()
+device = torch.device("cpu")
+if torch.cuda.is_available():
+    device = torch.device("cuda", 0)
+# VALL-E-X model
+model = VALLE(
+        N_DIM,
+        NUM_HEAD,
+        NUM_LAYERS,
+        norm_first=True,
+        add_prenet=False,
+        prefix_mode=PREFIX_MODE,
+        share_embedding=True,
+        nar_scale_factor=1.0,
+        prepend_bos=True,
+        num_quantizers=NUM_QUANTIZERS,
+    )
+checkpoint = torch.load("./epoch-10.pt", map_location='cpu')
+missing_keys, unexpected_keys = model.load_state_dict(
+    checkpoint["model"], strict=True
+)
+assert not missing_keys
+model.eval()
+# Encodec model
+audio_tokenizer = AudioTokenizer(device)
+# ASR
+whisper_model = whisper.load_model("medium").cpu()
+# Voice Presets
+preset_list = os.walk("./presets/").__next__()[2]
+preset_list = [preset[:-4] for preset in preset_list if preset.endswith(".npz")]
+def clear_prompts():
+    try:
+        path = tempfile.gettempdir()
+        for eachfile in os.listdir(path):
+            filename = os.path.join(path, eachfile)
+            if os.path.isfile(filename) and filename.endswith(".npz"):
+                lastmodifytime = os.stat(filename).st_mtime
+                endfiletime = time.time() - 60
+                if endfiletime > lastmodifytime:
+                    os.remove(filename)
+    except:
+        return
+def transcribe_one(model, audio_path):
+    # load audio and pad/trim it to fit 30 seconds
+    audio = whisper.load_audio(audio_path)
+    audio = whisper.pad_or_trim(audio)
+    # make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    # detect the spoken language
+    _, probs = model.detect_language(mel)
+    print(f"Detected language: {max(probs, key=probs.get)}")
+    lang = max(probs, key=probs.get)
+    # decode the audio
+    options = whisper.DecodingOptions(temperature=1.0, best_of=5, fp16=False if device == torch.device("cpu") else True, sample_len=150)
+    result = whisper.decode(model, mel, options)
+    # print the recognized text
+    print(result.text)
+    text_pr = result.text
+    if text_pr.strip(" ")[-1] not in "?!.,。，？！。、":
+        text_pr += "."
+    return lang, text_pr
+def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content):
+    global model, text_collater, text_tokenizer, audio_tokenizer
+    clear_prompts()
+    audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio
+    sr, wav_pr = audio_prompt
+    if len(wav_pr) / sr > 15:
+        return "Rejected, Audio too long (should be less than 15 seconds)", None
+    if not isinstance(wav_pr, torch.FloatTensor):
+        wav_pr = torch.FloatTensor(wav_pr)
+    if wav_pr.abs().max() > 1:
+        wav_pr /= wav_pr.abs().max()
+    if wav_pr.size(-1) == 2:
+        wav_pr = wav_pr[:, 0]
+    if wav_pr.ndim == 1:
+        wav_pr = wav_pr.unsqueeze(0)
+    assert wav_pr.ndim and wav_pr.size(0) == 1
+    if transcript_content == "":
+        text_pr, lang_pr = make_prompt(name, wav_pr, sr, save=False)
+    else:
+        lang_pr = langid.classify(str(transcript_content))[0]
+        lang_token = lang2token[lang_pr]
+        text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
+    # tokenize audio
+    encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
+    audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
+    # tokenize text
+    phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
+    text_tokens, enroll_x_lens = text_collater(
+        [
+            phonemes
+        ]
+    )
+    message = f"Detected language: {lang_pr}\n Detected text {text_pr}\n"
+    # save as npz file
+    np.savez(os.path.join(tempfile.gettempdir(), f"{name}.npz"),
+             audio_tokens=audio_tokens, text_tokens=text_tokens, lang_code=lang2code[lang_pr])
+    return message, os.path.join(tempfile.gettempdir(), f"{name}.npz")
+def make_prompt(name, wav, sr, save=True):
+    global whisper_model
+    whisper_model.to(device)
+    if not isinstance(wav, torch.FloatTensor):
+        wav = torch.tensor(wav)
+    if wav.abs().max() > 1:
+        wav /= wav.abs().max()
+    if wav.size(-1) == 2:
+        wav = wav.mean(-1, keepdim=False)
+    if wav.ndim == 1:
+        wav = wav.unsqueeze(0)
+    assert wav.ndim and wav.size(0) == 1
+    torchaudio.save(f"./prompts/{name}.wav", wav, sr)
+    lang, text = transcribe_one(whisper_model, f"./prompts/{name}.wav")
+    lang_token = lang2token[lang]
+    text = lang_token + text + lang_token
+    with open(f"./prompts/{name}.txt", 'w') as f:
+        f.write(text)
+    if not save:
+        os.remove(f"./prompts/{name}.wav")
+        os.remove(f"./prompts/{name}.txt")
+    whisper_model.cpu()
+    torch.cuda.empty_cache()
+    return text, lang
+@torch.no_grad()
+def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt, transcript_content):
+    if len(text) > 150:
+        return "Rejected, Text too long (should be less than 150 characters)", None
+    global model, text_collater, text_tokenizer, audio_tokenizer
+    model.to(device)
+    audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt
+    sr, wav_pr = audio_prompt
+    if len(wav_pr) / sr > 15:
+        return "Rejected, Audio too long (should be less than 15 seconds)", None
+    if not isinstance(wav_pr, torch.FloatTensor):
+        wav_pr = torch.FloatTensor(wav_pr)
+    if wav_pr.abs().max() > 1:
+        wav_pr /= wav_pr.abs().max()
+    if wav_pr.size(-1) == 2:
+        wav_pr = wav_pr[:, 0]
+    if wav_pr.ndim == 1:
+        wav_pr = wav_pr.unsqueeze(0)
+    assert wav_pr.ndim and wav_pr.size(0) == 1
+    if transcript_content == "":
+        text_pr, lang_pr = make_prompt('dummy', wav_pr, sr, save=False)
+    else:
+        lang_pr = langid.classify(str(transcript_content))[0]
+        lang_token = lang2token[lang_pr]
+        text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
+    if language == 'auto-detect':
+        lang_token = lang2token[langid.classify(text)[0]]
+    else:
+        lang_token = langdropdown2token[language]
+    lang = token2lang[lang_token]
+    text = lang_token + text + lang_token
+    # onload model
+    model.to(device)
+    # tokenize audio
+    encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
+    audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
+    # tokenize text
+    logging.info(f"synthesize text: {text}")
+    phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
+    text_tokens, text_tokens_lens = text_collater(
+        [
+            phone_tokens
+        ]
+    )
+    enroll_x_lens = None
+    if text_pr:
+        text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
+        text_prompts, enroll_x_lens = text_collater(
+            [
+                text_prompts
+            ]
+        )
+    text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+    text_tokens_lens += enroll_x_lens
+    lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+    encoded_frames = model.inference(
+        text_tokens.to(device),
+        text_tokens_lens.to(device),
+        audio_prompts,
+        enroll_x_lens=enroll_x_lens,
+        top_k=-100,
+        temperature=1,
+        prompt_language=lang_pr,
+        text_language=langs if accent == "no-accent" else lang,
+    )
+    samples = audio_tokenizer.decode(
+        [(encoded_frames.transpose(2, 1), None)]
+    )
+    # offload model
+    model.to('cpu')
+    torch.cuda.empty_cache()
+    message = f"text prompt: {text_pr}\nsythesized text: {text}"
+    return message, (24000, samples[0][0].cpu().numpy())
+@torch.no_grad()
+def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
+    if len(text) > 150:
+        return "Rejected, Text too long (should be less than 150 characters)", None
+    clear_prompts()
+    model.to(device)
+    # text to synthesize
+    if language == 'auto-detect':
+        lang_token = lang2token[langid.classify(text)[0]]
+    else:
+        lang_token = langdropdown2token[language]
+    lang = token2lang[lang_token]
+    text = lang_token + text + lang_token
+    # load prompt
+    if prompt_file is not None:
+        prompt_data = np.load(prompt_file.name)
+    else:
+        prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
+    audio_prompts = prompt_data['audio_tokens']
+    text_prompts = prompt_data['text_tokens']
+    lang_pr = prompt_data['lang_code']
+    lang_pr = code2lang[int(lang_pr)]
+    # numpy to tensor
+    audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
+    text_prompts = torch.tensor(text_prompts).type(torch.int32)
+    enroll_x_lens = text_prompts.shape[-1]
+    logging.info(f"synthesize text: {text}")
+    phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
+    text_tokens, text_tokens_lens = text_collater(
+        [
+            phone_tokens
+        ]
+    )
+    text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+    text_tokens_lens += enroll_x_lens
+    # accent control
+    lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+    encoded_frames = model.inference(
+        text_tokens.to(device),
+        text_tokens_lens.to(device),
+        audio_prompts,
+        enroll_x_lens=enroll_x_lens,
+        top_k=-100,
+        temperature=1,
+        prompt_language=lang_pr,
+        text_language=langs if accent == "no-accent" else lang,
+    )
+    samples = audio_tokenizer.decode(
+        [(encoded_frames.transpose(2, 1), None)]
+    )
+    model.to('cpu')
+    torch.cuda.empty_cache()
+    message = f"sythesized text: {text}"
+    return message, (24000, samples[0][0].cpu().numpy())
+from utils.sentence_cutter import split_text_into_sentences
+@torch.no_grad()
+def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
+    """
+    For long audio generation, two modes are available.
+    fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
+    sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
+    """
+    if len(text) > 1000:
+        return "Rejected, Text too long (should be less than 1000 characters)", None
+    mode = 'fixed-prompt'
+    global model, audio_tokenizer, text_tokenizer, text_collater
+    model.to(device)
+    if (prompt is None or prompt == "") and preset_prompt == "":
+        mode = 'sliding-window'  # If no prompt is given, use sliding-window mode
+    sentences = split_text_into_sentences(text)
+    # detect language
+    if language == "auto-detect":
+        language = langid.classify(text)[0]
+    else:
+        language = token2lang[langdropdown2token[language]]
+    # if initial prompt is given, encode it
+    if prompt is not None and prompt != "":
+        # load prompt
+        prompt_data = np.load(prompt.name)
+        audio_prompts = prompt_data['audio_tokens']
+        text_prompts = prompt_data['text_tokens']
+        lang_pr = prompt_data['lang_code']
+        lang_pr = code2lang[int(lang_pr)]
+        # numpy to tensor
+        audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
+        text_prompts = torch.tensor(text_prompts).type(torch.int32)
+    elif preset_prompt is not None and preset_prompt != "":
+        prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
+        audio_prompts = prompt_data['audio_tokens']
+        text_prompts = prompt_data['text_tokens']
+        lang_pr = prompt_data['lang_code']
+        lang_pr = code2lang[int(lang_pr)]
+        # numpy to tensor
+        audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
+        text_prompts = torch.tensor(text_prompts).type(torch.int32)
+    else:
+        audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
+        text_prompts = torch.zeros([1, 0]).type(torch.int32)
+        lang_pr = language if language != 'mix' else 'en'
+    if mode == 'fixed-prompt':
+        complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
+        for text in sentences:
+            text = text.replace("\n", "").strip(" ")
+            if text == "":
+                continue
+            lang_token = lang2token[language]
+            lang = token2lang[lang_token]
+            text = lang_token + text + lang_token
+            enroll_x_lens = text_prompts.shape[-1]
+            logging.info(f"synthesize text: {text}")
+            phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
+            text_tokens, text_tokens_lens = text_collater(
+                [
+                    phone_tokens
+                ]
+            )
+            text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+            text_tokens_lens += enroll_x_lens
+            # accent control
+            lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+            encoded_frames = model.inference(
+                text_tokens.to(device),
+                text_tokens_lens.to(device),
+                audio_prompts,
+                enroll_x_lens=enroll_x_lens,
+                top_k=-100,
+                temperature=1,
+                prompt_language=lang_pr,
+                text_language=langs if accent == "no-accent" else lang,
+            )
+            complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
+        samples = audio_tokenizer.decode(
+            [(complete_tokens, None)]
+        )
+        model.to('cpu')
+        message = f"Cut into {len(sentences)} sentences"
+        return message, (24000, samples[0][0].cpu().numpy())
+    elif mode == "sliding-window":
+        complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
+        original_audio_prompts = audio_prompts
+        original_text_prompts = text_prompts
+        for text in sentences:
+            text = text.replace("\n", "").strip(" ")
+            if text == "":
+                continue
+            lang_token = lang2token[language]
+            lang = token2lang[lang_token]
+            text = lang_token + text + lang_token
+            enroll_x_lens = text_prompts.shape[-1]
+            logging.info(f"synthesize text: {text}")
+            phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
+            text_tokens, text_tokens_lens = text_collater(
+                [
+                    phone_tokens
+                ]
+            )
+            text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+            text_tokens_lens += enroll_x_lens
+            # accent control
+            lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+            encoded_frames = model.inference(
+                text_tokens.to(device),
+                text_tokens_lens.to(device),
+                audio_prompts,
+                enroll_x_lens=enroll_x_lens,
+                top_k=-100,
+                temperature=1,
+                prompt_language=lang_pr,
+                text_language=langs if accent == "no-accent" else lang,
+            )
+            complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
+            if torch.rand(1) < 1.0:
+                audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
+                text_prompts = text_tokens[:, enroll_x_lens:]
+            else:
+                audio_prompts = original_audio_prompts
+                text_prompts = original_text_prompts
+        samples = audio_tokenizer.decode(
+            [(complete_tokens, None)]
+        )
+        model.to('cpu')
+        message = f"Cut into {len(sentences)} sentences"
+        return message, (24000, samples[0][0].cpu().numpy())
+    else:
+        raise ValueError(f"No such mode {mode}")
+def main():
+    app = gr.Blocks()
+    with app:
+        gr.Markdown(top_md)
+        with gr.Tab("Infer from audio"):
+            gr.Markdown(infer_from_audio_md)
+            with gr.Row():
+                with gr.Column():
+                    textbox = gr.TextArea(label="Text",
+                                          placeholder="Type your sentence here",
+                                          value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
+                    language_dropdown = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='English', label='auto-detect')
+                    accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
+                    textbox_transcript = gr.TextArea(label="Transcript",
+                                          placeholder="Write transcript here. (leave empty to use whisper)",
+                                          value="", elem_id=f"prompt-name")
+                    upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
+                    record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
+                with gr.Column():
+                    text_output = gr.Textbox(label="Message")
+                    audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
+                    btn = gr.Button("Generate!")
+                    btn.click(infer_from_audio,
+                              inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
+                              outputs=[text_output, audio_output])
+                    textbox_mp = gr.TextArea(label="Prompt name",
+                                          placeholder="Name your prompt here",
+                                          value="prompt_1", elem_id=f"prompt-name")
+                    btn_mp = gr.Button("Make prompt!")
+                    prompt_output = gr.File(interactive=False)
+                    btn_mp.click(make_npz_prompt,
+                                inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
+                                outputs=[text_output, prompt_output])
+        with gr.Tab("Make prompt"):
+            gr.Markdown(make_prompt_md)
+            with gr.Row():
+                with gr.Column():
+                    textbox2 = gr.TextArea(label="Prompt name",
+                                          placeholder="Name your prompt here",
+                                          value="prompt_1", elem_id=f"prompt-name")
+                    # 添加选择语言和输入台本的地方
+                    textbox_transcript2 = gr.TextArea(label="Transcript",
+                                          placeholder="Write transcript here. (leave empty to use whisper)",
+                                          value="", elem_id=f"prompt-name")
+                    upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
+                    record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
+                with gr.Column():
+                    text_output_2 = gr.Textbox(label="Message")
+                    prompt_output_2 = gr.File(interactive=False)
+                    btn_2 = gr.Button("Make!")
+                    btn_2.click(make_npz_prompt,
+                              inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
+                              outputs=[text_output_2, prompt_output_2])
+        with gr.Tab("Infer from prompt"):
+            gr.Markdown(infer_from_prompt_md)
+            with gr.Row():
+                with gr.Column():
+                    textbox_3 = gr.TextArea(label="Text",
+                                          placeholder="Type your sentence here",
+                                          value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
+                    language_dropdown_3 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語', 'Mix'], value='auto-detect',
+                                                    label='language')
+                    accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
+                                                  label='accent')
+                    preset_dropdown_3 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
+                    prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
+                with gr.Column():
+                    text_output_3 = gr.Textbox(label="Message")
+                    audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
+                    btn_3 = gr.Button("Generate!")
+                    btn_3.click(infer_from_prompt,
+                              inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
+                              outputs=[text_output_3, audio_output_3])
+        with gr.Tab("Infer long text"):
+            gr.Markdown("This is a long text generation demo. You can use this to generate long audio. ")
+            with gr.Row():
+                with gr.Column():
+                    textbox_4 = gr.TextArea(label="Text",
+                                          placeholder="Type your sentence here",
+                                          value=long_text_example, elem_id=f"tts-input")
+                    language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
+                                                    label='language')
+                    accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
+                                                    label='accent')
+                    preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
+                    prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True)
+                with gr.Column():
+                    text_output_4 = gr.TextArea(label="Message")
+                    audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio")
+                    btn_4 = gr.Button("Generate!")
+                    btn_4.click(infer_long_text,
+                              inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
+                              outputs=[text_output_4, audio_output_4])
+    app.launch()
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()

data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .collation import *

data/collation.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from pathlib import Path
+from typing import List, Tuple
+import numpy as np
+import torch
+class TextTokenCollater:
+    """Collate list of text tokens
+    Map sentences to integers. Sentences are padded to equal length.
+    Beginning and end-of-sequence symbols can be added.
+    Example:
+        >>> token_collater = TextTokenCollater(text_tokens)
+        >>> tokens_batch, tokens_lens = token_collater(text)
+    Returns:
+        tokens_batch: IntTensor of shape (B, L)
+            B: batch dimension, number of input sentences
+            L: length of the longest sentence
+        tokens_lens: IntTensor of shape (B,)
+            Length of each sentence after adding <eos> and <bos>
+            but before padding.
+    """
+    def __init__(
+        self,
+        text_tokens: List[str],
+        add_eos: bool = True,
+        add_bos: bool = True,
+        pad_symbol: str = "<pad>",
+        bos_symbol: str = "<bos>",
+        eos_symbol: str = "<eos>",
+    ):
+        self.pad_symbol = pad_symbol
+        self.add_eos = add_eos
+        self.add_bos = add_bos
+        self.bos_symbol = bos_symbol
+        self.eos_symbol = eos_symbol
+        unique_tokens = (
+            [pad_symbol]
+            + ([bos_symbol] if add_bos else [])
+            + ([eos_symbol] if add_eos else [])
+            + sorted(text_tokens)
+        )
+        self.token2idx = {token: idx for idx, token in enumerate(unique_tokens)}
+        self.idx2token = [token for token in unique_tokens]
+    def index(
+        self, tokens_list: List[str]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        seqs, seq_lens = [], []
+        for tokens in tokens_list:
+            assert (
+                all([True if s in self.token2idx else False for s in tokens])
+                is True
+            )
+            seq = (
+                ([self.bos_symbol] if self.add_bos else [])
+                + list(tokens)
+                + ([self.eos_symbol] if self.add_eos else [])
+            )
+            seqs.append(seq)
+            seq_lens.append(len(seq))
+        max_len = max(seq_lens)
+        for k, (seq, seq_len) in enumerate(zip(seqs, seq_lens)):
+            seq.extend([self.pad_symbol] * (max_len - seq_len))
+        tokens = torch.from_numpy(
+            np.array(
+                [[self.token2idx[token] for token in seq] for seq in seqs],
+                dtype=np.int64,
+            )
+        )
+        tokens_lens = torch.IntTensor(seq_lens)
+        return tokens, tokens_lens
+    def __call__(self, texts: List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
+        tokens_seqs = [[p for p in text] for text in texts]
+        max_len = len(max(tokens_seqs, key=len))
+        seqs = [
+            ([self.bos_symbol] if self.add_bos else [])
+            + list(seq)
+            + ([self.eos_symbol] if self.add_eos else [])
+            + [self.pad_symbol] * (max_len - len(seq))
+            for seq in tokens_seqs
+        ]
+        tokens_batch = torch.from_numpy(
+            np.array(
+                [seq for seq in seqs],
+                dtype=np.int64,
+            )
+        )
+        tokens_lens = torch.IntTensor(
+            [
+                len(seq) + int(self.add_eos) + int(self.add_bos)
+                for seq in tokens_seqs
+            ]
+        )
+        return tokens_batch, tokens_lens
+def get_text_token_collater() -> TextTokenCollater:
+    collater = TextTokenCollater(
+        ['0'], add_bos=False, add_eos=False
+    )
+    return collater

data/tokenizer.py ADDED Viewed

	@@ -0,0 +1,117 @@

+#!/usr/bin/env python3
+# Copyright    2023                            (authors: Feiteng Li)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, List, Optional, Pattern, Union
+import numpy as np
+import torch
+import torchaudio
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+def remove_encodec_weight_norm(model):
+    from encodec.modules import SConv1d
+    from encodec.modules.seanet import SConvTranspose1d, SEANetResnetBlock
+    from torch.nn.utils import remove_weight_norm
+    encoder = model.encoder.model
+    for key in encoder._modules:
+        if isinstance(encoder._modules[key], SEANetResnetBlock):
+            remove_weight_norm(encoder._modules[key].shortcut.conv.conv)
+            block_modules = encoder._modules[key].block._modules
+            for skey in block_modules:
+                if isinstance(block_modules[skey], SConv1d):
+                    remove_weight_norm(block_modules[skey].conv.conv)
+        elif isinstance(encoder._modules[key], SConv1d):
+            remove_weight_norm(encoder._modules[key].conv.conv)
+    decoder = model.decoder.model
+    for key in decoder._modules:
+        if isinstance(decoder._modules[key], SEANetResnetBlock):
+            remove_weight_norm(decoder._modules[key].shortcut.conv.conv)
+            block_modules = decoder._modules[key].block._modules
+            for skey in block_modules:
+                if isinstance(block_modules[skey], SConv1d):
+                    remove_weight_norm(block_modules[skey].conv.conv)
+        elif isinstance(decoder._modules[key], SConvTranspose1d):
+            remove_weight_norm(decoder._modules[key].convtr.convtr)
+        elif isinstance(decoder._modules[key], SConv1d):
+            remove_weight_norm(decoder._modules[key].conv.conv)
+class AudioTokenizer:
+    """EnCodec audio."""
+    def __init__(
+        self,
+        device: Any = None,
+    ) -> None:
+        # Instantiate a pretrained EnCodec model
+        model = EncodecModel.encodec_model_24khz()
+        model.set_target_bandwidth(6.0)
+        remove_encodec_weight_norm(model)
+        if not device:
+            device = torch.device("cpu")
+            if torch.cuda.is_available():
+                device = torch.device("cuda:0")
+        self._device = device
+        self.codec = model.to(device)
+        self.sample_rate = model.sample_rate
+        self.channels = model.channels
+    @property
+    def device(self):
+        return self._device
+    def encode(self, wav: torch.Tensor) -> torch.Tensor:
+        return self.codec.encode(wav.to(self.device))
+    def decode(self, frames: torch.Tensor) -> torch.Tensor:
+        return self.codec.decode(frames)
+def tokenize_audio(tokenizer: AudioTokenizer, audio):
+    # Load and pre-process the audio waveform
+    if isinstance(audio, str):
+        wav, sr = torchaudio.load(audio)
+    else:
+        wav, sr = audio
+    wav = convert_audio(wav, sr, tokenizer.sample_rate, tokenizer.channels)
+    wav = wav.unsqueeze(0)
+    # Extract discrete codes from EnCodec
+    with torch.no_grad():
+        encoded_frames = tokenizer.encode(wav)
+    return encoded_frames
+if __name__ == "__main__":
+    model = EncodecModel.encodec_model_24khz()
+    model.set_target_bandwidth(6.0)
+    samples = torch.from_numpy(np.random.random([4, 1, 1600])).type(
+        torch.float32
+    )
+    codes_raw = model.encode(samples)
+    remove_encodec_weight_norm(model)
+    codes_norm = model.encode(samples)
+    assert torch.allclose(codes_raw[0][0], codes_norm[0][0])

descriptions.py ADDED Viewed

	@@ -0,0 +1,27 @@

+top_md = """
+# VALL-E X
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing)
+VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
+an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
+This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
+See this [demo](https://plachtaa.github.io/) page for more details.
+"""
+infer_from_audio_md = """
+Upload a speech of 3~10 seconds as the audio prompt and type in the text you'd like to synthesize.<br>
+The model will synthesize speech of given text with the same voice of your audio prompt.<br>
+The model also tends to preserve the emotion & acoustic environment of your given speech.<br>
+For faster inference, please use **"Make prompt"** to get a `.npz` file as the encoded audio prompt, and use it by **"Infer from prompt"**
+"""
+make_prompt_md = """
+Upload a speech of 3~10 seconds as the audio prompt.<br>
+Get a `.npz` file as the encoded audio prompt. Use it by **"Infer with prompt"**
+"""
+infer_from_prompt_md = """
+Faster than **"Infer from audio"**.<br>
+You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
+"""
+long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded."

epoch-10.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5fcd05ee0c9c84a16a7b44495c46262177e66d5d454c20ca5f1da9832dbd5ac
+size 1482302113

images/vallex_framework.jpg ADDED Viewed

macros.py ADDED Viewed

	@@ -0,0 +1,39 @@

+NUM_LAYERS = 12
+NUM_HEAD = 16
+N_DIM = 1024
+PREFIX_MODE = 1
+NUM_QUANTIZERS = 8
+SAMPLE_RATE = 24000
+lang2token = {
+    'zh': "[ZH]",
+    'ja': "[JA]",
+    "en": "[EN]",
+    'mix': "",
+}
+lang2code = {
+    'zh': 0,
+    'ja': 1,
+    "en": 2,
+}
+token2lang = {
+    '[ZH]': "zh",
+    '[JA]': "ja",
+    "[EN]": "en",
+    "": "mix"
+}
+code2lang = {
+    0: 'zh',
+    1: 'ja',
+    2: "en",
+}
+langdropdown2token = {
+    'English': "[EN]",
+    '中文': "[ZH]",
+    '日本語': "[JA]",
+    'Mix': "",
+}

models/__init__.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import argparse
+import torch.nn as nn
+# from icefall.utils import AttributeDict, str2bool
+from .macros import (
+    NUM_AUDIO_TOKENS,
+    NUM_MEL_BINS,
+    NUM_SPEAKER_CLASSES,
+    NUM_TEXT_TOKENS,
+    SPEAKER_EMBEDDING_DIM,
+)
+from .vallex import VALLE, VALLF
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="VALL-E",
+        help="VALL-E, VALL-F, Transformer.",
+    )
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=1024,
+        help="Embedding dimension in the decoder model.",
+    )
+    parser.add_argument(
+        "--nhead",
+        type=int,
+        default=16,
+        help="Number of attention heads in the Decoder layers.",
+    )
+    parser.add_argument(
+        "--num-decoder-layers",
+        type=int,
+        default=12,
+        help="Number of Decoder layers.",
+    )
+    parser.add_argument(
+        "--scale-factor",
+        type=float,
+        default=1.0,
+        help="Model scale factor which will be assigned different meanings in different models.",
+    )
+    parser.add_argument(
+        "--norm-first",
+        type=bool,
+        default=True,
+        help="Pre or Post Normalization.",
+    )
+    parser.add_argument(
+        "--add-prenet",
+        type=bool,
+        default=False,
+        help="Whether add PreNet after Inputs.",
+    )
+    # VALL-E & F
+    parser.add_argument(
+        "--prefix-mode",
+        type=int,
+        default=1,
+        help="The mode for how to prefix VALL-E NAR Decoder, "
+        "0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance.",
+    )
+    parser.add_argument(
+        "--share-embedding",
+        type=bool,
+        default=True,
+        help="Share the parameters of the output projection layer with the parameters of the acoustic embedding.",
+    )
+    parser.add_argument(
+        "--prepend-bos",
+        type=bool,
+        default=False,
+        help="Whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs.",
+    )
+    parser.add_argument(
+        "--num-quantizers",
+        type=int,
+        default=8,
+        help="Number of Audio/Semantic quantization layers.",
+    )
+    # Transformer
+    parser.add_argument(
+        "--scaling-xformers",
+        type=bool,
+        default=False,
+        help="Apply Reworked Conformer scaling on Transformers.",
+    )
+def get_model(params) -> nn.Module:
+    if params.model_name.lower() in ["vall-f", "vallf"]:
+        model = VALLF(
+            params.decoder_dim,
+            params.nhead,
+            params.num_decoder_layers,
+            norm_first=params.norm_first,
+            add_prenet=params.add_prenet,
+            prefix_mode=params.prefix_mode,
+            share_embedding=params.share_embedding,
+            nar_scale_factor=params.scale_factor,
+            prepend_bos=params.prepend_bos,
+            num_quantizers=params.num_quantizers,
+        )
+    elif params.model_name.lower() in ["vall-e", "valle"]:
+        model = VALLE(
+            params.decoder_dim,
+            params.nhead,
+            params.num_decoder_layers,
+            norm_first=params.norm_first,
+            add_prenet=params.add_prenet,
+            prefix_mode=params.prefix_mode,
+            share_embedding=params.share_embedding,
+            nar_scale_factor=params.scale_factor,
+            prepend_bos=params.prepend_bos,
+            num_quantizers=params.num_quantizers,
+        )
+    else:
+        raise ValueError("No such model")
+    return model

models/macros.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Text
+NUM_TEXT_TOKENS = 2048
+# Audio
+NUM_AUDIO_TOKENS = 1024  # EnCodec RVQ bins
+NUM_MEL_BINS = 100  # BigVGAN bigvgan_24khz_100band
+# Speaker
+NUM_SPEAKER_CLASSES = 4096
+SPEAKER_EMBEDDING_DIM = 64

models/vallex.py ADDED Viewed

	@@ -0,0 +1,830 @@

+# Copyright    2023                             (authors: Feiteng Li)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from typing import Dict, Iterator, List, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from icefall.utils import make_pad_mask
+# from torchmetrics.classification import MulticlassAccuracy
+from modules.embedding import SinePositionalEmbedding, TokenEmbedding
+from modules.transformer import (
+    AdaptiveLayerNorm,
+    LayerNorm,
+    TransformerDecoderLayer,
+    TransformerEncoder,
+    TransformerEncoderLayer,
+)
+from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
+class Transpose(nn.Identity):
+    """(N, T, D) -> (N, D, T)"""
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return input.transpose(1, 2)
+# NOTE: There are two ways to implement the model
+#       1) [VALL-F] standard TransformerDecoder, use x as memory
+#       2) [VALL-E] modified TransformerDecoder like GPT-x(e.g. causal TransformerEncoder),
+#          use x as the prefix of decoder inputs
+class VALLF(nn.Module):
+    """It implements https://arxiv.org/abs/2301.02111
+    "Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers"
+    """
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        num_layers: int,
+        norm_first: bool = True,
+        add_prenet: bool = False,
+        decoder_cls: Union[
+            nn.TransformerDecoder, nn.TransformerEncoder
+        ] = nn.TransformerDecoder,
+        decoder_layer_cls: Union[
+            TransformerDecoderLayer, TransformerEncoderLayer
+        ] = TransformerDecoderLayer,
+        prefix_mode: int = 0,
+        share_embedding: bool = True,
+        nar_scale_factor: float = 1.0,
+        prepend_bos: bool = True,
+        num_quantizers: int = 8,
+    ):
+        """
+        Args:
+          d_model:
+            The number of expected features in the input (required).
+          nhead:
+            The number of heads in the multiheadattention models (required).
+          num_layers:
+            The number of sub-decoder-layers in the decoder (required).
+        """
+        super().__init__()
+        nar_d_model = int(d_model * nar_scale_factor)
+        self.ar_text_embedding = TokenEmbedding(d_model, NUM_TEXT_TOKENS)  # W_x
+        self.nar_text_embedding = TokenEmbedding(nar_d_model, NUM_TEXT_TOKENS)
+        # ID NUM_AUDIO_TOKENS     -> PAD
+        # ID NUM_AUDIO_TOKENS + 1 -> BOS
+        self.ar_audio_prepend_bos = prepend_bos
+        self.ar_audio_embedding = TokenEmbedding(
+            d_model, NUM_AUDIO_TOKENS + 1 + int(prepend_bos)
+        )
+        # PreNet
+        if add_prenet:
+            self.ar_text_prenet = nn.Sequential(
+                Transpose(),
+                nn.Conv1d(d_model, d_model, kernel_size=5, padding="same"),
+                nn.BatchNorm1d(d_model),
+                nn.ReLU(),
+                nn.Dropout(0.5),
+                nn.Conv1d(d_model, d_model, kernel_size=5, padding="same"),
+                nn.BatchNorm1d(d_model),
+                nn.ReLU(),
+                nn.Dropout(0.5),
+                nn.Conv1d(d_model, d_model, kernel_size=5, padding="same"),
+                nn.BatchNorm1d(d_model),
+                nn.ReLU(),
+                nn.Dropout(0.5),
+                Transpose(),
+                nn.Linear(d_model, d_model),
+            )
+            self.ar_audio_prenet = nn.Sequential(
+                nn.Linear(d_model, 256),
+                nn.ReLU(),
+                nn.Dropout(0.25),
+                nn.Linear(256, 256),
+                nn.ReLU(),
+                nn.Dropout(0.25),
+                nn.Linear(256, d_model),
+            )
+        else:
+            self.ar_text_prenet = nn.Identity()
+            self.ar_audio_prenet = nn.Identity()
+        self.ar_text_position = SinePositionalEmbedding(
+            d_model,
+            dropout=0.1,
+            scale=False,
+            alpha=True,
+        )
+        self.ar_audio_position = SinePositionalEmbedding(
+            d_model,
+            dropout=0.1,
+            scale=False,
+            alpha=True,
+        )
+        self.ar_decoder = decoder_cls(
+            decoder_layer_cls(
+                d_model,
+                nhead,
+                dim_feedforward=d_model * 4,
+                dropout=0.1,
+                batch_first=True,
+                norm_first=norm_first,
+            ),
+            num_layers=num_layers,
+            norm=LayerNorm(d_model) if norm_first else None,
+        )
+        self.ar_predict_layer = nn.Linear(
+            d_model, NUM_AUDIO_TOKENS + 1, bias=False
+        )
+        self.rng = random.Random(0)
+        self.num_heads = nhead
+        self.prefix_mode = prefix_mode
+        self.num_quantizers = num_quantizers
+        assert num_quantizers >= 1
+        if num_quantizers > 1:
+            self.nar_audio_embeddings = nn.ModuleList(
+                [TokenEmbedding(nar_d_model, NUM_AUDIO_TOKENS + 1)]
+                + [
+                    TokenEmbedding(nar_d_model, NUM_AUDIO_TOKENS)
+                    for i in range(num_quantizers - 1)
+                ]
+            )  # W_a
+            # PreNet
+            if add_prenet:
+                self.nar_text_prenet = nn.Sequential(
+                    Transpose(),
+                    nn.Conv1d(
+                        nar_d_model, nar_d_model, kernel_size=5, padding="same"
+                    ),
+                    nn.BatchNorm1d(nar_d_model),
+                    nn.ReLU(),
+                    nn.Dropout(0.5),
+                    nn.Conv1d(
+                        nar_d_model, nar_d_model, kernel_size=5, padding="same"
+                    ),
+                    nn.BatchNorm1d(nar_d_model),
+                    nn.ReLU(),
+                    nn.Dropout(0.5),
+                    nn.Conv1d(
+                        nar_d_model, nar_d_model, kernel_size=5, padding="same"
+                    ),
+                    nn.BatchNorm1d(nar_d_model),
+                    nn.ReLU(),
+                    nn.Dropout(0.5),
+                    Transpose(),
+                    nn.Linear(nar_d_model, nar_d_model),
+                )
+                self.nar_audio_prenet = nn.Sequential(
+                    nn.Linear(nar_d_model, 256),
+                    nn.ReLU(),
+                    nn.Dropout(0.25),
+                    nn.Linear(256, 256),
+                    nn.ReLU(),
+                    nn.Dropout(0.25),
+                    nn.Linear(256, nar_d_model),
+                )
+            else:
+                self.nar_text_prenet = nn.Identity()
+                self.nar_audio_prenet = nn.Identity()
+            self.nar_text_position = SinePositionalEmbedding(
+                nar_d_model,
+                dropout=0.0,
+                scale=False,
+                alpha=False,
+            )
+            self.nar_audio_position = SinePositionalEmbedding(
+                nar_d_model,
+                dropout=0.1,
+                scale=False,
+                alpha=False,
+            )
+            self.nar_decoder = decoder_cls(
+                decoder_layer_cls(
+                    nar_d_model,
+                    int(nhead * nar_scale_factor),
+                    dim_feedforward=nar_d_model * 4,
+                    dropout=0.1,
+                    batch_first=True,
+                    norm_first=norm_first,
+                    adaptive_layer_norm=True,
+                ),
+                num_layers=int(num_layers * nar_scale_factor),
+                norm=AdaptiveLayerNorm(
+                    nar_d_model, norm=nn.LayerNorm(nar_d_model)
+                )
+                if norm_first
+                else None,
+            )
+            self.nar_predict_layers = nn.ModuleList(
+                [
+                    nn.Linear(nar_d_model, NUM_AUDIO_TOKENS, bias=False)
+                    for i in range(num_quantizers - 1)
+                ]
+            )
+            self.nar_stage_embeddings = nn.ModuleList(
+                [
+                    TokenEmbedding(nar_d_model, 1)
+                    for i in range(num_quantizers - 1)
+                ]
+            )
+            if share_embedding:
+                # We share the parameters of the output projection layer with the parameters of the acoustic embedding Wa
+                # NOTE(Feiteng): In the experiment, this undermines accuracy
+                # self.ar_predict_layer.weight = self.ar_audio_embedding.weight
+                # We also share the parameters of the acoustic embedding layer and the output prediction layer,
+                # which means the weights of the j-th prediction layer are the same as the (j + 1)-th acoustic embedding layer.
+                for j in range(0, num_quantizers - 2):
+                    self.nar_predict_layers[
+                        j
+                    ].weight = self.nar_audio_embeddings[j + 2].weight
+    def stage_parameters(self, stage: int = 1) -> Iterator[nn.Parameter]:
+        assert stage > 0
+        if stage == 1:
+            for name, param in self.named_parameters():
+                if name.startswith("ar_"):
+                    print(f" AR parameter: {name}")
+                    yield param
+        if stage == 2:
+            for name, param in self.named_parameters():
+                if name.startswith("nar_"):
+                    print(f"NAR parameter: {name}")
+                    yield param
+    def stage_named_parameters(
+        self, stage: int = 1
+    ) -> Iterator[Tuple[str, nn.Parameter]]:
+        assert stage > 0
+        if stage == 1:
+            for pair in self.named_parameters():
+                if pair[0].startswith("ar_"):
+                    yield pair
+        if stage == 2:
+            for pair in self.named_parameters():
+                if pair[0].startswith("nar_"):
+                    yield pair
+    def pad_y_eos(self, y, y_mask_int, eos_id):
+        targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(
+            y_mask_int, (0, 1), value=1
+        )
+        # inputs, targets
+        if self.ar_audio_prepend_bos:
+            return (
+                F.pad(targets[:, :-1], (1, 0), value=NUM_AUDIO_TOKENS + 1),
+                targets,
+            )
+        return targets[:, :-1], targets[:, 1:]
+    def _prepare_prompts(self, y, y_lens, codes, nar_stage, y_prompts_codes, prefix_mode):
+        # 5.1 For the NAR acoustic prompt tokens, we select a random segment waveform of 3 seconds
+        # from the same utterance.
+        # We implement this differently.
+        if prefix_mode == 0:
+            # no prefix
+            prefix_len = 0
+            y_emb = self.nar_audio_embeddings[0](y)
+            for j in range(1, nar_stage):
+                # Formula (4) (5)
+                y_emb = y_emb + self.nar_audio_embeddings[j](codes[..., j])
+        elif prefix_mode == 1:
+            # prefix at begining
+            int_low = (0.25 * y_lens.min()).type(torch.int64).item()
+            prefix_len = torch.randint(0, int_low * 2, size=()).item()
+            prefix_len = min(prefix_len, 225)  # 24000/320 * 3s = 225 frames
+            y_prompts = self.nar_audio_embeddings[0](y[:, :prefix_len])
+            y_emb = self.nar_audio_embeddings[0](y[:, prefix_len:])
+            for j in range(1, self.num_quantizers):
+                y_prompts += self.nar_audio_embeddings[j](
+                    codes[:, :prefix_len, j]
+                )
+                if j < nar_stage:
+                    y_emb += self.nar_audio_embeddings[j](
+                        codes[:, prefix_len:, j]
+                    )
+            y_emb = torch.concat([y_prompts, y_emb], axis=1)
+        elif prefix_mode in [2, 4]:
+            if prefix_mode == 2:
+                # random prefix
+                prefix_len = min(225, int(0.25 * y_lens.min().item()))
+                y_prompts_codes = []
+                for b in range(codes.shape[0]):
+                    start = self.rng.randint(0, y_lens[b].item() - prefix_len)
+                    y_prompts_codes.append(
+                        torch.clone(codes[b, start : start + prefix_len])
+                    )
+                    codes[
+                        b, start : start + prefix_len, nar_stage
+                    ] = NUM_AUDIO_TOKENS
+                y_prompts_codes = torch.stack(y_prompts_codes, dim=0)
+            else:
+                prefix_len = y_prompts_codes.shape[1]
+            y_prompts = self.nar_audio_embeddings[0](y_prompts_codes[..., 0])
+            y_emb = self.nar_audio_embeddings[0](y)
+            for j in range(1, self.num_quantizers):
+                y_prompts += self.nar_audio_embeddings[j](
+                    y_prompts_codes[..., j]
+                )
+                if j < nar_stage:
+                    y_emb += self.nar_audio_embeddings[j](codes[..., j])
+            y_emb = torch.concat([y_prompts, y_emb], axis=1)
+        else:
+            raise ValueError
+        return y_emb, prefix_len
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: Union[torch.Tensor],
+        y_lens: Union[torch.Tensor],
+        reduction: str = "sum",
+        train_stage: int = 0,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Union[torch.Tensor, None]]:
+        raise NotImplementedError
+    def inference(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: torch.Tensor,
+        enroll_x_lens: Union[torch.Tensor, None] = None,
+        top_k: int = -100,
+        temperature: float = 1.0,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def visualize(
+        self,
+        predicts: Tuple[torch.Tensor],
+        batch: Dict[str, Union[List, torch.Tensor]],
+        output_dir: str,
+        limit: int = 4,
+    ) -> None:
+        raise NotImplementedError
+class VALLE(VALLF):
+    """It implements https://arxiv.org/abs/2301.02111
+    "Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers"
+    """
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        num_layers: int,
+        norm_first: bool = True,
+        add_prenet: bool = False,
+        prefix_mode: int = 0,
+        share_embedding: bool = True,
+        nar_scale_factor: float = 1.0,
+        **kwargs,
+    ):
+        """
+        Args:
+          d_model:
+            The number of expected features in the input (required).
+          nhead:
+            The number of heads in the multiheadattention models (required).
+          num_layers:
+            The number of sub-decoder-layers in the decoder (required).
+        """
+        super(VALLE, self).__init__(
+            d_model,
+            nhead,
+            num_layers,
+            norm_first=norm_first,
+            add_prenet=add_prenet,
+            decoder_cls=TransformerEncoder,
+            decoder_layer_cls=TransformerEncoderLayer,
+            prefix_mode=prefix_mode,
+            share_embedding=share_embedding,
+            nar_scale_factor=nar_scale_factor,
+            **kwargs,
+        )
+        self.language_ID = {
+            'en': 0,
+            'zh': 1,
+            'ja': 2,
+        }
+        self.ar_language_embedding = TokenEmbedding(d_model, len(self.language_ID))
+        self.nar_language_embedding = TokenEmbedding(d_model, len(self.language_ID))
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: Union[torch.Tensor],
+        y_lens: Union[torch.Tensor],
+        reduction: str = "sum",
+        train_stage: int = 0,
+        **kwargs,
+    ):
+        raise NotImplementedError
+    def inference(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: torch.Tensor,
+        enroll_x_lens: torch.Tensor,
+        top_k: int = -100,
+        temperature: float = 1.0,
+        prompt_language: str = None,
+        text_language: str = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+          x:
+            A 2-D tensor of shape (1, S).
+          x_lens:
+            A 1-D tensor of shape (1,). It contains the number of tokens in `x`
+            before padding.
+          y:
+            A 3-D tensor of shape (1, T, 8).
+          top_k: (`optional`) int
+            The number of highest probability tokens to keep for top-k-filtering. Default to -100.
+          temperature: (`optional`) float
+            The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
+        Returns:
+          Return the predicted audio code matrix.
+        """
+        assert x.ndim == 2, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert y.ndim == 3, y.shape
+        assert y.shape[0] == 1, y.shape
+        assert torch.all(x_lens > 0)
+        # NOTE: x has been padded in TextTokenCollater
+        text = x
+        x = self.ar_text_embedding(text)
+        # Add language embedding
+        prompt_language_id = torch.LongTensor(np.array([self.language_ID[prompt_language]])).to(x.device)
+        if isinstance(text_language, str):
+            text_language_id = torch.LongTensor(np.array([self.language_ID[text_language]])).to(x.device)
+        elif isinstance(text_language, List):
+            text_language_id = torch.LongTensor(np.array([self.language_ID[tl] for tl in text_language])).to(x.device)
+        x[:, :enroll_x_lens, :] += self.ar_language_embedding(prompt_language_id)
+        x[:, enroll_x_lens:, :] += self.ar_language_embedding(text_language_id)
+        x = self.ar_text_prenet(x)
+        x = self.ar_text_position(x)
+        text_len = x_lens.max()
+        prompts = y
+        prefix_len = y.shape[1]
+        # AR Decoder
+        # TODO: Managing decoder steps avoid repetitive computation
+        y = prompts[..., 0]
+        if self.ar_audio_prepend_bos:
+            y = F.pad(y, (1, 0), value=NUM_AUDIO_TOKENS + 1)
+        x_len = x_lens.max()
+        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
+        kv_cache = None
+        use_kv_caching = True
+        while True:
+            y_emb = self.ar_audio_embedding(y)
+            y_emb = self.ar_audio_prenet(y_emb)
+            y_pos = self.ar_audio_position(y_emb)
+            xy_pos = torch.concat([x, y_pos], dim=1)
+            y_len = y.shape[1]
+            x_attn_mask_pad = F.pad(
+                x_attn_mask,
+                (0, y_len),
+                value=True,
+            )
+            y_attn_mask = F.pad(
+                torch.triu(
+                    torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1
+                ),
+                (x_len, 0),
+                value=False,
+            )
+            xy_attn_mask = torch.concat(
+                [x_attn_mask_pad, y_attn_mask], dim=0
+            ).to(y.device)
+            if use_kv_caching and kv_cache is not None:
+                xy_pos = xy_pos[:, [-1]]
+            else:
+                pass
+            xy_dec, kv_cache = self.ar_decoder.infer(
+                xy_pos,
+                mask=xy_attn_mask,
+                past_kv=kv_cache,
+                use_cache=use_kv_caching,
+            )
+            # xy_dec, _ = self.ar_decoder(
+            #     (xy_pos, None),
+            #     mask=xy_attn_mask,
+            # )
+            logits = self.ar_predict_layer(xy_dec[:, -1])
+            samples = topk_sampling(
+                logits, top_k=top_k, top_p=1, temperature=temperature
+            )
+            if (
+                torch.argmax(logits, dim=-1)[0] == NUM_AUDIO_TOKENS
+                or samples[0, 0] == NUM_AUDIO_TOKENS
+                or (y.shape[1] - prompts.shape[1]) > x_lens.max() * 16
+            ):
+                if prompts.shape[1] == y.shape[1]:
+                    raise SyntaxError(
+                        "well trained model shouldn't reach here."
+                    )
+                print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
+                break
+            y = torch.concat([y, samples], dim=1)
+        codes = [y[:, prefix_len + int(self.ar_audio_prepend_bos) :]]
+        if self.num_quantizers == 1:
+            return torch.stack(codes, dim=-1)
+        # Non-AR Decoders
+        y_emb = self.nar_audio_embeddings[0](
+            y[:, int(self.ar_audio_prepend_bos) :]
+        )
+        if self.prefix_mode in [2, 4]:  # Exclude enrolled_phonemes
+            enrolled_len = enroll_x_lens.max().item()
+            # SOS + Synthesis Text + EOS
+            text = torch.concat(
+                [
+                    text[:, :1],
+                    text[:, enrolled_len - 1 :],
+                ],
+                dim=1,
+            )
+            text_len = text_len - (enrolled_len - 2)
+            assert text.shape[0] == 1
+        x = self.nar_text_embedding(text)
+        # Add language embedding
+        prompt_language_id = torch.LongTensor(np.array([self.language_ID[prompt_language]])).to(x.device)
+        if isinstance(text_language, str):
+            text_language_id = torch.LongTensor(np.array([self.language_ID[text_language]])).to(x.device)
+        elif isinstance(text_language, List):
+            text_language_id = torch.LongTensor(np.array([self.language_ID[tl] for tl in text_language])).to(x.device)
+        x[:, :enroll_x_lens, :] += self.nar_language_embedding(prompt_language_id)
+        x[:, enroll_x_lens:, :] += self.nar_language_embedding(text_language_id)
+        x = self.nar_text_prenet(x)
+        x = self.nar_text_position(x)
+        if self.prefix_mode == 0:
+            for i, (predict_layer, embedding_layer) in enumerate(
+                zip(
+                    self.nar_predict_layers,
+                    self.nar_audio_embeddings[1:],
+                )
+            ):
+                y_pos = self.nar_audio_prenet(y_emb)
+                y_pos = self.nar_audio_position(y_pos)
+                xy_pos = torch.concat([x, y_pos], dim=1)
+                xy_dec, _ = self.nar_decoder(
+                    (xy_pos, self.nar_stage_embeddings[i].weight)
+                )
+                logits = predict_layer(xy_dec[:, text_len + prefix_len :])
+                samples = torch.argmax(logits, dim=-1)
+                codes.append(samples)
+                if i < self.num_quantizers - 2:
+                    y_emb[:, :prefix_len] += embedding_layer(
+                        prompts[..., i + 1]
+                    )
+                    y_emb[:, prefix_len:] += embedding_layer(samples)
+        else:
+            for j in range(1, self.num_quantizers):
+                y_emb[:, :prefix_len] += self.nar_audio_embeddings[j](
+                    prompts[..., j]
+                )
+            for i, (predict_layer, embedding_layer) in enumerate(
+                zip(
+                    self.nar_predict_layers,
+                    self.nar_audio_embeddings[1:],
+                )
+            ):
+                y_pos = self.nar_audio_prenet(y_emb)
+                y_pos = self.nar_audio_position(y_pos)
+                xy_pos = torch.concat([x, y_pos], dim=1)
+                xy_dec, _ = self.nar_decoder(
+                    (xy_pos, self.nar_stage_embeddings[i].weight)
+                )
+                logits = predict_layer(xy_dec[:, text_len + prefix_len :])
+                samples = torch.argmax(logits, dim=-1)
+                codes.append(samples)
+                if i < self.num_quantizers - 2:
+                    y_emb[:, prefix_len:] += embedding_layer(samples)
+        assert len(codes) == self.num_quantizers
+        return torch.stack(codes, dim=-1)
+    def continual(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+          x:
+            A 2-D tensor of shape (1, S).
+          x_lens:
+            A 1-D tensor of shape (1,). It contains the number of tokens in `x`
+            before padding.
+          y:
+            A 3-D tensor of shape (1, T, 8).
+        Returns:
+          Return the predicted audio code matrix.
+        """
+        assert x.ndim == 2, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert y.ndim == 3, y.shape
+        assert y.shape[0] == 1, y.shape
+        assert torch.all(x_lens > 0)
+        assert self.num_quantizers == 8
+        # NOTE: x has been padded in TextTokenCollater
+        text = x
+        x = self.ar_text_embedding(text)
+        x = self.ar_text_prenet(x)
+        x = self.ar_text_position(x)
+        text_len = x_lens.max()
+        prefix_len = min(int(y.shape[1] * 0.5), 3 * 75)
+        # AR Decoder
+        prompts = y[:, :prefix_len]
+        codes = [y[:, prefix_len:, 0]]
+        # Non-AR Decoders
+        x = self.nar_text_embedding(text)
+        x = self.nar_text_prenet(x)
+        x = self.nar_text_position(x)
+        y_emb = self.nar_audio_embeddings[0](y[..., 0])
+        if self.prefix_mode == 0:
+            for i, (predict_layer, embedding_layer) in enumerate(
+                zip(
+                    self.nar_predict_layers,
+                    self.nar_audio_embeddings[1:],
+                )
+            ):
+                y_pos = self.nar_audio_position(y_emb)
+                y_pos = self.nar_audio_prenet(y_pos)
+                xy_pos = torch.concat([x, y_pos], dim=1)
+                xy_dec, _ = self.nar_decoder(
+                    (xy_pos, self.nar_stage_embeddings[i].weight)
+                )
+                logits = predict_layer(xy_dec[:, text_len + prefix_len :])
+                samples = torch.argmax(logits, dim=-1)
+                codes.append(samples)
+                if i < 6:
+                    y_emb[:, :prefix_len] += embedding_layer(
+                        prompts[..., i + 1]
+                    )
+                    y_emb[:, prefix_len:] += embedding_layer(samples)
+        else:
+            for j in range(1, 8):
+                y_emb[:, :prefix_len] += self.nar_audio_embeddings[j](
+                    prompts[..., j]
+                )
+            for i, (predict_layer, embedding_layer) in enumerate(
+                zip(
+                    self.nar_predict_layers,
+                    self.nar_audio_embeddings[1:],
+                )
+            ):
+                y_pos = self.nar_audio_prenet(y_emb)
+                y_pos = self.nar_audio_position(y_pos)
+                xy_pos = torch.concat([x, y_pos], dim=1)
+                xy_dec, _ = self.nar_decoder(
+                    (xy_pos, self.nar_stage_embeddings[i].weight)
+                )
+                logits = predict_layer(xy_dec[:, text_len + prefix_len :])
+                samples = torch.argmax(logits, dim=-1)
+                codes.append(samples)
+                if i < 6:
+                    y_emb[:, prefix_len:] += embedding_layer(samples)
+        assert len(codes) == 8
+        return torch.stack(codes, dim=-1)
+# https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
+def top_k_top_p_filtering(
+    logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1
+):
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        top_k = min(
+            max(top_k, min_tokens_to_keep), logits.size(-1)
+        )  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(
+            F.softmax(sorted_logits, dim=-1), dim=-1
+        )
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+            ..., :-1
+        ].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            1, sorted_indices, sorted_indices_to_remove
+        )
+        logits[indices_to_remove] = filter_value
+    return logits
+def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0):
+    # temperature: (`optional`) float
+    #     The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
+    # top_k: (`optional`) int
+    #     The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
+    # top_p: (`optional`) float
+    #     The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
+    # Temperature (higher temperature => more likely to sample low probability tokens)
+    if temperature != 1.0:
+        logits = logits / temperature
+    # Top-p/top-k filtering
+    logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    # Sample
+    token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
+    return token

modules/__init__.py ADDED Viewed

File without changes

modules/activation.py ADDED Viewed

	@@ -0,0 +1,612 @@

+from typing import Optional, Tuple, List
+import math
+import torch
+from torch import Tensor
+from torch.nn import Linear, Module
+from torch.nn import functional as F
+from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
+from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
+from torch.nn.parameter import Parameter
+def _in_projection_packed(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    w: Tensor,
+    b: Optional[Tensor] = None,
+) -> List[Tensor]:
+    r"""
+    Performs the in-projection step of the attention operation, using packed weights.
+    Output is a triple containing projection tensors for query, key and value.
+    Args:
+        q, k, v: query, key and value tensors to be projected. For self-attention,
+            these are typically the same tensor; for encoder-decoder attention,
+            k and v are typically the same tensor. (We take advantage of these
+            identities for performance if they are present.) Regardless, q, k and v
+            must share a common embedding dimension; otherwise their shapes may vary.
+        w: projection weights for q, k and v, packed into a single tensor. Weights
+            are packed along dimension 0, in q, k, v order.
+        b: optional projection biases for q, k and v, packed into a single tensor
+            in q, k, v order.
+    Shape:
+        Inputs:
+        - q: :math:`(..., E)` where E is the embedding dimension
+        - k: :math:`(..., E)` where E is the embedding dimension
+        - v: :math:`(..., E)` where E is the embedding dimension
+        - w: :math:`(E * 3, E)` where E is the embedding dimension
+        - b: :math:`E * 3` where E is the embedding dimension
+        Output:
+        - in output list :math:`[q', k', v']`, each output tensor will have the
+            same shape as the corresponding input tensor.
+    """
+    E = q.size(-1)
+    if k is v:
+        if q is k:
+            # self-attention
+            return F.linear(q, w, b).chunk(3, dim=-1)
+        else:
+            # encoder-decoder attention
+            w_q, w_kv = w.split([E, E * 2])
+            if b is None:
+                b_q = b_kv = None
+            else:
+                b_q, b_kv = b.split([E, E * 2])
+            return (F.linear(q, w_q, b_q),) + F.linear(k, w_kv, b_kv).chunk(2, dim=-1)
+    else:
+        w_q, w_k, w_v = w.chunk(3)
+        if b is None:
+            b_q = b_k = b_v = None
+        else:
+            b_q, b_k, b_v = b.chunk(3)
+        return F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v)
+def _scaled_dot_product_attention(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    attn_mask: Optional[Tensor] = None,
+    dropout_p: float = 0.0,
+) -> Tuple[Tensor, Tensor]:
+    r"""
+    Computes scaled dot product attention on query, key and value tensors, using
+    an optional attention mask if passed, and applying dropout if a probability
+    greater than 0.0 is specified.
+    Returns a tensor pair containing attended values and attention weights.
+    Args:
+        q, k, v: query, key and value tensors. See Shape section for shape details.
+        attn_mask: optional tensor containing mask values to be added to calculated
+            attention. May be 2D or 3D; see Shape section for details.
+        dropout_p: dropout probability. If greater than 0.0, dropout is applied.
+    Shape:
+        - q: :math:`(B, Nt, E)` where B is batch size, Nt is the target sequence length,
+            and E is embedding dimension.
+        - key: :math:`(B, Ns, E)` where B is batch size, Ns is the source sequence length,
+            and E is embedding dimension.
+        - value: :math:`(B, Ns, E)` where B is batch size, Ns is the source sequence length,
+            and E is embedding dimension.
+        - attn_mask: either a 3D tensor of shape :math:`(B, Nt, Ns)` or a 2D tensor of
+            shape :math:`(Nt, Ns)`.
+        - Output: attention values have shape :math:`(B, Nt, E)`; attention weights
+            have shape :math:`(B, Nt, Ns)`
+    """
+    B, Nt, E = q.shape
+    q = q / math.sqrt(E)
+    # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns)
+    if attn_mask is not None:
+        attn = torch.baddbmm(attn_mask, q, k.transpose(-2, -1))
+    else:
+        attn = torch.bmm(q, k.transpose(-2, -1))
+    attn = F.softmax(attn, dim=-1)
+    if dropout_p > 0.0:
+        attn = F.dropout(attn, p=dropout_p)
+    # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E)
+    output = torch.bmm(attn, v)
+    return output, attn
+def multi_head_attention_forward(
+        x,
+        ipw,
+        ipb,
+        opw,
+        opb,
+        n_head,
+        attn_mask,
+        past_kv=None,
+        use_cache=False,
+):
+    # x = x.transpose(1, 0)
+    # tgt_len, bsz, embed_dim = x.shape
+    # head_dim = embed_dim // n_head
+    # q, k, v = _in_projection_packed(x, x, x, ipw, ipb)
+    # q = q.contiguous().view(tgt_len, bsz * n_head, head_dim).transpose(0, 1)
+    # k = k.contiguous().view(k.shape[0], bsz * n_head, head_dim).transpose(0, 1)
+    # v = v.contiguous().view(v.shape[0], bsz * n_head, head_dim).transpose(0, 1)
+    # new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
+    # new_attn_mask.masked_fill_(attn_mask, float("-inf"))
+    # attn_mask = new_attn_mask
+    #
+    # attn_output, attn_output_weights = _scaled_dot_product_attention(q, k, v, attn_mask, 0.0)
+    # attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
+    # attn_output = torch._C._nn.linear(attn_output, opw, opb)
+    # attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
+    B, T, C = x.size()
+    q, k, v = torch._C._nn.linear(x, ipw, ipb).chunk(3, dim=-1)
+    k = k.view(B, T, n_head, C // n_head).transpose(1, 2)  # (B, nh, T, hs)
+    q = q.view(B, T, n_head, C // n_head).transpose(1, 2)  # (B, nh, T, hs)
+    v = v.view(B, T, n_head, C // n_head).transpose(1, 2)  # (B, nh, T, hs)
+    if past_kv is not None:
+        past_key = past_kv[0]
+        past_value = past_kv[1]
+        k = torch.cat((past_key, k), dim=-2)
+        v = torch.cat((past_value, v), dim=-2)
+    FULL_T = k.shape[-2]
+    if use_cache is True:
+        present = (k, v)
+    else:
+        present = None
+    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+    att = att.masked_fill(attn_mask[FULL_T - T:FULL_T, :FULL_T], float('-inf'))
+    att = F.softmax(att, dim=-1)
+    y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+    y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
+    y = torch._C._nn.linear(y, opw, opb)
+    return (y, present)
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces as described in the paper:
+    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+    Multi-Head Attention is defined as:
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+    ``forward()`` will use a special optimized implementation if all of the following
+    conditions are met:
+    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This
+      restriction will be loosened in the future.)
+    - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
+    - training is disabled (using ``.eval()``)
+    - dropout is 0
+    - ``add_bias_kv`` is ``False``
+    - ``add_zero_attn`` is ``False``
+    - ``batch_first`` is ``True`` and the input is batched
+    - ``kdim`` and ``vdim`` are equal to ``embed_dim``
+    - at most one of ``key_padding_mask`` or ``attn_mask`` is passed
+    - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
+      nor ``attn_mask`` is passed
+    If the optimized implementation is in use, a
+    `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
+    ``query``/``key``/``value`` to represent padding more efficiently than using a
+    padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
+    will be returned, and an additional speedup proportional to the fraction of the input
+    that is padding can be expected.
+    Args:
+        embed_dim: Total dimension of the model.
+        num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
+            across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
+        dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
+        bias: If specified, adds bias to input / output projection layers. Default: ``True``.
+        add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
+        add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
+            Default: ``False``.
+        kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
+        vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    """
+    __constants__ = ["batch_first"]
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+    def __init__(
+            self,
+            embed_dim,
+            num_heads,
+            dropout=0.0,
+            bias=True,
+            add_bias_kv=False,
+            add_zero_attn=False,
+            kdim=None,
+            vdim=None,
+            batch_first=False,
+            linear1_cls=Linear,
+            linear2_cls=Linear,
+            device=None,
+            dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = (
+                self.kdim == embed_dim and self.vdim == embed_dim
+        )
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.batch_first = batch_first
+        self.head_dim = embed_dim // num_heads
+        assert (
+                self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        if add_bias_kv:
+            self.bias_k = Parameter(
+                torch.empty((1, 1, embed_dim), **factory_kwargs)
+            )
+            self.bias_v = Parameter(
+                torch.empty((1, 1, embed_dim), **factory_kwargs)
+            )
+        else:
+            self.bias_k = self.bias_v = None
+        if linear1_cls == Linear:
+            if not self._qkv_same_embed_dim:
+                self.q_proj_weight = Parameter(
+                    torch.empty((embed_dim, embed_dim), **factory_kwargs)
+                )
+                self.k_proj_weight = Parameter(
+                    torch.empty((embed_dim, self.kdim), **factory_kwargs)
+                )
+                self.v_proj_weight = Parameter(
+                    torch.empty((embed_dim, self.vdim), **factory_kwargs)
+                )
+                self.register_parameter("in_proj_weight", None)
+            else:
+                self.in_proj_weight = Parameter(
+                    torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
+                )
+                self.register_parameter("q_proj_weight", None)
+                self.register_parameter("k_proj_weight", None)
+                self.register_parameter("v_proj_weight", None)
+            if bias:
+                self.in_proj_bias = Parameter(
+                    torch.empty(3 * embed_dim, **factory_kwargs)
+                )
+            else:
+                self.register_parameter("in_proj_bias", None)
+            self.out_proj = NonDynamicallyQuantizableLinear(
+                embed_dim, embed_dim, bias=bias, **factory_kwargs
+            )
+            self._reset_parameters()
+        else:
+            if not self._qkv_same_embed_dim:
+                raise NotImplementedError
+            else:
+                self.in_proj_linear = linear1_cls(
+                    embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs
+                )
+                self.in_proj_weight = self.in_proj_linear.weight
+                self.register_parameter("q_proj_weight", None)
+                self.register_parameter("k_proj_weight", None)
+                self.register_parameter("v_proj_weight", None)
+                if bias:
+                    self.in_proj_bias = self.in_proj_linear.bias
+                else:
+                    self.register_parameter("in_proj_bias", None)
+            self.out_proj = linear2_cls(
+                embed_dim, embed_dim, bias=bias, **factory_kwargs
+            )
+            if self.bias_k is not None:
+                xavier_normal_(self.bias_k)
+            if self.bias_v is not None:
+                xavier_normal_(self.bias_v)
+        self.add_zero_attn = add_zero_attn
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.0)
+            constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if "_qkv_same_embed_dim" not in state:
+            state["_qkv_same_embed_dim"] = True
+        super(MultiheadAttention, self).__setstate__(state)
+    def forward(
+            self,
+            query: Tensor,
+            key: Tensor,
+            value: Tensor,
+            key_padding_mask: Optional[Tensor] = None,
+            need_weights: bool = True,
+            attn_mask: Optional[Tensor] = None,
+            average_attn_weights: bool = True,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        r"""
+        Args:
+            query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
+                or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
+                :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
+                Queries are compared against key-value pairs to produce the output.
+                See "Attention Is All You Need" for more details.
+            key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
+                or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
+                :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
+                See "Attention Is All You Need" for more details.
+            value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
+                ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
+                sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
+                See "Attention Is All You Need" for more details.
+            key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
+                to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
+                Binary and byte masks are supported.
+                For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
+                the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
+            need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
+                Default: ``True``.
+            attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
+                :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
+                :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
+                broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
+                Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
+                corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
+                corresponding position is not allowed to attend. For a float mask, the mask values will be added to
+                the attention weight.
+            average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
+                heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
+                effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
+        Outputs:
+            - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
+              :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
+              where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
+              embedding dimension ``embed_dim``.
+            - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
+              returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
+              :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
+              :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
+              head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
+            .. note::
+                `batch_first` argument is ignored for unbatched inputs.
+        """
+        is_batched = query.dim() == 3
+        if key_padding_mask is not None:
+            _kpm_dtype = key_padding_mask.dtype
+            if _kpm_dtype != torch.bool and not torch.is_floating_point(
+                    key_padding_mask
+            ):
+                raise AssertionError(
+                    "only bool and floating types of key_padding_mask are supported"
+                )
+        why_not_fast_path = ""
+        if not is_batched:
+            why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
+        elif query is not key or key is not value:
+            # When lifting this restriction, don't forget to either
+            # enforce that the dtypes all match or test cases where
+            # they don't!
+            why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
+        elif (
+                self.in_proj_bias is not None
+                and query.dtype != self.in_proj_bias.dtype
+        ):
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
+        elif (
+                self.in_proj_weight is not None
+                and query.dtype != self.in_proj_weight.dtype
+        ):
+            # this case will fail anyway, but at least they'll get a useful error message.
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
+        elif self.training:
+            why_not_fast_path = "training is enabled"
+        elif not self.batch_first:
+            why_not_fast_path = "batch_first was not True"
+        elif self.bias_k is not None:
+            why_not_fast_path = "self.bias_k was not None"
+        elif self.bias_v is not None:
+            why_not_fast_path = "self.bias_v was not None"
+        elif self.dropout:
+            why_not_fast_path = f"dropout was {self.dropout}, required zero"
+        elif self.add_zero_attn:
+            why_not_fast_path = "add_zero_attn was enabled"
+        elif not self._qkv_same_embed_dim:
+            why_not_fast_path = "_qkv_same_embed_dim was not True"
+        elif attn_mask is not None:
+            why_not_fast_path = "attn_mask was not None"
+        elif query.is_nested and key_padding_mask is not None:
+            why_not_fast_path = (
+                "key_padding_mask is not supported with NestedTensor input"
+            )
+        elif self.num_heads % 2 == 1:
+            why_not_fast_path = "num_heads is odd"
+        elif torch.is_autocast_enabled():
+            why_not_fast_path = "autocast is enabled"
+        if not why_not_fast_path:
+            tensor_args = (
+                query,
+                key,
+                value,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj.weight,
+                self.out_proj.bias,
+            )
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_fast_path = "some Tensor argument has_torch_function"
+            elif not all(
+                    [
+                        (x is None or x.is_cuda or "cpu" in str(x.device))
+                        for x in tensor_args
+                    ]
+            ):
+                why_not_fast_path = (
+                    "some Tensor argument is neither CUDA nor CPU"
+                )
+            elif torch.is_grad_enabled() and any(
+                    [x is not None and x.requires_grad for x in tensor_args]
+            ):
+                why_not_fast_path = (
+                    "grad is enabled and at least one of query or the "
+                    "input/output projection weights or biases requires_grad"
+                )
+            if not why_not_fast_path:
+                return torch._native_multi_head_attention(
+                    query,
+                    key,
+                    value,
+                    self.embed_dim,
+                    self.num_heads,
+                    self.in_proj_weight,
+                    self.in_proj_bias,
+                    self.out_proj.weight,
+                    self.out_proj.bias,
+                    key_padding_mask
+                    if key_padding_mask is not None
+                    else attn_mask,
+                    need_weights,
+                    average_attn_weights,
+                    1
+                    if key_padding_mask is not None
+                    else 0
+                    if attn_mask is not None
+                    else None,
+                )
+        any_nested = query.is_nested or key.is_nested or value.is_nested
+        assert not any_nested, (
+                "MultiheadAttention does not support NestedTensor outside of its fast path. "
+                + f"The fast path was not hit because {why_not_fast_path}"
+        )
+        if self.batch_first and is_batched:
+            # make sure that the transpose op does not affect the "is" property
+            if key is value:
+                if query is key:
+                    query = key = value = query.transpose(1, 0)
+                else:
+                    query, key = [x.transpose(1, 0) for x in (query, key)]
+                    value = key
+            else:
+                query, key, value = [
+                    x.transpose(1, 0) for x in (query, key, value)
+                ]
+        if not self._qkv_same_embed_dim:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight,
+                k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                average_attn_weights=average_attn_weights,
+            )
+        else:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                average_attn_weights=average_attn_weights,
+            )
+        if self.batch_first and is_batched:
+            return attn_output.transpose(1, 0), attn_output_weights
+        else:
+            return attn_output, attn_output_weights
+    def infer(self,
+              x: Tensor,
+              key_padding_mask: Optional[Tensor] = None,
+              need_weights: bool = True,
+              attn_mask: Optional[Tensor] = None,
+              average_attn_weights: bool = True,
+              past_kv = None,
+              use_cache = False
+              ):
+        # x = x.transpose(1, 0)
+        y, kv = multi_head_attention_forward(
+                x=x,
+                ipw=self.in_proj_weight,
+                ipb=self.in_proj_bias,
+                opw=self.out_proj.weight,
+                opb=self.out_proj.bias,
+                n_head=self.num_heads,
+                attn_mask=attn_mask,
+                past_kv=past_kv,
+                use_cache=use_cache,
+        )
+        return (y, kv)

modules/embedding.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright    2023                             (authors: Feiteng Li)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import torch
+import torch.nn as nn
+class TokenEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim_model: int,
+        vocab_size: int,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.dim_model = dim_model
+        self.dropout = torch.nn.Dropout(p=dropout)
+        self.word_embeddings = nn.Embedding(self.vocab_size, self.dim_model)
+    @property
+    def weight(self) -> torch.Tensor:
+        return self.word_embeddings.weight
+    def embedding(self, index: int) -> torch.Tensor:
+        return self.word_embeddings.weight[index : index + 1]
+    def forward(self, x: torch.Tensor):
+        X = self.word_embeddings(x)
+        X = self.dropout(X)
+        return X
+class SinePositionalEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim_model: int,
+        dropout: float = 0.0,
+        scale: bool = False,
+        alpha: bool = False,
+    ):
+        super().__init__()
+        self.dim_model = dim_model
+        self.x_scale = math.sqrt(dim_model) if scale else 1.0
+        self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
+        self.dropout = torch.nn.Dropout(p=dropout)
+        self.reverse = False
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, 4000))
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.dim_model)
+        if self.reverse:
+            position = torch.arange(
+                x.size(1) - 1, -1, -1.0, dtype=torch.float32
+            ).unsqueeze(1)
+        else:
+            position = torch.arange(
+                0, x.size(1), dtype=torch.float32
+            ).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.dim_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.dim_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        self.extend_pe(x)
+        output = x.unsqueeze(-1) if x.ndim == 2 else x
+        output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
+        return self.dropout(output)

modules/scaling.py ADDED Viewed

	@@ -0,0 +1,1401 @@

+# Copyright    2022  Xiaomi Corp.        (authors: Daniel Povey)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import logging
+import random
+import math
+from functools import reduce
+from itertools import repeat
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn import Embedding as ScaledEmbedding
+from utils import Transpose
+class ActivationBalancerFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        scale_factor: Tensor,
+        sign_factor: Optional[Tensor],
+        channel_dim: int,
+    ) -> Tensor:
+        if channel_dim < 0:
+            channel_dim += x.ndim
+        ctx.channel_dim = channel_dim
+        xgt0 = x > 0
+        if sign_factor is None:
+            ctx.save_for_backward(xgt0, scale_factor)
+        else:
+            ctx.save_for_backward(xgt0, scale_factor, sign_factor)
+        return x
+    @staticmethod
+    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]:
+        if len(ctx.saved_tensors) == 3:
+            xgt0, scale_factor, sign_factor = ctx.saved_tensors
+            for _ in range(ctx.channel_dim, x_grad.ndim - 1):
+                scale_factor = scale_factor.unsqueeze(-1)
+                sign_factor = sign_factor.unsqueeze(-1)
+            factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
+        else:
+            xgt0, scale_factor = ctx.saved_tensors
+            for _ in range(ctx.channel_dim, x_grad.ndim - 1):
+                scale_factor = scale_factor.unsqueeze(-1)
+            factor = scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
+        neg_delta_grad = x_grad.abs() * factor
+        return (
+            x_grad - neg_delta_grad,
+            None,
+            None,
+            None,
+        )
+def _compute_scale_factor(
+    x: Tensor,
+    channel_dim: int,
+    min_abs: float,
+    max_abs: float,
+    gain_factor: float,
+    max_factor: float,
+) -> Tensor:
+    if channel_dim < 0:
+        channel_dim += x.ndim
+    sum_dims = [d for d in range(x.ndim) if d != channel_dim]
+    x_abs_mean = torch.mean(x.abs(), dim=sum_dims).to(torch.float32)
+    if min_abs == 0.0:
+        below_threshold = 0.0
+    else:
+        # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if
+        # x_abs)_mean , min_abs.
+        below_threshold = (
+            (min_abs - x_abs_mean) * (gain_factor / min_abs)
+        ).clamp(min=0, max=max_factor)
+    above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp(
+        min=0, max=max_factor
+    )
+    return below_threshold - above_threshold
+def _compute_sign_factor(
+    x: Tensor,
+    channel_dim: int,
+    min_positive: float,
+    max_positive: float,
+    gain_factor: float,
+    max_factor: float,
+) -> Tensor:
+    if channel_dim < 0:
+        channel_dim += x.ndim
+    sum_dims = [d for d in range(x.ndim) if d != channel_dim]
+    proportion_positive = torch.mean((x > 0).to(torch.float32), dim=sum_dims)
+    if min_positive == 0.0:
+        factor1 = 0.0
+    else:
+        # 0 if proportion_positive >= min_positive, else can be
+        # as large as max_factor.
+        factor1 = (
+            (min_positive - proportion_positive) * (gain_factor / min_positive)
+        ).clamp_(min=0, max=max_factor)
+    if max_positive == 1.0:
+        factor2 = 0.0
+    else:
+        # 0 if self.proportion_positive <= max_positive, else can be
+        # as large as -max_factor.
+        factor2 = (
+            (proportion_positive - max_positive)
+            * (gain_factor / (1.0 - max_positive))
+        ).clamp_(min=0, max=max_factor)
+    sign_factor = factor1 - factor2
+    # require min_positive != 0 or max_positive != 1:
+    assert not isinstance(sign_factor, float)
+    return sign_factor
+class ActivationScaleBalancerFunction(torch.autograd.Function):
+    """
+    This object is used in class ActivationBalancer when the user specified
+    min_positive=0, max_positive=1, so there are no constraints on the signs
+    of the activations and only the absolute value has a constraint.
+    """
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        sign_factor: Tensor,
+        scale_factor: Tensor,
+        channel_dim: int,
+    ) -> Tensor:
+        if channel_dim < 0:
+            channel_dim += x.ndim
+        ctx.channel_dim = channel_dim
+        xgt0 = x > 0
+        ctx.save_for_backward(xgt0, sign_factor, scale_factor)
+        return x
+    @staticmethod
+    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]:
+        xgt0, sign_factor, scale_factor = ctx.saved_tensors
+        for _ in range(ctx.channel_dim, x_grad.ndim - 1):
+            sign_factor = sign_factor.unsqueeze(-1)
+            scale_factor = scale_factor.unsqueeze(-1)
+        factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
+        neg_delta_grad = x_grad.abs() * factor
+        return (
+            x_grad - neg_delta_grad,
+            None,
+            None,
+            None,
+        )
+class RandomClampFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        min: Optional[float],
+        max: Optional[float],
+        prob: float,
+        reflect: float,
+    ) -> Tensor:
+        x_clamped = torch.clamp(x, min=min, max=max)
+        mask = torch.rand_like(x) < prob
+        ans = torch.where(mask, x_clamped, x)
+        if x.requires_grad:
+            ctx.save_for_backward(ans == x)
+            ctx.reflect = reflect
+        if reflect != 0.0:
+            ans = ans * (1.0 + reflect) - (x * reflect)
+        return ans
+    @staticmethod
+    def backward(
+        ctx, ans_grad: Tensor
+    ) -> Tuple[Tensor, None, None, None, None]:
+        (is_same,) = ctx.saved_tensors
+        x_grad = ans_grad * is_same.to(ans_grad.dtype)
+        reflect = ctx.reflect
+        if reflect != 0.0:
+            x_grad = x_grad * (1.0 + reflect) - (ans_grad * reflect)
+        return x_grad, None, None, None, None
+def random_clamp(
+    x: Tensor,
+    min: Optional[float] = None,
+    max: Optional[float] = None,
+    prob: float = 0.5,
+    reflect: float = 0.0,
+):
+    return RandomClampFunction.apply(x, min, max, prob, reflect)
+def random_cast_to_half(x: Tensor, min_abs: float = 5.0e-06) -> Tensor:
+    """
+    A randomized way of casting a floating point value to half precision.
+    """
+    if x.dtype == torch.float16:
+        return x
+    x_abs = x.abs()
+    is_too_small = x_abs < min_abs
+    # for elements where is_too_small is true, random_val will contain +-min_abs with
+    # probability (x.abs() / min_abs), and 0.0 otherwise.  [so this preserves expectations,
+    # for those elements].
+    random_val = min_abs * x.sign() * (torch.rand_like(x) * min_abs < x_abs)
+    return torch.where(is_too_small, random_val, x).to(torch.float16)
+class RandomGradFunction(torch.autograd.Function):
+    """
+    Does nothing in forward pass; in backward pass, gets rid of very small grads using
+    randomized approach that preserves expectations (intended to reduce roundoff).
+    """
+    @staticmethod
+    def forward(ctx, x: Tensor, min_abs: float) -> Tensor:
+        ctx.min_abs = min_abs
+        return x
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor) -> Tuple[Tensor, None]:
+        if ans_grad.dtype == torch.float16:
+            return (
+                random_cast_to_half(
+                    ans_grad.to(torch.float32), min_abs=ctx.min_abs
+                ),
+                None,
+            )
+        else:
+            return ans_grad, None
+class RandomGrad(torch.nn.Module):
+    """
+    Gets rid of very small gradients using an expectation-preserving method, intended to increase
+    accuracy of training when using amp (automatic mixed precision)
+    """
+    def __init__(self, min_abs: float = 5.0e-06):
+        super(RandomGrad, self).__init__()
+        self.min_abs = min_abs
+    def forward(self, x: Tensor):
+        if (
+            torch.jit.is_scripting()
+            or not self.training
+            or torch.jit.is_tracing()
+        ):
+            return x
+        else:
+            return RandomGradFunction.apply(x, self.min_abs)
+class SoftmaxFunction(torch.autograd.Function):
+    """
+    Tries to handle half-precision derivatives in a randomized way that should
+    be more accurate for training than the default behavior.
+    """
+    @staticmethod
+    def forward(ctx, x: Tensor, dim: int):
+        ans = x.softmax(dim=dim)
+        # if x dtype is float16, x.softmax() returns a float32 because
+        # (presumably) that op does not support float16, and autocast
+        # is enabled.
+        if torch.is_autocast_enabled():
+            ans = ans.to(torch.float16)
+        ctx.save_for_backward(ans)
+        ctx.x_dtype = x.dtype
+        ctx.dim = dim
+        return ans
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor):
+        (ans,) = ctx.saved_tensors
+        with torch.cuda.amp.autocast(enabled=False):
+            ans_grad = ans_grad.to(torch.float32)
+            ans = ans.to(torch.float32)
+            x_grad = ans_grad * ans
+            x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True)
+            return x_grad, None
+def softmax(x: Tensor, dim: int):
+    if torch.jit.is_scripting() or torch.jit.is_tracing():
+        return x.softmax(dim)
+    return SoftmaxFunction.apply(x, dim)
+class MaxEigLimiterFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        coeffs: Tensor,
+        direction: Tensor,
+        channel_dim: int,
+        grad_scale: float,
+    ) -> Tensor:
+        ctx.channel_dim = channel_dim
+        ctx.grad_scale = grad_scale
+        ctx.save_for_backward(x.detach(), coeffs.detach(), direction.detach())
+        return x
+    @staticmethod
+    def backward(ctx, x_grad, *args):
+        with torch.enable_grad():
+            (x_orig, coeffs, new_direction) = ctx.saved_tensors
+            x_orig.requires_grad = True
+            num_channels = x_orig.shape[ctx.channel_dim]
+            x = x_orig.transpose(ctx.channel_dim, -1).reshape(-1, num_channels)
+            new_direction.requires_grad = False
+            x = x - x.mean(dim=0)
+            x_var = (x ** 2).mean()
+            x_residual = x - coeffs * new_direction
+            x_residual_var = (x_residual ** 2).mean()
+            # `variance_proportion` is the proportion of the variance accounted for
+            # by the top eigen-direction.  This is to be minimized.
+            variance_proportion = (x_var - x_residual_var) / (x_var + 1.0e-20)
+            variance_proportion.backward()
+        x_orig_grad = x_orig.grad
+        x_extra_grad = (
+            x_orig.grad
+            * ctx.grad_scale
+            * x_grad.norm()
+            / (x_orig_grad.norm() + 1.0e-20)
+        )
+        return x_grad + x_extra_grad.detach(), None, None, None, None
+class BasicNorm(torch.nn.Module):
+    """
+    This is intended to be a simpler, and hopefully cheaper, replacement for
+    LayerNorm.  The observation this is based on, is that Transformer-type
+    networks, especially with pre-norm, sometimes seem to set one of the
+    feature dimensions to a large constant value (e.g. 50), which "defeats"
+    the LayerNorm because the output magnitude is then not strongly dependent
+    on the other (useful) features.  Presumably the weight and bias of the
+    LayerNorm are required to allow it to do this.
+    So the idea is to introduce this large constant value as an explicit
+    parameter, that takes the role of the "eps" in LayerNorm, so the network
+    doesn't have to do this trick.  We make the "eps" learnable.
+    Args:
+       num_channels: the number of channels, e.g. 512.
+      channel_dim: the axis/dimension corresponding to the channel,
+        interprted as an offset from the input's ndim if negative.
+        shis is NOT the num_channels; it should typically be one of
+        {-2, -1, 0, 1, 2, 3}.
+       eps: the initial "epsilon" that we add as ballast in:
+             scale = ((input_vec**2).mean() + epsilon)**-0.5
+          Note: our epsilon is actually large, but we keep the name
+          to indicate the connection with conventional LayerNorm.
+       learn_eps: if true, we learn epsilon; if false, we keep it
+         at the initial value.
+    eps_min: float
+    eps_max: float
+    """
+    def __init__(
+        self,
+        num_channels: int,
+        channel_dim: int = -1,  # CAUTION: see documentation.
+        eps: float = 0.25,
+        learn_eps: bool = True,
+        eps_min: float = -3.0,
+        eps_max: float = 3.0,
+    ) -> None:
+        super(BasicNorm, self).__init__()
+        self.num_channels = num_channels
+        self.channel_dim = channel_dim
+        if learn_eps:
+            self.eps = nn.Parameter(torch.tensor(eps).log().detach())
+        else:
+            self.register_buffer("eps", torch.tensor(eps).log().detach())
+        self.eps_min = eps_min
+        self.eps_max = eps_max
+    def forward(self, x: Tensor) -> Tensor:
+        assert x.shape[self.channel_dim] == self.num_channels
+        eps = self.eps
+        if self.training and random.random() < 0.25:
+            # with probability 0.25, in training mode, clamp eps between the min
+            # and max; this will encourage it to learn parameters within the
+            # allowed range by making parameters that are outside the allowed
+            # range noisy.
+            # gradients to allow the parameter to get back into the allowed
+            # region if it happens to exit it.
+            eps = eps.clamp(min=self.eps_min, max=self.eps_max)
+        scales = (
+            torch.mean(x ** 2, dim=self.channel_dim, keepdim=True) + eps.exp()
+        ) ** -0.5
+        return x * scales
+def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
+    """
+    Behaves like a constructor of a modified version of nn.Linear
+    that gives an easy way to set the default initial parameter scale.
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+    ans = nn.Linear(*args, **kwargs)
+    with torch.no_grad():
+        ans.weight[:] *= initial_scale
+        if ans.bias is not None:
+            torch.nn.init.uniform_(
+                ans.bias, -0.1 * initial_scale, 0.1 * initial_scale
+            )
+    return ans
+def ScaledConv1d(
+    *args,
+    initial_scale: float = 1.0,
+    kernel_size: int = 3,
+    padding: str = "same",
+    **kwargs,
+) -> nn.Conv1d:
+    """
+    Behaves like a constructor of a modified version of nn.Conv1d
+    that gives an easy way to set the default initial parameter scale.
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+    ans = nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs)
+    with torch.no_grad():
+        ans.weight[:] *= initial_scale
+        if ans.bias is not None:
+            torch.nn.init.uniform_(
+                ans.bias, -0.1 * initial_scale, 0.1 * initial_scale
+            )
+    return ans
+def TransposeScaledConv1d(
+    *args,
+    initial_scale: float = 1.0,
+    kernel_size: int = 3,
+    padding: str = "same",
+    **kwargs,
+) -> nn.Sequential:
+    """
+    Transpose -> ScaledConv1d
+    """
+    return nn.Sequential(
+        Transpose(),
+        ScaledConv1d(
+            *args,
+            initial_scale=initial_scale,
+            kernel_size=kernel_size,
+            padding=padding,
+            **kwargs,
+        ),
+    )
+def ScaledConv1dTranspose(
+    *args,
+    initial_scale: float = 1.0,
+    kernel_size: int = 3,
+    padding: str = "same",
+    **kwargs,
+) -> nn.Sequential:
+    """
+    Transpose -> ScaledConv1d
+    """
+    return nn.Sequential(
+        ScaledConv1d(
+            *args,
+            initial_scale=initial_scale,
+            kernel_size=kernel_size,
+            padding=padding,
+            **kwargs,
+        ),
+        Transpose(),
+    )
+def TransposeConv1d(
+    *args, kernel_size: int = 3, padding: str = "same", **kwargs
+) -> nn.Sequential:
+    """
+    Transpose -> Conv1d
+    """
+    return nn.Sequential(
+        Transpose(),
+        nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
+    )
+def Conv1dTranspose(
+    *args, kernel_size: int = 3, padding: str = "same", **kwargs
+) -> nn.Sequential:
+    """
+    ScaledConv1d -> Transpose
+    """
+    return nn.Sequential(
+        nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
+        Transpose(),
+    )
+class SRLinear(nn.Linear):
+    """https://arxiv.org/abs/2303.06296
+    Stabilizing Transformer Training by Preventing Attention Entropy Collapse
+    """
+    def __init__(self, in_features, out_features, bias=True, **kwargs):
+        super().__init__(in_features, out_features, bias=bias, **kwargs)
+        self.register_buffer(
+            "u", nn.functional.normalize(torch.randn(in_features), dim=0)
+        )
+        with torch.no_grad():
+            sigma = self.get_sigma()
+        self.register_buffer("spectral_norm", sigma)
+        self.sigma = nn.Parameter(torch.ones(1))
+    def get_sigma(self):
+        with torch.no_grad():
+            u = self.u
+            v = self.weight.mv(u)
+            v = nn.functional.normalize(v, dim=0)
+            u = self.weight.T.mv(v)
+            u = nn.functional.normalize(u, dim=0)
+            self.u.data.copy_(u)
+        return torch.einsum("c,cd,d->", v, self.weight, u)
+    def get_weight(self):
+        sigma = self.get_sigma()
+        if self.training:
+            self.spectral_norm.data.copy_(sigma)
+        weight = (self.sigma / sigma) * self.weight
+        return weight
+    def forward(self, x):
+        return nn.functional.linear(x, self.get_weight(), self.bias)
+class SRConv1d(SRLinear):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        kernel_size,
+        stride: int = 1,
+        padding: str = "same",
+        bias: bool = True,
+        **kwargs,
+    ):
+        in_features = in_features * kernel_size
+        super().__init__(in_features, out_features, bias=bias, **kwargs)
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+    def forward(self, x):
+        in_features = self.in_features // self.kernel_size
+        weight = self.get_weight().view(
+            self.out_features, in_features, self.kernel_size
+        )
+        return nn.functional.conv1d(
+            x, weight, bias=self.bias, stride=self.stride, padding=self.padding
+        )
+def TransposeSRConv1d(
+    *args, kernel_size: int = 3, padding: str = "same", **kwargs
+) -> nn.Sequential:
+    """
+    Transpose -> SRConv1d
+    """
+    return nn.Sequential(
+        Transpose(),
+        SRConv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
+    )
+def SRConv1dTranspose(
+    *args, kernel_size: int = 3, padding: str = "same", **kwargs
+) -> nn.Sequential:
+    """
+    SRConv1d -> Transpose
+    """
+    return nn.Sequential(
+        SRConv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs),
+        Transpose(),
+    )
+class ActivationBalancer(torch.nn.Module):
+    """
+    Modifies the backpropped derivatives of a function to try to encourage, for
+    each channel, that it is positive at least a proportion `threshold` of the
+    time.  It does this by multiplying negative derivative values by up to
+    (1+max_factor), and positive derivative values by up to (1-max_factor),
+    interpolated from 1 at the threshold to those extremal values when none
+    of the inputs are positive.
+    Args:
+           num_channels: the number of channels
+           channel_dim: the dimension/axis corresponding to the channel, e.g.
+               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
+           min_positive: the minimum, per channel, of the proportion of the time
+               that (x > 0), below which we start to modify the derivatives.
+           max_positive: the maximum, per channel, of the proportion of the time
+               that (x > 0), above which we start to modify the derivatives.
+           max_factor: the maximum factor by which we modify the derivatives for
+              either the sign constraint or the magnitude constraint;
+              e.g. with max_factor=0.02, the the derivatives would be multiplied by
+              values in the range [0.98..1.02].
+           sign_gain_factor: determines the 'gain' with which we increase the
+              change in gradient once the constraints on min_positive and max_positive
+              are violated.
+           scale_gain_factor: determines the 'gain' with which we increase the
+              change in gradient once the constraints on min_abs and max_abs
+              are violated.
+           min_abs:  the minimum average-absolute-value difference from the mean
+               value per channel, which we allow, before we start to modify
+               the derivatives to prevent this.
+           max_abs:  the maximum average-absolute-value difference from the mean
+               value per channel, which we allow, before we start to modify
+               the derivatives to prevent this.
+          min_prob: determines the minimum probability with which we modify the
+             gradients for the {min,max}_positive and {min,max}_abs constraints,
+             on each forward().  This is done randomly to prevent all layers
+             from doing it at the same time.  Early in training we may use
+             higher probabilities than this; it will decay to this value.
+    """
+    def __init__(
+        self,
+        num_channels: int,
+        channel_dim: int,
+        min_positive: float = 0.05,
+        max_positive: float = 0.95,
+        max_factor: float = 0.04,
+        sign_gain_factor: float = 0.01,
+        scale_gain_factor: float = 0.02,
+        min_abs: float = 0.2,
+        max_abs: float = 100.0,
+        min_prob: float = 0.1,
+    ):
+        super(ActivationBalancer, self).__init__()
+        self.num_channels = num_channels
+        self.channel_dim = channel_dim
+        self.min_positive = min_positive
+        self.max_positive = max_positive
+        self.max_factor = max_factor
+        self.min_abs = min_abs
+        self.max_abs = max_abs
+        self.min_prob = min_prob
+        self.sign_gain_factor = sign_gain_factor
+        self.scale_gain_factor = scale_gain_factor
+        # count measures how many times the forward() function has been called.
+        # We occasionally sync this to a tensor called `count`, that exists to
+        # make sure it is synced to disk when we load and save the model.
+        self.cpu_count = 0
+        self.register_buffer("count", torch.tensor(0, dtype=torch.int64))
+    def forward(self, x: Tensor) -> Tensor:
+        if (
+            torch.jit.is_scripting()
+            or not x.requires_grad
+            or torch.jit.is_tracing()
+        ):
+            return _no_op(x)
+        count = self.cpu_count
+        self.cpu_count += 1
+        if random.random() < 0.01:
+            # Occasionally sync self.cpu_count with self.count.
+            # count affects the decay of 'prob'.  don't do this on every iter,
+            # because syncing with the GPU is slow.
+            self.cpu_count = max(self.cpu_count, self.count.item())
+            self.count.fill_(self.cpu_count)
+        # the prob of doing some work exponentially decreases from 0.5 till it hits
+        # a floor at min_prob (==0.1, by default)
+        prob = max(self.min_prob, 0.5 ** (1 + (count / 4000.0)))
+        if random.random() < prob:
+            sign_gain_factor = 0.5
+            if self.min_positive != 0.0 or self.max_positive != 1.0:
+                sign_factor = _compute_sign_factor(
+                    x,
+                    self.channel_dim,
+                    self.min_positive,
+                    self.max_positive,
+                    gain_factor=self.sign_gain_factor / prob,
+                    max_factor=self.max_factor,
+                )
+            else:
+                sign_factor = None
+            scale_factor = _compute_scale_factor(
+                x.detach(),
+                self.channel_dim,
+                min_abs=self.min_abs,
+                max_abs=self.max_abs,
+                gain_factor=self.scale_gain_factor / prob,
+                max_factor=self.max_factor,
+            )
+            return ActivationBalancerFunction.apply(
+                x,
+                scale_factor,
+                sign_factor,
+                self.channel_dim,
+            )
+        else:
+            return _no_op(x)
+def penalize_abs_values_gt(x: Tensor, limit: float, penalty: float) -> Tensor:
+    """
+    Returns x unmodified, but in backprop will put a penalty for the excess of
+    the absolute values of elements of x over the limit "limit".  E.g. if
+    limit == 10.0, then if x has any values over 10 it will get a penalty.
+    Caution: the value of this penalty will be affected by grad scaling used
+    in automatic mixed precision training.  For this reasons we use this,
+    it shouldn't really matter, or may even be helpful; we just use this
+    to disallow really implausible values of scores to be given to softmax.
+    """
+    x_sign = x.sign()
+    over_limit = (x.abs() - limit) > 0
+    # The following is a memory efficient way to penalize the absolute values of
+    # x that's over the limit.  (The memory efficiency comes when you think
+    # about which items torch needs to cache for the autograd, and which ones it
+    # can throw away).  The numerical value of aux_loss as computed here will
+    # actually be larger than it should be, by limit * over_limit.sum(), but it
+    # has the same derivative as the real aux_loss which is penalty * (x.abs() -
+    # limit).relu().
+    aux_loss = penalty * ((x_sign * over_limit).to(torch.int8) * x)
+    # note: we don't do sum() here on aux)_loss, but it's as if we had done
+    # sum() due to how with_loss() works.
+    x = with_loss(x, aux_loss)
+    # you must use x for something, or this will be ineffective.
+    return x
+def _diag(x: Tensor):  # like .diag(), but works for tensors with 3 dims.
+    if x.ndim == 2:
+        return x.diag()
+    else:
+        (batch, dim, dim) = x.shape
+        x = x.reshape(batch, dim * dim)
+        x = x[:, :: dim + 1]
+        assert x.shape == (batch, dim)
+        return x
+def _whitening_metric(x: Tensor, num_groups: int):
+    """
+    Computes the "whitening metric", a value which will be 1.0 if all the eigenvalues of
+    of the centered feature covariance are the same within each group's covariance matrix
+    and also between groups.
+    Args:
+        x: a Tensor of shape (*, num_channels)
+     num_groups:  the number of groups of channels, a number >=1 that divides num_channels
+    Returns:
+        Returns a scalar Tensor that will be 1.0 if the data is "perfectly white" and
+    greater than 1.0 otherwise.
+    """
+    assert x.dtype != torch.float16
+    x = x.reshape(-1, x.shape[-1])
+    (num_frames, num_channels) = x.shape
+    assert num_channels % num_groups == 0
+    channels_per_group = num_channels // num_groups
+    x = x.reshape(num_frames, num_groups, channels_per_group).transpose(0, 1)
+    # x now has shape (num_groups, num_frames, channels_per_group)
+    # subtract the mean so we use the centered, not uncentered, covariance.
+    # My experience has been that when we "mess with the gradients" like this,
+    # it's better not do anything that tries to move the mean around, because
+    # that can easily cause instability.
+    x = x - x.mean(dim=1, keepdim=True)
+    # x_covar: (num_groups, channels_per_group, channels_per_group)
+    x_covar = torch.matmul(x.transpose(1, 2), x)
+    x_covar_mean_diag = _diag(x_covar).mean()
+    # the following expression is what we'd get if we took the matrix product
+    # of each covariance and measured the mean of its trace, i.e.
+    # the same as _diag(torch.matmul(x_covar, x_covar)).mean().
+    x_covarsq_mean_diag = (x_covar ** 2).sum() / (
+        num_groups * channels_per_group
+    )
+    # this metric will be >= 1.0; the larger it is, the less 'white' the data was.
+    metric = x_covarsq_mean_diag / (x_covar_mean_diag ** 2 + 1.0e-20)
+    return metric
+class WhiteningPenaltyFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        num_groups: int,
+        whitening_limit: float,
+        grad_scale: float,
+    ) -> Tensor:
+        ctx.save_for_backward(x)
+        ctx.num_groups = num_groups
+        ctx.whitening_limit = whitening_limit
+        ctx.grad_scale = grad_scale
+        return x
+    @staticmethod
+    def backward(ctx, x_grad: Tensor):
+        (x_orig,) = ctx.saved_tensors
+        with torch.enable_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                x_detached = x_orig.to(torch.float32).detach()
+                x_detached.requires_grad = True
+                metric = _whitening_metric(x_detached, ctx.num_groups)
+                if random.random() < 0.005 or __name__ == "__main__":
+                    logging.info(
+                        f"Whitening: num_groups={ctx.num_groups}, num_channels={x_orig.shape[-1]}, "
+                        f"metric={metric.item():.2f} vs. limit={ctx.whitening_limit}"
+                    )
+                (metric - ctx.whitening_limit).relu().backward()
+                penalty_grad = x_detached.grad
+                scale = ctx.grad_scale * (
+                    x_grad.to(torch.float32).norm()
+                    / (penalty_grad.norm() + 1.0e-20)
+                )
+                penalty_grad = penalty_grad * scale
+        return x_grad + penalty_grad.to(x_grad.dtype), None, None, None
+class Whiten(nn.Module):
+    def __init__(
+        self,
+        num_groups: int,
+        whitening_limit: float,
+        prob: Union[float, Tuple[float, float]],
+        grad_scale: float,
+    ):
+        """
+        Args:
+          num_groups: the number of groups to divide the channel dim into before
+            whitening.  We will attempt to make the feature covariance
+            within each group, after mean subtraction, as "white" as possible,
+            while having the same trace across all groups.
+         whitening_limit: a value greater than 1.0, that dictates how much
+           freedom we have to violate the constraints.  1.0 would mean perfectly
+           white, with exactly the same trace across groups; larger values
+           give more freedom.  E.g. 2.0.
+         prob: the probability with which we apply the gradient modification
+           (also affects the grad scale).  May be supplied as a float,
+           or as a pair (min_prob, max_prob)
+          grad_scale: determines the scale on the gradient term from this object,
+            relative to the rest of the gradient on the attention weights.
+            E.g. 0.02 (you may want to use smaller values than this if prob is large)
+        """
+        super(Whiten, self).__init__()
+        assert num_groups >= 1
+        assert whitening_limit >= 1
+        assert grad_scale >= 0
+        self.num_groups = num_groups
+        self.whitening_limit = whitening_limit
+        if isinstance(prob, float):
+            assert 0 < prob <= 1
+            self.prob = prob
+        else:
+            (self.min_prob, self.max_prob) = prob
+            assert 0 < self.min_prob < self.max_prob <= 1
+            self.prob = self.max_prob
+        self.grad_scale = grad_scale
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        In the forward pass, this function just returns the input unmodified.
+        In the backward pass, it will modify the gradients to ensure that the
+        distribution in each group has close to (lambda times I) as the covariance
+        after mean subtraction, with the same lambda across groups.
+        For whitening_limit > 1, there will be more freedom to violate this
+        constraint.
+        Args:
+           x: the input of shape (*, num_channels)
+        Returns:
+            x, unmodified.   You should make sure
+        you use the returned value, or the graph will be freed
+        and nothing will happen in backprop.
+        """
+        if (
+            not x.requires_grad
+            or random.random() > self.prob
+            or self.grad_scale == 0
+        ):
+            return _no_op(x)
+        else:
+            if hasattr(self, "min_prob") and random.random() < 0.25:
+                # occasionally switch between min_prob and max_prob, based on whether
+                # we are above or below the threshold.
+                if (
+                    _whitening_metric(x.to(torch.float32), self.num_groups)
+                    > self.whitening_limit
+                ):
+                    # there would be a change to the grad.
+                    self.prob = self.max_prob
+                else:
+                    self.prob = self.min_prob
+            return WhiteningPenaltyFunction.apply(
+                x, self.num_groups, self.whitening_limit, self.grad_scale
+            )
+class WithLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor, y: Tensor):
+        ctx.y_shape = y.shape
+        return x
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor):
+        return ans_grad, torch.ones(
+            ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device
+        )
+def with_loss(x, y):
+    if torch.jit.is_scripting() or torch.jit.is_tracing():
+        return x
+    # returns x but adds y.sum() to the loss function.
+    return WithLoss.apply(x, y)
+def _no_op(x: Tensor) -> Tensor:
+    if torch.jit.is_scripting() or torch.jit.is_tracing():
+        return x
+    else:
+        # a no-op function that will have a node in the autograd graph,
+        # to avoid certain bugs relating to backward hooks
+        return x.chunk(1, dim=-1)[0]
+class Identity(torch.nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, x):
+        return _no_op(x)
+class MaxEig(torch.nn.Module):
+    """
+    Modifies the backpropped derivatives of a function to try to discourage
+    that any given direction in activation space accounts for more than
+    a specified proportion of the covariance (e.g. 0.2).
+    Args:
+           num_channels: the number of channels
+           channel_dim: the dimension/axis corresponding to the channel, e.g.
+               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
+           max_var_per_eig:  the maximum proportion of the variance of the
+               features/channels, after mean subtraction, that can come from
+               any given eigenvalue.
+           min_prob: the minimum probability with which we apply this during any invocation
+               of forward(), assuming last time we applied the constraint it was
+               not active; supplied for speed.
+           scale: determines the scale with which we modify the gradients, relative
+               to the existing / unmodified gradients
+    """
+    def __init__(
+        self,
+        num_channels: int,
+        channel_dim: int,
+        max_var_per_eig: float = 0.2,
+        min_prob: float = 0.01,
+        scale: float = 0.01,
+    ):
+        super(MaxEig, self).__init__()
+        self.num_channels = num_channels
+        self.channel_dim = channel_dim
+        self.scale = scale
+        assert max_var_per_eig == 0.0 or max_var_per_eig > 1.0 / num_channels
+        self.max_var_per_eig = max_var_per_eig
+        # we figure out the dominant direction using the power method: starting with
+        # a random vector, keep multiplying by the covariance and renormalizing.
+        with torch.no_grad():
+            # arbitrary.. would use randn() but want to leave the rest of the model's
+            # random parameters unchanged for comparison
+            direction = torch.arange(num_channels).to(torch.float)
+            direction = direction / direction.norm()
+            self.register_buffer("max_eig_direction", direction)
+        self.min_prob = min_prob
+        # cur_prob is the current probability we'll use to apply the ActivationBalancer.
+        # We'll regress this towards prob, each tiem we try to apply it and it is not
+        # active.
+        self.cur_prob = 1.0
+    def forward(self, x: Tensor) -> Tensor:
+        if (
+            torch.jit.is_scripting()
+            or self.max_var_per_eig <= 0
+            or random.random() > self.cur_prob
+            or torch.jit.is_tracing()
+        ):
+            return _no_op(x)
+        with torch.cuda.amp.autocast(enabled=False):
+            eps = 1.0e-20
+            orig_x = x
+            x = x.to(torch.float32)
+            with torch.no_grad():
+                x = x.transpose(self.channel_dim, -1).reshape(
+                    -1, self.num_channels
+                )
+                x = x - x.mean(dim=0)
+                new_direction, coeffs = self._find_direction_coeffs(
+                    x, self.max_eig_direction
+                )
+                x_var = (x ** 2).mean()
+                x_residual = x - coeffs * new_direction
+                x_residual_var = (x_residual ** 2).mean()
+                # `variance_proportion` is the proportion of the variance accounted for
+                # by the top eigen-direction.
+                variance_proportion = (x_var - x_residual_var) / (
+                    x_var + 1.0e-20
+                )
+                # ensure new direction is nonzero even if x == 0, by including `direction`.
+                self._set_direction(
+                    0.1 * self.max_eig_direction + new_direction
+                )
+            if random.random() < 0.01 or __name__ == "__main__":
+                logging.info(
+                    f"variance_proportion = {variance_proportion.item()}, shape={tuple(orig_x.shape)}, cur_prob={self.cur_prob}"
+                )
+            if variance_proportion >= self.max_var_per_eig:
+                # The constraint is active.  Note, we should quite rarely
+                # reach here, only near the beginning of training if we are
+                # starting to diverge, should this constraint be active.
+                cur_prob = self.cur_prob
+                self.cur_prob = (
+                    1.0  # next time, do the update with probability 1.0.
+                )
+                return MaxEigLimiterFunction.apply(
+                    orig_x, coeffs, new_direction, self.channel_dim, self.scale
+                )
+            else:
+                # let self.cur_prob exponentially approach self.min_prob, as
+                # long as the constraint is inactive.
+                self.cur_prob = 0.75 * self.cur_prob + 0.25 * self.min_prob
+                return orig_x
+    def _set_direction(self, direction: Tensor):
+        """
+        Sets self.max_eig_direction to a normalized version of `direction`
+        """
+        direction = direction.detach()
+        direction = direction / direction.norm()
+        direction_sum = direction.sum().item()
+        if direction_sum - direction_sum == 0:  # no inf/nan
+            self.max_eig_direction[:] = direction
+        else:
+            logging.info(
+                f"Warning: sum of direction in MaxEig is {direction_sum}, "
+                "num_channels={self.num_channels}, channel_dim={self.channel_dim}"
+            )
+    def _find_direction_coeffs(
+        self, x: Tensor, prev_direction: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+            Figure out (an approximation to) the proportion of the variance of a set of
+            feature vectors that can be attributed to the top eigen-direction.
+            Args:
+             x: a Tensor of shape (num_frames, num_channels), with num_frames > 1.
+          prev_direction:  a Tensor of shape (num_channels,), that is our previous estimate
+                   of the top eigen-direction, or a random direction if this is the first
+                   iteration.  Does not have to be normalized, but should be nonzero.
+        Returns: (cur_direction, coeffs), where:
+             cur_direction: a Tensor of shape (num_channels,) that is the current
+                estimate of the top eigen-direction.
+             coeffs: a Tensor of shape (num_frames, 1) that minimizes, or
+                approximately minimizes, (x - coeffs * cur_direction).norm()
+        """
+        (num_frames, num_channels) = x.shape
+        assert num_channels > 1 and num_frames > 1
+        assert prev_direction.shape == (num_channels,)
+        # `coeffs` are the coefficients of `prev_direction` in x.
+        # actually represent the coeffs up to a constant positive factor.
+        coeffs = (x * prev_direction).sum(dim=1, keepdim=True) + 1.0e-10
+        cur_direction = (x * coeffs).sum(dim=0) / (
+            (coeffs ** 2).sum() + 1.0e-20
+        )
+        return cur_direction, coeffs
+class DoubleSwishFunction(torch.autograd.Function):
+    """
+      double_swish(x) = x * torch.sigmoid(x-1)
+    This is a definition, originally motivated by its close numerical
+    similarity to swish(swish(x)), where swish(x) =  x * sigmoid(x).
+    Memory-efficient derivative computation:
+     double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
+     double_swish'(x) = d/dx double_swish(x) =  x * s'(x) + x' * s(x) = x * s'(x) + s(x).
+     Now, s'(x) = s(x) * (1-s(x)).
+     double_swish'(x) =  x * s'(x) + s(x).
+                      =  x * s(x) * (1-s(x)) + s(x).
+                     = double_swish(x) * (1-s(x)) + s(x)
+     ... so we just need to remember s(x) but not x itself.
+    """
+    @staticmethod
+    def forward(ctx, x: Tensor) -> Tensor:
+        requires_grad = x.requires_grad
+        x_dtype = x.dtype
+        if x.dtype == torch.float16:
+            x = x.to(torch.float32)
+        s = torch.sigmoid(x - 1.0)
+        y = x * s
+        if requires_grad:
+            deriv = y * (1 - s) + s
+            # notes on derivative of x * sigmoid(x - 1):
+            # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29
+            # min \simeq -0.043638.  Take floor as -0.043637 so it's a lower bund
+            # max \simeq 1.1990.   Take ceil to be 1.2 so it's an upper bound.
+            # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which
+            # floors), should be expectation-preserving.
+            floor = -0.043637
+            ceil = 1.2
+            d_scaled = (deriv - floor) * (
+                255.0 / (ceil - floor)
+            ) + torch.rand_like(deriv)
+            if __name__ == "__main__":
+                # for self-testing only.
+                assert d_scaled.min() >= 0.0
+                assert d_scaled.max() < 256.0
+            d_int = d_scaled.to(torch.uint8)
+            ctx.save_for_backward(d_int)
+        if x.dtype == torch.float16 or torch.is_autocast_enabled():
+            y = y.to(torch.float16)
+        return y
+    @staticmethod
+    def backward(ctx, y_grad: Tensor) -> Tensor:
+        (d,) = ctx.saved_tensors
+        # the same constants as used in forward pass.
+        floor = -0.043637
+        ceil = 1.2
+        d = d * ((ceil - floor) / 255.0) + floor
+        return y_grad * d
+class DoubleSwish(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
+        that we approximate closely with x * sigmoid(x-1).
+        """
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            return x * torch.sigmoid(x - 1.0)
+        return DoubleSwishFunction.apply(x)
+def BalancedDoubleSwish(
+    d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25
+) -> nn.Sequential:
+    """
+    ActivationBalancer -> DoubleSwish
+    """
+    balancer = ActivationBalancer(
+        d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob
+    )
+    return nn.Sequential(
+        balancer,
+        DoubleSwish(),
+    )
+def _test_max_eig():
+    for proportion in [0.1, 0.5, 10.0]:
+        logging.info(f"proportion = {proportion}")
+        x = torch.randn(100, 128)
+        direction = torch.randn(128)
+        coeffs = torch.randn(100, 1)
+        x += proportion * direction * coeffs
+        x.requires_grad = True
+        num_channels = 128
+        m = MaxEig(
+            num_channels, 1, 0.5, scale=0.1  # channel_dim  # max_var_per_eig
+        )  # grad_scale
+        for _ in range(4):
+            y = m(x)
+        y_grad = torch.randn_like(x)
+        y.backward(gradient=y_grad)
+        if proportion < 0.2:
+            assert torch.allclose(x.grad, y_grad, atol=1.0e-02)
+        elif proportion > 1.0:
+            assert not torch.allclose(x.grad, y_grad)
+def _test_whiten():
+    for proportion in [0.1, 0.5, 10.0]:
+        logging.info(f"_test_whiten(): proportion = {proportion}")
+        x = torch.randn(100, 128)
+        direction = torch.randn(128)
+        coeffs = torch.randn(100, 1)
+        x += proportion * direction * coeffs
+        x.requires_grad = True
+        num_channels = 128
+        m = Whiten(
+            1, 5.0, prob=1.0, grad_scale=0.1  # num_groups  # whitening_limit,
+        )  # grad_scale
+        for _ in range(4):
+            y = m(x)
+        y_grad = torch.randn_like(x)
+        y.backward(gradient=y_grad)
+        if proportion < 0.2:
+            assert torch.allclose(x.grad, y_grad)
+        elif proportion > 1.0:
+            assert not torch.allclose(x.grad, y_grad)
+def _test_activation_balancer_sign():
+    probs = torch.arange(0, 1, 0.01)
+    N = 1000
+    x = 1.0 * (
+        (2.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))) - 1.0
+    )
+    x = x.detach()
+    x.requires_grad = True
+    m = ActivationBalancer(
+        probs.numel(),
+        channel_dim=0,
+        min_positive=0.05,
+        max_positive=0.95,
+        max_factor=0.2,
+        min_abs=0.0,
+    )
+    y_grad = torch.sign(torch.randn(probs.numel(), N))
+    y = m(x)
+    y.backward(gradient=y_grad)
+    print("_test_activation_balancer_sign: x = ", x)
+    print("_test_activation_balancer_sign: y grad = ", y_grad)
+    print("_test_activation_balancer_sign: x grad = ", x.grad)
+def _test_activation_balancer_magnitude():
+    magnitudes = torch.arange(0, 1, 0.01)
+    N = 1000
+    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(
+        -1
+    )
+    x = x.detach()
+    x.requires_grad = True
+    m = ActivationBalancer(
+        magnitudes.numel(),
+        channel_dim=0,
+        min_positive=0.0,
+        max_positive=1.0,
+        max_factor=0.2,
+        min_abs=0.2,
+        max_abs=0.8,
+        min_prob=1.0,
+    )
+    y_grad = torch.sign(torch.randn(magnitudes.numel(), N))
+    y = m(x)
+    y.backward(gradient=y_grad)
+    print("_test_activation_balancer_magnitude: x = ", x)
+    print("_test_activation_balancer_magnitude: y grad = ", y_grad)
+    print("_test_activation_balancer_magnitude: x grad = ", x.grad)
+def _test_basic_norm():
+    num_channels = 128
+    m = BasicNorm(num_channels=num_channels, channel_dim=1)
+    x = torch.randn(500, num_channels)
+    y = m(x)
+    assert y.shape == x.shape
+    x_rms = (x ** 2).mean().sqrt()
+    y_rms = (y ** 2).mean().sqrt()
+    print("x rms = ", x_rms)
+    print("y rms = ", y_rms)
+    assert y_rms < x_rms
+    assert y_rms > 0.5 * x_rms
+def _test_double_swish_deriv():
+    x = torch.randn(10, 12, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    m = DoubleSwish()
+    tol = (1.2 - (-0.043637)) / 255.0
+    torch.autograd.gradcheck(m, x, atol=tol)
+    # for self-test.
+    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    y = m(x)
+def _test_softmax():
+    a = torch.randn(2, 10, dtype=torch.float64)
+    b = a.clone()
+    a.requires_grad = True
+    b.requires_grad = True
+    a.softmax(dim=1)[:, 0].sum().backward()
+    print("a grad = ", a.grad)
+    softmax(b, dim=1)[:, 0].sum().backward()
+    print("b grad = ", b.grad)
+    assert torch.allclose(a.grad, b.grad)
+if __name__ == "__main__":
+    logging.getLogger().setLevel(logging.INFO)
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    _test_softmax()
+    _test_whiten()
+    _test_max_eig()
+    _test_activation_balancer_sign()
+    _test_activation_balancer_magnitude()
+    _test_basic_norm()
+    _test_double_swish_deriv()

modules/transformer.py ADDED Viewed

	@@ -0,0 +1,683 @@

+import copy
+import numbers
+from functools import partial
+from typing import Any, Callable, List, Optional, Tuple, Union
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from .activation import MultiheadAttention
+from .scaling import ActivationBalancer, BalancedDoubleSwish
+from .scaling import BasicNorm as _BasicNorm
+_shape_t = Union[int, List[int], torch.Size]
+class LayerNorm(nn.Module):
+    __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
+    normalized_shape: Tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+    def __init__(
+        self,
+        normalized_shape: _shape_t,
+        eps: float = 1e-5,
+        elementwise_affine: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(
+                torch.empty(self.normalized_shape, **factory_kwargs)
+            )
+            self.bias = nn.Parameter(
+                torch.empty(self.normalized_shape, **factory_kwargs)
+            )
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            nn.init.ones_(self.weight)
+            nn.init.zeros_(self.bias)
+    def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
+        if isinstance(input, tuple):
+            input, embedding = input
+            return (
+                F.layer_norm(
+                    input,
+                    self.normalized_shape,
+                    self.weight,
+                    self.bias,
+                    self.eps,
+                ),
+                embedding,
+            )
+        assert embedding is None
+        return F.layer_norm(
+            input, self.normalized_shape, self.weight, self.bias, self.eps
+        )
+    def extra_repr(self) -> str:
+        return (
+            "{normalized_shape}, eps={eps}, "
+            "elementwise_affine={elementwise_affine}".format(**self.__dict__)
+        )
+class AdaptiveLayerNorm(nn.Module):
+    r"""Adaptive Layer Normalization"""
+    def __init__(self, d_model, norm) -> None:
+        super(AdaptiveLayerNorm, self).__init__()
+        self.project_layer = nn.Linear(d_model, 2 * d_model)
+        self.norm = norm
+        self.d_model = d_model
+        self.eps = self.norm.eps
+    def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor:
+        if isinstance(input, tuple):
+            input, embedding = input
+            weight, bias = torch.split(
+                self.project_layer(embedding),
+                split_size_or_sections=self.d_model,
+                dim=-1,
+            )
+            return (weight * self.norm(input) + bias, embedding)
+        weight, bias = torch.split(
+            self.project_layer(embedding),
+            split_size_or_sections=self.d_model,
+            dim=-1,
+        )
+        return weight * self.norm(input) + bias
+class BasicNorm(_BasicNorm):
+    def __init__(
+        self,
+        d_model: int,
+        eps: float = 1e-5,
+        device=None,
+        dtype=None,
+    ):
+        super(BasicNorm, self).__init__(d_model, eps=eps)
+    def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
+        if isinstance(input, tuple):
+            input, embedding = input
+            return (
+                super(BasicNorm, self).forward(input),
+                embedding,
+            )
+        assert embedding is None
+        return super(BasicNorm, self).forward(input)
+class BalancedBasicNorm(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        eps: float = 1e-5,
+        device=None,
+        dtype=None,
+    ):
+        super(BalancedBasicNorm, self).__init__()
+        self.balancer = ActivationBalancer(
+            d_model,
+            channel_dim=-1,
+            min_positive=0.45,
+            max_positive=0.55,
+            max_abs=6.0,
+        )
+        self.norm = BasicNorm(d_model, eps, device=device, dtype=dtype)
+    def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
+        if isinstance(input, tuple):
+            input, embedding = input
+            return self.norm((self.balancer(input), embedding))
+        assert embedding is None
+        return self.norm(self.balancer(input))
+class IdentityNorm(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        eps: float = 1e-5,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super(IdentityNorm, self).__init__()
+    def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
+        if isinstance(input, tuple):
+            return input
+        assert embedding is None
+        return input
+class TransformerEncoderLayer(nn.Module):
+    __constants__ = ["batch_first", "norm_first"]
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+        batch_first: bool = False,
+        norm_first: bool = False,
+        device=None,
+        dtype=None,
+        linear1_self_attention_cls: nn.Module = nn.Linear,
+        linear2_self_attention_cls: nn.Module = nn.Linear,
+        linear1_feedforward_cls: nn.Module = nn.Linear,
+        linear2_feedforward_cls: nn.Module = nn.Linear,
+        layer_norm_cls: nn.Module = LayerNorm,
+        layer_norm_eps: float = 1e-5,
+        adaptive_layer_norm=False,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super(TransformerEncoderLayer, self).__init__()
+        self.self_attn = MultiheadAttention(
+            d_model,
+            nhead,
+            dropout=dropout,
+            batch_first=batch_first,
+            linear1_cls=linear1_self_attention_cls,
+            linear2_cls=linear2_self_attention_cls,
+            **factory_kwargs,
+        )
+        # Implementation of Feedforward model
+        self.linear1 = linear1_feedforward_cls(
+            d_model, dim_feedforward, **factory_kwargs
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = linear2_feedforward_cls(
+            dim_feedforward, d_model, **factory_kwargs
+        )
+        self.norm_first = norm_first
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            activation = _get_activation_fn(activation)
+        elif isinstance(activation, partial):
+            activation = activation(d_model)
+        elif activation == BalancedDoubleSwish:
+            activation = BalancedDoubleSwish(d_model)
+        # # We can't test self.activation in forward() in TorchScript,
+        # # so stash some information about it instead.
+        # if activation is F.relu or isinstance(activation, torch.nn.ReLU):
+        #     self.activation_relu_or_gelu = 1
+        # elif activation is F.gelu or isinstance(activation, torch.nn.GELU):
+        #     self.activation_relu_or_gelu = 2
+        # else:
+        #     self.activation_relu_or_gelu = 0
+        self.activation = activation
+        norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
+        if layer_norm_cls == IdentityNorm:
+            norm2 = BalancedBasicNorm(
+                d_model, eps=layer_norm_eps, **factory_kwargs
+            )
+        else:
+            norm2 = layer_norm_cls(
+                d_model, eps=layer_norm_eps, **factory_kwargs
+            )
+        if adaptive_layer_norm:
+            self.norm1 = AdaptiveLayerNorm(d_model, norm1)
+            self.norm2 = AdaptiveLayerNorm(d_model, norm2)
+        else:
+            self.norm1 = norm1
+            self.norm2 = norm2
+    def __setstate__(self, state):
+        super(TransformerEncoderLayer, self).__setstate__(state)
+        if not hasattr(self, "activation"):
+            self.activation = F.relu
+    def forward(
+        self,
+        src: Tensor,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        r"""Pass the input through the encoder layer.
+        Args:
+            src: the sequence to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        x, stage_embedding = src, None
+        is_src_tuple = False
+        if isinstance(src, tuple):
+            x, stage_embedding = src
+            is_src_tuple = True
+        if src_key_padding_mask is not None:
+            _skpm_dtype = src_key_padding_mask.dtype
+            if _skpm_dtype != torch.bool and not torch.is_floating_point(
+                src_key_padding_mask
+            ):
+                raise AssertionError(
+                    "only bool and floating types of key_padding_mask are supported"
+                )
+        if self.norm_first:
+            x = x + self._sa_block(
+                self.norm1(x, stage_embedding),
+                src_mask,
+                src_key_padding_mask,
+            )
+            x = x + self._ff_block(self.norm2(x, stage_embedding))
+        else:
+            x = self.norm1(
+                x + self._sa_block(x, src_mask, src_key_padding_mask),
+                stage_embedding,
+            )
+            x = self.norm2(x + self._ff_block(x), stage_embedding)
+        if is_src_tuple:
+            return (x, stage_embedding)
+        return x
+    def infer(
+        self,
+        src: Tensor,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        past_kv: Optional[Tensor] = None,
+        use_cache: bool = False,
+    ):
+        x, stage_embedding = src, None
+        is_src_tuple = False
+        if isinstance(src, tuple):
+            x, stage_embedding = src
+            is_src_tuple = True
+        if src_key_padding_mask is not None:
+            _skpm_dtype = src_key_padding_mask.dtype
+            if _skpm_dtype != torch.bool and not torch.is_floating_point(
+                src_key_padding_mask
+            ):
+                raise AssertionError(
+                    "only bool and floating types of key_padding_mask are supported"
+                )
+        if self.norm_first:
+            x_attn_out, kv = self.self_attn.infer(
+                self.norm1(x, stage_embedding),
+                attn_mask=src_mask,
+                key_padding_mask=src_key_padding_mask,
+                need_weights=False,
+                past_kv=past_kv,
+                use_cache=use_cache,
+            )
+            x = x + x_attn_out
+            x = x + self._ff_block(self.norm2(x, stage_embedding))
+        if is_src_tuple:
+            return (x, stage_embedding)
+        return (x, kv)
+    # self-attention block
+    def _sa_block(
+        self,
+        x: Tensor,
+        attn_mask: Optional[Tensor],
+        key_padding_mask: Optional[Tensor],
+    ) -> Tensor:
+        x = self.self_attn(
+            x,
+            x,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout1(x)
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+class TransformerEncoder(nn.Module):
+    r"""TransformerEncoder is a stack of N encoder layers. Users can build the
+    BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
+    Args:
+        encoder_layer: an instance of the TransformerEncoderLayer() class (required).
+        num_layers: the number of sub-encoder-layers in the encoder (required).
+        norm: the layer normalization component (optional).
+        enable_nested_tensor: if True, input will automatically convert to nested tensor
+            (and convert back on output). This will improve the overall performance of
+            TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
+    Examples::
+        >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = transformer_encoder(src)
+    """
+    __constants__ = ["norm"]
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(
+        self,
+        src: Tensor,
+        mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        return_layer_states: bool = False,
+    ) -> Tensor:
+        r"""Pass the input through the encoder layers in turn.
+        Args:
+            src: the sequence to the encoder (required).
+            mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+            return_layer_states: return layers' state (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        if return_layer_states:
+            layer_states = []  # layers' output
+            output = src
+            for mod in self.layers:
+                output = mod(
+                    output,
+                    src_mask=mask,
+                    src_key_padding_mask=src_key_padding_mask,
+                )
+                layer_states.append(output[0])
+            if self.norm is not None:
+                output = self.norm(output)
+            return layer_states, output
+        output = src
+        for mod in self.layers:
+            output = mod(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+    def infer(
+        self,
+        src: Tensor,
+        mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        return_layer_states: bool = False,
+        past_kv: Optional[Tensor] = None,
+        use_cache: bool = False,
+    ):
+        if past_kv is None:
+            past_length = 0
+            past_kv = tuple([None] * self.num_layers)
+        else:
+            past_length = past_kv[0][0].size(-2)
+        new_kv = () if use_cache else None
+        output = src
+        for mod, past_layer_kv in zip(self.layers, past_kv):
+            output, kv = mod.infer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, past_kv=past_layer_kv, use_cache=use_cache
+            )
+            if use_cache:
+                new_kv = new_kv + (kv,)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output, new_kv
+class TransformerDecoderLayer(nn.Module):
+    __constants__ = ["batch_first", "norm_first"]
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+        linear1_self_attention_cls: nn.Module = nn.Linear,
+        linear2_self_attention_cls: nn.Module = nn.Linear,
+        linear1_feedforward_cls: nn.Module = nn.Linear,
+        linear2_feedforward_cls: nn.Module = nn.Linear,
+        batch_first: bool = False,
+        norm_first: bool = False,
+        device=None,
+        dtype=None,
+        layer_norm_cls: nn.Module = LayerNorm,
+        layer_norm_eps: float = 1e-5,
+        adaptive_layer_norm=False,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super(TransformerDecoderLayer, self).__init__()
+        self.self_attn = MultiheadAttention(
+            d_model,
+            nhead,
+            dropout=dropout,
+            batch_first=batch_first,
+            linear1_cls=linear1_self_attention_cls,
+            linear2_cls=linear2_self_attention_cls,
+            **factory_kwargs,
+        )
+        self.multihead_attn = MultiheadAttention(
+            d_model,
+            nhead,
+            dropout=dropout,
+            batch_first=batch_first,
+            linear1_cls=linear1_self_attention_cls,
+            linear2_cls=linear2_self_attention_cls,
+            **factory_kwargs,
+        )
+        # Implementation of Feedforward model
+        self.linear1 = linear1_feedforward_cls(
+            d_model, dim_feedforward, **factory_kwargs
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = linear2_feedforward_cls(
+            dim_feedforward, d_model, **factory_kwargs
+        )
+        self.norm_first = norm_first
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            self.activation = _get_activation_fn(activation)
+        elif isinstance(activation, partial):
+            self.activation = activation(d_model)
+        elif activation == BalancedDoubleSwish:
+            self.activation = BalancedDoubleSwish(d_model)
+        else:
+            self.activation = activation
+        if adaptive_layer_norm:
+            norm1 = layer_norm_cls(
+                d_model, eps=layer_norm_eps, **factory_kwargs
+            )
+            norm2 = layer_norm_cls(
+                d_model, eps=layer_norm_eps, **factory_kwargs
+            )
+            norm3 = layer_norm_cls(
+                d_model, eps=layer_norm_eps, **factory_kwargs
+            )
+            self.norm1 = AdaptiveLayerNorm(d_model, norm1)
+            self.norm2 = AdaptiveLayerNorm(d_model, norm2)
+            self.norm3 = AdaptiveLayerNorm(d_model, norm3)
+        else:
+            self.norm1 = layer_norm_cls(
+                d_model, eps=layer_norm_eps, **factory_kwargs
+            )
+            self.norm2 = layer_norm_cls(
+                d_model, eps=layer_norm_eps, **factory_kwargs
+            )
+            if layer_norm_cls == IdentityNorm:
+                self.norm3 = BalancedBasicNorm(
+                    d_model, eps=layer_norm_eps, **factory_kwargs
+                )
+            else:
+                self.norm3 = layer_norm_cls(
+                    d_model, eps=layer_norm_eps, **factory_kwargs
+                )
+    def forward(
+        self,
+        tgt: Tensor,
+        memory: Tensor,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer.
+        Args:
+            tgt: the sequence to the decoder layer (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        tgt_is_tuple = False
+        if isinstance(tgt, tuple):
+            x, stage_embedding = tgt
+            tgt_is_tuple = True
+        else:
+            x, stage_embedding = tgt, None
+        if self.norm_first:
+            x = x + self._sa_block(
+                self.norm1(x, stage_embedding), tgt_mask, tgt_key_padding_mask
+            )
+            x = x + self._mha_block(
+                self.norm2(x, stage_embedding),
+                memory,
+                memory_mask,
+                memory_key_padding_mask,
+            )
+            x = x + self._ff_block(self.norm3(x, stage_embedding))
+        else:
+            x = self.norm1(
+                x + self._sa_block(x, tgt_mask, tgt_key_padding_mask),
+                stage_embedding,
+            )
+            x = self.norm2(
+                x
+                + self._mha_block(
+                    x, memory, memory_mask, memory_key_padding_mask
+                ),
+                stage_embedding,
+            )
+            x = self.norm3(x + self._ff_block(x), stage_embedding)
+        if tgt_is_tuple:
+            return (x, stage_embedding)
+        return x
+    # self-attention block
+    def _sa_block(
+        self,
+        x: Tensor,
+        attn_mask: Optional[Tensor],
+        key_padding_mask: Optional[Tensor],
+    ) -> Tensor:
+        x = self.self_attn(
+            x,
+            x,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout1(x)
+    # multihead attention block
+    def _mha_block(
+        self,
+        x: Tensor,
+        mem: Tensor,
+        attn_mask: Optional[Tensor],
+        key_padding_mask: Optional[Tensor],
+    ) -> Tensor:
+        x = self.multihead_attn(
+            x,
+            mem,
+            mem,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout2(x)
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout3(x)
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]:
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return F.gelu
+    raise RuntimeError(
+        "activation should be relu/gelu, not {}".format(activation)
+    )

presets/acou_1.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:470ce66fc24a2d14e162343381f7d93ef0a3af51edf5fd37240c21f492b4e769
+size 15650

presets/acou_2.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec1c5328751cadeed5356d4264759799ad96d33ea8dd4f8a3d0a80dd8ddb0e74
+size 15426

presets/acou_3.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03f241b094a32b3f542e74374183c6d15e8b70ae73ceeafb11bfd4ee6b8b4a3a
+size 15410

presets/acou_4.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52b96f32863f13f84cf7ac4a27d2bc95cea70c350a037f4d1890b20b8da9501e
+size 15506

presets/alan.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28838c3f0b2f9f315b34e9b940f30641306f0cadc5c527857cd1cc408547ed1c
+size 50002

presets/amused.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df3e882f3a62805b9aaf300d81822cd4eddeafee480503b7b78e32be2085fb11
+size 20882

presets/anger.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:959cec6dc0b30219db0d70cdd165fe00bbdc098165cf9d67ccdd1ecf7a5da5be
+size 22090

presets/babara.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8106b2a98c3f70587f23ab46ed5bf73b1c9a770481c3620ab140bd3256010376
+size 11526

presets/bronya_1.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02eaada2c3d58866c813887ed9f871587ef5a7e976abc23382ce46a17b208001
+size 18106

presets/cafe.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d78d96f5829da8f69c327ff25958da5b451305fdc9c308f7e67f13cf8d640fea
+size 22442

presets/dingzhen.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d19167c65eefef5e42dfaa1919ff5149ca0a93cb052396a47d1f42f9865f5f8
+size 18154

presets/dingzhen_1.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d19167c65eefef5e42dfaa1919ff5149ca0a93cb052396a47d1f42f9865f5f8
+size 18154

presets/disgust.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4443f0a395072700f2ec6101dbf2ad9d28968aa3e5809e384ea131832f894d7f
+size 39386

presets/emo_amused.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38be2ea16dc79beae68b6c885d99d4dad516acbd88ed5ed6991dd97301f2f30b
+size 15378

presets/emo_anger.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3261c3bdd5b7b4be9783d9293ee3d871be9d9d791f2b3a8bf62a1a0ee0ed93e6
+size 15434

presets/emo_neutral.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2188c4154692316ed7c0edee3aa3dd8678be36f355ee2b8c8a3a6412c3673ba9
+size 15578

presets/emo_sleepy.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a53255890beaf4ed339e1967f0837fdb87c34c9f7e18bf77cd4b08eba176963
+size 15370

presets/emotion_sleepiness.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0f866a278a10c7b6b494fb62589a9d8fef778ccf272df3b0d5510f45b243b5c
+size 33218

presets/en2zh_tts_1.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d4de4ed055448ea54f7b40091afae565197f960d954279035ac537ea5a01bc4
+size 44354

presets/en2zh_tts_2.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcc066ea104daa27d1552fe76574d09359d56fa892241581cc19e931a696eca9
+size 24178

presets/en2zh_tts_3.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7468944e6d0ed7f2da033e8037be07dbafc76bd1ed7c0f5996d85ff45aacda11
+size 21410

presets/en2zh_tts_4.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fd8d0914e74769114310e9504d68d6b7b0c6aacd46763478cbfd4f9631ad54a
+size 43826

presets/esta.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f944e135d901a00e74e7affe6757334e9a2679c10ad7ae4bcb5b33569d77eba
+size 40250

presets/fuxuan_2.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17b90388d179ae309e1f577c28c3f10d9bed73c6ccbffdd829c00568eb3941e6
+size 50330

presets/librispeech_1.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:415b244e43b45291fd651d71f15bb7a31c244e2054988c436f6bbc04465c6099
+size 15650

presets/librispeech_2.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd74e77370248b025321b9dbae25b1572f13f98da63255e384d382d2b0c78227
+size 15418

presets/librispeech_3.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1eceb3f4cc0f3a8856b5e3b5f1ca28c428d75305b1452da1ecf4013bc358ccaa
+size 15634

presets/librispeech_4.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3939dde39f5e65bc01f5eba9acb7b8329465aaca3c38edf1b240aa714e687960
+size 15594

presets/neutral.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8a63993526ffdc788a711b512d07a8b1c816151a1edb63913d0bfb48c2ea380
+size 21050

presets/paimon_1.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:452d5e0cd3a060db521bd65a16af818a6177f357801402aa5581eceb2c24039a
+size 13762