jaskaran Singh commited on Nov 16, 2023

Commit

22a7887

1 Parent(s): 92172e2

init

Files changed (46) hide show

.gitattributes +4 -0
LICENSE +201 -0
README.md +1 -3
maha_tts/__init__.py +1 -0
maha_tts/__pycache__/__init__.cpython-311.pyc +0 -0
maha_tts/__pycache__/config.cpython-311.pyc +0 -0
maha_tts/__pycache__/inference.cpython-311.pyc +0 -0
maha_tts/config.py +23 -0
maha_tts/dataloaders/__init__.py +0 -0
maha_tts/inference.py +254 -0
maha_tts/models/__init__.py +0 -0
maha_tts/models/__pycache__/__init__.cpython-311.pyc +0 -0
maha_tts/models/__pycache__/autoregressive.cpython-311.pyc +0 -0
maha_tts/models/__pycache__/diff_model.cpython-311.pyc +0 -0
maha_tts/models/__pycache__/modules.cpython-311.pyc +0 -0
maha_tts/models/__pycache__/vocoder.cpython-311.pyc +0 -0
maha_tts/models/autoregressive.py +135 -0
maha_tts/models/diff_model.py +303 -0
maha_tts/models/modules.py +406 -0
maha_tts/models/vocoder.py +342 -0
maha_tts/pretrained_models/.DS_Store +0 -0
maha_tts/pretrained_models/hifigan/config.json +3 -0
maha_tts/pretrained_models/hifigan/g_02500000 +3 -0
maha_tts/pretrained_models/smolie/S2A/s2a_latest.pt +3 -0
maha_tts/pretrained_models/smolie/T2S/t2s_best.pt +3 -0
maha_tts/text/__init__.py +0 -0
maha_tts/text/__pycache__/__init__.cpython-311.pyc +0 -0
maha_tts/text/__pycache__/cleaners.cpython-311.pyc +0 -0
maha_tts/text/__pycache__/symbols.cpython-311.pyc +0 -0
maha_tts/text/cleaners.py +143 -0
maha_tts/text/symbols.py +28 -0
maha_tts/utils/__init__.py +0 -0
maha_tts/utils/__pycache__/__init__.cpython-311.pyc +0 -0
maha_tts/utils/__pycache__/audio.cpython-311.pyc +0 -0
maha_tts/utils/__pycache__/diffusion.cpython-311.pyc +0 -0
maha_tts/utils/__pycache__/stft.cpython-311.pyc +0 -0
maha_tts/utils/audio.py +109 -0
maha_tts/utils/diffusion.py +1283 -0
maha_tts/utils/stft.py +109 -0
ref_clips/2971_4275_000003_000007.wav +0 -0
ref_clips/2971_4275_000020_000001.wav +0 -0
ref_clips/2971_4275_000023_000010.wav +0 -0
ref_clips/2971_4275_000049_000000.wav +0 -0
ref_clips/2971_4275_000049_000004.wav +0 -0
ref_clips/2971_4275_000050_000000.wav +0 -0
tts.py +14 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+maha_tts/pretrained_models/smolie/T2S/t2s_best.pt filter=lfs diff=lfs merge=lfs -text
+maha_tts/pretrained_models/smolie/S2A/s2a_latest.pt filter=lfs diff=lfs merge=lfs -text
+maha_tts/pretrained_models/hifigan/config.json filter=lfs diff=lfs merge=lfs -text
+maha_tts/pretrained_models/hifigan/g_02500000 filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1 @@
----
-license: apache-2.0
----


1	+ # MahaTTS

maha_tts/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .inference import load_models,load_diffuser,infer_tts

maha_tts/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (285 Bytes). View file

maha_tts/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (817 Bytes). View file

maha_tts/__pycache__/inference.cpython-311.pyc ADDED Viewed

Binary file (17.1 kB). View file

maha_tts/config.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+class config:
+    semantic_model_centroids = 10000 + 1
+    seed_value = 3407
+    # Text to Semantic
+    t2s_position = 2048
+    # Semantic to acoustic
+    sa_timesteps_max = 1000
+    #Acoustic Properties
+    CLIP_LENGTH = 500
+    MAX_WAV_VALUE=32768.0
+    filter_length=1024
+    hop_length=256 #256
+    window = 'hann'
+    win_length=1024
+    n_mel_channels=80
+    sampling_rate=22050
+    mel_fmin=0.0
+    mel_fmax=8000.0

maha_tts/dataloaders/__init__.py ADDED Viewed

File without changes

maha_tts/inference.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import torch,glob,os
+import numpy as np
+import torch.nn.functional as F
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import write
+from scipy.special import softmax
+from maha_tts.models.diff_model import load_diff_model
+from maha_tts.models.autoregressive import load_TS_model
+from maha_tts.models.vocoder import load_vocoder_model,infer_wav
+from maha_tts.utils.audio import denormalize_tacotron_mel,normalize_tacotron_mel,load_wav_to_torch,dynamic_range_compression
+from maha_tts.utils.stft import STFT
+from maha_tts.utils.diffusion import SpacedDiffusion,get_named_beta_schedule,space_timesteps
+from maha_tts.text.symbols import labels,text_labels,code_labels,text_enc,text_dec,code_enc,code_dec
+from maha_tts.text.cleaners import  english_cleaners
+from maha_tts.config import config
+stft_fn = STFT(config.filter_length, config.hop_length, config.win_length)
+mel_basis = librosa_mel_fn(
+        sr=config.sampling_rate, n_fft=config.filter_length, n_mels=config.n_mel_channels, fmin=config.mel_fmin, fmax=config.mel_fmax)
+mel_basis = torch.from_numpy(mel_basis).float()
+model_dirs= {
+    'Smolie':'asdf',
+    'hifigan':'asdf'
+}
+def download_model(name):
+    pass
+def load_models(name,device=torch.device('cpu')):
+    '''
+    Load pre-trained models for different components of a text-to-speech system.
+    Args:
+    device (str): The target device for model loading (e.g., 'cpu' or 'cuda').
+    checkpoint_diff (str): File path to the pre-trained model checkpoint for the diffusion model.
+    checkpoint_ts (str): File path to the pre-trained model checkpoint for the text-to-semantic model.
+    checkpoint_voco (str): File path to the pre-trained model checkpoint for the vocoder model.
+    voco_config_path (str): File path to the configuration file for the vocoder model.
+    Returns:
+    diff_model (object): Loaded diffusion model for semantic-to-acoustic tokens.
+    ts_model (object): Loaded text-to-semantic model for converting text-to-semantic tokens.
+    vocoder (object): Loaded vocoder model for generating waveform from acoustic tokens.
+    diffuser (object): Configured diffuser object for use in the diffusion model.
+    '''
+    assert name in model_dirs, "no model name "+name
+    checkpoint_diff = 'maha_tts/pretrained_models/'+str(name)+'/S2A/s2a_latest.pt'
+    checkpoint_ts = 'maha_tts/pretrained_models/'+str(name)+'/T2S/t2s_best.pt'
+    checkpoint_voco = 'maha_tts/pretrained_models/hifigan/g_02500000'
+    voco_config_path = 'maha_tts/pretrained_models/hifigan/config.json'
+    # for i in [checkpoint_diff,checkpoint_ts,checkpoint_voco,voco_config_path]:
+    if not os.path.exists(checkpoint_diff) or not os.path.exists(checkpoint_ts):
+        download_model(name)
+    if not os.path.exists(checkpoint_voco) or not os.path.exists(voco_config_path):
+        download_model('hifigan')
+    diff_model = load_diff_model(checkpoint_diff,device)
+    ts_model = load_TS_model(checkpoint_ts,device)
+    vocoder = load_vocoder_model(voco_config_path,checkpoint_voco,device)
+    diffuser = load_diffuser()
+    return diff_model,ts_model,vocoder,diffuser
+def infer_mel(model,timeshape,code,ref_mel,diffuser,temperature=0.1):
+    device = next(model.parameters()).device
+    code = code.to(device)
+    output_shape = (1,80,timeshape)
+    noise = torch.randn(output_shape, device=code.device) * temperature
+    mel = diffuser.p_sample_loop(model, output_shape, noise=noise,
+                                      model_kwargs={'code_emb': code,'ref_clips':ref_mel},
+                                     progress=True)
+    return denormalize_tacotron_mel(mel)
+def generate_semantic_tokens(
+    text,
+    model,
+    ref_mels,
+    temp = 0.7,
+    top_p= None,
+    top_k= None,
+    n_tot_steps = 1000,
+    device = None
+    ):
+    semb = []
+    with torch.no_grad():
+        for n in range(n_tot_steps):
+            x = get_inputs(text,semb,ref_mels,device)
+            _,result = model(**x)
+            relevant_logits = result[0,:,-1]
+            if top_p is not None:
+                # faster to convert to numpy
+                original_device = relevant_logits.device
+                relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+                sorted_indices = np.argsort(relevant_logits)[::-1]
+                sorted_logits = relevant_logits[sorted_indices]
+                cumulative_probs = np.cumsum(softmax(sorted_logits))
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+                sorted_indices_to_remove[0] = False
+                relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+                relevant_logits = torch.from_numpy(relevant_logits)
+                relevant_logits = relevant_logits.to(original_device)
+            if top_k is not None:
+                v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+                relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+            probs = F.softmax(relevant_logits / temp, dim=-1)
+            item_next = torch.multinomial(probs, num_samples=1).to(torch.int32)
+            semb.append(str(code_dec[item_next.item()]))
+            if semb[-1] == '<EST>' or semb[-1] == '<PAD>':
+                break
+            del relevant_logits, probs, item_next
+    semb = torch.tensor([int(i) for i in semb[:-1]])
+    return semb,result
+def get_inputs(text,semb=[],ref_mels=[],device=torch.device('cpu')):
+  text = text.lower()
+  text_ids=[text_enc['<S>']]+[text_enc[i] for i in text.strip()]+[text_enc['<E>']]
+  semb_ids=[code_enc['<SST>']]+[code_enc[i] for i in semb]#+[tok_enc['<EST>']]
+  input_ids = text_ids+semb_ids
+  # pad_length = config.t2s_position-(len(text_ids)+len(semb_ids))
+  token_type_ids = [0]*len(text_ids)+[1]*len(semb_ids)#+[0]*pad_length
+  positional_ids = [i for i in range(len(text_ids))]+[i for i in range(len(semb_ids))]#+[0]*pad_length
+  # labels = [-100]*len(text_ids)+semb_ids+[-100]*pad_length
+  attention_mask = [1]*len(input_ids)#+[0]*pad_length
+  # input_ids += [tok_enc['<PAD>']]*pad_length
+  return {'text_ids':torch.tensor(text_ids).unsqueeze(0).to(device),'codes_ids':torch.tensor(semb_ids).unsqueeze(0).to(device),'ref_clips':normalize_tacotron_mel(ref_mels).to(device)}
+def get_ref_mels(ref_clips):
+    ref_mels = []
+    for i in ref_clips:
+        ref_mels.append(get_mel(i)[0][:,:500])
+    ref_mels_padded = (torch.randn((len(ref_mels), 80, 500)))*1e-8
+    for i,mel in enumerate(ref_mels):
+        ref_mels_padded[i, :, :mel.size(1)] = mel
+    return ref_mels_padded.unsqueeze(0)
+def get_mel(filepath):
+    audio, sampling_rate = load_wav_to_torch(filepath)
+    audio_norm = audio / config.MAX_WAV_VALUE
+    audio_norm = audio_norm.unsqueeze(0)
+    y = torch.autograd.Variable(audio_norm, requires_grad=False)
+    assert(torch.min(y.data) >= -1)
+    assert(torch.max(y.data) <= 1)
+    magnitudes, phases = stft_fn.transform(y)
+    magnitudes = magnitudes.data
+    mel_output = torch.matmul(mel_basis, magnitudes)
+    mel_output = dynamic_range_compression(mel_output)
+    melspec = torch.squeeze(mel_output, 0)
+    energy = torch.norm(magnitudes, dim=1).squeeze(0)
+    return melspec,list(energy)
+def infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder):
+    '''
+    Generate audio from the given text using a text-to-speech (TTS) pipeline.
+    Args:
+    text (str): The input text to be synthesized into speech.
+    ref_clips (list): A list of paths to reference audio clips, preferably more than 3 clips.
+    diffuser (object): A diffusion object used for denoising and guidance in the diffusion model. It should be obtained using load_diffuser.
+    diff_model: diffusion model for semantic-to-acoustic tokens.
+    ts_model: text-to-semantic model for converting text-to-semantic tokens.
+    vocoder: vocoder model for generating waveform from acoustic tokens.
+    Returns:
+    audio (numpy.ndarray): Generated audio waveform.
+    sampling_rate (int): Sampling rate of the generated audio.
+    Description:
+    The `infer_tts` function takes input text and reference audio clips, and processes them through a TTS pipeline.
+    It first performs text preprocessing and generates semantic tokens using the specified text synthesis model.
+    Then, it infers mel-spectrogram features using the diffusion model and the provided diffuser.
+    Finally, it generates audio from the mel-spectrogram using the vocoder.
+    Note: The function requires properly configured diff_model, ts_model, and vocoder objects for successful TTS.
+    Example usage:
+    audio, sampling_rate = infer_tts("Hello, how are you?", ref_clips, diffuser, diff_model, ts_model, vocoder)
+    '''
+    text = english_cleaners(text)
+    ref_mels = get_ref_mels(ref_clips)
+    with torch.no_grad():
+        sem_tok,_ = generate_semantic_tokens(
+                        text,
+                        ts_model,
+                        ref_mels,
+                        temp = 0.7,
+                        top_p= 0.8,
+                        top_k= 5,
+                        n_tot_steps = 1000,
+                        device = None
+                    )
+        mel = infer_mel(diff_model,int(((sem_tok.shape[-1] * 320 / 16000) * 22050/256)+1),sem_tok.unsqueeze(0) + 1,
+                        ref_mels,diffuser,temperature=1.0)
+        audio = infer_wav(mel,vocoder)
+    return audio,config.sampling_rate
+def load_diffuser(timesteps = 100, gudiance=3):
+    '''
+    Load and configure a diffuser for denoising and guidance in the diffusion model.
+    Args:
+    timesteps (int): Number of denoising steps out of 1000. Default is 100.
+    guidance (int): Conditioning-free guidance parameter. Default is 3.
+    Returns:
+    diffuser (object): Configured diffuser object for use in the diffusion model.
+    Description:
+    The `load_diffuser` function initializes a diffuser with specific settings for denoising and guidance.
+    '''
+    betas = get_named_beta_schedule('cosine',config.sa_timesteps_max)
+    diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [timesteps]), model_mean_type='epsilon',
+                        model_var_type='learned_range', loss_type='rescaled_mse', betas=betas,
+                        conditioning_free=True, conditioning_free_k=gudiance)
+    diffuser.training=False
+    return diffuser
+if __name__ == '__main__':
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(device)
+    text = 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition.'
+    ref_clips = glob.glob('/Users/jaskaransingh/Desktop/maha_tts/ref_clips/*.wav')
+    checkpoint_diff = 'maha_tts/pretrained_models/S2A/s2a_latest.pt'
+    checkpoint_ts = 'maha_tts/pretrained_models/T2S/t2s_best.pt'
+    checkpoint_voco = 'maha_tts/pretrained_models/hifigan/g_02500000'
+    voco_config_path = 'maha_tts/pretrained_models/hifigan/config.json'
+    diffuser = load_diffuser()
+    diff_model,ts_model,vocoder = load_models(device,checkpoint_diff,checkpoint_ts,checkpoint_voco,voco_config_path)
+    audio,sr = infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder)
+    write('test.wav',sr,audio)

maha_tts/models/__init__.py ADDED Viewed

File without changes

maha_tts/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (177 Bytes). View file

maha_tts/models/__pycache__/autoregressive.cpython-311.pyc ADDED Viewed

Binary file (9.89 kB). View file

maha_tts/models/__pycache__/diff_model.cpython-311.pyc ADDED Viewed

Binary file (18.9 kB). View file

maha_tts/models/__pycache__/modules.cpython-311.pyc ADDED Viewed

Binary file (28.6 kB). View file

maha_tts/models/__pycache__/vocoder.cpython-311.pyc ADDED Viewed

Binary file (22.8 kB). View file

maha_tts/models/autoregressive.py ADDED Viewed

	@@ -0,0 +1,135 @@

+'''
+Inspiration taken from https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/models/autoregressive.py
+'''
+import os,sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import functools
+from typing import Any
+from torch.utils.data import Dataset,DataLoader
+from transformers import GPT2Tokenizer,GPT2Config, GPT2Model, GPT2LMHeadModel
+from tqdm import tqdm
+from maha_tts.config import config
+from maha_tts.text.symbols import labels,code_labels,text_labels
+from maha_tts.models.modules import GST
+def null_position_embeddings(range, dim):
+    return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
+class TS_model(nn.Module):
+    def __init__(self,n_embed = 512, n_layer = 16, n_head = 8):
+        super(TS_model,self).__init__()
+        self.vocab_size=len(labels)
+        self.n_positions=config.t2s_position
+        self.n_embed=n_embed
+        self.n_layer=n_layer
+        self.n_head=n_head
+        self.config = GPT2Config(vocab_size=self.vocab_size,n_positions=self.n_positions,n_embd=self.n_embed,n_layer=self.n_layer,n_head=self.n_head)
+        self.gpt = GPT2Model(self.config)
+        del self.gpt.wpe
+        self.gpt.wpe = functools.partial(null_position_embeddings, dim=self.n_embed)
+        # Built-in token embeddings are unused.
+        del self.gpt.wte
+        self.GST = GST(model_channels=self.n_embed,num_heads=self.n_head,in_channels=config.n_mel_channels,k=1)
+        self.text_head = nn.Linear(self.n_embed,len(text_labels))
+        self.code_head = nn.Linear(self.n_embed,len(code_labels))
+        self.text_positional_embed = LearnedPositionEmbeddings(self.n_positions,self.n_embed)
+        self.code_positional_embed = LearnedPositionEmbeddings(self.n_positions,self.n_embed)
+        self.text_embed = nn.Embedding(len(text_labels),self.n_embed)
+        self.code_embed = nn.Embedding(len(code_labels),self.n_embed)
+        self.final_norm = nn.LayerNorm(self.n_embed)
+    def get_speaker_latent(self, ref_mels):
+        ref_mels = ref_mels.unsqueeze(1) if len(
+            ref_mels.shape) == 3 else ref_mels
+        conds = []
+        for j in range(ref_mels.shape[1]):
+            conds.append(self.GST(ref_mels[:, j,:,:]))
+        conds = torch.cat(conds, dim=-1)
+        conds = conds.mean(dim=-1)
+        return conds.unsqueeze(1)
+    def forward(self,text_ids,codes_ids = None,speaker_embed=None,ref_clips=None,return_loss = False):
+        assert speaker_embed is not None or ref_clips is not None
+        text_embed = self.text_embed(text_ids)
+        text_embed += self.text_positional_embed(text_embed)
+        code_embed = None
+        code_probs= None
+        if codes_ids is not None:
+            code_embed = self.code_embed(codes_ids)
+            code_embed+= self.code_positional_embed(code_embed)
+        if ref_clips is not None:
+            speaker_embed = self.get_speaker_latent(ref_clips)
+        text_embed,code_embed = self.get_logits(speaker_embed=speaker_embed,text_embed=text_embed,code_embed=code_embed)
+        text_probs = self.text_head(text_embed).permute(0,2,1)
+        if codes_ids is not None:
+            code_probs = self.code_head(code_embed).permute(0,2,1)
+        if return_loss:
+            loss_text = F.cross_entropy(text_probs[:,:,:-1], text_ids[:,1:].long(), reduce=False)
+            loss_mel = F.cross_entropy(code_probs[:,:,:-1], codes_ids[:,1:].long(), reduce=False)
+            return loss_text,loss_mel,code_probs
+        return text_probs,code_probs
+    def get_logits(self,speaker_embed,text_embed,code_embed=None):
+        if code_embed is not None:
+            embed = torch.cat([speaker_embed,text_embed,code_embed],dim=1)
+        else:
+            embed = torch.cat([speaker_embed,text_embed],dim=1)
+        gpt_output = self.gpt(inputs_embeds=embed, return_dict=True)
+        enc = gpt_output.last_hidden_state[:, 1:]
+        enc = self.final_norm(enc)
+        if code_embed is not None:
+            return enc[:,:text_embed.shape[1]],enc[:,-code_embed.shape[1]:]
+        return enc[:,:text_embed.shape[1]],None
+class LearnedPositionEmbeddings(nn.Module):
+    def __init__(self, seq_len, model_dim, init=.02):
+        super().__init__()
+        self.emb = nn.Embedding(seq_len, model_dim)
+        # Initializing this way is standard for GPT-2
+        self.emb.weight.data.normal_(mean=0.0, std=init)
+    def forward(self, x):
+        sl = x.shape[1]
+        return self.emb(torch.arange(0, sl, device=x.device))
+    def get_fixed_embedding(self, ind, dev):
+        return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
+def load_TS_model(checkpoint,device):
+    sem_model= TS_model(n_embed = 512, n_layer = 16, n_head = 8)
+    sem_model.load_state_dict(torch.load(checkpoint,map_location=torch.device('cpu')),strict=False)
+    sem_model.eval().to(device)
+    return sem_model
+if __name__ == '__main__':
+    model=TS_model(n_embed = 256, n_layer = 6, n_head = 4)
+    text_ids = torch.randint(0,100,(5,20))
+    code_ids = torch.randint(0,100,(5,200))
+    speaker_embed = torch.randn((5,1,256))
+    output=model(text_ids=text_ids,speaker_embed=speaker_embed,codes_ids=code_ids,return_loss=True)

maha_tts/models/diff_model.py ADDED Viewed

	@@ -0,0 +1,303 @@

+'''
+inspiration taken from https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/models/diffusion_decoder.py
+'''
+import sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from maha_tts.config import config
+from torch import autocast
+from maha_tts.models.modules import QuartzNetBlock,AttentionBlock,mySequential,normalization,SCBD,SqueezeExcite,GST
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class TimestepBlock(nn.Module):
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+class QuartzNetBlock(TimestepBlock):
+    '''Similar to Resnet block with Batchnorm and dropout, and using Separable conv in the middle.
+    if its the last layer,set se = False and separable = False, and use a projection layer on top of this.
+    '''
+    def __init__(self,nin,nout,emb_channels,kernel_size=3,dropout=0.1,R=1,se=True,ratio=8,separable=False,bias=True,use_scale_shift_norm=True):
+        super(QuartzNetBlock,self).__init__()
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.se=se
+        self.in_layers = mySequential(
+            nn.Conv1d(nin,nout,kernel_size=1,padding='same',bias=bias),
+            normalization(nout) #nn.BatchNorm1d(nout,eps)
+        )
+        self.residual=mySequential(
+            nn.Conv1d(nin,nout,kernel_size=1,padding='same',bias=bias),
+            normalization(nout) #nn.BatchNorm1d(nout,eps)
+        )
+        nin=nout
+        model=[]
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_channels,
+                2 * nout if use_scale_shift_norm else nout,
+            ),
+        )
+        for i in range(R-1):
+            model.append(SCBD(nin,nout,kernel_size,dropout,bias=bias))
+            nin=nout
+        if separable:
+            model.append(SCBD(nin,nout,kernel_size,dropout,rd=False,bias=bias))
+        else:
+            model.append(SCBD(nin,nout,kernel_size,dropout,rd=False,separable=False,bias=bias))
+        self.model=mySequential(*model)
+        if self.se:
+            self.se_layer=SqueezeExcite(nin,ratio)
+        self.mout= mySequential(nn.SiLU(),nn.Dropout(dropout))
+    def forward(self,x,emb,mask=None):
+        x_new=self.in_layers(x)
+        emb = self.emb_layers(emb)
+        while len(emb.shape) < len(x_new.shape):
+            emb = emb[..., None]
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x_new = x_new * (1 + scale) + shift
+        y,_=self.model(x_new)
+        if self.se:
+            y,_=self.se_layer(y,mask)
+        y+=self.residual(x)
+        y=self.mout(y)
+        return y
+class QuartzAttn(TimestepBlock):
+    def __init__(self, model_channels, dropout, num_heads):
+        super().__init__()
+        self.resblk = QuartzNetBlock(model_channels, model_channels, model_channels,dropout=dropout,use_scale_shift_norm=True)
+        self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True)
+    def forward(self, x, time_emb):
+        y = self.resblk(x, time_emb)
+        return self.attn(y)
+class QuartzNet9x5(nn.Module):
+    def __init__(self,model_channels,num_heads,enable_fp16=False):
+        super(QuartzNet9x5,self).__init__()
+        self.enable_fp16 = enable_fp16
+        self.conv1=QuartzNetBlock(model_channels,model_channels,model_channels,kernel_size=3,dropout=0.1,R=3)
+        kernels=[5,7,9,13,15,17]
+        quartznet=[]
+        attn=[]
+        for i in kernels:
+            quartznet.append(QuartzNetBlock(model_channels,model_channels,model_channels,kernel_size=i,dropout=0.1,R=5,se=True))
+            attn.append(AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True))
+        kernels=[21,23,25]
+        quartznet.append(QuartzNetBlock(model_channels,model_channels,model_channels,kernel_size=21,dropout=0.1,R=5,se=True))
+        attn.append(AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True))
+        for i in kernels[1:]:
+            quartznet.append(QuartzNetBlock(model_channels,model_channels,model_channels,kernel_size=i,dropout=0.1,R=5,se=True))
+            attn.append(AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True))
+        self.quartznet= nn.ModuleList(quartznet)
+        self.attn = nn.ModuleList(attn)
+        self.conv3=nn.Conv1d(model_channels, model_channels, 1, padding='same')
+    def forward(self, x, time_emb):
+        x = self.conv1(x,time_emb)
+        # with autocast(x.device.type, enabled=self.enable_fp16):
+        for n,(layer,attn) in enumerate(zip(self.quartznet,self.attn)):
+            x = layer(x,time_emb) #256 dim
+            x = attn(x)
+        x = self.conv3(x.float())
+        return x
+class DiffModel(nn.Module):
+    def __init__(
+        self,
+        input_channels=80,
+        output_channels=160,
+        model_channels=512,
+        num_heads=8,
+        dropout=0.0,
+        multispeaker = True,
+        condition_free_per=0.1,
+        training = False,
+        ar_active = False,
+        in_latent_channels = 10004
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.model_channels = model_channels
+        self.output_channels = output_channels
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.condition_free_per = condition_free_per
+        self.training = training
+        self.multispeaker = multispeaker
+        self.ar_active = ar_active
+        self.in_latent_channels = in_latent_channels
+        if not self.ar_active:
+            self.code_emb = nn.Embedding(config.semantic_model_centroids+1,model_channels)
+            self.code_converter = mySequential(
+                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+            )
+        else:
+            self.code_converter = mySequential(
+                nn.Conv1d(self.in_latent_channels, model_channels, 3, padding=1),
+                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+            )
+        if self.multispeaker:
+            self.GST = GST(model_channels,num_heads)
+        self.code_norm = normalization(model_channels)
+        self.time_norm = normalization(model_channels)
+        self.noise_norm = normalization(model_channels)
+        self.code_time_norm = normalization(model_channels)
+        # self.code_latent = []
+        self.time_embed = mySequential(
+            nn.Linear(model_channels, model_channels),
+            nn.SiLU(),
+            nn.Linear(model_channels, model_channels),)
+        self.input_block = nn.Conv1d(input_channels,model_channels,3,1,1)
+        self.unconditioned_embedding = nn.Parameter(torch.randn(1,model_channels,1))
+        self.code_time = TimestepEmbedSequential(QuartzAttn(model_channels, dropout, num_heads),QuartzAttn(model_channels, dropout, num_heads),QuartzAttn(model_channels, dropout, num_heads))
+        self.layers = QuartzNet9x5(model_channels,num_heads)
+        self.out = nn.Sequential(
+            normalization(model_channels),
+            nn.SiLU(),
+            nn.Conv1d(model_channels, output_channels, 3, padding=1),
+        )
+    def get_speaker_latent(self, ref_mels):
+        ref_mels = ref_mels.unsqueeze(1) if len(
+            ref_mels.shape) == 3 else ref_mels
+        conds = []
+        for j in range(ref_mels.shape[1]):
+            conds.append(self.GST(ref_mels[:, j,:,:]))
+        conds = torch.cat(conds, dim=-1)
+        conds = conds.mean(dim=-1)
+        return conds.unsqueeze(2)
+    def forward(self ,x,t,code_emb,ref_clips=None,speaker_latents=None,conditioning_free=False):
+        time_embed = self.time_norm(self.time_embed(timestep_embedding(t.unsqueeze(-1),self.model_channels)).permute(0,2,1)).squeeze(2)
+        if conditioning_free:
+            code_embed = self.unconditioned_embedding.repeat(x.shape[0], 1, x.shape[-1])
+        else:
+            if not self.ar_active:
+                code_embed = self.code_norm(self.code_converter(self.code_emb(code_emb).permute(0,2,1)))
+            else:
+                code_embed = self.code_norm(self.code_converter(code_emb))
+        if self.multispeaker:
+            assert speaker_latents is not None or ref_clips is not None
+            if ref_clips is not None:
+                speaker_latents = self.get_speaker_latent(ref_clips)
+            cond_scale, cond_shift = torch.chunk(speaker_latents, 2, dim=1)
+            code_embed = code_embed * (1 + cond_scale) + cond_shift
+        if self.training and self.condition_free_per > 0:
+            unconditioned_batches = torch.rand((code_embed.shape[0], 1, 1),
+                                               device=code_embed.device) < self.condition_free_per
+            code_embed = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(code_embed.shape[0], 1, 1),
+                                   code_embed)
+        expanded_code_emb = F.interpolate(code_embed, size=x.shape[-1], mode='nearest') #try different modes
+        x_cond = self.code_time_norm(self.code_time(expanded_code_emb,time_embed))
+        x = self.noise_norm(self.input_block(x))
+        x += x_cond
+        x = self.layers(x, time_embed)
+        out = self.out(x)
+        return out
+def load_diff_model(checkpoint,device,model_channels=512,ar_active=False,len_code_labels=10004):
+    diff_model = DiffModel(input_channels=80,
+                 output_channels=160,
+                 model_channels=512,
+                 num_heads=8,
+                 dropout=0.15,
+                 condition_free_per=0.15,
+                 multispeaker=True,
+                 training=False,
+                 ar_active=ar_active,
+                 in_latent_channels = len_code_labels)
+    # diff_model.load_state_dict(torch.load('/content/LibriTTS_fp64_10k/S2A/_latest.pt',map_location=torch.device('cpu')),strict=True)
+    diff_model.load_state_dict(torch.load(checkpoint,map_location=torch.device('cpu')),strict=True)
+    diff_model=diff_model.eval().to(device)
+    return diff_model
+if __name__ == '__main__':
+    device = torch.device('cpu')
+    diff_model = DiffModel(input_channels=80,
+                 output_channels=160,
+                 model_channels=1024,
+                 num_heads=8,
+                 dropout=0.1,
+                 num_layers=8,
+                 enable_fp16=True,
+                 condition_free_per=0.1,
+                 multispeaker=True,
+                 training=True).to(device)
+    batch_Size = 32
+    timeseries = 800
+    from torchinfo import summary
+    summary(diff_model, input_data={'x': torch.randn(batch_Size, 80, timeseries).to(device),
+    'ref_clips': torch.randn(batch_Size,3, 80, timeseries).to(device),
+    't':torch.LongTensor(size=[batch_Size,]).to(device),
+    'code_emb':torch.randint(0,201,(batch_Size,timeseries)).to(device)})

maha_tts/models/modules.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import torch,math
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from einops import rearrange, repeat
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    Using it for Zero Convolutions
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def normalization(channels):
+    """
+    Make a standard normalization layer. of groups ranging from 2 to 32.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    groups = 32
+    if channels <= 16:
+        groups = 8
+    elif channels <= 64:
+        groups = 16
+    while channels % groups != 0:
+        groups = int(groups / 2)
+    assert groups > 2
+    return GroupNorm32(groups, channels)
+class mySequential(nn.Sequential):
+    '''Using this to pass mask variable to nn layers
+    '''
+    def forward(self, *inputs):
+        for module in self._modules.values():
+            if type(inputs) == tuple:
+                inputs = module(*inputs)
+            else:
+                inputs = module(inputs)
+        return inputs
+class SepConv1D(nn.Module):
+    '''Depth wise separable Convolution layer with mask
+    '''
+    def __init__(self,nin,nout,kernel_size,stride=1,dilation=1,padding_mode='same',bias=True):
+        super(SepConv1D,self).__init__()
+        self.conv1=nn.Conv1d(nin, nin, kernel_size=kernel_size, stride=stride,groups=nin,dilation=dilation,padding=padding_mode,bias=bias)
+        self.conv2=nn.Conv1d(nin,nout,kernel_size=1,stride=1,padding=padding_mode,bias=bias)
+    def forward(self,x,mask=None):
+        if mask is not None:
+            x = x * mask.unsqueeze(1).to(device=x.device)
+        x=self.conv1(x)
+        x=self.conv2(x)
+        return x,mask
+class Conv1DBN(nn.Module):
+    def __init__(self,nin,nout,kernel_size,stride=1,dilation=1,dropout=0.1,padding_mode='same',bias=False):
+        super(Conv1DBN,self).__init__()
+        self.conv1=nn.Conv1d(nin, nout, kernel_size=kernel_size, stride=stride,padding=padding_mode,dilation=dilation,bias=bias)
+        self.bn=nn.BatchNorm1d(nout)
+        self.drop=nn.Dropout(dropout)
+    def forward(self,x,mask=None):
+        if mask is not None:
+            x = x * mask.unsqueeze(1).to(device=x.device)
+        x=self.conv1(x)
+        x=self.bn(x)
+        x=F.relu(x)
+        x=self.drop(x)
+        return x,mask
+class Conv1d(nn.Module):
+    '''normal conv1d with mask
+    '''
+    def __init__(self,nin,nout,kernel_size,padding,bias=True):
+        super(Conv1d,self).__init__()
+        self.l=nn.Conv1d(nin,nout,kernel_size,padding=padding,bias=bias)
+    def forward(self,x,mask):
+        if mask is not None:
+            x = x * mask.unsqueeze(1).to(device=x.device)
+        y=self.l(x)
+        return y,mask
+class SqueezeExcite(nn.Module):
+    '''Let the CNN decide how to add across channels
+    '''
+    def __init__(self,nin,ratio=8):
+        super(SqueezeExcite,self).__init__()
+        self.nin=nin
+        self.ratio=ratio
+        self.fc=mySequential(
+            nn.Linear(nin,nin//ratio,bias=True),nn.SiLU(inplace=True),nn.Linear(nin//ratio,nin,bias=True)
+        )
+    def forward(self,x,mask=None):
+        if mask is None:
+            mask = torch.ones((x.shape[0],x.shape[-1]),dtype=torch.bool).to(x.device)
+        mask=~mask
+        x=x.float()
+        x.masked_fill_(mask.unsqueeze(1), 0.0)
+        mask=~mask
+        y = (torch.sum(x, dim=-1, keepdim=True) / mask.unsqueeze(1).sum(dim=-1, keepdim=True)).type(x.dtype)
+        # y=torch.mean(x,-1,keepdim=True)
+        y=y.transpose(1, -1)
+        y=self.fc(y)
+        y=torch.sigmoid(y)
+        y=y.transpose(1, -1)
+        y= x * y
+        return y,mask
+class SCBD(nn.Module):
+    '''SeparableConv1D + Batchnorm + Dropout, Generally use it for middle layers and resnet
+    '''
+    def __init__(self,nin,nout,kernel_size,p=0.1,rd=True,separable=True,bias=True):
+        super(SCBD,self).__init__()
+        if separable:
+            self.SC=SepConv1D(nin,nout,kernel_size,bias=bias)
+        else:
+            self.SC=Conv1d(nin,nout,kernel_size,padding='same',bias=bias)
+        if rd: #relu and Dropout
+            self.mout=mySequential(normalization(nout),nn.SiLU(), # nn.BatchNorm1d(nout,eps)
+                nn.Dropout(p))
+        else:
+            self.mout=normalization(nout) # nn.BatchNorm1d(nout,eps)
+    def forward(self,x,mask=None):
+        if mask is not None:
+            x = x * mask.unsqueeze(1).to(device=x.device)
+        x,_= self.SC(x,mask)
+        y = self.mout(x)
+        return y,mask
+class QuartzNetBlock(nn.Module):
+    '''Similar to Resnet block with Batchnorm and dropout, and using Separable conv in the middle.
+    if its the last layer,set se = False and separable = False, and use a projection layer on top of this.
+    '''
+    def __init__(self,nin,nout,kernel_size,dropout=0.1,R=5,se=False,ratio=8,separable=False,bias=True):
+        super(QuartzNetBlock,self).__init__()
+        self.se=se
+        self.residual=mySequential(
+            nn.Conv1d(nin,nout,kernel_size=1,padding='same',bias=bias),
+            normalization(nout) #nn.BatchNorm1d(nout,eps)
+        )
+        model=[]
+        for i in range(R-1):
+            model.append(SCBD(nin,nout,kernel_size,dropout,eps=0.001,bias=bias))
+            nin=nout
+        if separable:
+            model.append(SCBD(nin,nout,kernel_size,dropout,eps=0.001,rd=False,bias=bias))
+        else:
+            model.append(SCBD(nin,nout,kernel_size,dropout,eps=0.001,rd=False,separable=False,bias=bias))
+        self.model=mySequential(*model)
+        if self.se:
+            self.se_layer=SqueezeExcite(nin,ratio)
+        self.mout= mySequential(nn.SiLU(),nn.Dropout(dropout))
+    def forward(self,x,mask=None):
+        if mask is not None:
+            x = x * mask.unsqueeze(1).to(device=x.device)
+        y,_=self.model(x,mask)
+        if self.se:
+            y,_=self.se_layer(y,mask)
+        y+=self.residual(x)
+        y=self.mout(y)
+        return y,mask
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv, mask=None, rel_pos=None):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        if rel_pos is not None:
+            weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(bs * self.n_heads, weight.shape[-2], weight.shape[-1])
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        if mask is not None:
+            # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs.
+            mask = mask.repeat(self.n_heads, 1).unsqueeze(1)
+            weight = weight * mask
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        do_checkpoint=True,
+        relative_pos_embeddings=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.do_checkpoint = do_checkpoint
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.norm = normalization(channels)
+        self.qkv = nn.Conv1d(channels, channels * 3, 1)
+        # split heads before split qkv
+        self.attention = QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(nn.Conv1d(channels, channels, 1)) # no effect of attention in the inital stages.
+        # if relative_pos_embeddings:
+        self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64) #need to read about this, vit and swin transformers
+        # self.relative_pos_embeddings = FixedPositionalEmbedding(dim=channels)
+        # else:
+        # self.relative_pos_embeddings = None
+    def forward(self, x, mask=None):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv, mask, self.relative_pos_embeddings)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+class AbsolutePositionalEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len):
+        super().__init__()
+        self.scale = dim ** -0.5
+        self.emb = nn.Embedding(max_seq_len, dim)
+    def forward(self, x):
+        n = torch.arange(x.shape[1], device=x.device)
+        pos_emb = self.emb(n)
+        pos_emb = rearrange(pos_emb, 'n d -> () n d')
+        return pos_emb * self.scale
+class FixedPositionalEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+    def forward(self, x, seq_dim=1, offset=0):
+        t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
+        sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
+        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
+        return rearrange(emb, 'n d -> () n d')
+class RelativePositionBias(nn.Module):
+    def __init__(self, scale, causal=False, num_buckets=32, max_distance=128, heads=8):
+        super().__init__()
+        self.scale = scale
+        self.causal = causal
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.relative_attention_bias = nn.Embedding(num_buckets, heads)
+    @staticmethod
+    def _relative_position_bucket(relative_position, causal=True, num_buckets=32, max_distance=128):
+        ret = 0
+        n = -relative_position
+        if not causal:
+            num_buckets //= 2
+            ret += (n < 0).long() * num_buckets
+            n = torch.abs(n)
+        else:
+            n = torch.max(n, torch.zeros_like(n))
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (
+                torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).long()
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+    def forward(self, qk_dots):
+        i, j, device = *qk_dots.shape[-2:], qk_dots.device
+        q_pos = torch.arange(i, dtype=torch.long, device=device)
+        k_pos = torch.arange(j, dtype=torch.long, device=device)
+        rel_pos = k_pos[None, :] - q_pos[:, None]
+        rp_bucket = self._relative_position_bucket(rel_pos, causal=self.causal, num_buckets=self.num_buckets,
+                                                   max_distance=self.max_distance)
+        values = self.relative_attention_bias(rp_bucket)
+        bias = rearrange(values, 'i j h -> () h i j')
+        return qk_dots + (bias * self.scale)
+class MultiHeadAttention(nn.Module):
+    '''
+    only for GST
+    input:
+        query --- [N, T_q, query_dim]
+        key --- [N, T_k, key_dim]
+    output:
+        out --- [N, T_q, num_units]
+    '''
+    def __init__(self, query_dim, key_dim, num_units, num_heads):
+        super().__init__()
+        self.num_units = num_units
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
+        self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+        self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+    def forward(self, query, key):
+        querys = self.W_query(query)  # [N, T_q, num_units]
+        keys = self.W_key(key)  # [N, T_k, num_units]
+        values = self.W_value(key)
+        split_size = self.num_units // self.num_heads
+        querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)  # [h, N, T_q, num_units/h]
+        keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
+        values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
+        # score = softmax(QK^T / (d_k ** 0.5))
+        scores = torch.matmul(querys, keys.transpose(2, 3))  # [h, N, T_q, T_k]
+        scores = scores / (self.key_dim ** 0.5)
+        scores = F.softmax(scores, dim=3)
+        # out = score * V
+        out = torch.matmul(scores, values)  # [h, N, T_q, num_units/h]
+        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]
+        return out
+class GST(nn.Module):
+    def __init__(self,model_channels=512,num_heads=8,in_channels=80,k=2):
+        super(GST,self).__init__()
+        self.model_channels=model_channels
+        self.num_heads=num_heads
+        self.reference_encoder=nn.Sequential(
+            nn.Conv1d(in_channels,model_channels,3,padding=1,stride=2),
+            nn.Conv1d(model_channels, model_channels*k,3,padding=1,stride=2),
+            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False),
+            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False),
+            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False),
+            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False),
+            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False)
+        )
+    def forward(self,x):
+        x=self.reference_encoder(x)
+        return x
+if __name__ == '__main__':
+    device = torch.device('cpu')
+    m = GST(512,10).to(device)
+    mels = torch.rand((16,80,1000)).to(device)
+    o = m(mels)
+    print(o.shape,'final output')
+    from torchinfo import summary
+    summary(m, input_data={'x': torch.randn(16,80,500).to(device)})

maha_tts/models/vocoder.py ADDED Viewed

	@@ -0,0 +1,342 @@

+'''
+copde from https://github.com/jik876/hifi-gan/blob/master/models.py
+'''
+import json,os
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+# from utils import init_weights, get_padding
+LRELU_SLOPE = 0.1
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        # print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2),
+            DiscriminatorP(3),
+            DiscriminatorP(5),
+            DiscriminatorP(7),
+            DiscriminatorP(11),
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList([
+            AvgPool1d(4, 2, padding=2),
+            AvgPool1d(4, 2, padding=2)
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i-1](y)
+                y_hat = self.meanpools[i-1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss*2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    return checkpoint_dict
+def load_vocoder_model(config_path,checkpoint_path,device):
+    # config_file = os.path.join(os.path.split(checkpoint_file)[0], 'config.json')
+    with open(config_path) as f:
+        data = f.read()
+    global h
+    json_config = json.loads(data)
+    h = AttrDict(json_config)
+    torch.manual_seed(h.seed)
+    generator = Generator(h).to(device)
+    state_dict_g = load_checkpoint(checkpoint_path, device)
+    generator.load_state_dict(state_dict_g['generator'])
+    generator.eval()
+    generator.remove_weight_norm()
+    return generator
+def infer_wav(mel,generator):
+    MAX_WAV_VALUE =32768.0
+    with torch.no_grad():
+        y_g_hat = generator(mel)
+        audio = y_g_hat.squeeze()
+        audio = audio * MAX_WAV_VALUE
+        audio = audio.cpu().numpy().astype('int16')
+    return audio

maha_tts/pretrained_models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

maha_tts/pretrained_models/hifigan/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1bd98e99062ddbced38729a5252dc2aa772328d16d70097ac139dab2f269dc9
+size 799

maha_tts/pretrained_models/hifigan/g_02500000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:771eaf4876485a35e25577563d390c262e23c2421e4a8c929eacfde34a5b7a60
+size 55788858

maha_tts/pretrained_models/smolie/S2A/s2a_latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf359fab98b047ef89d79a99a78fee9c38880e307630d3b3af7bc9cb170f366b
+size 432971673

maha_tts/pretrained_models/smolie/T2S/t2s_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67a10c3bf12a8bca3dd67075ccbfbd79887b244109bd9c96013b0f348d9e2570
+size 276146627

maha_tts/text/__init__.py ADDED Viewed

File without changes

maha_tts/text/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (178 Bytes). View file

maha_tts/text/__pycache__/cleaners.cpython-311.pyc ADDED Viewed

Binary file (7.03 kB). View file

maha_tts/text/__pycache__/symbols.cpython-311.pyc ADDED Viewed

Binary file (2.37 kB). View file

maha_tts/text/cleaners.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import re
+from unidecode import unidecode
+import inflect
+import re
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+def _remove_commas(m):
+  return m.group(1).replace(',', '')
+def _expand_decimal_point(m):
+  return m.group(1).replace('.', ' point ')
+def _expand_dollars(m):
+  match = m.group(1)
+  parts = match.split('.')
+  if len(parts) > 2:
+    return match + ' dollars'  # Unexpected format
+  dollars = int(parts[0]) if parts[0] else 0
+  cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+  if dollars and cents:
+    dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+    cent_unit = 'cent' if cents == 1 else 'cents'
+    return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+  elif dollars:
+    dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+    return '%s %s' % (dollars, dollar_unit)
+  elif cents:
+    cent_unit = 'cent' if cents == 1 else 'cents'
+    return '%s %s' % (cents, cent_unit)
+  else:
+    return 'zero dollars'
+def _expand_ordinal(m):
+  return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+  num = int(m.group(0))
+  if num > 1000 and num < 3000:
+    if num == 2000:
+      return 'two thousand'
+    elif num > 2000 and num < 2010:
+      return 'two thousand ' + _inflect.number_to_words(num % 100)
+    elif num % 100 == 0:
+      return _inflect.number_to_words(num // 100) + ' hundred'
+    else:
+      return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+  else:
+    return _inflect.number_to_words(num, andword='')
+def normalize_numbers(text):
+  text = re.sub(_comma_number_re, _remove_commas, text)
+  text = re.sub(_pounds_re, r'\1 pounds', text)
+  text = re.sub(_dollars_re, _expand_dollars, text)
+  text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+  text = re.sub(_ordinal_re, _expand_ordinal, text)
+  text = re.sub(_number_re, _expand_number, text)
+  return text
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+  ('mrs', 'misess'),
+  ('mr', 'mister'),
+  ('dr', 'doctor'),
+  ('st', 'saint'),
+  ('co', 'company'),
+  ('jr', 'junior'),
+  ('maj', 'major'),
+  ('gen', 'general'),
+  ('drs', 'doctors'),
+  ('rev', 'reverend'),
+  ('lt', 'lieutenant'),
+  ('hon', 'honorable'),
+  ('sgt', 'sergeant'),
+  ('capt', 'captain'),
+  ('esq', 'esquire'),
+  ('ltd', 'limited'),
+  ('col', 'colonel'),
+  ('ft', 'fort'),
+]]
+def expand_abbreviations(text):
+  for regex, replacement in _abbreviations:
+    text = re.sub(regex, replacement, text)
+  return text
+def expand_numbers(text):
+  return normalize_numbers(text)
+def lowercase(text):
+  return text.lower()
+def collapse_whitespace(text):
+  return re.sub(_whitespace_re, ' ', text)
+def convert_to_ascii(text):
+  return unidecode(text)
+def basic_cleaners(text):
+  '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
+def transliteration_cleaners(text):
+  '''Pipeline for non-English text that transliterates to ASCII.'''
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
+def english_cleaners(text):
+  '''Pipeline for English text, including number and abbreviation expansion.'''
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = expand_numbers(text)
+  text = expand_abbreviations(text)
+  text = collapse_whitespace(text)
+  return text

maha_tts/text/symbols.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import sys
+from maha_tts.config import config
+labels=" abcdefghijklmnopqrstuvwxyz.,:;'()?!\""
+labels=" !\"'(),-.:;?[]abcdefghijklmnopqrstuvwxyzàâèéêü’“”"
+labels= [i for i in labels]
+text_labels = [i for i in labels]
+text_labels+='<S>','<E>','<PAD>'
+code_labels= [str(i) for i in range(config.semantic_model_centroids)]
+labels+=code_labels
+code_labels+='<SST>','<EST>','<PAD>'
+labels+='<S>','<E>','<SST>','<EST>','<PAD>'
+tok_enc = {j:i for i,j in enumerate(labels)}
+tok_dec = {i:j for i,j in enumerate(labels)}
+#text encdec
+text_enc = {j:i for i,j in enumerate(text_labels)}
+text_dec = {i:j for i,j in enumerate(text_labels)}
+#code encdec
+code_enc = {j:i for i,j in enumerate(code_labels)}
+code_dec = {i:j for i,j in enumerate(code_labels)}
+# print('length of the labels: ',len(labels))

maha_tts/utils/__init__.py ADDED Viewed

File without changes

maha_tts/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (179 Bytes). View file

maha_tts/utils/__pycache__/audio.cpython-311.pyc ADDED Viewed

Binary file (5.3 kB). View file

maha_tts/utils/__pycache__/diffusion.cpython-311.pyc ADDED Viewed

Binary file (58.7 kB). View file

maha_tts/utils/__pycache__/stft.cpython-311.pyc ADDED Viewed

Binary file (6.9 kB). View file

maha_tts/utils/audio.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+import numpy as np
+import librosa.util as librosa_util
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from maha_tts.config import config
+TACOTRON_MEL_MAX = 2.3143386840820312
+TACOTRON_MEL_MIN = -11.512925148010254
+def denormalize_tacotron_mel(norm_mel):
+    return ((norm_mel+1)/2)*(TACOTRON_MEL_MAX-TACOTRON_MEL_MIN)+TACOTRON_MEL_MIN
+def normalize_tacotron_mel(mel):
+    return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1
+def get_mask_from_lengths(lengths, max_len=None):
+    if not max_len:
+        max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len, device=lengths.device, dtype=torch.long)
+    mask = (ids < lengths.unsqueeze(1)).bool()
+    return mask
+def get_mask(lengths, max_len=None):
+    if not max_len:
+        max_len = torch.max(lengths).item()
+    lens = torch.arange(max_len,)
+    mask = lens[:max_len].unsqueeze(0) < lengths.unsqueeze(1)
+    return mask
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
+                     n_fft=800, dtype=np.float32, norm=None):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
+    win_sq = librosa_util.pad_center(win_sq, size=n_fft)
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
+    return x
+def load_wav_to_torch(full_path):
+    sampling_rate, data = read(full_path,)
+    return torch.FloatTensor(data), sampling_rate
+if __name__ == "__main__":
+    lens = torch.tensor([2, 3, 7, 5, 4])
+    mask  = get_mask(lens)
+    print(mask)
+    print(mask.shape)

maha_tts/utils/diffusion.py ADDED Viewed

	@@ -0,0 +1,1283 @@

+"""
+Copied from Tortoise-tts
+########################################
+This is an almost carbon copy of gaussian_diffusion.py from OpenAI's ImprovedDiffusion repo, which itself:
+This code started out as a PyTorch port of Ho et al's diffusion models:
+https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py
+Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules.
+########################################
+"""
+import enum
+import math
+import torch
+import torch as th
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
+        )
+    elif schedule_name == "cosine":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = 'previous_x'  # the model predicts x_{t-1}
+    START_X = 'start_x'  # the model predicts x_0
+    EPSILON = 'epsilon'  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = 'learned'
+    FIXED_SMALL = 'fixed_small'
+    FIXED_LARGE = 'fixed_large'
+    LEARNED_RANGE = 'learned_range'
+class LossType(enum.Enum):
+    MSE = 'mse'  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = 'rescaled_mse'  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = 'kl'  # use the variational lower-bound
+    RESCALED_KL = 'rescaled_kl'  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Ported directly from here, and then adapted over time to further experimentation.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    :param model_mean_type: a ModelMeanType determining what the model outputs.
+    :param model_var_type: a ModelVarType determining how variance is output.
+    :param loss_type: a LossType determining the loss function to use.
+    :param rescale_timesteps: if True, pass floating point timesteps into the
+                              model so that they are always scaled like in the
+                              original paper (0 to 1000).
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type,
+        rescale_timesteps=False,
+        conditioning_free=False,
+        conditioning_free_k=1,
+        ramp_conditioning_free=True,
+    ):
+        self.model_mean_type = ModelMeanType(model_mean_type)
+        self.model_var_type = ModelVarType(model_var_type)
+        self.loss_type = LossType(loss_type)
+        self.rescale_timesteps = rescale_timesteps
+        self.conditioning_free = conditioning_free
+        self.conditioning_free_k = conditioning_free_k
+        self.ramp_conditioning_free = ramp_conditioning_free
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # log calculation clipped because the posterior variance is 0 at the
+        # beginning of the diffusion chain.
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        )
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev)
+            * np.sqrt(alphas)
+            / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape. of the sample at timestep t
+        """
+        mean = (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        )
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(
+            self.log_one_minus_alphas_cumprod, t, x_start.shape
+        )
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
+            * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(
+        self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None
+    ):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, self._scale_timesteps(t), **model_kwargs)
+        if self.conditioning_free:
+            model_output_no_conditioning = model(x, self._scale_timesteps(t), conditioning_free=True, **model_kwargs)
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            if self.conditioning_free:
+                model_output_no_conditioning, _ = th.split(model_output_no_conditioning, C, dim=1)
+            if self.model_var_type == ModelVarType.LEARNED:
+                model_log_variance = model_var_values
+                model_variance = th.exp(model_log_variance)
+            else:
+                min_log = _extract_into_tensor(
+                    self.posterior_log_variance_clipped, t, x.shape
+                )
+                max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+                # The model_var_values is [-1, 1] for [min_var, max_var].
+                frac = (model_var_values + 1) / 2
+                model_log_variance = frac * max_log + (1 - frac) * min_log
+                model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        if self.conditioning_free:
+            if self.ramp_conditioning_free:
+                assert t.shape[0] == 1  # This should only be used in inference.
+                cfk = self.conditioning_free_k * (1 - self._scale_timesteps(t)[0].item() / self.num_timesteps)
+            else:
+                cfk = self.conditioning_free_k
+            model_output = (1 + cfk) * model_output - cfk * model_output_no_conditioning
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
+            )
+            model_mean = model_output
+        elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
+            if self.model_mean_type == ModelMeanType.START_X:
+                pred_xstart = process_xstart(model_output)
+            else:
+                pred_xstart = process_xstart(
+                    self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+                )
+            model_mean, _, _ = self.q_posterior_mean_variance(
+                x_start=pred_xstart, x_t=x, t=t
+            )
+        else:
+            raise NotImplementedError(self.model_mean_type)
+        assert (
+            model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        )
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_xstart_from_xprev(self, x_t, t, xprev):
+        assert x_t.shape == xprev.shape
+        return (  # (xprev - coef2*x_t) / coef1
+            _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
+            - _extract_into_tensor(
+                self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
+            )
+            * x_t
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * (1000.0 / self.num_timesteps)
+        return t
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
+        new_mean = (
+            p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        )
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(
+            x, self._scale_timesteps(t), **model_kwargs
+        )
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(
+            x_start=out["pred_xstart"], x_t=x, t=t
+        )
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(
+                cond_fn, out, x, t, model_kwargs=model_kwargs
+            )
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        for i in tqdm(indices, disable=not progress):
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_next)
+            + th.sqrt(1 - alpha_bar_next) * eps
+        )
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices, disable=not progress)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+        self, model, x_start, x_t, t, mask,clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        mask = mask.squeeze(1).float()
+        kl= kl.mean(dim=-2)
+        kl *= mask
+        kl = kl.sum(-1) / mask.sum(-1)
+        kl = kl/np.log(2.0)
+        # kl = mean_flat(kl) / np.log(2.0)
+        # print(kl)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        # print(decoder_nll.shape)
+        decoder_nll = decoder_nll.mean(dim=-2)
+        decoder_nll *= mask
+        decoder_nll = decoder_nll.sum(-1) / mask.sum(-1)
+        decoder_nll = decoder_nll/np.log(2.0)
+        # decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # print(decoder_nll)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, mask,model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        # print(x_t.shape,mask.shape)
+        terms = {}
+        # mask = torch.ones(mask.shape).to(mask.device)
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            # TODO: support multiple model outputs for this mode.
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            # print('timestep : ',self._scale_timesteps(t))
+            model_outputs = model(x_t, self._scale_timesteps(t), **model_kwargs)
+            if isinstance(model_outputs, tuple):
+                model_output = model_outputs[0]
+                terms['extra_outputs'] = model_outputs[1:]
+            else:
+                model_output = model_outputs
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    mask=mask,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    # terms["vb"] *= self.num_timesteps / 1000.0
+                    terms["vb"] *= 1/1000
+                    # print('scaling vb :',self.num_timesteps / 1000.0)
+            if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+                target = self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0]
+                x_start_pred = torch.zeros(x_start)  # Not supported.
+            elif self.model_mean_type == ModelMeanType.START_X:
+                target = x_start
+                x_start_pred = model_output
+            elif self.model_mean_type == ModelMeanType.EPSILON:
+                target = noise
+                x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output)
+            else:
+                raise NotImplementedError(self.model_mean_type)
+            assert model_output.shape == target.shape == x_start.shape
+            mask = mask.squeeze(1).float()
+            loss = F.mse_loss(target, model_output, reduction='none').mean(dim=-2)
+            loss *= mask
+            loss = loss.sum(-1) / mask.sum(-1)
+            terms["mse"] = loss
+            # terms["mse"] = mean_flat((target - model_output) ** 2)
+            terms["x_start_predicted"] = x_start_pred
+            # print(terms['vb'],terms['mse'])
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        # print(terms['loss'])
+        # terms["loss"]=terms['loss'].sum()/terms['loss'].shape[0]
+        terms["mse"]=terms['mse'].sum()/terms['mse'].shape[0]
+        terms["vb"]=terms['vb'].sum()/terms['vb'].shape[0]
+        # print(terms['loss'],terms['mse'],terms['vb'])
+        return terms
+    def autoregressive_training_losses(self, model, x_start, t, model_output_keys, gd_out_key, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            assert False  # not currently supported for this type of diffusion.
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs)
+            terms.update({k: o for k, o in zip(model_output_keys, model_outputs)})
+            model_output = terms[gd_out_key]
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C, 2, *x_t.shape[2:])
+                model_output, model_var_values = model_output[:, :, 0], model_output[:, :, 1]
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+                target = self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0]
+                x_start_pred = torch.zeros(x_start)  # Not supported.
+            elif self.model_mean_type == ModelMeanType.START_X:
+                target = x_start
+                x_start_pred = model_output
+            elif self.model_mean_type == ModelMeanType.EPSILON:
+                target = noise
+                x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output)
+            else:
+                raise NotImplementedError(self.model_mean_type)
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            terms["x_start_predicted"] = x_start_pred
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
+        )
+    elif schedule_name == "cosine":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def autoregressive_training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().autoregressive_training_losses(self._wrap_model(model, True), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model, autoregressive=False):
+        if isinstance(model, _WrappedModel) or isinstance(model, _WrappedAutoregressiveModel):
+            return model
+        mod = _WrappedAutoregressiveModel if autoregressive else _WrappedModel
+        return mod(
+            model, self.timestep_map, self.rescale_timesteps, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class _WrappedModel:
+    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        if self.rescale_timesteps:
+            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)
+class _WrappedAutoregressiveModel:
+    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, x0, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        if self.rescale_timesteps:
+            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, x0, new_ts, **kwargs)
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)

maha_tts/utils/stft.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.autograd import Variable
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from maha_tts.utils.audio import window_sumsquare
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length=800, hop_length=200, win_length=800,
+                 window='hann'):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
+                                   np.imag(fourier_basis[:cutoff, :])])
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :])
+        if window is not None:
+            assert(filter_length >= win_length)
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, size = filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+        self.register_buffer('forward_basis', forward_basis.float())
+        self.register_buffer('inverse_basis', inverse_basis.float())
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        self.num_samples = num_samples
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode='reflect')
+        input_data = input_data.squeeze(1)
+        forward_transform = F.conv1d(
+            input_data,
+            Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0)
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(
+            torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat(
+            [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0)
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window, magnitude.size(-1), hop_length=self.hop_length,
+                win_length=self.win_length, n_fft=self.filter_length,
+                dtype=np.float32)
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0])
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False)
+            window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+        inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
+        inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
+        return inverse_transform
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction

ref_clips/2971_4275_000003_000007.wav ADDED Viewed

Binary file (392 kB). View file

ref_clips/2971_4275_000020_000001.wav ADDED Viewed

Binary file (386 kB). View file

ref_clips/2971_4275_000023_000010.wav ADDED Viewed

Binary file (435 kB). View file

ref_clips/2971_4275_000049_000000.wav ADDED Viewed

Binary file (366 kB). View file

ref_clips/2971_4275_000049_000004.wav ADDED Viewed

Binary file (321 kB). View file

ref_clips/2971_4275_000050_000000.wav ADDED Viewed

Binary file (385 kB). View file

tts.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch,glob
+from maha_tts import load_diffuser,load_models,infer_tts
+from scipy.io.wavfile import write
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print('Using:',device)
+text = 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition.'
+ref_clips = glob.glob('/Users/jaskaransingh/Desktop/NeuralSpeak/ref_clips/*.wav')
+# print(len(ref_clips))
+# diffuser = load_diffuser()
+diff_model,ts_model,vocoder,diffuser = load_models('Smolie',device)
+audio,sr = infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder)
+write('test.wav',sr,audio)