|
import torch,glob,os,requests |
|
import numpy as np |
|
import torch.nn.functional as F |
|
|
|
from tqdm import tqdm |
|
from librosa.filters import mel as librosa_mel_fn |
|
from scipy.io.wavfile import write |
|
from scipy.special import softmax |
|
from maha_tts.models.diff_model import load_diff_model |
|
from maha_tts.models.autoregressive import load_TS_model |
|
from maha_tts.models.vocoder import load_vocoder_model,infer_wav |
|
from maha_tts.utils.audio import denormalize_tacotron_mel,normalize_tacotron_mel,load_wav_to_torch,dynamic_range_compression |
|
from maha_tts.utils.stft import STFT |
|
from maha_tts.utils.diffusion import SpacedDiffusion,get_named_beta_schedule,space_timesteps |
|
from maha_tts.text.symbols import labels,text_labels,text_labels_en,code_labels,text_enc,text_dec,code_enc,code_dec,text_enc_en,text_dec_en |
|
from maha_tts.text.cleaners import english_cleaners |
|
from maha_tts.config import config |
|
|
|
DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'maha_tts', 'models') |
|
DEFAULT_MODELS_DIR = '/Users/jaskaransingh/Desktop/MahaTTS/models/' |
|
stft_fn = STFT(config.filter_length, config.hop_length, config.win_length) |
|
|
|
mel_basis = librosa_mel_fn( |
|
sr=config.sampling_rate, n_fft=config.filter_length, n_mels=config.n_mel_channels, fmin=config.mel_fmin, fmax=config.mel_fmax) |
|
|
|
mel_basis = torch.from_numpy(mel_basis).float() |
|
|
|
model_dirs= { |
|
'Smolie':['https://huggingface.co/Dubverse/MahaTTS/resolve/main/maha_tts/pretrained_models/smolie/S2A/s2a_latest.pt', |
|
'https://huggingface.co/Dubverse/MahaTTS/resolve/main/maha_tts/pretrained_models/smolie/T2S/t2s_best.pt'], |
|
'Smolie-en':[''], |
|
'Smolie-in':[''], |
|
'hifigan':['https://huggingface.co/Dubverse/MahaTTS/resolve/main/maha_tts/pretrained_models/hifigan/g_02500000', |
|
'https://huggingface.co/Dubverse/MahaTTS/resolve/main/maha_tts/pretrained_models/hifigan/config.json'] |
|
} |
|
|
|
def download_file(url, filename): |
|
response = requests.get(url, stream=True) |
|
total_size = int(response.headers.get('content-length', 0)) |
|
|
|
|
|
response.raise_for_status() |
|
|
|
with open(filename, 'wb') as file, tqdm( |
|
desc=filename, |
|
total=total_size, |
|
unit='B', |
|
unit_scale=True, |
|
unit_divisor=1024, |
|
) as bar: |
|
for data in response.iter_content(chunk_size=1024): |
|
|
|
file.write(data) |
|
|
|
bar.update(len(data)) |
|
|
|
print(f"Download complete: {filename}\n") |
|
|
|
def download_model(name): |
|
print('Downloading ',name," ....") |
|
checkpoint_diff = os.path.join(DEFAULT_MODELS_DIR,name,'s2a_latest.pt') |
|
checkpoint_ts = os.path.join(DEFAULT_MODELS_DIR,name,'t2s_best.pt') |
|
checkpoint_voco = os.path.join(DEFAULT_MODELS_DIR,'hifigan','g_02500000') |
|
voco_config_path = os.path.join(DEFAULT_MODELS_DIR,'hifigan','config.json') |
|
|
|
os.makedirs(os.path.join(DEFAULT_MODELS_DIR,name),exist_ok=True) |
|
|
|
if name == 'hifigan': |
|
download_file(model_dirs[name][0],checkpoint_voco) |
|
download_file(model_dirs[name][1],voco_config_path) |
|
|
|
else: |
|
download_file(model_dirs[name][0],checkpoint_diff) |
|
download_file(model_dirs[name][1],checkpoint_ts) |
|
|
|
def load_models(name,device=torch.device('cpu')): |
|
''' |
|
Load pre-trained models for different components of a text-to-speech system. |
|
|
|
Args: |
|
device (str): The target device for model loading (e.g., 'cpu' or 'cuda'). |
|
checkpoint_diff (str): File path to the pre-trained model checkpoint for the diffusion model. |
|
checkpoint_ts (str): File path to the pre-trained model checkpoint for the text-to-semantic model. |
|
checkpoint_voco (str): File path to the pre-trained model checkpoint for the vocoder model. |
|
voco_config_path (str): File path to the configuration file for the vocoder model. |
|
|
|
Returns: |
|
diff_model (object): Loaded diffusion model for semantic-to-acoustic tokens. |
|
ts_model (object): Loaded text-to-semantic model for converting text-to-semantic tokens. |
|
vocoder (object): Loaded vocoder model for generating waveform from acoustic tokens. |
|
diffuser (object): Configured diffuser object for use in the diffusion model. |
|
''' |
|
|
|
assert name in model_dirs, "no model name "+name |
|
|
|
checkpoint_diff = os.path.join(DEFAULT_MODELS_DIR,name,'s2a_latest.pt') |
|
checkpoint_ts = os.path.join(DEFAULT_MODELS_DIR,name,'t2s_best.pt') |
|
checkpoint_voco = os.path.join(DEFAULT_MODELS_DIR,'hifigan','g_02500000') |
|
voco_config_path = os.path.join(DEFAULT_MODELS_DIR,'hifigan','config.json') |
|
|
|
|
|
if not os.path.exists(checkpoint_diff) or not os.path.exists(checkpoint_ts): |
|
download_model(name) |
|
|
|
if not os.path.exists(checkpoint_voco) or not os.path.exists(voco_config_path): |
|
download_model('hifigan') |
|
|
|
diff_model = load_diff_model(checkpoint_diff,device) |
|
ts_model = load_TS_model(checkpoint_ts,device,name) |
|
vocoder = load_vocoder_model(voco_config_path,checkpoint_voco,device) |
|
diffuser = load_diffuser() |
|
|
|
return diff_model,ts_model,vocoder,diffuser |
|
|
|
def infer_mel(model,timeshape,code,ref_mel,diffuser,temperature=1.0): |
|
device = next(model.parameters()).device |
|
code = code.to(device) |
|
ref_mel =ref_mel.to(device) |
|
output_shape = (1,80,timeshape) |
|
noise = torch.randn(output_shape, device=code.device) * temperature |
|
mel = diffuser.p_sample_loop(model, output_shape, noise=noise, |
|
model_kwargs={'code_emb': code,'ref_clips':ref_mel}, |
|
progress=True) |
|
return denormalize_tacotron_mel(mel) |
|
|
|
def generate_semantic_tokens( |
|
text, |
|
model, |
|
ref_mels, |
|
language=None, |
|
temp = 0.7, |
|
top_p= None, |
|
top_k= 1, |
|
n_tot_steps = 1000, |
|
device = None |
|
): |
|
semb = [] |
|
with torch.no_grad(): |
|
for n in tqdm(range(n_tot_steps)): |
|
x = get_inputs(text,semb,ref_mels,device,model.name) |
|
_,result = model(**x,language=language) |
|
relevant_logits = result[0,:,-1] |
|
if top_p is not None: |
|
|
|
original_device = relevant_logits.device |
|
relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy() |
|
sorted_indices = np.argsort(relevant_logits)[::-1] |
|
sorted_logits = relevant_logits[sorted_indices] |
|
cumulative_probs = np.cumsum(softmax(sorted_logits)) |
|
sorted_indices_to_remove = cumulative_probs > top_p |
|
sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy() |
|
sorted_indices_to_remove[0] = False |
|
relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf |
|
relevant_logits = torch.from_numpy(relevant_logits) |
|
relevant_logits = relevant_logits.to(original_device) |
|
|
|
if top_k is not None: |
|
v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1))) |
|
relevant_logits[relevant_logits < v[-1]] = -float("Inf") |
|
|
|
probs = F.softmax(relevant_logits / temp, dim=-1) |
|
item_next = torch.multinomial(probs, num_samples=1).to(torch.int32) |
|
semb.append(str(code_dec[item_next.item()])) |
|
if semb[-1] == '<EST>' or semb[-1] == '<PAD>': |
|
break |
|
|
|
del relevant_logits, probs, item_next |
|
|
|
semb = torch.tensor([int(i) for i in semb[:-1]]) |
|
return semb,result |
|
|
|
def get_inputs(text,semb=[],ref_mels=[],device=torch.device('cpu'),name = 'Smolie-in'): |
|
text = text.lower() |
|
if name=='Smolie-en': |
|
text_ids=[text_enc_en['<S>']]+[text_enc_en[i] for i in text.strip()]+[text_enc_en['<E>']] |
|
else: |
|
text_ids=[text_enc['<S>']]+[text_enc[i] for i in text.strip()]+[text_enc['<E>']] |
|
|
|
semb_ids=[code_enc['<SST>']]+[code_enc[i] for i in semb] |
|
|
|
input_ids = text_ids+semb_ids |
|
|
|
|
|
token_type_ids = [0]*len(text_ids)+[1]*len(semb_ids) |
|
positional_ids = [i for i in range(len(text_ids))]+[i for i in range(len(semb_ids))] |
|
|
|
attention_mask = [1]*len(input_ids) |
|
|
|
return {'text_ids':torch.tensor(text_ids).unsqueeze(0).to(device),'codes_ids':torch.tensor(semb_ids).unsqueeze(0).to(device),'ref_clips':normalize_tacotron_mel(ref_mels).to(device)} |
|
|
|
def get_ref_mels(ref_clips): |
|
ref_mels = [] |
|
for i in ref_clips: |
|
ref_mels.append(get_mel(i)[0][:,:500]) |
|
|
|
ref_mels_padded = (torch.randn((len(ref_mels), 80, 500)))*1e-8 |
|
for i,mel in enumerate(ref_mels): |
|
ref_mels_padded[i, :, :mel.size(1)] = mel |
|
return ref_mels_padded.unsqueeze(0) |
|
|
|
def get_mel(filepath): |
|
audio, sampling_rate = load_wav_to_torch(filepath) |
|
audio_norm = audio / config.MAX_WAV_VALUE |
|
audio_norm = audio_norm.unsqueeze(0) |
|
y = torch.autograd.Variable(audio_norm, requires_grad=False) |
|
|
|
assert(torch.min(y.data) >= -1) |
|
assert(torch.max(y.data) <= 1) |
|
magnitudes, phases = stft_fn.transform(y) |
|
magnitudes = magnitudes.data |
|
mel_output = torch.matmul(mel_basis, magnitudes) |
|
mel_output = dynamic_range_compression(mel_output) |
|
melspec = torch.squeeze(mel_output, 0) |
|
energy = torch.norm(magnitudes, dim=1).squeeze(0) |
|
return melspec,list(energy) |
|
|
|
def infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder,language=None): |
|
''' |
|
Generate audio from the given text using a text-to-speech (TTS) pipeline. |
|
|
|
Args: |
|
text (str): The input text to be synthesized into speech. |
|
ref_clips (list): A list of paths to reference audio clips, preferably more than 3 clips. |
|
diffuser (object): A diffusion object used for denoising and guidance in the diffusion model. It should be obtained using load_diffuser. |
|
diff_model: diffusion model for semantic-to-acoustic tokens. |
|
ts_model: text-to-semantic model for converting text-to-semantic tokens. |
|
vocoder: vocoder model for generating waveform from acoustic tokens. |
|
|
|
Returns: |
|
audio (numpy.ndarray): Generated audio waveform. |
|
sampling_rate (int): Sampling rate of the generated audio. |
|
|
|
Description: |
|
The `infer_tts` function takes input text and reference audio clips, and processes them through a TTS pipeline. |
|
It first performs text preprocessing and generates semantic tokens using the specified text synthesis model. |
|
Then, it infers mel-spectrogram features using the diffusion model and the provided diffuser. |
|
Finally, it generates audio from the mel-spectrogram using the vocoder. |
|
|
|
Note: The function requires properly configured diff_model, ts_model, and vocoder objects for successful TTS. |
|
|
|
Example usage: |
|
audio, sampling_rate = infer_tts("Hello, how are you?", ref_clips, diffuser, diff_model, ts_model, vocoder) |
|
''' |
|
device = next(ts_model.parameters()).device |
|
text = english_cleaners(text) |
|
ref_mels = get_ref_mels(ref_clips) |
|
with torch.no_grad(): |
|
sem_tok,_ = generate_semantic_tokens( |
|
text, |
|
ts_model, |
|
ref_mels, |
|
language, |
|
temp = 0.7, |
|
top_p= 0.8, |
|
top_k= 5, |
|
n_tot_steps = 1000, |
|
device = device |
|
) |
|
mel = infer_mel(diff_model,int(((sem_tok.shape[-1] * 320 / 16000) * 22050/256)+1),sem_tok.unsqueeze(0) + 1, |
|
normalize_tacotron_mel(ref_mels),diffuser,temperature=0.5) |
|
|
|
audio = infer_wav(mel,vocoder) |
|
|
|
return audio,config.sampling_rate |
|
|
|
def load_diffuser(timesteps = 100, guidance=3): |
|
''' |
|
Load and configure a diffuser for denoising and guidance in the diffusion model. |
|
|
|
Args: |
|
timesteps (int): Number of denoising steps out of 1000. Default is 100. |
|
guidance (int): Conditioning-free guidance parameter. Default is 3. |
|
|
|
Returns: |
|
diffuser (object): Configured diffuser object for use in the diffusion model. |
|
|
|
Description: |
|
The `load_diffuser` function initializes a diffuser with specific settings for denoising and guidance. |
|
''' |
|
betas = get_named_beta_schedule('linear',config.sa_timesteps_max) |
|
diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [timesteps]), model_mean_type='epsilon', |
|
model_var_type='learned_range', loss_type='rescaled_mse', betas=betas, |
|
conditioning_free=True, conditioning_free_k=guidance) |
|
diffuser.training=False |
|
return diffuser |
|
|
|
if __name__ == '__main__': |
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
print(device) |
|
text = 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition.' |
|
ref_clips = glob.glob('/Users/jaskaransingh/Desktop/maha_tts/ref_clips/*.wav') |
|
|
|
checkpoint_diff = 'maha_tts/pretrained_models/S2A/s2a_latest.pt' |
|
checkpoint_ts = 'maha_tts/pretrained_models/T2S/t2s_best.pt' |
|
checkpoint_voco = 'maha_tts/pretrained_models/hifigan/g_02500000' |
|
voco_config_path = 'maha_tts/pretrained_models/hifigan/config.json' |
|
|
|
diffuser = load_diffuser() |
|
diff_model,ts_model,vocoder = load_models(device,checkpoint_diff,checkpoint_ts,checkpoint_voco,voco_config_path) |
|
audio,sr = infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder) |
|
write('test.wav',sr,audio) |
|
|
|
|