Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,769 Bytes
9f21f94 d789727 a801789 d789727 7fef4b1 cbfb85b d789727 7d0e0a6 d789727 7d0e0a6 d789727 40b1908 d789727 54547cf fabec3c 54547cf d789727 54547cf 1ea390e d789727 54547cf 1f1eee1 d789727 54547cf d789727 5554841 d789727 54547cf d789727 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer,VitsModel
import os
import numpy as np
token=os.environ.get("key_")
#tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
models= {}
import noisereduce as nr
import torch
from typing import Any, Callable, Optional, Tuple, Union,Iterator
import torch.nn as nn # Import the missing module
def remove_noise_nr(audio_data,sr=16000):
reduced_noise = nr.reduce_noise(y=audio_data,hop_length=256, sr=sr)
return reduced_noise
def _inference_forward_stream(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
speaker_embeddings: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
padding_mask: Optional[torch.Tensor] = None,
chunk_size: int = 32, # Chunk size for streaming output
is_streaming: bool = True,
) -> Iterator[torch.Tensor]:
"""Generates speech waveforms in a streaming fashion."""
if attention_mask is not None:
padding_mask = attention_mask.unsqueeze(-1).float()
else:
padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()
text_encoder_output = self.text_encoder(
input_ids=input_ids,
padding_mask=padding_mask,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
hidden_states = hidden_states.transpose(1, 2)
input_padding_mask = padding_mask.transpose(1, 2)
prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
if self.config.use_stochastic_duration_prediction:
log_duration = self.duration_predictor(
hidden_states,
input_padding_mask,
speaker_embeddings,
reverse=True,
noise_scale=self.noise_scale_duration,
)
else:
log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
length_scale = 1.0 / self.speaking_rate
duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
# Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
# Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
batch_size, _, output_length, input_length = attn_mask.shape
cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
valid_indices = indices.unsqueeze(0) < cum_duration
valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
# Expand prior distribution
prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
spectrogram = latents * output_padding_mask
if is_streaming:
for i in range(0, spectrogram.size(-1), chunk_size):
with torch.no_grad():
wav=self.decoder(spectrogram[:,:,i : i + chunk_size] ,speaker_embeddings)
yield wav.squeeze().cpu().numpy()
else:
wav=self.decoder(spectrogram,speaker_embeddings)
yield wav.squeeze().cpu().numpy()
@spaces.GPU
def get_model(name_model):
global models
if name_model in models:
if name_model=='wasmdashai/vits-en-v1':
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
else:
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
return models[name_model],tokenizer
models[name_model]=VitsModel.from_pretrained(name_model,token=token).cuda()
models[name_model].decoder.apply_weight_norm()
# torch.nn.utils.weight_norm(self.decoder.conv_pre)
# torch.nn.utils.weight_norm(self.decoder.conv_post)
for flow in models[name_model].flow.flows:
torch.nn.utils.weight_norm(flow.conv_pre)
torch.nn.utils.weight_norm(flow.conv_post)
if name_model=='wasmdashai/vits-en-v1':
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
else:
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
return models[name_model],tokenizer
zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' 🤔
import torch
TXT="""السلام عليكم ورحمة الله وبركاتة يا هلا وسهلا ومراحب بالغالي اخباركم طيبين ان شاء الله ارحبوا على العين والراس """
@spaces.GPU
def modelspeech(text=TXT,name_model="wasmdashai/vits-ar-sa-huba-v2",speaking_rate=16000):
model,tokenizer=get_model(name_model)
inputs = tokenizer(text, return_tensors="pt")
model.speaking_rate=speaking_rate
with torch.no_grad():
wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
# with torch.no_grad():
# wav = model(input_ids=inputs["input_ids"].cuda()).waveform.cpu().numpy().reshape(-1)#.detach()
return (model.config.sampling_rate,remove_noise_nr(wav))
model_choices = gr.Dropdown(
choices=[
"wasmdashai/vits-ar-sa-huba-v1",
"wasmdashai/vits-ar-sa-huba-v2",
"wasmdashai/vits-ar-sa-A",
"wasmdashai/vits-ar-ye-sa",
"wasmdashai/vits-ar-sa-M-v1",
'wasmdashai/vits-en-v1'
],
label="اختر النموذج",
value="wasmdashai/vits-ar-sa-huba-v2",
)
demo = gr.Interface(fn=modelspeech, inputs=["text",model_choices,gr.Slider(0.1, 1, step=0.1,value=0.8)], outputs=["audio"])
demo.queue()
demo.launch()
|