Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,347 Bytes
f7fd7ba c259b68 0c4e5eb 221682e f7906ff ff3f6c6 bbff27f 5d6531a ff3f6c6 f7906ff f7fd7ba f7906ff 3ead256 5d6531a b8c8892 f7906ff b3add08 ff3f6c6 d01da1c 6b733ea bbff27f f7906ff 46b0be8 f7906ff ff3f6c6 f5dcb77 2872c21 3646132 a0cd113 4cdfa11 00cae3c e60eceb 00cae3c a0cd113 ff3f6c6 214d206 ff3f6c6 3ead256 a0cd113 59cd719 f7fd7ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer,VitsModel
import os
import numpy as np
token=os.environ.get("key_")
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
models= {}
import noisereduce as nr
import torch
from typing import Any, Callable, Optional, Tuple, Union,Iterator
import torch.nn as nn # Import the missing module
def remove_noise_nr(audio_data,sr=16000):
"""يزيل الضوضاء باستخدام مكتبة noisereduce."""
reduced_noise = nr.reduce_noise(y=audio_data,hop_length=256, sr=sr)
return reduced_noise
def _inference_forward_stream(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
speaker_embeddings: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
padding_mask: Optional[torch.Tensor] = None,
chunk_size: int = 32, # Chunk size for streaming output
is_streaming: bool = True,
) -> Iterator[torch.Tensor]:
"""Generates speech waveforms in a streaming fashion."""
if attention_mask is not None:
padding_mask = attention_mask.unsqueeze(-1).float()
else:
padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()
text_encoder_output = self.text_encoder(
input_ids=input_ids,
padding_mask=padding_mask,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
hidden_states = hidden_states.transpose(1, 2)
input_padding_mask = padding_mask.transpose(1, 2)
prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
if self.config.use_stochastic_duration_prediction:
log_duration = self.duration_predictor(
hidden_states,
input_padding_mask,
speaker_embeddings,
reverse=True,
noise_scale=self.noise_scale_duration,
)
else:
log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
length_scale = 1.0 / self.speaking_rate
duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
# Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
# Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
batch_size, _, output_length, input_length = attn_mask.shape
cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
valid_indices = indices.unsqueeze(0) < cum_duration
valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
# Expand prior distribution
prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
spectrogram = latents * output_padding_mask
if is_streaming:
for i in range(0, spectrogram.size(-1), chunk_size):
with torch.no_grad():
wav=self.decoder(spectrogram[:,:,i : i + chunk_size] ,speaker_embeddings)
yield wav.squeeze().cpu().numpy()
else:
wav=self.decoder(spectrogram,speaker_embeddings)
yield wav.squeeze().cpu().numpy()
@spaces.GPU
def get_model(name_model):
global models
if name_model in models:
return models[name_model]
models[name_model]=VitsModel.from_pretrained(name_model,token=token).cuda()
models[name_model].decoder.apply_weight_norm()
# torch.nn.utils.weight_norm(self.decoder.conv_pre)
# torch.nn.utils.weight_norm(self.decoder.conv_post)
for flow in models[name_model].flow.flows:
torch.nn.utils.weight_norm(flow.conv_pre)
torch.nn.utils.weight_norm(flow.conv_post)
return models[name_model]
zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' 🤔
import torch
TXT="""السلام عليكم ورحمة الله وبركاتة يا هلا وسهلا ومراحب بالغالي اخباركم طيبين ان شاء الله ارحبوا على العين والراس """
@spaces.GPU
def modelspeech(text=TXT,name_model="wasmdashai/vits-ar-sa-huba-v2",speaking_rate=16000):
inputs = tokenizer(text, return_tensors="pt")
model=get_model(name_model)
model.speaking_rate=speaking_rate
with torch.no_grad():
wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
# with torch.no_grad():
# wav = model(input_ids=inputs["input_ids"].cuda()).waveform.cpu().numpy().reshape(-1)#.detach()
return (model.config.sampling_rate,wav),(model.config.sampling_rate,remove_noise_nr(wav))
model_choices = gr.Dropdown(
choices=[
"wasmdashai/vits-ar-sa-huba-v1",
"wasmdashai/vits-ar-sa-huba-v2",
"wasmdashai/vits-ar-sa-A",
"wasmdashai/vits-ar-ye-sa",
"wasmdashai/vits-ar-sa-M-v1",
"wasmdashai/vits-ar-sa-M-v2"
],
label="اختر النموذج",
value="wasmdashai/vits-ar-sa-huba-v2",
)
demo = gr.Interface(fn=modelspeech, inputs=["text",model_choices,gr.Slider(0, 1, step=0.1,value=0.8)], outputs=["audio","audio"])
demo.queue()
demo.launch()
|