Spaces:
Sleeping
Sleeping
File size: 7,179 Bytes
f7fd7ba c259b68 0c4e5eb 221682e f7906ff ff3f6c6 bbff27f 5d6531a ff3f6c6 f7906ff f7fd7ba f7906ff 5d6531a ff3f6c6 f7906ff b3add08 ff3f6c6 bbff27f f7906ff bbff27f f7906ff ff3f6c6 88facee 2872c21 ff3f6c6 c299287 2872c21 3646132 c8d27aa 4cdfa11 3646132 ff3f6c6 214d206 ff3f6c6 bbff27f 59cd719 f7fd7ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer,VitsModel
import os
import numpy as np
token=os.environ.get("key_")
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
models= {}
import noisereduce as nr
import torch
from typing import Any, Callable, Optional, Tuple, Union,Iterator
import torch.nn as nn # Import the missing module
def remove_noise_nr(audio_data,sr=16000):
"""يزيل الضوضاء باستخدام مكتبة noisereduce."""
reduced_noise = nr.reduce_noise(y=audio_data,hop_length=256, sr=sr)
return reduced_noise
def _inference_forward_stream(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
speaker_embeddings: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
padding_mask: Optional[torch.Tensor] = None,
chunk_size: int = 32, # Chunk size for streaming output
is_streaming: bool = True,
) -> Iterator[torch.Tensor]:
"""Generates speech waveforms in a streaming fashion."""
if attention_mask is not None:
padding_mask = attention_mask.unsqueeze(-1).float()
else:
padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()
text_encoder_output = self.text_encoder(
input_ids=input_ids,
padding_mask=padding_mask,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
hidden_states = hidden_states.transpose(1, 2)
input_padding_mask = padding_mask.transpose(1, 2)
prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
if self.config.use_stochastic_duration_prediction:
log_duration = self.duration_predictor(
hidden_states,
input_padding_mask,
speaker_embeddings,
reverse=True,
noise_scale=self.noise_scale_duration,
)
else:
log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
length_scale = 1.0 / self.speaking_rate
duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
# Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
# Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
batch_size, _, output_length, input_length = attn_mask.shape
cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
valid_indices = indices.unsqueeze(0) < cum_duration
valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
# Expand prior distribution
prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
spectrogram = latents * output_padding_mask
if is_streaming:
for i in range(0, spectrogram.size(-1), chunk_size):
with torch.no_grad():
wav=self.decoder(spectrogram[:,:,i : i + chunk_size] ,speaker_embeddings)
yield wav.squeeze().cpu().numpy()
else:
wav=self.decoder(spectrogram,speaker_embeddings)
yield wav.squeeze().cpu().numpy()
@spaces.GPU
def get_model(name_model):
global models
if name_model in models:
return models[name_model]
models[name_model]=VitsModel.from_pretrained(name_model,token=token).cuda()
models[name_model].decoder.apply_weight_norm()
# torch.nn.utils.weight_norm(self.decoder.conv_pre)
# torch.nn.utils.weight_norm(self.decoder.conv_post)
for flow in models[name_model].flow.flows:
torch.nn.utils.weight_norm(flow.conv_pre)
torch.nn.utils.weight_norm(flow.conv_post)
return models[name_model]
zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' 🤔
import torch
@spaces.GPU
def modelspeech(text,name_model):
inputs = tokenizer(text, return_tensors="pt")
model=get_model(name_model)
with torch.no_grad():
wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
# with torch.no_grad():
# wav = model(input_ids=inputs["input_ids"].cuda()).waveform.cpu().numpy().reshape(-1)#.detach()
return model.config.sampling_rate,wav,remove_noise_nr(wav)
model_choices = gr.Dropdown(
choices=[
"wasmdashai/vits-ar",
"wasmdashai/vits-ar-v1",
"wasmdashai/vits-ar-sa-huba",
"wasmdashai/vits-ar-sa-ms",
"wasmdashai/vits-ar-sa-magd",
"wasmdashai/vtk",
"wasmdashai/mak",
"wasmdashai/vits-ar-sa-huba-v1",
"wasmdashai/vits-ar-sa-huba-v2",
"wasmdashai/vits-ar-z1",
"wasmdashai/vits-ar-sa-A",
"wasmdashai/vits-ar-ye"
],
label="اختر النموذج",
value="wasmdashai/vits-ar-sa-huba-v2",
)
demo = gr.Interface(fn=modelspeech, inputs=["text",model_choices], outputs=["audio","audio"])
demo.queue()
demo.launch()
|