File size: 7,347 Bytes
f7fd7ba
 
 
c259b68
0c4e5eb
221682e
f7906ff
 
 
ff3f6c6
bbff27f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d6531a
ff3f6c6
 
 
 
 
 
 
 
 
 
 
 
 
f7906ff
f7fd7ba
 
f7906ff
3ead256
5d6531a
b8c8892
f7906ff
 
b3add08
ff3f6c6
d01da1c
6b733ea
bbff27f
 
 
f7906ff
46b0be8
f7906ff
ff3f6c6
 
f5dcb77
2872c21
3646132
a0cd113
4cdfa11
00cae3c
e60eceb
 
00cae3c
a0cd113
ff3f6c6
 
214d206
ff3f6c6
3ead256
a0cd113
59cd719
f7fd7ba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer,VitsModel
import os
import numpy as np  

token=os.environ.get("key_")
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
models= {}

import noisereduce as nr

import torch
from typing import Any, Callable, Optional, Tuple, Union,Iterator

import torch.nn as nn # Import the missing module
def remove_noise_nr(audio_data,sr=16000):
    """يزيل الضوضاء باستخدام مكتبة noisereduce."""
    reduced_noise = nr.reduce_noise(y=audio_data,hop_length=256, sr=sr)
    return reduced_noise

def _inference_forward_stream(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        speaker_embeddings: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        padding_mask: Optional[torch.Tensor] = None,
        chunk_size: int = 32,  # Chunk size for streaming output
        is_streaming: bool = True,
    ) -> Iterator[torch.Tensor]:
        """Generates speech waveforms in a streaming fashion."""
        if attention_mask is not None:
            padding_mask = attention_mask.unsqueeze(-1).float()
        else:
            padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()



        text_encoder_output = self.text_encoder(
            input_ids=input_ids,
            padding_mask=padding_mask,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
        hidden_states = hidden_states.transpose(1, 2)
        input_padding_mask = padding_mask.transpose(1, 2)

        prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
        prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances

        if self.config.use_stochastic_duration_prediction:
            log_duration = self.duration_predictor(
                hidden_states,
                input_padding_mask,
                speaker_embeddings,
                reverse=True,
                noise_scale=self.noise_scale_duration,
            )
        else:
            log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)

        length_scale = 1.0 / self.speaking_rate
        duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
        predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()


        # Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
        indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
        output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
        output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)

        # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
        attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
        batch_size, _, output_length, input_length = attn_mask.shape
        cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
        indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
        valid_indices = indices.unsqueeze(0) < cum_duration
        valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
        padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
        attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask

        # Expand prior distribution
        prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
        prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)

        prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
        latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)

        spectrogram = latents * output_padding_mask
        if is_streaming:
            
            for i in range(0, spectrogram.size(-1), chunk_size):
                with torch.no_grad():
                    wav=self.decoder(spectrogram[:,:,i : i + chunk_size] ,speaker_embeddings)
                yield wav.squeeze().cpu().numpy()
        else:

              wav=self.decoder(spectrogram,speaker_embeddings)
              yield  wav.squeeze().cpu().numpy()
@spaces.GPU
def  get_model(name_model):
    global models
    if name_model in   models:
        return models[name_model]
    models[name_model]=VitsModel.from_pretrained(name_model,token=token).cuda()
    models[name_model].decoder.apply_weight_norm()
    # torch.nn.utils.weight_norm(self.decoder.conv_pre)
    # torch.nn.utils.weight_norm(self.decoder.conv_post)
    for flow in models[name_model].flow.flows:
        torch.nn.utils.weight_norm(flow.conv_pre)
        torch.nn.utils.weight_norm(flow.conv_post)
    return models[name_model]


zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' 🤔
import torch
TXT="""السلام  عليكم  ورحمة الله وبركاتة  يا هلا وسهلا ومراحب بالغالي  اخباركم  طيبين ان شاء الله     ارحبوا  على العين والراس     """
@spaces.GPU
def   modelspeech(text=TXT,name_model="wasmdashai/vits-ar-sa-huba-v2",speaking_rate=16000):
     
    
     inputs = tokenizer(text, return_tensors="pt")
     model=get_model(name_model)
     model.speaking_rate=speaking_rate
     with torch.no_grad():
        wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
     # with torch.no_grad():
     #      wav = model(input_ids=inputs["input_ids"].cuda()).waveform.cpu().numpy().reshape(-1)#.detach()
          
     return  (model.config.sampling_rate,wav),(model.config.sampling_rate,remove_noise_nr(wav))

model_choices = gr.Dropdown(
                            choices=[
                                
                                "wasmdashai/vits-ar-sa-huba-v1",
                                 "wasmdashai/vits-ar-sa-huba-v2",
                                 
                                 "wasmdashai/vits-ar-sa-A",
                                "wasmdashai/vits-ar-ye-sa",
                                "wasmdashai/vits-ar-sa-M-v1",
                                "wasmdashai/vits-ar-sa-M-v2"
                                
                                  
                            ],
                            label="اختر النموذج",
                            value="wasmdashai/vits-ar-sa-huba-v2",
                        )

demo = gr.Interface(fn=modelspeech, inputs=["text",model_choices,gr.Slider(0, 1, step=0.1,value=0.8)], outputs=["audio","audio"])
demo.queue()
demo.launch()