File size: 7,769 Bytes
9f21f94
 
d789727
 
a801789
d789727
 
7fef4b1
cbfb85b
d789727
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d0e0a6
d789727
 
 
 
7d0e0a6
d789727
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40b1908
d789727
 
 
54547cf
 
 
 
 
 
 
 
 
fabec3c
54547cf
 
 
d789727
 
 
 
 
 
54547cf
 
 
 
 
 
 
1ea390e
d789727
 
 
 
 
 
 
54547cf
1f1eee1
d789727
54547cf
d789727
 
 
 
 
 
5554841
d789727
 
 
 
 
 
 
 
 
54547cf
 
d789727
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer,VitsModel
import os
import numpy as np  

token=os.environ.get("key_")
#tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
models= {}

import noisereduce as nr

import torch
from typing import Any, Callable, Optional, Tuple, Union,Iterator

import torch.nn as nn # Import the missing module
def remove_noise_nr(audio_data,sr=16000):
    reduced_noise = nr.reduce_noise(y=audio_data,hop_length=256, sr=sr)
    return reduced_noise

def _inference_forward_stream(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        speaker_embeddings: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        padding_mask: Optional[torch.Tensor] = None,
        chunk_size: int = 32,  # Chunk size for streaming output
        is_streaming: bool = True,
    ) -> Iterator[torch.Tensor]:
        """Generates speech waveforms in a streaming fashion."""
        if attention_mask is not None:
            padding_mask = attention_mask.unsqueeze(-1).float()
        else:
            padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()



        text_encoder_output = self.text_encoder(
            input_ids=input_ids,
            padding_mask=padding_mask,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
        hidden_states = hidden_states.transpose(1, 2)
        input_padding_mask = padding_mask.transpose(1, 2)

        prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
        prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances

        if self.config.use_stochastic_duration_prediction:
            log_duration = self.duration_predictor(
                hidden_states,
                input_padding_mask,
                speaker_embeddings,
                reverse=True,
                noise_scale=self.noise_scale_duration,
            )
        else:
            log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)

        length_scale = 1.0 / self.speaking_rate
        duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
        predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()


        # Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
        indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
        output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
        output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)

        # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
        attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
        batch_size, _, output_length, input_length = attn_mask.shape
        cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
        indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
        valid_indices = indices.unsqueeze(0) < cum_duration
        valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
        padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
        attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask

        # Expand prior distribution
        prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
        prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)

        prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
        latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)

        spectrogram = latents * output_padding_mask
        if is_streaming:
            
            for i in range(0, spectrogram.size(-1), chunk_size):
                with torch.no_grad():
                    wav=self.decoder(spectrogram[:,:,i : i + chunk_size] ,speaker_embeddings)
                yield wav.squeeze().cpu().numpy()
        else:

              wav=self.decoder(spectrogram,speaker_embeddings)
              yield  wav.squeeze().cpu().numpy()
@spaces.GPU
def  get_model(name_model):
    global models
    if name_model in   models:
        if  name_model=='wasmdashai/vits-en-v1':
            tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
        else: 
            tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)

        

        
        return models[name_model],tokenizer
    models[name_model]=VitsModel.from_pretrained(name_model,token=token).cuda()
    
    
   
    models[name_model].decoder.apply_weight_norm()
    # torch.nn.utils.weight_norm(self.decoder.conv_pre)
    # torch.nn.utils.weight_norm(self.decoder.conv_post)
    for flow in models[name_model].flow.flows:
        torch.nn.utils.weight_norm(flow.conv_pre)
        torch.nn.utils.weight_norm(flow.conv_post)
    
    if  name_model=='wasmdashai/vits-en-v1':
            tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
    else: 
            tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)

    return models[name_model],tokenizer


zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' 🤔
import torch
TXT="""السلام  عليكم  ورحمة الله وبركاتة  يا هلا وسهلا ومراحب بالغالي  اخباركم  طيبين ان شاء الله     ارحبوا  على العين والراس     """
@spaces.GPU
def   modelspeech(text=TXT,name_model="wasmdashai/vits-ar-sa-huba-v2",speaking_rate=16000):
     model,tokenizer=get_model(name_model)
    
     inputs = tokenizer(text, return_tensors="pt")
     
     model.speaking_rate=speaking_rate
     with torch.no_grad():
        wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
     # with torch.no_grad():
     #      wav = model(input_ids=inputs["input_ids"].cuda()).waveform.cpu().numpy().reshape(-1)#.detach()
          
     return  (model.config.sampling_rate,remove_noise_nr(wav))

model_choices = gr.Dropdown(
                            choices=[
                                
                                "wasmdashai/vits-ar-sa-huba-v1",
                                 "wasmdashai/vits-ar-sa-huba-v2",
                                 
                                 "wasmdashai/vits-ar-sa-A",
                                "wasmdashai/vits-ar-ye-sa",
                                "wasmdashai/vits-ar-sa-M-v1",
                                'wasmdashai/vits-en-v1'
                                
                                  
                            ],
                            label="اختر النموذج",
                            value="wasmdashai/vits-ar-sa-huba-v2",
                        )

demo = gr.Interface(fn=modelspeech, inputs=["text",model_choices,gr.Slider(0.1, 1, step=0.1,value=0.8)], outputs=["audio"])
demo.queue()
demo.launch()