wasmdashai commited on
Commit
bbff27f
·
verified ·
1 Parent(s): 214d206

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -4
app.py CHANGED
@@ -8,6 +8,102 @@ import numpy as np
8
  token=os.environ.get("key_")
9
  tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
10
  models= {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  @spaces.GPU
12
  def get_model(name_model):
13
  global models
@@ -32,10 +128,12 @@ def modelspeech(text,name_model):
32
 
33
  inputs = tokenizer(text, return_tensors="pt")
34
  model=get_model(name_model)
35
- with torch.no_grad():
36
- wav = model(input_ids=inputs["input_ids"].cuda()).waveform.cpu().numpy().reshape(-1)#.detach()
 
 
37
 
38
- return model.config.sampling_rate,wav#remove_noise_nr(wav)
39
 
40
  model_choices = gr.Dropdown(
41
  choices=[
@@ -56,6 +154,6 @@ model_choices = gr.Dropdown(
56
  label="اختر النموذج",
57
  value="wasmdashai/vits-ar-sa-huba-v2",
58
  )
59
- demo = gr.Interface(fn=modelspeech, inputs=["text",model_choices], outputs=["audio"])
60
  demo.queue()
61
  demo.launch()
 
8
  token=os.environ.get("key_")
9
  tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
10
  models= {}
11
+
12
+ import noisereduce as nr
13
+
14
+ import torch
15
+ from typing import Any, Callable, Optional, Tuple, Union,Iterator
16
+
17
+ import torch.nn as nn # Import the missing module
18
+ def remove_noise_nr(audio_data,sr=16000):
19
+ """يزيل الضوضاء باستخدام مكتبة noisereduce."""
20
+ reduced_noise = nr.reduce_noise(y=audio_data,hop_length=256, sr=sr)
21
+ return reduced_noise
22
+
23
+ def _inference_forward_stream(
24
+ self,
25
+ input_ids: Optional[torch.Tensor] = None,
26
+ attention_mask: Optional[torch.Tensor] = None,
27
+ speaker_embeddings: Optional[torch.Tensor] = None,
28
+ output_attentions: Optional[bool] = None,
29
+ output_hidden_states: Optional[bool] = None,
30
+ return_dict: Optional[bool] = None,
31
+ padding_mask: Optional[torch.Tensor] = None,
32
+ chunk_size: int = 32, # Chunk size for streaming output
33
+ is_streaming: bool = True,
34
+ ) -> Iterator[torch.Tensor]:
35
+ """Generates speech waveforms in a streaming fashion."""
36
+ if attention_mask is not None:
37
+ padding_mask = attention_mask.unsqueeze(-1).float()
38
+ else:
39
+ padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()
40
+
41
+
42
+
43
+ text_encoder_output = self.text_encoder(
44
+ input_ids=input_ids,
45
+ padding_mask=padding_mask,
46
+ attention_mask=attention_mask,
47
+ output_attentions=output_attentions,
48
+ output_hidden_states=output_hidden_states,
49
+ return_dict=return_dict,
50
+ )
51
+ hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
52
+ hidden_states = hidden_states.transpose(1, 2)
53
+ input_padding_mask = padding_mask.transpose(1, 2)
54
+
55
+ prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
56
+ prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
57
+
58
+ if self.config.use_stochastic_duration_prediction:
59
+ log_duration = self.duration_predictor(
60
+ hidden_states,
61
+ input_padding_mask,
62
+ speaker_embeddings,
63
+ reverse=True,
64
+ noise_scale=self.noise_scale_duration,
65
+ )
66
+ else:
67
+ log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
68
+
69
+ length_scale = 1.0 / self.speaking_rate
70
+ duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
71
+ predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
72
+
73
+
74
+ # Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
75
+ indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
76
+ output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
77
+ output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
78
+
79
+ # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
80
+ attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
81
+ batch_size, _, output_length, input_length = attn_mask.shape
82
+ cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
83
+ indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
84
+ valid_indices = indices.unsqueeze(0) < cum_duration
85
+ valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
86
+ padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
87
+ attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
88
+
89
+ # Expand prior distribution
90
+ prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
91
+ prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
92
+
93
+ prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
94
+ latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
95
+
96
+ spectrogram = latents * output_padding_mask
97
+ if is_streaming:
98
+
99
+ for i in range(0, spectrogram.size(-1), chunk_size):
100
+ with torch.no_grad():
101
+ wav=self.decoder(spectrogram[:,:,i : i + chunk_size] ,speaker_embeddings)
102
+ yield wav.squeeze().cpu().numpy()
103
+ else:
104
+
105
+ wav=self.decoder(spectrogram,speaker_embeddings)
106
+ yield wav.squeeze().cpu().numpy()
107
  @spaces.GPU
108
  def get_model(name_model):
109
  global models
 
128
 
129
  inputs = tokenizer(text, return_tensors="pt")
130
  model=get_model(name_model)
131
+ with torch.no_grad():
132
+ wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
133
+ # with torch.no_grad():
134
+ # wav = model(input_ids=inputs["input_ids"].cuda()).waveform.cpu().numpy().reshape(-1)#.detach()
135
 
136
+ return model.config.sampling_rate,wav,remove_noise_nr(wav)
137
 
138
  model_choices = gr.Dropdown(
139
  choices=[
 
154
  label="اختر النموذج",
155
  value="wasmdashai/vits-ar-sa-huba-v2",
156
  )
157
+ demo = gr.Interface(fn=modelspeech, inputs=["text",model_choices], outputs=["audio","audio"])
158
  demo.queue()
159
  demo.launch()