Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -107,15 +107,32 @@ def _inference_forward_stream(
|
|
107 |
def get_model(name_model):
|
108 |
global models
|
109 |
if name_model in models:
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
models[name_model].decoder.apply_weight_norm()
|
113 |
# torch.nn.utils.weight_norm(self.decoder.conv_pre)
|
114 |
# torch.nn.utils.weight_norm(self.decoder.conv_post)
|
115 |
for flow in models[name_model].flow.flows:
|
116 |
torch.nn.utils.weight_norm(flow.conv_pre)
|
117 |
torch.nn.utils.weight_norm(flow.conv_post)
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
|
121 |
zero = torch.Tensor([0]).cuda()
|
@@ -124,10 +141,10 @@ import torch
|
|
124 |
TXT="""السلام عليكم ورحمة الله وبركاتة يا هلا وسهلا ومراحب بالغالي اخباركم طيبين ان شاء الله ارحبوا على العين والراس """
|
125 |
@spaces.GPU
|
126 |
def modelspeech(text=TXT,name_model="wasmdashai/vits-ar-sa-huba-v2",speaking_rate=16000):
|
127 |
-
|
128 |
|
129 |
inputs = tokenizer(text, return_tensors="pt")
|
130 |
-
|
131 |
model.speaking_rate=speaking_rate
|
132 |
with torch.no_grad():
|
133 |
wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
|
@@ -144,7 +161,8 @@ model_choices = gr.Dropdown(
|
|
144 |
|
145 |
"wasmdashai/vits-ar-sa-A",
|
146 |
"wasmdashai/vits-ar-ye-sa",
|
147 |
-
"wasmdashai/vits-ar-sa-M-v1"
|
|
|
148 |
|
149 |
|
150 |
],
|
|
|
107 |
def get_model(name_model):
|
108 |
global models
|
109 |
if name_model in models:
|
110 |
+
if name_model=='wasmdashai/vits-en-v1':
|
111 |
+
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
|
112 |
+
else:
|
113 |
+
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
return models[name_model],tokenizer
|
119 |
+
models[name_model]=VitsModel.from_pretrained(name_model,token=token)
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
models[name_model].decoder.apply_weight_norm()
|
124 |
# torch.nn.utils.weight_norm(self.decoder.conv_pre)
|
125 |
# torch.nn.utils.weight_norm(self.decoder.conv_post)
|
126 |
for flow in models[name_model].flow.flows:
|
127 |
torch.nn.utils.weight_norm(flow.conv_pre)
|
128 |
torch.nn.utils.weight_norm(flow.conv_post)
|
129 |
+
|
130 |
+
if name_model=='wasmdashai/vits-en-v1':
|
131 |
+
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
|
132 |
+
else:
|
133 |
+
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
|
134 |
+
|
135 |
+
return models[name_model],tokenizer
|
136 |
|
137 |
|
138 |
zero = torch.Tensor([0]).cuda()
|
|
|
141 |
TXT="""السلام عليكم ورحمة الله وبركاتة يا هلا وسهلا ومراحب بالغالي اخباركم طيبين ان شاء الله ارحبوا على العين والراس """
|
142 |
@spaces.GPU
|
143 |
def modelspeech(text=TXT,name_model="wasmdashai/vits-ar-sa-huba-v2",speaking_rate=16000):
|
144 |
+
model,tokenizer=get_model(name_model)
|
145 |
|
146 |
inputs = tokenizer(text, return_tensors="pt")
|
147 |
+
|
148 |
model.speaking_rate=speaking_rate
|
149 |
with torch.no_grad():
|
150 |
wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
|
|
|
161 |
|
162 |
"wasmdashai/vits-ar-sa-A",
|
163 |
"wasmdashai/vits-ar-ye-sa",
|
164 |
+
"wasmdashai/vits-ar-sa-M-v1",
|
165 |
+
'wasmdashai/vits-en-v1'
|
166 |
|
167 |
|
168 |
],
|