Spaces:
Running
Running
Commit
·
310916c
1
Parent(s):
42db371
add metrics text
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import random
|
|
5 |
from zipfile import ZipFile
|
6 |
import uuid
|
7 |
|
|
|
8 |
import torch
|
9 |
import torchaudio
|
10 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
@@ -59,6 +60,7 @@ DEVICE_ASSERT_DETECTED=0
|
|
59 |
DEVICE_ASSERT_PROMPT=None
|
60 |
DEVICE_ASSERT_LANG=None
|
61 |
|
|
|
62 |
def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
|
63 |
if agree == True:
|
64 |
supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
@@ -165,9 +167,18 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
165 |
global DEVICE_ASSERT_LANG
|
166 |
#It will likely never come here as we restart space on first unrecoverable error now
|
167 |
print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
|
|
|
|
|
|
|
168 |
|
|
|
169 |
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
|
|
|
|
|
|
|
170 |
wav_chunks = []
|
|
|
|
|
171 |
|
172 |
chunks = model.inference_stream(
|
173 |
prompt,
|
@@ -175,14 +186,24 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
175 |
gpt_cond_latent,
|
176 |
speaker_embedding,)
|
177 |
try:
|
|
|
178 |
for i, chunk in enumerate(chunks):
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
wav_chunks.append(chunk)
|
180 |
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
|
|
181 |
out_file = f'{i}.wav'
|
182 |
write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
|
183 |
audio = AudioSegment.from_file(out_file)
|
184 |
audio.export(out_file, format='wav')
|
185 |
-
|
|
|
|
|
186 |
except RuntimeError as e :
|
187 |
if "device-side assert" in str(e):
|
188 |
# cannot do anything on cuda device side error, need tor estart
|
@@ -212,6 +233,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
212 |
audio="output.wav",
|
213 |
),
|
214 |
"sil.wav",
|
|
|
215 |
speaker_wav,
|
216 |
)
|
217 |
else:
|
@@ -220,6 +242,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
|
|
220 |
None,
|
221 |
None,
|
222 |
None,
|
|
|
223 |
)
|
224 |
|
225 |
|
@@ -439,10 +462,11 @@ gr.Interface(
|
|
439 |
outputs=[
|
440 |
gr.Video(label="Waveform Visual"),
|
441 |
gr.Audio(label="Synthesised Audio", streaming=True, autoplay=True),
|
|
|
442 |
gr.Audio(label="Reference Audio Used"),
|
443 |
],
|
444 |
title=title,
|
445 |
description=description,
|
446 |
article=article,
|
447 |
-
|
448 |
-
).queue().launch(debug=True,show_api=
|
|
|
5 |
from zipfile import ZipFile
|
6 |
import uuid
|
7 |
|
8 |
+
import time
|
9 |
import torch
|
10 |
import torchaudio
|
11 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
|
|
60 |
DEVICE_ASSERT_PROMPT=None
|
61 |
DEVICE_ASSERT_LANG=None
|
62 |
|
63 |
+
|
64 |
def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
|
65 |
if agree == True:
|
66 |
supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
|
|
167 |
global DEVICE_ASSERT_LANG
|
168 |
#It will likely never come here as we restart space on first unrecoverable error now
|
169 |
print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
|
170 |
+
|
171 |
+
|
172 |
+
metrics_text= ""
|
173 |
|
174 |
+
t_latent=time.time()
|
175 |
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
|
176 |
+
latent_calculation_time = time.time() - t_latent
|
177 |
+
metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
|
178 |
+
|
179 |
wav_chunks = []
|
180 |
+
|
181 |
+
t_inference=time.time()
|
182 |
|
183 |
chunks = model.inference_stream(
|
184 |
prompt,
|
|
|
186 |
gpt_cond_latent,
|
187 |
speaker_embedding,)
|
188 |
try:
|
189 |
+
first_chunk=True
|
190 |
for i, chunk in enumerate(chunks):
|
191 |
+
if first_chunk:
|
192 |
+
first_chunk_time = time.time() - t_inference
|
193 |
+
metrics_text+=f"Streaming: First chunk actual latency: {first_chunk_time:.2f} seconds\n"
|
194 |
+
first_chunk=False
|
195 |
+
|
196 |
+
|
197 |
wav_chunks.append(chunk)
|
198 |
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
199 |
+
|
200 |
out_file = f'{i}.wav'
|
201 |
write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
|
202 |
audio = AudioSegment.from_file(out_file)
|
203 |
audio.export(out_file, format='wav')
|
204 |
+
|
205 |
+
yield (None, out_file, metrics_text, None)
|
206 |
+
|
207 |
except RuntimeError as e :
|
208 |
if "device-side assert" in str(e):
|
209 |
# cannot do anything on cuda device side error, need tor estart
|
|
|
233 |
audio="output.wav",
|
234 |
),
|
235 |
"sil.wav",
|
236 |
+
metrics_text,
|
237 |
speaker_wav,
|
238 |
)
|
239 |
else:
|
|
|
242 |
None,
|
243 |
None,
|
244 |
None,
|
245 |
+
None,
|
246 |
)
|
247 |
|
248 |
|
|
|
462 |
outputs=[
|
463 |
gr.Video(label="Waveform Visual"),
|
464 |
gr.Audio(label="Synthesised Audio", streaming=True, autoplay=True),
|
465 |
+
gr.Text(label="Metrics"),
|
466 |
gr.Audio(label="Reference Audio Used"),
|
467 |
],
|
468 |
title=title,
|
469 |
description=description,
|
470 |
article=article,
|
471 |
+
examples=examples,
|
472 |
+
).queue().launch(debug=True,show_api=True)
|