Update app.py
Browse filesupdate gui and support both segment by punctuation and time stamp
app.py
CHANGED
@@ -24,7 +24,8 @@ TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
|
|
24 |
TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
|
25 |
|
26 |
|
27 |
-
|
|
|
28 |
|
29 |
LANGUAGE_MAP = {
|
30 |
"en": "English",
|
@@ -47,8 +48,8 @@ AVAILABLE_SPEAKERS = {
|
|
47 |
audio_update_event = asyncio.Event()
|
48 |
acc_cosy_audio = None
|
49 |
# cosy voice tts related;
|
50 |
-
|
51 |
-
TTS_SOCKET_SERVER = "http://astarwiz.com:9444"
|
52 |
|
53 |
sio = socketio.AsyncClient()
|
54 |
|
@@ -346,8 +347,10 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
|
|
346 |
data.add_field('file', open(audio, 'rb'))
|
347 |
data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
|
348 |
data.add_field('model_name', 'whisper-large-v2-local-cs')
|
349 |
-
|
350 |
-
|
|
|
|
|
351 |
|
352 |
async with aiohttp.ClientSession() as session:
|
353 |
async with session.post(ASR_API, data=data) as asr_response:
|
@@ -395,15 +398,20 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
|
|
395 |
|
396 |
|
397 |
|
398 |
-
|
399 |
-
|
|
|
|
|
|
|
400 |
translate_segments = []
|
401 |
accumulated_audio = None
|
402 |
sample_rate = 22050
|
403 |
global is_playing
|
404 |
for i, segment in enumerate(split_result):
|
405 |
-
|
406 |
-
|
|
|
|
|
407 |
translated_seg_txt = await inference_via_llm_api(translation_prompt)
|
408 |
translate_segments.append(translated_seg_txt)
|
409 |
print(f"Translation: {translated_seg_txt}")
|
@@ -431,8 +439,8 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
|
|
431 |
content = await response.read()
|
432 |
audio_chunk, sr = sf.read(BytesIO(content))
|
433 |
#print ('audio_chunk:', type(audio_chunk),audio_chunk)
|
434 |
-
print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
|
435 |
-
|
436 |
|
437 |
|
438 |
if accumulated_audio is None:
|
@@ -490,6 +498,10 @@ async def update_audio():
|
|
490 |
return content
|
491 |
return gr.update()
|
492 |
|
|
|
|
|
|
|
|
|
493 |
with gr.Blocks() as demo:
|
494 |
gr.Markdown("# Speech Translation")
|
495 |
|
@@ -510,15 +522,14 @@ with gr.Blocks() as demo:
|
|
510 |
with gr.Row():
|
511 |
user_transcription_output = gr.Textbox(label="Transcription")
|
512 |
user_translation_output = gr.Textbox(label="Translation")
|
513 |
-
user_audio_output = gr.Audio(label="Translated Speech")
|
514 |
user_audio_final = gr.Audio(label="Final total Speech")
|
515 |
-
progress_bar = gr.Textbox(label="progress", interactive=False)
|
516 |
status_message = gr.Textbox(label="Status", interactive=False)
|
517 |
|
518 |
user_video_output = gr.HTML(label="YouTube Video")
|
519 |
|
520 |
-
replace_audio_button = gr.Button("Replace Audio", interactive=False)
|
521 |
-
final_video_output = gr.Video(label="Video with Replaced Audio")
|
522 |
|
523 |
temp_video_path = gr.State()
|
524 |
translation_progress = gr.State(0.0)
|
@@ -526,6 +537,7 @@ with gr.Blocks() as demo:
|
|
526 |
async def update_button_state(audio, youtube_url, progress):
|
527 |
print(audio, youtube_url, progress)
|
528 |
# Button is interactive if there's input and progress is 0 or 1 (not in progress)
|
|
|
529 |
return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
|
530 |
|
531 |
user_audio_input.change(
|
@@ -539,31 +551,23 @@ with gr.Blocks() as demo:
|
|
539 |
outputs=user_button
|
540 |
)
|
541 |
|
542 |
-
async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker):
|
543 |
-
|
544 |
-
|
545 |
-
#audio_data, sample_rate = sf.read(audio)
|
546 |
-
#print ("user_audio_input:", audio, audio_data, sample_rate)
|
547 |
-
|
548 |
|
549 |
-
|
550 |
-
gr.update(interactive=False),
|
551 |
-
gr.update(), gr.update(), gr.update(), gr.update(),
|
552 |
-
"Translation in progress...",None)
|
553 |
|
554 |
-
|
555 |
temp_video_path = None
|
556 |
transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
|
557 |
-
|
558 |
-
|
559 |
-
gr.update(interactive=True),
|
560 |
-
transcription, translated_text, audio_chunksr, temp_video_path,
|
561 |
-
"Translation complete", accumulated_aud_buf)
|
562 |
|
563 |
user_button.click(
|
|
|
|
|
|
|
|
|
564 |
fn=run_speech_translation_wrapper,
|
565 |
-
inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
|
566 |
-
outputs=[
|
567 |
)
|
568 |
|
569 |
async def update_replace_audio_button(audio_url, video_path):
|
@@ -578,8 +582,8 @@ with gr.Blocks() as demo:
|
|
578 |
|
579 |
replace_audio_button.click(
|
580 |
fn=replace_audio_and_generate_video,
|
581 |
-
inputs=[temp_video_path,
|
582 |
-
outputs=[
|
583 |
)
|
584 |
|
585 |
async def update_video_embed(youtube_url):
|
|
|
24 |
TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
|
25 |
|
26 |
|
27 |
+
#bSegByPunct = True
|
28 |
+
bSegByPunct = False
|
29 |
|
30 |
LANGUAGE_MAP = {
|
31 |
"en": "English",
|
|
|
48 |
audio_update_event = asyncio.Event()
|
49 |
acc_cosy_audio = None
|
50 |
# cosy voice tts related;
|
51 |
+
TTS_SOCKET_SERVER = "http://localhost:9444"
|
52 |
+
#TTS_SOCKET_SERVER = "http://astarwiz.com:9444"
|
53 |
|
54 |
sio = socketio.AsyncClient()
|
55 |
|
|
|
347 |
data.add_field('file', open(audio, 'rb'))
|
348 |
data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
|
349 |
data.add_field('model_name', 'whisper-large-v2-local-cs')
|
350 |
+
if bSegByPunct:
|
351 |
+
data.add_field('with_timestamp', 'false')
|
352 |
+
else:
|
353 |
+
data.add_field('with_timestamp', 'true')
|
354 |
|
355 |
async with aiohttp.ClientSession() as session:
|
356 |
async with session.post(ASR_API, data=data) as asr_response:
|
|
|
398 |
|
399 |
|
400 |
|
401 |
+
if bSegByPunct:
|
402 |
+
split_result = split_text_with_punctuation(transcription)
|
403 |
+
else:
|
404 |
+
split_result = extract_segments(transcription);
|
405 |
+
|
406 |
translate_segments = []
|
407 |
accumulated_audio = None
|
408 |
sample_rate = 22050
|
409 |
global is_playing
|
410 |
for i, segment in enumerate(split_result):
|
411 |
+
if bSegByPunct:
|
412 |
+
translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
|
413 |
+
else:
|
414 |
+
translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
|
415 |
translated_seg_txt = await inference_via_llm_api(translation_prompt)
|
416 |
translate_segments.append(translated_seg_txt)
|
417 |
print(f"Translation: {translated_seg_txt}")
|
|
|
439 |
content = await response.read()
|
440 |
audio_chunk, sr = sf.read(BytesIO(content))
|
441 |
#print ('audio_chunk:', type(audio_chunk),audio_chunk)
|
442 |
+
#print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
|
443 |
+
# _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
|
444 |
|
445 |
|
446 |
if accumulated_audio is None:
|
|
|
498 |
return content
|
499 |
return gr.update()
|
500 |
|
501 |
+
def disable_button():
|
502 |
+
# Disable the button during processing
|
503 |
+
return gr.update(interactive=False)
|
504 |
+
|
505 |
with gr.Blocks() as demo:
|
506 |
gr.Markdown("# Speech Translation")
|
507 |
|
|
|
522 |
with gr.Row():
|
523 |
user_transcription_output = gr.Textbox(label="Transcription")
|
524 |
user_translation_output = gr.Textbox(label="Translation")
|
525 |
+
user_audio_output = gr.Audio(label="Translated Speech", visible =False)
|
526 |
user_audio_final = gr.Audio(label="Final total Speech")
|
|
|
527 |
status_message = gr.Textbox(label="Status", interactive=False)
|
528 |
|
529 |
user_video_output = gr.HTML(label="YouTube Video")
|
530 |
|
531 |
+
replace_audio_button = gr.Button("Replace Audio", interactive=False, visible =False)
|
532 |
+
final_video_output = gr.Video(label="Video with Replaced Audio",visible=False)
|
533 |
|
534 |
temp_video_path = gr.State()
|
535 |
translation_progress = gr.State(0.0)
|
|
|
537 |
async def update_button_state(audio, youtube_url, progress):
|
538 |
print(audio, youtube_url, progress)
|
539 |
# Button is interactive if there's input and progress is 0 or 1 (not in progress)
|
540 |
+
print ("progress:", audio, youtube_url,bool(audio) , bool(youtube_url), progress == 0 or progress == 1)
|
541 |
return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
|
542 |
|
543 |
user_audio_input.change(
|
|
|
551 |
outputs=user_button
|
552 |
)
|
553 |
|
|
|
|
|
|
|
|
|
|
|
|
|
554 |
|
555 |
+
async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker,progress):
|
|
|
|
|
|
|
556 |
|
557 |
+
progress = 0.1
|
558 |
temp_video_path = None
|
559 |
transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
|
560 |
+
progress = 1
|
561 |
+
return transcription, translated_text, audio_chunksr, temp_video_path, "Translation complete", accumulated_aud_buf, gr.update(interactive=True)
|
|
|
|
|
|
|
562 |
|
563 |
user_button.click(
|
564 |
+
fn=disable_button,
|
565 |
+
inputs=[],
|
566 |
+
outputs=[user_button] # Disable the button during processing
|
567 |
+
).then(
|
568 |
fn=run_speech_translation_wrapper,
|
569 |
+
inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker, translation_progress],
|
570 |
+
outputs=[user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,user_button]
|
571 |
)
|
572 |
|
573 |
async def update_replace_audio_button(audio_url, video_path):
|
|
|
582 |
|
583 |
replace_audio_button.click(
|
584 |
fn=replace_audio_and_generate_video,
|
585 |
+
inputs=[temp_video_path, user_audio_final],
|
586 |
+
outputs=[status_message, final_video_output]
|
587 |
)
|
588 |
|
589 |
async def update_video_embed(youtube_url):
|