jerrypan7 commited on
Commit
c0a79f6
·
verified ·
1 Parent(s): e10aa56

Update app.py

Browse files

update gui and support both segment by punctuation and time stamp

Files changed (1) hide show
  1. app.py +39 -35
app.py CHANGED
@@ -24,7 +24,8 @@ TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
24
  TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
25
 
26
 
27
-
 
28
 
29
  LANGUAGE_MAP = {
30
  "en": "English",
@@ -47,8 +48,8 @@ AVAILABLE_SPEAKERS = {
47
  audio_update_event = asyncio.Event()
48
  acc_cosy_audio = None
49
  # cosy voice tts related;
50
- #TTS_SOCKET_SERVER = "http://localhost:9244"
51
- TTS_SOCKET_SERVER = "http://astarwiz.com:9444"
52
 
53
  sio = socketio.AsyncClient()
54
 
@@ -346,8 +347,10 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
346
  data.add_field('file', open(audio, 'rb'))
347
  data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
348
  data.add_field('model_name', 'whisper-large-v2-local-cs')
349
- #data.add_field('with_timestamp', 'false')
350
- data.add_field('with_timestamp', 'true')
 
 
351
 
352
  async with aiohttp.ClientSession() as session:
353
  async with session.post(ASR_API, data=data) as asr_response:
@@ -395,15 +398,20 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
395
 
396
 
397
 
398
- #split_result = split_text_with_punctuation(transcription)
399
- split_result = extract_segments(transcription);
 
 
 
400
  translate_segments = []
401
  accumulated_audio = None
402
  sample_rate = 22050
403
  global is_playing
404
  for i, segment in enumerate(split_result):
405
- #translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
406
- translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
 
 
407
  translated_seg_txt = await inference_via_llm_api(translation_prompt)
408
  translate_segments.append(translated_seg_txt)
409
  print(f"Translation: {translated_seg_txt}")
@@ -431,8 +439,8 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
431
  content = await response.read()
432
  audio_chunk, sr = sf.read(BytesIO(content))
433
  #print ('audio_chunk:', type(audio_chunk),audio_chunk)
434
- print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
435
- # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
436
 
437
 
438
  if accumulated_audio is None:
@@ -490,6 +498,10 @@ async def update_audio():
490
  return content
491
  return gr.update()
492
 
 
 
 
 
493
  with gr.Blocks() as demo:
494
  gr.Markdown("# Speech Translation")
495
 
@@ -510,15 +522,14 @@ with gr.Blocks() as demo:
510
  with gr.Row():
511
  user_transcription_output = gr.Textbox(label="Transcription")
512
  user_translation_output = gr.Textbox(label="Translation")
513
- user_audio_output = gr.Audio(label="Translated Speech")
514
  user_audio_final = gr.Audio(label="Final total Speech")
515
- progress_bar = gr.Textbox(label="progress", interactive=False)
516
  status_message = gr.Textbox(label="Status", interactive=False)
517
 
518
  user_video_output = gr.HTML(label="YouTube Video")
519
 
520
- replace_audio_button = gr.Button("Replace Audio", interactive=False)
521
- final_video_output = gr.Video(label="Video with Replaced Audio")
522
 
523
  temp_video_path = gr.State()
524
  translation_progress = gr.State(0.0)
@@ -526,6 +537,7 @@ with gr.Blocks() as demo:
526
  async def update_button_state(audio, youtube_url, progress):
527
  print(audio, youtube_url, progress)
528
  # Button is interactive if there's input and progress is 0 or 1 (not in progress)
 
529
  return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
530
 
531
  user_audio_input.change(
@@ -539,31 +551,23 @@ with gr.Blocks() as demo:
539
  outputs=user_button
540
  )
541
 
542
- async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker):
543
-
544
-
545
- #audio_data, sample_rate = sf.read(audio)
546
- #print ("user_audio_input:", audio, audio_data, sample_rate)
547
-
548
 
549
- yield (0.01,
550
- gr.update(interactive=False),
551
- gr.update(), gr.update(), gr.update(), gr.update(),
552
- "Translation in progress...",None)
553
 
554
-
555
  temp_video_path = None
556
  transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
557
-
558
- yield (1,
559
- gr.update(interactive=True),
560
- transcription, translated_text, audio_chunksr, temp_video_path,
561
- "Translation complete", accumulated_aud_buf)
562
 
563
  user_button.click(
 
 
 
 
564
  fn=run_speech_translation_wrapper,
565
- inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
566
- outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,]
567
  )
568
 
569
  async def update_replace_audio_button(audio_url, video_path):
@@ -578,8 +582,8 @@ with gr.Blocks() as demo:
578
 
579
  replace_audio_button.click(
580
  fn=replace_audio_and_generate_video,
581
- inputs=[temp_video_path, user_audio_output],
582
- outputs=[gr.Textbox(label="Status"), final_video_output]
583
  )
584
 
585
  async def update_video_embed(youtube_url):
 
24
  TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
25
 
26
 
27
+ #bSegByPunct = True
28
+ bSegByPunct = False
29
 
30
  LANGUAGE_MAP = {
31
  "en": "English",
 
48
  audio_update_event = asyncio.Event()
49
  acc_cosy_audio = None
50
  # cosy voice tts related;
51
+ TTS_SOCKET_SERVER = "http://localhost:9444"
52
+ #TTS_SOCKET_SERVER = "http://astarwiz.com:9444"
53
 
54
  sio = socketio.AsyncClient()
55
 
 
347
  data.add_field('file', open(audio, 'rb'))
348
  data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
349
  data.add_field('model_name', 'whisper-large-v2-local-cs')
350
+ if bSegByPunct:
351
+ data.add_field('with_timestamp', 'false')
352
+ else:
353
+ data.add_field('with_timestamp', 'true')
354
 
355
  async with aiohttp.ClientSession() as session:
356
  async with session.post(ASR_API, data=data) as asr_response:
 
398
 
399
 
400
 
401
+ if bSegByPunct:
402
+ split_result = split_text_with_punctuation(transcription)
403
+ else:
404
+ split_result = extract_segments(transcription);
405
+
406
  translate_segments = []
407
  accumulated_audio = None
408
  sample_rate = 22050
409
  global is_playing
410
  for i, segment in enumerate(split_result):
411
+ if bSegByPunct:
412
+ translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
413
+ else:
414
+ translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
415
  translated_seg_txt = await inference_via_llm_api(translation_prompt)
416
  translate_segments.append(translated_seg_txt)
417
  print(f"Translation: {translated_seg_txt}")
 
439
  content = await response.read()
440
  audio_chunk, sr = sf.read(BytesIO(content))
441
  #print ('audio_chunk:', type(audio_chunk),audio_chunk)
442
+ #print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
443
+ # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
444
 
445
 
446
  if accumulated_audio is None:
 
498
  return content
499
  return gr.update()
500
 
501
+ def disable_button():
502
+ # Disable the button during processing
503
+ return gr.update(interactive=False)
504
+
505
  with gr.Blocks() as demo:
506
  gr.Markdown("# Speech Translation")
507
 
 
522
  with gr.Row():
523
  user_transcription_output = gr.Textbox(label="Transcription")
524
  user_translation_output = gr.Textbox(label="Translation")
525
+ user_audio_output = gr.Audio(label="Translated Speech", visible =False)
526
  user_audio_final = gr.Audio(label="Final total Speech")
 
527
  status_message = gr.Textbox(label="Status", interactive=False)
528
 
529
  user_video_output = gr.HTML(label="YouTube Video")
530
 
531
+ replace_audio_button = gr.Button("Replace Audio", interactive=False, visible =False)
532
+ final_video_output = gr.Video(label="Video with Replaced Audio",visible=False)
533
 
534
  temp_video_path = gr.State()
535
  translation_progress = gr.State(0.0)
 
537
  async def update_button_state(audio, youtube_url, progress):
538
  print(audio, youtube_url, progress)
539
  # Button is interactive if there's input and progress is 0 or 1 (not in progress)
540
+ print ("progress:", audio, youtube_url,bool(audio) , bool(youtube_url), progress == 0 or progress == 1)
541
  return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
542
 
543
  user_audio_input.change(
 
551
  outputs=user_button
552
  )
553
 
 
 
 
 
 
 
554
 
555
+ async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker,progress):
 
 
 
556
 
557
+ progress = 0.1
558
  temp_video_path = None
559
  transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
560
+ progress = 1
561
+ return transcription, translated_text, audio_chunksr, temp_video_path, "Translation complete", accumulated_aud_buf, gr.update(interactive=True)
 
 
 
562
 
563
  user_button.click(
564
+ fn=disable_button,
565
+ inputs=[],
566
+ outputs=[user_button] # Disable the button during processing
567
+ ).then(
568
  fn=run_speech_translation_wrapper,
569
+ inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker, translation_progress],
570
+ outputs=[user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,user_button]
571
  )
572
 
573
  async def update_replace_audio_button(audio_url, video_path):
 
582
 
583
  replace_audio_button.click(
584
  fn=replace_audio_and_generate_video,
585
+ inputs=[temp_video_path, user_audio_final],
586
+ outputs=[status_message, final_video_output]
587
  )
588
 
589
  async def update_video_embed(youtube_url):