speech_translation_integrate

Sleeping

App Files Files Community

jerrypan7 commited on Oct 25, 2024

Commit

c0a79f6

verified ·

1 Parent(s): e10aa56

Update app.py

Browse files

update gui and support both segment by punctuation and time stamp

Files changed (1) hide show

app.py +39 -35

app.py CHANGED Viewed

@@ -24,7 +24,8 @@ TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
 TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
 LANGUAGE_MAP = {
     "en": "English",
@@ -47,8 +48,8 @@ AVAILABLE_SPEAKERS = {
 audio_update_event = asyncio.Event()
 acc_cosy_audio = None
 # cosy voice tts related;
-#TTS_SOCKET_SERVER = "http://localhost:9244"
-TTS_SOCKET_SERVER = "http://astarwiz.com:9444"
 sio = socketio.AsyncClient()
@@ -346,8 +347,10 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
     data.add_field('file', open(audio, 'rb'))
     data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
     data.add_field('model_name', 'whisper-large-v2-local-cs')
-    #data.add_field('with_timestamp', 'false')
-    data.add_field('with_timestamp', 'true')
     async with aiohttp.ClientSession() as session:
         async with session.post(ASR_API, data=data) as asr_response:
@@ -395,15 +398,20 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
-    #split_result = split_text_with_punctuation(transcription)
-    split_result = extract_segments(transcription);
     translate_segments = []
     accumulated_audio = None
     sample_rate = 22050
     global is_playing
     for i, segment in enumerate(split_result):
-        #translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
-        translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
         translated_seg_txt = await inference_via_llm_api(translation_prompt)
         translate_segments.append(translated_seg_txt)
         print(f"Translation: {translated_seg_txt}")
@@ -431,8 +439,8 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
                         content = await response.read()
                         audio_chunk, sr = sf.read(BytesIO(content))
                         #print ('audio_chunk:', type(audio_chunk),audio_chunk)
-                        print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
-                       # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
                         if accumulated_audio is None:
@@ -490,6 +498,10 @@ async def update_audio():
             return content
     return gr.update()
 with gr.Blocks() as demo:
     gr.Markdown("# Speech Translation")
@@ -510,15 +522,14 @@ with gr.Blocks() as demo:
     with gr.Row():
         user_transcription_output = gr.Textbox(label="Transcription")
         user_translation_output = gr.Textbox(label="Translation")
-        user_audio_output = gr.Audio(label="Translated Speech")
         user_audio_final = gr.Audio(label="Final total Speech")
-    progress_bar = gr.Textbox(label="progress", interactive=False)
     status_message = gr.Textbox(label="Status", interactive=False)
     user_video_output = gr.HTML(label="YouTube Video")
-    replace_audio_button = gr.Button("Replace Audio", interactive=False)
-    final_video_output = gr.Video(label="Video with Replaced Audio")
     temp_video_path = gr.State()
     translation_progress = gr.State(0.0)
@@ -526,6 +537,7 @@ with gr.Blocks() as demo:
     async def update_button_state(audio, youtube_url, progress):
         print(audio, youtube_url, progress)
         # Button is interactive if there's input and progress is 0 or 1 (not in progress)
         return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
     user_audio_input.change(
@@ -539,31 +551,23 @@ with gr.Blocks() as demo:
         outputs=user_button
     )
-    async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker):
-        #audio_data, sample_rate = sf.read(audio)
-        #print ("user_audio_input:", audio, audio_data, sample_rate)
-        yield (0.01,
-               gr.update(interactive=False),
-               gr.update(), gr.update(), gr.update(), gr.update(),
-               "Translation in progress...",None)
         temp_video_path = None
         transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
-        yield (1,
-               gr.update(interactive=True),
-               transcription, translated_text, audio_chunksr, temp_video_path,
-               "Translation complete", accumulated_aud_buf)
     user_button.click(
         fn=run_speech_translation_wrapper,
-        inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
-        outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,]
     )
     async def update_replace_audio_button(audio_url, video_path):
@@ -578,8 +582,8 @@ with gr.Blocks() as demo:
     replace_audio_button.click(
         fn=replace_audio_and_generate_video,
-        inputs=[temp_video_path, user_audio_output],
-        outputs=[gr.Textbox(label="Status"), final_video_output]
     )
     async def update_video_embed(youtube_url):

 TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
+#bSegByPunct = True
+bSegByPunct = False
 LANGUAGE_MAP = {
     "en": "English",
 audio_update_event = asyncio.Event()
 acc_cosy_audio = None
 # cosy voice tts related;
+TTS_SOCKET_SERVER = "http://localhost:9444"
+#TTS_SOCKET_SERVER = "http://astarwiz.com:9444"
 sio = socketio.AsyncClient()
     data.add_field('file', open(audio, 'rb'))
     data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
     data.add_field('model_name', 'whisper-large-v2-local-cs')
+    if bSegByPunct:
+        data.add_field('with_timestamp', 'false')
+    else:
+        data.add_field('with_timestamp', 'true')
     async with aiohttp.ClientSession() as session:
         async with session.post(ASR_API, data=data) as asr_response:
+    if bSegByPunct:
+        split_result = split_text_with_punctuation(transcription)
+    else:
+        split_result = extract_segments(transcription);
     translate_segments = []
     accumulated_audio = None
     sample_rate = 22050
     global is_playing
     for i, segment in enumerate(split_result):
+        if bSegByPunct:
+            translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
+        else:
+            translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
         translated_seg_txt = await inference_via_llm_api(translation_prompt)
         translate_segments.append(translated_seg_txt)
         print(f"Translation: {translated_seg_txt}")
                         content = await response.read()
                         audio_chunk, sr = sf.read(BytesIO(content))
                         #print ('audio_chunk:', type(audio_chunk),audio_chunk)
+                        #print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
+                        # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
                         if accumulated_audio is None:
             return content
     return gr.update()
+def disable_button():
+    # Disable the button during processing
+    return gr.update(interactive=False)
 with gr.Blocks() as demo:
     gr.Markdown("# Speech Translation")
     with gr.Row():
         user_transcription_output = gr.Textbox(label="Transcription")
         user_translation_output = gr.Textbox(label="Translation")
+        user_audio_output = gr.Audio(label="Translated Speech", visible =False)
         user_audio_final = gr.Audio(label="Final total Speech")
     status_message = gr.Textbox(label="Status", interactive=False)
     user_video_output = gr.HTML(label="YouTube Video")
+    replace_audio_button = gr.Button("Replace Audio", interactive=False, visible =False)
+    final_video_output = gr.Video(label="Video with Replaced Audio",visible=False)
     temp_video_path = gr.State()
     translation_progress = gr.State(0.0)
     async def update_button_state(audio, youtube_url, progress):
         print(audio, youtube_url, progress)
         # Button is interactive if there's input and progress is 0 or 1 (not in progress)
+        print ("progress:", audio, youtube_url,bool(audio) , bool(youtube_url), progress == 0 or progress == 1)
         return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
     user_audio_input.change(
         outputs=user_button
     )
+    async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker,progress):
+        progress = 0.1
         temp_video_path = None
         transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
+        progress = 1
+        return  transcription, translated_text, audio_chunksr, temp_video_path, "Translation complete", accumulated_aud_buf, gr.update(interactive=True)
     user_button.click(
+        fn=disable_button,
+        inputs=[],
+        outputs=[user_button]  # Disable the button during processing
+    ).then(
         fn=run_speech_translation_wrapper,
+        inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker, translation_progress],
+        outputs=[user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,user_button]
     )
     async def update_replace_audio_button(audio_url, video_path):
     replace_audio_button.click(
         fn=replace_audio_and_generate_video,
+        inputs=[temp_video_path, user_audio_final],
+        outputs=[status_message, final_video_output]
     )
     async def update_video_embed(youtube_url):