speech_translation_integrate

Sleeping

App Files Files Community

jerrypan7 commited on Oct 18, 2024

Commit

e2c6728

verified ·

1 Parent(s): d171a54

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -53

app.py CHANGED Viewed

@@ -43,28 +43,9 @@ AVAILABLE_SPEAKERS = {
     "zh": ["childChinese2"]
 }
-# global variable to playing of tts generated
-audio_queue = []
-is_playing = False
-audio_update_event = asyncio.Event()
-def play_audio():
-    global is_playing
-    is_playing = True
-    #
-    while is_playing:
-        if audio_queue:
-            audio_chunk = audio_queue.pop(0)
-            sd.play(audio_chunk, samplerate=22050)
-            sd.wait()
-        else:
-            time.sleep(0.1)
-    print(" tts generating finished. play all the rest to finish playing")
-    while audio_queue:
-        audio_chunk = audio_queue.pop(0)
-        sd.play(audio_chunk, samplerate=22050)
-        sd.wait()
 # cosy voice tts related;
 #TTS_SOCKET_SERVER = "http://localhost:9244"
 TTS_SOCKET_SERVER = "http://astarwiz.com:9244"
@@ -81,7 +62,7 @@ def on_disconnect():
 @sio.on('audio_chunk')
 async def on_audio_chunk(data):
-    global translation_update, audio_update
     translated_seg_txt = data['trans_text']
     with translation_lock:
@@ -91,26 +72,20 @@ async def on_audio_chunk(data):
     audio_base64 = data['audio']
     audio_bytes = base64.b64decode(audio_base64)
     audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
-    audio_queue.append(audio_np)
-    if audio_update["content"] is None:
-        sr,accumulated_audio= 22050 ,audio_np
     else:
-        sr, accumulated_audio = audio_update["content"]
-        accumulated_audio = np.concatenate((accumulated_audio, audio_np))
     with audio_lock:
-        audio_update["content"] = (sr, accumulated_audio)
         audio_update["new"] = True
     #audio_float = audio_np.astype(np.float32) / 32767.0
     #audio_queue.append(audio_float)
     #accumulated_audio.extend(audio_float)
-    if not is_playing:
-        playback_thread = threading.Thread(target=play_audio)
-        playback_thread.start()
 @sio.on('tts_complete')
 async def on_tts_complete():
@@ -118,10 +93,7 @@ async def on_tts_complete():
     print("Disconnected from server after TTS completion")
     audio_update_event.set()
-    global is_playing
-    while audio_queue:
-        await asyncio.sleep(0.1)
-    is_playing = False
 # Global variables for storing update information
@@ -349,10 +321,11 @@ async def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64)
             return "The system got some error during vLLM generation. Please try it again."
 async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
-    global transcription_update, translation_update, audio_update
     transcription_update = {"content": "", "new": False}
     translation_update = {"content": "", "new": False}
     audio_update = {"content": None, "new": False}
     video_path = None
     #progress = gr.Progress();
@@ -414,7 +387,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
             await audio_update_event.wait()
             print('cosy tts complete,',audio_update)
-            return transcription, translation_update["content"], audio_update["content"], video_path
         except Exception as e:
             print(f"Failed to process request: {str(e)}")
@@ -426,7 +399,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
     split_result = extract_segments(transcription);
     translate_segments = []
     accumulated_audio = None
-    sample_rate = None
     global is_playing
     for i, segment in enumerate(split_result):
         #translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
@@ -460,10 +433,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
                         #print ('audio_chunk:', type(audio_chunk),audio_chunk)
                         print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
                        # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
-                        audio_queue.append(audio_chunk)
-                        if not is_playing:
-                            playback_thread = threading.Thread(target=play_audio)
-                            playback_thread.start()
                         if accumulated_audio is None:
                             accumulated_audio = audio_chunk
@@ -472,7 +442,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
                             accumulated_audio = np.concatenate((accumulated_audio, audio_chunk))
                         with audio_lock:
-                            audio_update["content"] = (sample_rate, accumulated_audio)
                             audio_update["new"] = True
                 else:
                     print(f"TTS failed for segment: {translated_seg_txt}")
@@ -483,9 +453,9 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
     print("sigal the playing could stop now. all tts generated")
     is_playing =False;
     if accumulated_audio is not None:
-        return transcription, translated_text, (sample_rate, accumulated_audio), video_path
     else:
-        return transcription, translated_text, "TTS failed", video_path
 """
 async def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
@@ -541,6 +511,7 @@ with gr.Blocks() as demo:
         user_transcription_output = gr.Textbox(label="Transcription")
         user_translation_output = gr.Textbox(label="Translation")
         user_audio_output = gr.Audio(label="Translated Speech")
     progress_bar = gr.Textbox(label="progress", interactive=False)
     status_message = gr.Textbox(label="Status", interactive=False)
@@ -578,21 +549,21 @@ with gr.Blocks() as demo:
         yield (0.01,
                gr.update(interactive=False),
                gr.update(), gr.update(), gr.update(), gr.update(),
-               "Translation in progress...")
         temp_video_path = None
-        transcription, translated_text, audio_chunksr, temp_video_path = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
         yield (1,
                gr.update(interactive=True),
                transcription, translated_text, audio_chunksr, temp_video_path,
-               "Translation complete")
     user_button.click(
         fn=run_speech_translation_wrapper,
         inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
-        outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message]
     )
     async def update_replace_audio_button(audio_url, video_path):
@@ -653,10 +624,106 @@ with gr.Blocks() as demo:
             user_translation_output,
             user_audio_output,
         ],
-        every=0.3
     )
 demo.queue()
 demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
-#asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")), share=True))

     "zh": ["childChinese2"]
 }
+audio_update_event = asyncio.Event()
+acc_cosy_audio = None
 # cosy voice tts related;
 #TTS_SOCKET_SERVER = "http://localhost:9244"
 TTS_SOCKET_SERVER = "http://astarwiz.com:9244"
 @sio.on('audio_chunk')
 async def on_audio_chunk(data):
+    global translation_update, audio_update, acc_cosy_audio
     translated_seg_txt = data['trans_text']
     with translation_lock:
     audio_base64 = data['audio']
     audio_bytes = base64.b64decode(audio_base64)
     audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
+    if (acc_cosy_audio is None):
+        acc_cosy_audio = audio_np
     else:
+        acc_cosy_audio = np.concatenate((acc_cosy_audio, audio_np))
     with audio_lock:
+        audio_update["content"] = (22050, audio_np)
         audio_update["new"] = True
     #audio_float = audio_np.astype(np.float32) / 32767.0
     #audio_queue.append(audio_float)
     #accumulated_audio.extend(audio_float)
 @sio.on('tts_complete')
 async def on_tts_complete():
     print("Disconnected from server after TTS completion")
     audio_update_event.set()
 # Global variables for storing update information
             return "The system got some error during vLLM generation. Please try it again."
 async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
+    global transcription_update, translation_update, audio_update, acc_cosy_audio
     transcription_update = {"content": "", "new": False}
     translation_update = {"content": "", "new": False}
     audio_update = {"content": None, "new": False}
+    acc_cosy_audio =None
     video_path = None
     #progress = gr.Progress();
             await audio_update_event.wait()
             print('cosy tts complete,',audio_update)
+            return transcription, translation_update["content"], audio_update["content"], video_path, (22050, acc_cosy_audio)
         except Exception as e:
             print(f"Failed to process request: {str(e)}")
     split_result = extract_segments(transcription);
     translate_segments = []
     accumulated_audio = None
+    sample_rate = 22050
     global is_playing
     for i, segment in enumerate(split_result):
         #translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
                         #print ('audio_chunk:', type(audio_chunk),audio_chunk)
                         print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
                        # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
                         if accumulated_audio is None:
                             accumulated_audio = audio_chunk
                             accumulated_audio = np.concatenate((accumulated_audio, audio_chunk))
                         with audio_lock:
+                            audio_update["content"] = (sample_rate, audio_chunk)
                             audio_update["new"] = True
                 else:
                     print(f"TTS failed for segment: {translated_seg_txt}")
     print("sigal the playing could stop now. all tts generated")
     is_playing =False;
     if accumulated_audio is not None:
+        return transcription, translated_text, audio_update["content"], video_path, (sample_rate,accumulated_audio)
     else:
+        return transcription, translated_text, "TTS failed", video_path, accumulated_audio
 """
 async def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
         user_transcription_output = gr.Textbox(label="Transcription")
         user_translation_output = gr.Textbox(label="Translation")
         user_audio_output = gr.Audio(label="Translated Speech")
+        user_audio_final = gr.Audio(label="Final total Speech")
     progress_bar = gr.Textbox(label="progress", interactive=False)
     status_message = gr.Textbox(label="Status", interactive=False)
         yield (0.01,
                gr.update(interactive=False),
                gr.update(), gr.update(), gr.update(), gr.update(),
+               "Translation in progress...",gr.update())
         temp_video_path = None
+        transcription, translated_text, audio_chunksr, temp_video_path,accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
         yield (1,
                gr.update(interactive=True),
                transcription, translated_text, audio_chunksr, temp_video_path,
+               "Translation complete", accumulated_aud_buf)
     user_button.click(
         fn=run_speech_translation_wrapper,
         inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
+        outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,]
     )
     async def update_replace_audio_button(audio_url, video_path):
             user_translation_output,
             user_audio_output,
         ],
+        every=0.1
+    )
+    # JavaScript for client-side queue and playback handling
+    user_audio_output.change(
+        None,  # No backend change needed, we only handle frontend actions
+        inputs=user_audio_output,  # Set the user_audio_output as input to capture its audio changes
+        outputs=None,
+        js="""
+        async (audioFilePath) => {
+        // Debug: Log received audio file path
+        console.log("Received audio file path:", audioFilePath);
+        if (!window.audioQueue) {
+            window.audioQueue = [];
+            window.isPlaying = false;
+        }
+        // Ensure the correct URL for the audio file is available
+        if (audioFilePath && audioFilePath.url) {
+            console.log("Processing audio file...");
+            try {
+                // Fetch and decode the audio file
+                const response = await fetch(audioFilePath.url);
+                if (!response.ok) {
+                    console.error("Failed to fetch audio file:", response.statusText);
+                    return;
+                }
+                const audioData = await response.arrayBuffer();
+                const audioContext = new AudioContext();
+                const decodedData = await audioContext.decodeAudioData(audioData);
+                // Split the decoded audio buffer into two chunks
+                const totalDuration = decodedData.duration;
+                const midPoint = Math.floor(decodedData.length / 2);  // Midpoint for splitting
+                const sampleRate = decodedData.sampleRate;
+                // Create two separate AudioBuffers for each chunk
+                const firstHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, midPoint, sampleRate);
+                const secondHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, decodedData.length - midPoint, sampleRate);
+                // Copy data from original buffer to the two new buffers
+                for (let channel = 0; channel < decodedData.numberOfChannels; channel++) {
+                    firstHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(0, midPoint), channel, 0);
+                    secondHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(midPoint), channel, 0);
+                }
+                // Add both chunks to the queue
+                window.audioQueue.push(firstHalfBuffer);
+                window.audioQueue.push(secondHalfBuffer);
+                console.log("Two audio chunks added to queue. Queue length:", window.audioQueue.length);
+                // Function to play the next audio chunk from the queue
+                const playNextChunk = async () => {
+                    console.log("Attempting to play next chunk. isPlaying:", window.isPlaying);
+                    if (!window.isPlaying && window.audioQueue.length > 0) {
+                        console.log("Starting playback...");
+                        window.isPlaying = true;
+                        // Get the next audio buffer from the queue
+                        const audioBuffer = window.audioQueue.shift();
+                        console.log("Playing audio chunk from buffer.");
+                        const source = audioContext.createBufferSource();
+                        source.buffer = audioBuffer;
+                        source.connect(audioContext.destination);
+                        // When the audio finishes playing, play the next chunk
+                        source.onended = () => {
+                            console.log("Audio chunk finished playing.");
+                            window.isPlaying = false;
+                            playNextChunk();  // Play the next audio chunk in the queue
+                        };
+                        source.start(0);  // Start playing the current chunk
+                        console.log("Audio chunk started.");
+                    } else {
+                        console.log("Already playing or queue is empty.");
+                    }
+                };
+                // Start playing the next chunk if not already playing
+                playNextChunk();
+            } catch (error) {
+                console.error("Error during audio playback:", error);
+                window.isPlaying = false;
+            }
+        } else {
+            console.log("No valid audio file path received.");
+        }
+    }
+    """
     )
 demo.queue()
 demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
+#asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))))