speech_translation_final

Runtime error

App Files Files Community

jerrypan7 commited on Oct 28, 2024

Commit

ef89cb1

verified ·

1 Parent(s): 412e2b2

Update app.py

Browse files

adaptation of tts using timestamp speech for cloning

Files changed (1) hide show

app.py +67 -39

app.py CHANGED Viewed

@@ -23,6 +23,10 @@ ASR_API = "http://astarwiz.com:9998/asr"
 TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
 TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
 LANGUAGE_MAP = {
     "en": "English",
     "ma": "Malay",
@@ -40,10 +44,12 @@ AVAILABLE_SPEAKERS = {
     "zh": ["childChinese2"]
 }
 audio_update_event = asyncio.Event()
 acc_cosy_audio = None
 # cosy voice tts related;
-TTS_SOCKET_SERVER = "http://astarwiz.com:9123"
 sio = socketio.AsyncClient()
@@ -209,7 +215,9 @@ async def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = N
                     if url.get('isBundle'):
                         audio_url = url['url']
                         extension = url['extension']
                         async with session.get(audio_url) as audio_response:
                             if audio_response.status == 200:
                                 content = await audio_response.read()
                                 temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
@@ -320,18 +328,17 @@ async def upload_file(file_path, upload_url):
         with open(file_path, 'rb') as f:
             form_data = aiohttp.FormData()
             form_data.add_field('file', f, filename=os.path.basename(file_path))
             async with session.post(upload_url, data=form_data) as response:
                 print(f"5. Client receives headers: {time.time()}")
                 print(f"Status: {response.status}")
                 result = await response.json()
                 print(f"7. Client fully received and parsed response: {time.time()}")
                 if response.status == 200:
                     return result
                 else:
                     return {"file_id",""}
 async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
     global transcription_update, translation_update, audio_update, acc_cosy_audio,audio_update_event
     transcription_update = {"content": "", "new": True}
@@ -357,9 +364,12 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
     data = aiohttp.FormData()
     data.add_field('file', open(audio, 'rb'))
     data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
-    data.add_field('model_name', 'whisper-large-v2-local-cs')
-    #data.add_field('with_timestamp', 'false')
-    data.add_field('with_timestamp', 'true')
     async with aiohttp.ClientSession() as session:
         async with session.post(ASR_API, data=data) as asr_response:
@@ -382,8 +392,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
                 server_url = TTS_SOCKET_SERVER
                 await sio.connect(server_url)
                 print(f"Connected to {server_url}")
             # Handle the audio file
             file_id=""
             if audio and os.path.exists(audio):
@@ -395,6 +404,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
                 print ("upload_result:", upload_result)
                 file_id = upload_result['file_id']
             # use defualt voice
             tts_request = {
                 'text': transcription,
@@ -418,15 +428,20 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
-    #split_result = split_text_with_punctuation(transcription)
-    split_result = extract_segments(transcription);
     translate_segments = []
     accumulated_audio = None
     sample_rate = 22050
     global is_playing
     for i, segment in enumerate(split_result):
-        #translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
-        translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
         translated_seg_txt = await inference_via_llm_api(translation_prompt)
         translate_segments.append(translated_seg_txt)
         print(f"Translation: {translated_seg_txt}")
@@ -454,8 +469,8 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
                         content = await response.read()
                         audio_chunk, sr = sf.read(BytesIO(content))
                         #print ('audio_chunk:', type(audio_chunk),audio_chunk)
-                        print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
-                       # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
                         if accumulated_audio is None:
@@ -513,6 +528,10 @@ async def update_audio():
             return content
     return gr.update()
 with gr.Blocks() as demo:
     gr.Markdown("# Speech Translation")
@@ -533,15 +552,14 @@ with gr.Blocks() as demo:
     with gr.Row():
         user_transcription_output = gr.Textbox(label="Transcription")
         user_translation_output = gr.Textbox(label="Translation")
-        user_audio_output = gr.Audio(label="Translated Speech")
         user_audio_final = gr.Audio(label="Final total Speech")
-    progress_bar = gr.Textbox(label="progress", interactive=False)
     status_message = gr.Textbox(label="Status", interactive=False)
     user_video_output = gr.HTML(label="YouTube Video")
-    replace_audio_button = gr.Button("Replace Audio", interactive=False)
-    final_video_output = gr.Video(label="Video with Replaced Audio")
     temp_video_path = gr.State()
     translation_progress = gr.State(0.0)
@@ -549,6 +567,7 @@ with gr.Blocks() as demo:
     async def update_button_state(audio, youtube_url, progress):
         print(audio, youtube_url, progress)
         # Button is interactive if there's input and progress is 0 or 1 (not in progress)
         return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
     user_audio_input.change(
@@ -562,31 +581,23 @@ with gr.Blocks() as demo:
         outputs=user_button
     )
-    async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker):
-        #audio_data, sample_rate = sf.read(audio)
-        #print ("user_audio_input:", audio, audio_data, sample_rate)
-        yield (0.01,
-               gr.update(interactive=False),
-               gr.update(), gr.update(), gr.update(), gr.update(),
-               "Translation in progress...",None)
         temp_video_path = None
         transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
-        yield (1,
-               gr.update(interactive=True),
-               transcription, translated_text, audio_chunksr, temp_video_path,
-               "Translation complete", accumulated_aud_buf)
     user_button.click(
         fn=run_speech_translation_wrapper,
-        inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
-        outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,]
     )
     async def update_replace_audio_button(audio_url, video_path):
@@ -601,8 +612,8 @@ with gr.Blocks() as demo:
     replace_audio_button.click(
         fn=replace_audio_and_generate_video,
-        inputs=[temp_video_path, user_audio_output],
-        outputs=[gr.Textbox(label="Status"), final_video_output]
     )
     async def update_video_embed(youtube_url):
@@ -659,13 +670,16 @@ with gr.Blocks() as demo:
         async (audioFilePath) => {
         // Debug: Log received audio file path
         console.log("Received audio file path:", audioFilePath);
         if (!window.audioQueue) {
             window.audioQueue = [];
             window.isPlaying = false;
         }
         // Ensure the correct URL for the audio file is available
         if (audioFilePath && audioFilePath.url) {
             console.log("Processing audio file...");
             try {
                 // Fetch and decode the audio file
                 const response = await fetch(audioFilePath.url);
@@ -673,51 +687,64 @@ with gr.Blocks() as demo:
                     console.error("Failed to fetch audio file:", response.statusText);
                     return;
                 }
                 const audioData = await response.arrayBuffer();
                 const audioContext = new AudioContext();
                 const decodedData = await audioContext.decodeAudioData(audioData);
                 // Split the decoded audio buffer into two chunks
                 const totalDuration = decodedData.duration;
                 const midPoint = Math.floor(decodedData.length / 2);  // Midpoint for splitting
                 const sampleRate = decodedData.sampleRate;
                 // Create two separate AudioBuffers for each chunk
                 const firstHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, midPoint, sampleRate);
                 const secondHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, decodedData.length - midPoint, sampleRate);
                 // Copy data from original buffer to the two new buffers
                 for (let channel = 0; channel < decodedData.numberOfChannels; channel++) {
                     firstHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(0, midPoint), channel, 0);
                     secondHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(midPoint), channel, 0);
                 }
                 // Add both chunks to the queue
                 window.audioQueue.push(firstHalfBuffer);
                 window.audioQueue.push(secondHalfBuffer);
                 console.log("Two audio chunks added to queue. Queue length:", window.audioQueue.length);
                 // Function to play the next audio chunk from the queue
                 const playNextChunk = async () => {
                     console.log("Attempting to play next chunk. isPlaying:", window.isPlaying);
                     if (!window.isPlaying && window.audioQueue.length > 0) {
                         console.log("Starting playback...");
                         window.isPlaying = true;
                         // Get the next audio buffer from the queue
                         const audioBuffer = window.audioQueue.shift();
                         console.log("Playing audio chunk from buffer.");
                         const source = audioContext.createBufferSource();
                         source.buffer = audioBuffer;
                         source.connect(audioContext.destination);
                         // When the audio finishes playing, play the next chunk
                         source.onended = () => {
                             console.log("Audio chunk finished playing.");
                             window.isPlaying = false;
                             playNextChunk();  // Play the next audio chunk in the queue
                         };
                         source.start(0);  // Start playing the current chunk
                         console.log("Audio chunk started.");
                     } else {
                         console.log("Already playing or queue is empty.");
                     }
                 };
                 // Start playing the next chunk if not already playing
                 playNextChunk();
             } catch (error) {
                 console.error("Error during audio playback:", error);
                 window.isPlaying = false;
@@ -733,3 +760,4 @@ demo.queue()
 #demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
 asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))))

 TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
 TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
+#bSegByPunct = True
+bSegByPunct = False
 LANGUAGE_MAP = {
     "en": "English",
     "ma": "Malay",
     "zh": ["childChinese2"]
 }
 audio_update_event = asyncio.Event()
 acc_cosy_audio = None
 # cosy voice tts related;
+#TTS_SOCKET_SERVER = "http://localhost:9244"
+TTS_SOCKET_SERVER = "http://astarwiz.com:9244"
 sio = socketio.AsyncClient()
                     if url.get('isBundle'):
                         audio_url = url['url']
                         extension = url['extension']
+                        print ("audio_url :", audio_url)
                         async with session.get(audio_url) as audio_response:
+                            print ("audio_response:", audio_response)
                             if audio_response.status == 200:
                                 content = await audio_response.read()
                                 temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
         with open(file_path, 'rb') as f:
             form_data = aiohttp.FormData()
             form_data.add_field('file', f, filename=os.path.basename(file_path))
             async with session.post(upload_url, data=form_data) as response:
                 print(f"5. Client receives headers: {time.time()}")
                 print(f"Status: {response.status}")
                 result = await response.json()
                 print(f"7. Client fully received and parsed response: {time.time()}")
                 if response.status == 200:
                     return result
                 else:
                     return {"file_id",""}
 async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
     global transcription_update, translation_update, audio_update, acc_cosy_audio,audio_update_event
     transcription_update = {"content": "", "new": True}
     data = aiohttp.FormData()
     data.add_field('file', open(audio, 'rb'))
     data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
+    if bSegByPunct:
+        data.add_field('model_name', 'whisper-large-v2-local-cs')
+        data.add_field('with_timestamp', 'false')
+    else:
+        data.add_field('model_name', 'official-v3')
+        data.add_field('with_timestamp', 'true')
     async with aiohttp.ClientSession() as session:
         async with session.post(ASR_API, data=data) as asr_response:
                 server_url = TTS_SOCKET_SERVER
                 await sio.connect(server_url)
                 print(f"Connected to {server_url}")
             # Handle the audio file
             file_id=""
             if audio and os.path.exists(audio):
                 print ("upload_result:", upload_result)
                 file_id = upload_result['file_id']
             # use defualt voice
             tts_request = {
                 'text': transcription,
+    if bSegByPunct:
+        split_result = split_text_with_punctuation(transcription)
+    else:
+        split_result = extract_segments(transcription);
     translate_segments = []
     accumulated_audio = None
     sample_rate = 22050
     global is_playing
     for i, segment in enumerate(split_result):
+        if bSegByPunct:
+            translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
+        else:
+            translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
         translated_seg_txt = await inference_via_llm_api(translation_prompt)
         translate_segments.append(translated_seg_txt)
         print(f"Translation: {translated_seg_txt}")
                         content = await response.read()
                         audio_chunk, sr = sf.read(BytesIO(content))
                         #print ('audio_chunk:', type(audio_chunk),audio_chunk)
+                        #print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
+                        # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
                         if accumulated_audio is None:
             return content
     return gr.update()
+def disable_button():
+    # Disable the button during processing
+    return gr.update(interactive=False)
 with gr.Blocks() as demo:
     gr.Markdown("# Speech Translation")
     with gr.Row():
         user_transcription_output = gr.Textbox(label="Transcription")
         user_translation_output = gr.Textbox(label="Translation")
+        user_audio_output = gr.Audio(label="Translated Speech", visible =False)
         user_audio_final = gr.Audio(label="Final total Speech")
     status_message = gr.Textbox(label="Status", interactive=False)
     user_video_output = gr.HTML(label="YouTube Video")
+    replace_audio_button = gr.Button("Replace Audio", interactive=False, visible =False)
+    final_video_output = gr.Video(label="Video with Replaced Audio",visible=False)
     temp_video_path = gr.State()
     translation_progress = gr.State(0.0)
     async def update_button_state(audio, youtube_url, progress):
         print(audio, youtube_url, progress)
         # Button is interactive if there's input and progress is 0 or 1 (not in progress)
+        print ("progress:", audio, youtube_url,bool(audio) , bool(youtube_url), progress == 0 or progress == 1)
         return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
     user_audio_input.change(
         outputs=user_button
     )
+    async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker,progress):
+        progress = 0.1
         temp_video_path = None
         transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
+        progress = 1
+        return  transcription, translated_text, audio_chunksr, temp_video_path, "Translation complete", accumulated_aud_buf, gr.update(interactive=True)
     user_button.click(
+        fn=disable_button,
+        inputs=[],
+        outputs=[user_button]  # Disable the button during processing
+    ).then(
         fn=run_speech_translation_wrapper,
+        inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker, translation_progress],
+        outputs=[user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,user_button]
     )
     async def update_replace_audio_button(audio_url, video_path):
     replace_audio_button.click(
         fn=replace_audio_and_generate_video,
+        inputs=[temp_video_path, user_audio_final],
+        outputs=[status_message, final_video_output]
     )
     async def update_video_embed(youtube_url):
         async (audioFilePath) => {
         // Debug: Log received audio file path
         console.log("Received audio file path:", audioFilePath);
         if (!window.audioQueue) {
             window.audioQueue = [];
             window.isPlaying = false;
         }
         // Ensure the correct URL for the audio file is available
         if (audioFilePath && audioFilePath.url) {
             console.log("Processing audio file...");
             try {
                 // Fetch and decode the audio file
                 const response = await fetch(audioFilePath.url);
                     console.error("Failed to fetch audio file:", response.statusText);
                     return;
                 }
                 const audioData = await response.arrayBuffer();
                 const audioContext = new AudioContext();
                 const decodedData = await audioContext.decodeAudioData(audioData);
                 // Split the decoded audio buffer into two chunks
                 const totalDuration = decodedData.duration;
                 const midPoint = Math.floor(decodedData.length / 2);  // Midpoint for splitting
                 const sampleRate = decodedData.sampleRate;
                 // Create two separate AudioBuffers for each chunk
                 const firstHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, midPoint, sampleRate);
                 const secondHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, decodedData.length - midPoint, sampleRate);
                 // Copy data from original buffer to the two new buffers
                 for (let channel = 0; channel < decodedData.numberOfChannels; channel++) {
                     firstHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(0, midPoint), channel, 0);
                     secondHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(midPoint), channel, 0);
                 }
                 // Add both chunks to the queue
                 window.audioQueue.push(firstHalfBuffer);
                 window.audioQueue.push(secondHalfBuffer);
                 console.log("Two audio chunks added to queue. Queue length:", window.audioQueue.length);
                 // Function to play the next audio chunk from the queue
                 const playNextChunk = async () => {
                     console.log("Attempting to play next chunk. isPlaying:", window.isPlaying);
                     if (!window.isPlaying && window.audioQueue.length > 0) {
                         console.log("Starting playback...");
                         window.isPlaying = true;
                         // Get the next audio buffer from the queue
                         const audioBuffer = window.audioQueue.shift();
                         console.log("Playing audio chunk from buffer.");
                         const source = audioContext.createBufferSource();
                         source.buffer = audioBuffer;
                         source.connect(audioContext.destination);
                         // When the audio finishes playing, play the next chunk
                         source.onended = () => {
                             console.log("Audio chunk finished playing.");
                             window.isPlaying = false;
                             playNextChunk();  // Play the next audio chunk in the queue
                         };
                         source.start(0);  // Start playing the current chunk
                         console.log("Audio chunk started.");
                     } else {
                         console.log("Already playing or queue is empty.");
                     }
                 };
                 // Start playing the next chunk if not already playing
                 playNextChunk();
             } catch (error) {
                 console.error("Error during audio playback:", error);
                 window.isPlaying = false;
 #demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
 asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))))