speech_translation_final

Runtime error

App Files Files Community

jerrypan7 commited on Oct 9, 2024

Commit

57e4840

verified ·

1 Parent(s): a590991

Update app.py

Browse files

test ok on my local machine

Files changed (1) hide show

app.py +115 -20

app.py CHANGED Viewed

@@ -6,6 +6,9 @@ from typing import Optional
 import tempfile
 from pydub import AudioSegment
 import re
 ASR_API = "http://astarwiz.com:9998/asr"
 TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
@@ -32,7 +35,70 @@ AVAILABLE_SPEAKERS = {
     "ta": ["ta_female1"],
     "zh": ["childChinese2"]
 }
 def fetch_youtube_id(youtube_url: str) -> str:
     if 'v=' in youtube_url:
         return youtube_url.split("v=")[1].split("&")[0]
@@ -43,7 +109,7 @@ def fetch_youtube_id(youtube_url: str) -> str:
     else:
         raise Exception("Unsupported URL format")
-def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[str]:
     video_id = fetch_youtube_id(youtube_url)
     if not video_id:
@@ -53,9 +119,9 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
         output_dir = tempfile.gettempdir()
     output_filename = os.path.join(output_dir, f"{video_id}.mp3")
-    if os.path.exists(output_filename):
-        return output_filename  # Return if the file already exists
     url = "https://youtube86.p.rapidapi.com/api/youtube/links"
     headers = {
@@ -78,7 +144,7 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
                 extension = url['extension']
                 audio_response = requests.get(audio_url)
-                if audio_response.status_code == 200:
                     temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
                     with open(temp_filename, 'wb') as audio_file:
                         audio_file.write(audio_response.content)
@@ -87,9 +153,9 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
                     audio = AudioSegment.from_file(temp_filename, format=extension)
                     audio = audio.set_frame_rate(16000)
                     audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
-                    os.remove(temp_filename)  # Remove the temporary file
-                    return output_filename  # Return the final MP3 filename
         return None  # Return None if no successful download occurs
     else:
@@ -161,13 +227,14 @@ def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
         return "The system got some error during vLLM generation. Please try it again."
 def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None):
     if youtube_url:
         audio = download_youtube_audio(youtube_url)
-        if not audio:
-            return "Failed to download YouTube audio.", None, None
     if not audio:
-        return "Please provide an audio input or a valid YouTube URL.", None, None
     # ASR
     file_id = str(uuid.uuid4())
@@ -183,7 +250,7 @@ def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, targ
     if asr_response.status_code == 200:
         transcription = asr_response.json()['text']
     else:
-        return "ASR failed", None, None
     split_result = split_text_with_punctuation(transcription)
@@ -206,17 +273,18 @@ def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, targ
     if tts_response.status_code == 200:
         audio_file = tts_response.text.strip()
         audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}"
-        return transcription, translated_text, audio_url
     else:
-        return transcription, translated_text, "TTS failed"
 def check_password(password):
     return password == DEVELOPER_PASSWORD
 def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
-    transcription, translated_text, audio_url = transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
-    return transcription, translated_text, audio_url
 with gr.Blocks() as demo:
     gr.Markdown("# Speech Translation")
@@ -236,6 +304,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         user_button = gr.Button("Translate and Speak", interactive=False)
     with gr.Row():
         user_transcription_output = gr.Textbox(label="Transcription")
         user_translation_output = gr.Textbox(label="Translation")
@@ -258,12 +327,38 @@ with gr.Blocks() as demo:
         outputs=user_button
     )
     user_button.click(
         fn=run_speech_translation,
         inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
-        outputs=[user_transcription_output, user_translation_output, user_audio_output]
     )
     def update_video_embed(youtube_url):
         if youtube_url:
             try:
@@ -288,4 +383,4 @@ with gr.Blocks() as demo:
         outputs=[user_target_speaker]
     )
-demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))

 import tempfile
 from pydub import AudioSegment
 import re
+import subprocess
+import numpy as np
+import soundfile as sf
 ASR_API = "http://astarwiz.com:9998/asr"
 TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
     "ta": ["ta_female1"],
     "zh": ["childChinese2"]
 }
+def replace_audio_in_video(video_path, audio_path, output_path):
+    command = [
+        'ffmpeg',
+        '-i', video_path,
+        '-i', audio_path,
+        '-c:v', 'copy',
+        '-map', '0:v:0',
+        '-map', '1:a:0',
+        '-shortest',
+        output_path
+    ]
+    subprocess.run(command, check=True)
+    return output_path
+def replace_audio_and_generate_video(temp_video_path, gradio_audio):
+    print (type(temp_video_path), type(gradio_audio))
+    if not temp_video_path or gradio_audio is None:
+        return "Both video and audio are required to replace audio.", None
+    if not os.path.exists(temp_video_path):
+        return "Video file not found.", None
+    # Unpack the Gradio audio output
+    sample_rate, audio_data = gradio_audio
+    # Ensure audio_data is a numpy array
+    if not isinstance(audio_data, np.ndarray):
+        audio_data = np.array(audio_data)
+    # Create a temporary WAV file
+    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio_file:
+        temp_audio_path = temp_audio_file.name
+        sf.write(temp_audio_path, audio_data, sample_rate)
+    # Generate output video path
+    output_video_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4()}.mp4")
+    try:
+        replace_audio_in_video(temp_video_path, temp_audio_path, output_video_path)
+        return "Audio replaced successfully.", output_video_path
+    except subprocess.CalledProcessError as e:
+        return f"Error replacing audio: {str(e)}", None
+    finally:
+        os.unlink(temp_audio_path)  # Clean up the temporary audio file
+"""
+def replace_audio_and_generate_video(temp_video_path, audio_path):
+    if not temp_video_path or not audio_path:
+        return "Both video and audio are required to replace audio.", None
+    if not os.path.exists(temp_video_path) or not os.path.exists(audio_path):
+        return "Video or audio file not found.", None
+    # Generate output video path
+    output_video_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4()}.mp4")
+    try:
+        replace_audio_in_video(temp_video_path, audio_path, output_video_path)
+        return "Audio replaced successfully.", output_video_path
+    except subprocess.CalledProcessError as e:
+        return f"Error replacing audio: {str(e)}", None
+"""
 def fetch_youtube_id(youtube_url: str) -> str:
     if 'v=' in youtube_url:
         return youtube_url.split("v=")[1].split("&")[0]
     else:
         raise Exception("Unsupported URL format")
+def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[tuple[str, str]]:
     video_id = fetch_youtube_id(youtube_url)
     if not video_id:
         output_dir = tempfile.gettempdir()
     output_filename = os.path.join(output_dir, f"{video_id}.mp3")
+    temp_filename = os.path.join(output_dir, f"{video_id}.mp4")
+    if os.path.exists(output_filename) and os.path.exists(temp_filename):
+        return (output_filename, temp_filename)  # Return if the file already exists
     url = "https://youtube86.p.rapidapi.com/api/youtube/links"
     headers = {
                 extension = url['extension']
                 audio_response = requests.get(audio_url)
+                if audio_response.status_code == 200:
                     temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
                     with open(temp_filename, 'wb') as audio_file:
                         audio_file.write(audio_response.content)
                     audio = AudioSegment.from_file(temp_filename, format=extension)
                     audio = audio.set_frame_rate(16000)
                     audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
+                    print ("audio video", output_filename,temp_filename)
+                    #os.remove(temp_filename)  # Remove the temporary file
+                    return (output_filename, temp_filename)   # Return the final MP3 filename
         return None  # Return None if no successful download occurs
     else:
         return "The system got some error during vLLM generation. Please try it again."
 def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None):
+    video_path =None
     if youtube_url:
         audio = download_youtube_audio(youtube_url)
+        if audio is None:
+            return "Failed to download YouTube audio.", None, None, video_path
+        audio, video_path =audio
     if not audio:
+        return "Please provide an audio input or a valid YouTube URL.", None, None, video_path
     # ASR
     file_id = str(uuid.uuid4())
     if asr_response.status_code == 200:
         transcription = asr_response.json()['text']
     else:
+        return "ASR failed", None, None, video_path
     split_result = split_text_with_punctuation(transcription)
     if tts_response.status_code == 200:
         audio_file = tts_response.text.strip()
         audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}"
+        return transcription, translated_text, audio_url,video_path
     else:
+        return transcription, translated_text, "TTS failed",video_path
 def check_password(password):
     return password == DEVELOPER_PASSWORD
 def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
+    temp_video_path =None;
+    transcription, translated_text, audio_url,temp_video_path = transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
+    return transcription, translated_text, audio_url,temp_video_path
 with gr.Blocks() as demo:
     gr.Markdown("# Speech Translation")
     with gr.Row():
         user_button = gr.Button("Translate and Speak", interactive=False)
     with gr.Row():
         user_transcription_output = gr.Textbox(label="Transcription")
         user_translation_output = gr.Textbox(label="Translation")
         outputs=user_button
     )
+    # New components
+    replace_audio_button = gr.Button("Replace Audio", interactive=False)
+    final_video_output = gr.Video(label="Video with Replaced Audio")
+    # Add a state to store temporary file paths
+    temp_video_path = gr.State()
     user_button.click(
         fn=run_speech_translation,
         inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
+        outputs=[user_transcription_output, user_translation_output, user_audio_output,temp_video_path]
+    )
+    # Enable the Replace Audio button when both video and audio are available
+    def update_replace_audio_button(audio_url, video_path):
+        print ("update replace:", audio_url, video_path)
+        return gr.Button(interactive=bool(audio_url) and bool(video_path))
+    user_audio_output.change(
+        fn=update_replace_audio_button,
+        inputs=[user_audio_output, temp_video_path],
+        outputs=[replace_audio_button]
     )
+    # Handle Replace Audio button click
+    replace_audio_button.click(
+        fn=replace_audio_and_generate_video,
+        inputs=[temp_video_path, user_audio_output],
+        outputs=[gr.Textbox(label="Status"), final_video_output]
+    )
     def update_video_embed(youtube_url):
         if youtube_url:
             try:
         outputs=[user_target_speaker]
     )
+demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))