jerrypan7 commited on
Commit
57e4840
·
verified ·
1 Parent(s): a590991

Update app.py

Browse files

test ok on my local machine

Files changed (1) hide show
  1. app.py +115 -20
app.py CHANGED
@@ -6,6 +6,9 @@ from typing import Optional
6
  import tempfile
7
  from pydub import AudioSegment
8
  import re
 
 
 
9
 
10
  ASR_API = "http://astarwiz.com:9998/asr"
11
  TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
@@ -32,7 +35,70 @@ AVAILABLE_SPEAKERS = {
32
  "ta": ["ta_female1"],
33
  "zh": ["childChinese2"]
34
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def fetch_youtube_id(youtube_url: str) -> str:
37
  if 'v=' in youtube_url:
38
  return youtube_url.split("v=")[1].split("&")[0]
@@ -43,7 +109,7 @@ def fetch_youtube_id(youtube_url: str) -> str:
43
  else:
44
  raise Exception("Unsupported URL format")
45
 
46
- def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[str]:
47
  video_id = fetch_youtube_id(youtube_url)
48
 
49
  if not video_id:
@@ -53,9 +119,9 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
53
  output_dir = tempfile.gettempdir()
54
 
55
  output_filename = os.path.join(output_dir, f"{video_id}.mp3")
56
-
57
- if os.path.exists(output_filename):
58
- return output_filename # Return if the file already exists
59
 
60
  url = "https://youtube86.p.rapidapi.com/api/youtube/links"
61
  headers = {
@@ -78,7 +144,7 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
78
  extension = url['extension']
79
  audio_response = requests.get(audio_url)
80
 
81
- if audio_response.status_code == 200:
82
  temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
83
  with open(temp_filename, 'wb') as audio_file:
84
  audio_file.write(audio_response.content)
@@ -87,9 +153,9 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
87
  audio = AudioSegment.from_file(temp_filename, format=extension)
88
  audio = audio.set_frame_rate(16000)
89
  audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
90
-
91
- os.remove(temp_filename) # Remove the temporary file
92
- return output_filename # Return the final MP3 filename
93
 
94
  return None # Return None if no successful download occurs
95
  else:
@@ -161,13 +227,14 @@ def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
161
  return "The system got some error during vLLM generation. Please try it again."
162
 
163
  def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None):
 
164
  if youtube_url:
165
  audio = download_youtube_audio(youtube_url)
166
- if not audio:
167
- return "Failed to download YouTube audio.", None, None
168
-
169
  if not audio:
170
- return "Please provide an audio input or a valid YouTube URL.", None, None
171
 
172
  # ASR
173
  file_id = str(uuid.uuid4())
@@ -183,7 +250,7 @@ def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, targ
183
  if asr_response.status_code == 200:
184
  transcription = asr_response.json()['text']
185
  else:
186
- return "ASR failed", None, None
187
 
188
 
189
  split_result = split_text_with_punctuation(transcription)
@@ -206,17 +273,18 @@ def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, targ
206
  if tts_response.status_code == 200:
207
  audio_file = tts_response.text.strip()
208
  audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}"
209
- return transcription, translated_text, audio_url
210
  else:
211
- return transcription, translated_text, "TTS failed"
212
 
213
  def check_password(password):
214
  return password == DEVELOPER_PASSWORD
215
-
216
  def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
217
- transcription, translated_text, audio_url = transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
 
218
 
219
- return transcription, translated_text, audio_url
220
 
221
  with gr.Blocks() as demo:
222
  gr.Markdown("# Speech Translation")
@@ -236,6 +304,7 @@ with gr.Blocks() as demo:
236
  with gr.Row():
237
  user_button = gr.Button("Translate and Speak", interactive=False)
238
 
 
239
  with gr.Row():
240
  user_transcription_output = gr.Textbox(label="Transcription")
241
  user_translation_output = gr.Textbox(label="Translation")
@@ -258,12 +327,38 @@ with gr.Blocks() as demo:
258
  outputs=user_button
259
  )
260
 
 
 
 
 
 
 
 
261
  user_button.click(
262
  fn=run_speech_translation,
263
  inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
264
- outputs=[user_transcription_output, user_translation_output, user_audio_output]
 
 
 
 
 
 
 
 
 
 
 
 
265
  )
266
 
 
 
 
 
 
 
 
267
  def update_video_embed(youtube_url):
268
  if youtube_url:
269
  try:
@@ -288,4 +383,4 @@ with gr.Blocks() as demo:
288
  outputs=[user_target_speaker]
289
  )
290
 
291
- demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
 
6
  import tempfile
7
  from pydub import AudioSegment
8
  import re
9
+ import subprocess
10
+ import numpy as np
11
+ import soundfile as sf
12
 
13
  ASR_API = "http://astarwiz.com:9998/asr"
14
  TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
 
35
  "ta": ["ta_female1"],
36
  "zh": ["childChinese2"]
37
  }
38
+ def replace_audio_in_video(video_path, audio_path, output_path):
39
+ command = [
40
+ 'ffmpeg',
41
+ '-i', video_path,
42
+ '-i', audio_path,
43
+ '-c:v', 'copy',
44
+ '-map', '0:v:0',
45
+ '-map', '1:a:0',
46
+ '-shortest',
47
+ output_path
48
+ ]
49
+ subprocess.run(command, check=True)
50
+ return output_path
51
 
52
+ def replace_audio_and_generate_video(temp_video_path, gradio_audio):
53
+ print (type(temp_video_path), type(gradio_audio))
54
+ if not temp_video_path or gradio_audio is None:
55
+ return "Both video and audio are required to replace audio.", None
56
+
57
+ if not os.path.exists(temp_video_path):
58
+ return "Video file not found.", None
59
+
60
+ # Unpack the Gradio audio output
61
+ sample_rate, audio_data = gradio_audio
62
+
63
+ # Ensure audio_data is a numpy array
64
+ if not isinstance(audio_data, np.ndarray):
65
+ audio_data = np.array(audio_data)
66
+
67
+ # Create a temporary WAV file
68
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio_file:
69
+ temp_audio_path = temp_audio_file.name
70
+ sf.write(temp_audio_path, audio_data, sample_rate)
71
+
72
+ # Generate output video path
73
+ output_video_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4()}.mp4")
74
+
75
+ try:
76
+ replace_audio_in_video(temp_video_path, temp_audio_path, output_video_path)
77
+ return "Audio replaced successfully.", output_video_path
78
+ except subprocess.CalledProcessError as e:
79
+ return f"Error replacing audio: {str(e)}", None
80
+ finally:
81
+ os.unlink(temp_audio_path) # Clean up the temporary audio file
82
+ """
83
+ def replace_audio_and_generate_video(temp_video_path, audio_path):
84
+ if not temp_video_path or not audio_path:
85
+ return "Both video and audio are required to replace audio.", None
86
+
87
+ if not os.path.exists(temp_video_path) or not os.path.exists(audio_path):
88
+ return "Video or audio file not found.", None
89
+
90
+ # Generate output video path
91
+ output_video_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4()}.mp4")
92
+
93
+ try:
94
+ replace_audio_in_video(temp_video_path, audio_path, output_video_path)
95
+ return "Audio replaced successfully.", output_video_path
96
+ except subprocess.CalledProcessError as e:
97
+ return f"Error replacing audio: {str(e)}", None
98
+
99
+ """
100
+
101
+
102
  def fetch_youtube_id(youtube_url: str) -> str:
103
  if 'v=' in youtube_url:
104
  return youtube_url.split("v=")[1].split("&")[0]
 
109
  else:
110
  raise Exception("Unsupported URL format")
111
 
112
+ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[tuple[str, str]]:
113
  video_id = fetch_youtube_id(youtube_url)
114
 
115
  if not video_id:
 
119
  output_dir = tempfile.gettempdir()
120
 
121
  output_filename = os.path.join(output_dir, f"{video_id}.mp3")
122
+ temp_filename = os.path.join(output_dir, f"{video_id}.mp4")
123
+ if os.path.exists(output_filename) and os.path.exists(temp_filename):
124
+ return (output_filename, temp_filename) # Return if the file already exists
125
 
126
  url = "https://youtube86.p.rapidapi.com/api/youtube/links"
127
  headers = {
 
144
  extension = url['extension']
145
  audio_response = requests.get(audio_url)
146
 
147
+ if audio_response.status_code == 200:
148
  temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
149
  with open(temp_filename, 'wb') as audio_file:
150
  audio_file.write(audio_response.content)
 
153
  audio = AudioSegment.from_file(temp_filename, format=extension)
154
  audio = audio.set_frame_rate(16000)
155
  audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
156
+ print ("audio video", output_filename,temp_filename)
157
+ #os.remove(temp_filename) # Remove the temporary file
158
+ return (output_filename, temp_filename) # Return the final MP3 filename
159
 
160
  return None # Return None if no successful download occurs
161
  else:
 
227
  return "The system got some error during vLLM generation. Please try it again."
228
 
229
  def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None):
230
+ video_path =None
231
  if youtube_url:
232
  audio = download_youtube_audio(youtube_url)
233
+ if audio is None:
234
+ return "Failed to download YouTube audio.", None, None, video_path
235
+ audio, video_path =audio
236
  if not audio:
237
+ return "Please provide an audio input or a valid YouTube URL.", None, None, video_path
238
 
239
  # ASR
240
  file_id = str(uuid.uuid4())
 
250
  if asr_response.status_code == 200:
251
  transcription = asr_response.json()['text']
252
  else:
253
+ return "ASR failed", None, None, video_path
254
 
255
 
256
  split_result = split_text_with_punctuation(transcription)
 
273
  if tts_response.status_code == 200:
274
  audio_file = tts_response.text.strip()
275
  audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}"
276
+ return transcription, translated_text, audio_url,video_path
277
  else:
278
+ return transcription, translated_text, "TTS failed",video_path
279
 
280
  def check_password(password):
281
  return password == DEVELOPER_PASSWORD
282
+
283
  def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
284
+ temp_video_path =None;
285
+ transcription, translated_text, audio_url,temp_video_path = transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
286
 
287
+ return transcription, translated_text, audio_url,temp_video_path
288
 
289
  with gr.Blocks() as demo:
290
  gr.Markdown("# Speech Translation")
 
304
  with gr.Row():
305
  user_button = gr.Button("Translate and Speak", interactive=False)
306
 
307
+
308
  with gr.Row():
309
  user_transcription_output = gr.Textbox(label="Transcription")
310
  user_translation_output = gr.Textbox(label="Translation")
 
327
  outputs=user_button
328
  )
329
 
330
+ # New components
331
+ replace_audio_button = gr.Button("Replace Audio", interactive=False)
332
+ final_video_output = gr.Video(label="Video with Replaced Audio")
333
+
334
+ # Add a state to store temporary file paths
335
+ temp_video_path = gr.State()
336
+
337
  user_button.click(
338
  fn=run_speech_translation,
339
  inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
340
+ outputs=[user_transcription_output, user_translation_output, user_audio_output,temp_video_path]
341
+ )
342
+
343
+
344
+ # Enable the Replace Audio button when both video and audio are available
345
+ def update_replace_audio_button(audio_url, video_path):
346
+ print ("update replace:", audio_url, video_path)
347
+ return gr.Button(interactive=bool(audio_url) and bool(video_path))
348
+
349
+ user_audio_output.change(
350
+ fn=update_replace_audio_button,
351
+ inputs=[user_audio_output, temp_video_path],
352
+ outputs=[replace_audio_button]
353
  )
354
 
355
+ # Handle Replace Audio button click
356
+ replace_audio_button.click(
357
+ fn=replace_audio_and_generate_video,
358
+ inputs=[temp_video_path, user_audio_output],
359
+ outputs=[gr.Textbox(label="Status"), final_video_output]
360
+ )
361
+
362
  def update_video_embed(youtube_url):
363
  if youtube_url:
364
  try:
 
383
  outputs=[user_target_speaker]
384
  )
385
 
386
+ demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))