Spaces:
Runtime error
Runtime error
Update app.py
Browse filestest ok on my local machine
app.py
CHANGED
@@ -6,6 +6,9 @@ from typing import Optional
|
|
6 |
import tempfile
|
7 |
from pydub import AudioSegment
|
8 |
import re
|
|
|
|
|
|
|
9 |
|
10 |
ASR_API = "http://astarwiz.com:9998/asr"
|
11 |
TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
|
@@ -32,7 +35,70 @@ AVAILABLE_SPEAKERS = {
|
|
32 |
"ta": ["ta_female1"],
|
33 |
"zh": ["childChinese2"]
|
34 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
def fetch_youtube_id(youtube_url: str) -> str:
|
37 |
if 'v=' in youtube_url:
|
38 |
return youtube_url.split("v=")[1].split("&")[0]
|
@@ -43,7 +109,7 @@ def fetch_youtube_id(youtube_url: str) -> str:
|
|
43 |
else:
|
44 |
raise Exception("Unsupported URL format")
|
45 |
|
46 |
-
def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[str]:
|
47 |
video_id = fetch_youtube_id(youtube_url)
|
48 |
|
49 |
if not video_id:
|
@@ -53,9 +119,9 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
|
|
53 |
output_dir = tempfile.gettempdir()
|
54 |
|
55 |
output_filename = os.path.join(output_dir, f"{video_id}.mp3")
|
56 |
-
|
57 |
-
if os.path.exists(output_filename):
|
58 |
-
return output_filename # Return if the file already exists
|
59 |
|
60 |
url = "https://youtube86.p.rapidapi.com/api/youtube/links"
|
61 |
headers = {
|
@@ -78,7 +144,7 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
|
|
78 |
extension = url['extension']
|
79 |
audio_response = requests.get(audio_url)
|
80 |
|
81 |
-
if audio_response.status_code == 200:
|
82 |
temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
|
83 |
with open(temp_filename, 'wb') as audio_file:
|
84 |
audio_file.write(audio_response.content)
|
@@ -87,9 +153,9 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
|
|
87 |
audio = AudioSegment.from_file(temp_filename, format=extension)
|
88 |
audio = audio.set_frame_rate(16000)
|
89 |
audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
|
90 |
-
|
91 |
-
os.remove(temp_filename) # Remove the temporary file
|
92 |
-
return output_filename
|
93 |
|
94 |
return None # Return None if no successful download occurs
|
95 |
else:
|
@@ -161,13 +227,14 @@ def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
|
|
161 |
return "The system got some error during vLLM generation. Please try it again."
|
162 |
|
163 |
def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None):
|
|
|
164 |
if youtube_url:
|
165 |
audio = download_youtube_audio(youtube_url)
|
166 |
-
if
|
167 |
-
return "Failed to download YouTube audio.", None, None
|
168 |
-
|
169 |
if not audio:
|
170 |
-
return "Please provide an audio input or a valid YouTube URL.", None, None
|
171 |
|
172 |
# ASR
|
173 |
file_id = str(uuid.uuid4())
|
@@ -183,7 +250,7 @@ def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, targ
|
|
183 |
if asr_response.status_code == 200:
|
184 |
transcription = asr_response.json()['text']
|
185 |
else:
|
186 |
-
return "ASR failed", None, None
|
187 |
|
188 |
|
189 |
split_result = split_text_with_punctuation(transcription)
|
@@ -206,17 +273,18 @@ def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, targ
|
|
206 |
if tts_response.status_code == 200:
|
207 |
audio_file = tts_response.text.strip()
|
208 |
audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}"
|
209 |
-
return transcription, translated_text, audio_url
|
210 |
else:
|
211 |
-
return transcription, translated_text, "TTS failed"
|
212 |
|
213 |
def check_password(password):
|
214 |
return password == DEVELOPER_PASSWORD
|
215 |
-
|
216 |
def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
|
217 |
-
|
|
|
218 |
|
219 |
-
return transcription, translated_text, audio_url
|
220 |
|
221 |
with gr.Blocks() as demo:
|
222 |
gr.Markdown("# Speech Translation")
|
@@ -236,6 +304,7 @@ with gr.Blocks() as demo:
|
|
236 |
with gr.Row():
|
237 |
user_button = gr.Button("Translate and Speak", interactive=False)
|
238 |
|
|
|
239 |
with gr.Row():
|
240 |
user_transcription_output = gr.Textbox(label="Transcription")
|
241 |
user_translation_output = gr.Textbox(label="Translation")
|
@@ -258,12 +327,38 @@ with gr.Blocks() as demo:
|
|
258 |
outputs=user_button
|
259 |
)
|
260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
user_button.click(
|
262 |
fn=run_speech_translation,
|
263 |
inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
|
264 |
-
outputs=[user_transcription_output, user_translation_output, user_audio_output]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
)
|
266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
def update_video_embed(youtube_url):
|
268 |
if youtube_url:
|
269 |
try:
|
@@ -288,4 +383,4 @@ with gr.Blocks() as demo:
|
|
288 |
outputs=[user_target_speaker]
|
289 |
)
|
290 |
|
291 |
-
demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
|
|
|
6 |
import tempfile
|
7 |
from pydub import AudioSegment
|
8 |
import re
|
9 |
+
import subprocess
|
10 |
+
import numpy as np
|
11 |
+
import soundfile as sf
|
12 |
|
13 |
ASR_API = "http://astarwiz.com:9998/asr"
|
14 |
TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
|
|
|
35 |
"ta": ["ta_female1"],
|
36 |
"zh": ["childChinese2"]
|
37 |
}
|
38 |
+
def replace_audio_in_video(video_path, audio_path, output_path):
|
39 |
+
command = [
|
40 |
+
'ffmpeg',
|
41 |
+
'-i', video_path,
|
42 |
+
'-i', audio_path,
|
43 |
+
'-c:v', 'copy',
|
44 |
+
'-map', '0:v:0',
|
45 |
+
'-map', '1:a:0',
|
46 |
+
'-shortest',
|
47 |
+
output_path
|
48 |
+
]
|
49 |
+
subprocess.run(command, check=True)
|
50 |
+
return output_path
|
51 |
|
52 |
+
def replace_audio_and_generate_video(temp_video_path, gradio_audio):
|
53 |
+
print (type(temp_video_path), type(gradio_audio))
|
54 |
+
if not temp_video_path or gradio_audio is None:
|
55 |
+
return "Both video and audio are required to replace audio.", None
|
56 |
+
|
57 |
+
if not os.path.exists(temp_video_path):
|
58 |
+
return "Video file not found.", None
|
59 |
+
|
60 |
+
# Unpack the Gradio audio output
|
61 |
+
sample_rate, audio_data = gradio_audio
|
62 |
+
|
63 |
+
# Ensure audio_data is a numpy array
|
64 |
+
if not isinstance(audio_data, np.ndarray):
|
65 |
+
audio_data = np.array(audio_data)
|
66 |
+
|
67 |
+
# Create a temporary WAV file
|
68 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio_file:
|
69 |
+
temp_audio_path = temp_audio_file.name
|
70 |
+
sf.write(temp_audio_path, audio_data, sample_rate)
|
71 |
+
|
72 |
+
# Generate output video path
|
73 |
+
output_video_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4()}.mp4")
|
74 |
+
|
75 |
+
try:
|
76 |
+
replace_audio_in_video(temp_video_path, temp_audio_path, output_video_path)
|
77 |
+
return "Audio replaced successfully.", output_video_path
|
78 |
+
except subprocess.CalledProcessError as e:
|
79 |
+
return f"Error replacing audio: {str(e)}", None
|
80 |
+
finally:
|
81 |
+
os.unlink(temp_audio_path) # Clean up the temporary audio file
|
82 |
+
"""
|
83 |
+
def replace_audio_and_generate_video(temp_video_path, audio_path):
|
84 |
+
if not temp_video_path or not audio_path:
|
85 |
+
return "Both video and audio are required to replace audio.", None
|
86 |
+
|
87 |
+
if not os.path.exists(temp_video_path) or not os.path.exists(audio_path):
|
88 |
+
return "Video or audio file not found.", None
|
89 |
+
|
90 |
+
# Generate output video path
|
91 |
+
output_video_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4()}.mp4")
|
92 |
+
|
93 |
+
try:
|
94 |
+
replace_audio_in_video(temp_video_path, audio_path, output_video_path)
|
95 |
+
return "Audio replaced successfully.", output_video_path
|
96 |
+
except subprocess.CalledProcessError as e:
|
97 |
+
return f"Error replacing audio: {str(e)}", None
|
98 |
+
|
99 |
+
"""
|
100 |
+
|
101 |
+
|
102 |
def fetch_youtube_id(youtube_url: str) -> str:
|
103 |
if 'v=' in youtube_url:
|
104 |
return youtube_url.split("v=")[1].split("&")[0]
|
|
|
109 |
else:
|
110 |
raise Exception("Unsupported URL format")
|
111 |
|
112 |
+
def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[tuple[str, str]]:
|
113 |
video_id = fetch_youtube_id(youtube_url)
|
114 |
|
115 |
if not video_id:
|
|
|
119 |
output_dir = tempfile.gettempdir()
|
120 |
|
121 |
output_filename = os.path.join(output_dir, f"{video_id}.mp3")
|
122 |
+
temp_filename = os.path.join(output_dir, f"{video_id}.mp4")
|
123 |
+
if os.path.exists(output_filename) and os.path.exists(temp_filename):
|
124 |
+
return (output_filename, temp_filename) # Return if the file already exists
|
125 |
|
126 |
url = "https://youtube86.p.rapidapi.com/api/youtube/links"
|
127 |
headers = {
|
|
|
144 |
extension = url['extension']
|
145 |
audio_response = requests.get(audio_url)
|
146 |
|
147 |
+
if audio_response.status_code == 200:
|
148 |
temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
|
149 |
with open(temp_filename, 'wb') as audio_file:
|
150 |
audio_file.write(audio_response.content)
|
|
|
153 |
audio = AudioSegment.from_file(temp_filename, format=extension)
|
154 |
audio = audio.set_frame_rate(16000)
|
155 |
audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
|
156 |
+
print ("audio video", output_filename,temp_filename)
|
157 |
+
#os.remove(temp_filename) # Remove the temporary file
|
158 |
+
return (output_filename, temp_filename) # Return the final MP3 filename
|
159 |
|
160 |
return None # Return None if no successful download occurs
|
161 |
else:
|
|
|
227 |
return "The system got some error during vLLM generation. Please try it again."
|
228 |
|
229 |
def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None):
|
230 |
+
video_path =None
|
231 |
if youtube_url:
|
232 |
audio = download_youtube_audio(youtube_url)
|
233 |
+
if audio is None:
|
234 |
+
return "Failed to download YouTube audio.", None, None, video_path
|
235 |
+
audio, video_path =audio
|
236 |
if not audio:
|
237 |
+
return "Please provide an audio input or a valid YouTube URL.", None, None, video_path
|
238 |
|
239 |
# ASR
|
240 |
file_id = str(uuid.uuid4())
|
|
|
250 |
if asr_response.status_code == 200:
|
251 |
transcription = asr_response.json()['text']
|
252 |
else:
|
253 |
+
return "ASR failed", None, None, video_path
|
254 |
|
255 |
|
256 |
split_result = split_text_with_punctuation(transcription)
|
|
|
273 |
if tts_response.status_code == 200:
|
274 |
audio_file = tts_response.text.strip()
|
275 |
audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}"
|
276 |
+
return transcription, translated_text, audio_url,video_path
|
277 |
else:
|
278 |
+
return transcription, translated_text, "TTS failed",video_path
|
279 |
|
280 |
def check_password(password):
|
281 |
return password == DEVELOPER_PASSWORD
|
282 |
+
|
283 |
def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
|
284 |
+
temp_video_path =None;
|
285 |
+
transcription, translated_text, audio_url,temp_video_path = transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
|
286 |
|
287 |
+
return transcription, translated_text, audio_url,temp_video_path
|
288 |
|
289 |
with gr.Blocks() as demo:
|
290 |
gr.Markdown("# Speech Translation")
|
|
|
304 |
with gr.Row():
|
305 |
user_button = gr.Button("Translate and Speak", interactive=False)
|
306 |
|
307 |
+
|
308 |
with gr.Row():
|
309 |
user_transcription_output = gr.Textbox(label="Transcription")
|
310 |
user_translation_output = gr.Textbox(label="Translation")
|
|
|
327 |
outputs=user_button
|
328 |
)
|
329 |
|
330 |
+
# New components
|
331 |
+
replace_audio_button = gr.Button("Replace Audio", interactive=False)
|
332 |
+
final_video_output = gr.Video(label="Video with Replaced Audio")
|
333 |
+
|
334 |
+
# Add a state to store temporary file paths
|
335 |
+
temp_video_path = gr.State()
|
336 |
+
|
337 |
user_button.click(
|
338 |
fn=run_speech_translation,
|
339 |
inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
|
340 |
+
outputs=[user_transcription_output, user_translation_output, user_audio_output,temp_video_path]
|
341 |
+
)
|
342 |
+
|
343 |
+
|
344 |
+
# Enable the Replace Audio button when both video and audio are available
|
345 |
+
def update_replace_audio_button(audio_url, video_path):
|
346 |
+
print ("update replace:", audio_url, video_path)
|
347 |
+
return gr.Button(interactive=bool(audio_url) and bool(video_path))
|
348 |
+
|
349 |
+
user_audio_output.change(
|
350 |
+
fn=update_replace_audio_button,
|
351 |
+
inputs=[user_audio_output, temp_video_path],
|
352 |
+
outputs=[replace_audio_button]
|
353 |
)
|
354 |
|
355 |
+
# Handle Replace Audio button click
|
356 |
+
replace_audio_button.click(
|
357 |
+
fn=replace_audio_and_generate_video,
|
358 |
+
inputs=[temp_video_path, user_audio_output],
|
359 |
+
outputs=[gr.Textbox(label="Status"), final_video_output]
|
360 |
+
)
|
361 |
+
|
362 |
def update_video_embed(youtube_url):
|
363 |
if youtube_url:
|
364 |
try:
|
|
|
383 |
outputs=[user_target_speaker]
|
384 |
)
|
385 |
|
386 |
+
demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
|