Spaces:
Runtime error
Runtime error
File size: 14,790 Bytes
970d7ce c54a536 57e4840 dc7cfc8 c54a536 90bd7c5 a590991 c54a536 34db584 c54a536 57e4840 c54a536 57e4840 dc7cfc8 57e4840 dc7cfc8 57e4840 dc7cfc8 57e4840 dc7cfc8 57e4840 dc7cfc8 57e4840 dc7cfc8 57e4840 dc7cfc8 57e4840 c54a536 57e4840 c54a536 57e4840 c54a536 57e4840 c54a536 57e4840 c54a536 df13cc5 542e87b df13cc5 000d398 df13cc5 542e87b c54a536 57e4840 c54a536 57e4840 c54a536 57e4840 c54a536 57e4840 c54a536 df13cc5 c54a536 57e4840 c54a536 57e4840 c54a536 57e4840 c54a536 57e4840 c54a536 57e4840 c54a536 57e4840 c54a536 970d7ce c54a536 970d7ce c54a536 57e4840 c54a536 57e4840 970d7ce 57e4840 c54a536 970d7ce c54a536 970d7ce c54a536 57e4840 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 |
import gradio as gr
import requests
import uuid
import os
from typing import Optional
import tempfile
from pydub import AudioSegment
import re
import subprocess
import numpy as np
import soundfile as sf
import sox
from moviepy.editor import VideoFileClip
ASR_API = "http://astarwiz.com:9998/asr"
TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
LANGUAGE_MAP = {
"en": "English",
"ma": "Malay",
"ta": "Tamil",
"zh": "Chinese"
}
# Add a password for developer mode
DEVELOPER_PASSWORD = os.getenv("DEV_PWD")
# Add this constant for the RapidAPI key
#RAPID_API_KEY = os.getenv("RAPID_API_KEY")
RAPID_API_KEY = os.getenv("RAPID_API_KEY")
# Add this constant for available speakers
AVAILABLE_SPEAKERS = {
"en": ["MS"],
"ma": ["ChildMs_100049"],
"ta": ["ta_female1"],
"zh": ["childChinese2"]
}
def replace_audio_in_video(video_path, audio_path, output_path):
command = [
'ffmpeg',
'-i', video_path,
'-i', audio_path,
'-c:v', 'copy',
'-map', '0:v:0',
'-map', '1:a:0',
'-shortest',
output_path
]
subprocess.run(command, check=True)
return output_path
def replace_audio_and_generate_video(temp_video_path, gradio_audio):
if not temp_video_path or gradio_audio is None:
return "Both video and audio are required to replace audio.", None
if not os.path.exists(temp_video_path):
return "Video file not found.", None
# Unpack the Gradio audio output
sample_rate, audio_data = gradio_audio
# Ensure audio_data is a numpy array
if not isinstance(audio_data, np.ndarray):
audio_data = np.array(audio_data)
# Create a temporary WAV file for the original audio
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio_file:
original_audio_path = temp_audio_file.name
sf.write(original_audio_path, audio_data, sample_rate)
# Get video duration
video_clip = VideoFileClip(temp_video_path)
video_duration = video_clip.duration
video_clip.close()
# Get audio duration
audio_duration = len(audio_data) / sample_rate
# Calculate tempo factor
tempo_factor = audio_duration / video_duration
# Create a temporary WAV file for the tempo-adjusted audio
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio_file:
adjusted_audio_path = temp_audio_file.name
# Adjust audio tempo
tfm = sox.Transformer()
tfm.tempo(tempo_factor, 's')
tfm.build(original_audio_path, adjusted_audio_path)
# Generate output video path
output_video_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4()}.mp4")
try:
replace_audio_in_video(temp_video_path, adjusted_audio_path, output_video_path)
return "Audio replaced successfully.", output_video_path
except subprocess.CalledProcessError as e:
return f"Error replacing audio: {str(e)}", None
finally:
os.unlink(original_audio_path) # Clean up the original audio file
os.unlink(adjusted_audio_path) # Clean up the adjusted audio file
def fetch_youtube_id(youtube_url: str) -> str:
if 'v=' in youtube_url:
return youtube_url.split("v=")[1].split("&")[0]
elif 'youtu.be/' in youtube_url:
return youtube_url.split("youtu.be/")[1]
elif 'shorts' in youtube_url:
return youtube_url.split("/")[-1]
else:
raise Exception("Unsupported URL format")
def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[tuple[str, str]]:
video_id = fetch_youtube_id(youtube_url)
if not video_id:
return None
if output_dir is None:
output_dir = tempfile.gettempdir()
output_filename = os.path.join(output_dir, f"{video_id}.mp3")
temp_filename = os.path.join(output_dir, f"{video_id}.mp4")
if os.path.exists(output_filename) and os.path.exists(temp_filename):
return (output_filename, temp_filename) # Return if the file already exists
url = "https://youtube86.p.rapidapi.com/api/youtube/links"
headers = {
'Content-Type': 'application/json',
'x-rapidapi-host': 'youtube86.p.rapidapi.com',
'x-rapidapi-key': RAPID_API_KEY
}
data = {
"url": youtube_url
}
response = requests.post(url, headers=headers, json=data)
print('Fetched audio links')
if response.status_code == 200:
result = response.json()
for url in result[0]['urls']:
if url.get('isBundle'):
audio_url = url['url']
extension = url['extension']
audio_response = requests.get(audio_url)
if audio_response.status_code == 200:
temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
with open(temp_filename, 'wb') as audio_file:
audio_file.write(audio_response.content)
# Convert to MP3 and downsample to 16000 Hz
audio = AudioSegment.from_file(temp_filename, format=extension)
audio = audio.set_frame_rate(16000)
audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
print ("audio video", output_filename,temp_filename)
#os.remove(temp_filename) # Remove the temporary file
return (output_filename, temp_filename) # Return the final MP3 filename
return None # Return None if no successful download occurs
else:
print("Error:", response.status_code, response.text)
return None # Return None on failure
punctuation_marks = r'([\.!?!?。])'
"""
def split_text_with_punctuation(text):
# Split the text using the punctuation marks, keeping the punctuation marks
split_text = re.split(punctuation_marks, text)
# Combine each punctuation mark with the preceding segment
combined_segments = []
for i in range(0, len(split_text) - 1, 2):
combined_segments.append(split_text[i] + split_text[i + 1])
# If there's any remaining text after the last punctuation, append it as well
if len(split_text) % 2 != 0 and split_text[-1]:
combined_segments.append(split_text[-1])
return combined_segments
"""
def split_text_with_punctuation(text):
# Split the text using the punctuation marks, keeping the punctuation marks
split_text = re.split(punctuation_marks, text)
# Combine each punctuation mark with the preceding segment
combined_segments = []
# Loop through the split text in steps of 2
for i in range(0, len(split_text) - 1, 2):
combined_segments.append(split_text[i] + split_text[i + 1])
# Handle any remaining text that doesn't have a punctuation following it
if len(split_text) % 2 != 0 and split_text[-1]:
combined_segments.append(split_text[-1])
# Split any segment that exceeds 50 words
final_segments = []
for segment in combined_segments:
words = segment.split() # Split each segment into words
if len(words) > 50:
# Split the segment into chunks of no more than 50 words
for j in range(0, len(words), 50):
final_segments.append(' '.join(words[j:j+50]))
else:
final_segments.append(segment)
return [segment for segment in final_segments if segment] # Filter out empty strings
def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
print(input_text)
one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
vllm_api = 'http://astarwiz.com:2333/' + "v1/completions"
data = {
"prompt": one_vllm_input,
'model': "./Edu-4B-NewTok-V2-20240904/",
'min_tokens': min_new_tokens,
'max_tokens': max_new_tokens,
'temperature': 0.1,
'top_p': 0.75,
'repetition_penalty': 1.1,
"stop_token_ids": [151645, ],
}
response = requests.post(vllm_api, headers={"Content-Type": "application/json"}, json=data).json()
print(response)
if "choices" in response.keys():
return response["choices"][0]['text'].strip()
else:
return "The system got some error during vLLM generation. Please try it again."
def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None):
video_path =None
if youtube_url:
audio = download_youtube_audio(youtube_url)
if audio is None:
return "Failed to download YouTube audio.", None, None, video_path
audio, video_path =audio
if not audio:
return "Please provide an audio input or a valid YouTube URL.", None, None, video_path
# ASR
file_id = str(uuid.uuid4())
files = {'file': open(audio, 'rb')}
data = {
'language': 'ms' if source_lang == 'ma' else source_lang,
'model_name': 'whisper-large-v2-local-cs',
'with_timestamp': False
}
asr_response = requests.post(ASR_API, files=files, data=data)
print(asr_response.json())
if asr_response.status_code == 200:
transcription = asr_response.json()['text']
else:
return "ASR failed", None, None, video_path
split_result = split_text_with_punctuation(transcription)
translate_segments=[]
for segment in split_result:
translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
translated_seg_txt = inference_via_llm_api(translation_prompt)
translate_segments.append(translated_seg_txt)
print(f"Translation: {translated_seg_txt}")
translated_text = " ".join(translate_segments)
# TTS
tts_params = {
'language': target_lang,
'speed': 1.1,
'speaker': target_speaker or AVAILABLE_SPEAKERS[target_lang][0], # Use the first speaker as default
'text': translated_text
}
tts_response = requests.get(TTS_SPEAK_SERVICE, params=tts_params)
if tts_response.status_code == 200:
audio_file = tts_response.text.strip()
audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}"
return transcription, translated_text, audio_url,video_path
else:
return transcription, translated_text, "TTS failed",video_path
def check_password(password):
return password == DEVELOPER_PASSWORD
def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
temp_video_path =None;
transcription, translated_text, audio_url,temp_video_path = transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
return transcription, translated_text, audio_url,temp_video_path
with gr.Blocks() as demo:
gr.Markdown("# Speech Translation")
# with gr.Tab("User Mode"):
gr.Markdown("Speak into the microphone, upload an audio file, or provide a YouTube URL. The app will translate and speak it back to you.")
with gr.Row():
user_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
user_youtube_url = gr.Textbox(label="YouTube URL (optional)")
with gr.Row():
user_source_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Source Language", value="en")
user_target_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Target Language", value="zh")
user_target_speaker = gr.Dropdown(choices=AVAILABLE_SPEAKERS['zh'], label="Target Speaker", value="childChinese2")
with gr.Row():
user_button = gr.Button("Translate and Speak", interactive=False)
with gr.Row():
user_transcription_output = gr.Textbox(label="Transcription")
user_translation_output = gr.Textbox(label="Translation")
user_audio_output = gr.Audio(label="Translated Speech")
user_video_output = gr.HTML(label="YouTube Video")
def update_button_state(audio, youtube_url):
print(audio, youtube_url)
return gr.Button(interactive=bool(audio) or bool(youtube_url))
user_audio_input.change(
fn=update_button_state,
inputs=[user_audio_input, user_youtube_url],
outputs=user_button
)
user_youtube_url.change(
fn=update_button_state,
inputs=[user_audio_input, user_youtube_url],
outputs=user_button
)
# New components
replace_audio_button = gr.Button("Replace Audio", interactive=False)
final_video_output = gr.Video(label="Video with Replaced Audio")
# Add a state to store temporary file paths
temp_video_path = gr.State()
user_button.click(
fn=run_speech_translation,
inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
outputs=[user_transcription_output, user_translation_output, user_audio_output,temp_video_path]
)
# Enable the Replace Audio button when both video and audio are available
def update_replace_audio_button(audio_url, video_path):
print ("update replace:", audio_url, video_path)
return gr.Button(interactive=bool(audio_url) and bool(video_path))
user_audio_output.change(
fn=update_replace_audio_button,
inputs=[user_audio_output, temp_video_path],
outputs=[replace_audio_button]
)
# Handle Replace Audio button click
replace_audio_button.click(
fn=replace_audio_and_generate_video,
inputs=[temp_video_path, user_audio_output],
outputs=[gr.Textbox(label="Status"), final_video_output]
)
def update_video_embed(youtube_url):
if youtube_url:
try:
video_id = fetch_youtube_id(youtube_url)
return f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>'
except Exception as e:
print(f"Error embedding video: {e}")
return ""
user_youtube_url.change(
fn=update_video_embed,
inputs=[user_youtube_url],
outputs=[user_video_output]
)
def update_target_speakers(target_lang):
return gr.Dropdown(choices=AVAILABLE_SPEAKERS[target_lang], value=AVAILABLE_SPEAKERS[target_lang][0])
user_target_lang.change(
fn=update_target_speakers,
inputs=[user_target_lang],
outputs=[user_target_speaker]
)
demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
|