jerrypan7 commited on
Commit
0c4f039
·
verified ·
1 Parent(s): 9eb78b7

Update app.py

Browse files

integrate cosyvoice with old tts for this streaming demo

Files changed (1) hide show
  1. app.py +405 -134
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- import requests
3
  import uuid
4
  import os
5
  from typing import Optional
@@ -9,13 +8,24 @@ import re
9
  import subprocess
10
  import numpy as np
11
  import soundfile as sf
 
 
12
  import sox
 
 
 
13
  from moviepy.editor import VideoFileClip
 
 
 
14
 
15
  ASR_API = "http://astarwiz.com:9998/asr"
16
  TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
17
  TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
18
 
 
 
 
19
  LANGUAGE_MAP = {
20
  "en": "English",
21
  "ma": "Malay",
@@ -23,20 +33,107 @@ LANGUAGE_MAP = {
23
  "zh": "Chinese"
24
  }
25
 
26
- # Add a password for developer mode
27
  DEVELOPER_PASSWORD = os.getenv("DEV_PWD")
28
-
29
- # Add this constant for the RapidAPI key
30
- #RAPID_API_KEY = os.getenv("RAPID_API_KEY")
31
  RAPID_API_KEY = os.getenv("RAPID_API_KEY")
32
 
33
- # Add this constant for available speakers
34
  AVAILABLE_SPEAKERS = {
35
  "en": ["MS"],
36
- "ma": ["ChildMs_100049"],
37
  "ta": ["ta_female1"],
38
  "zh": ["childChinese2"]
39
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def replace_audio_in_video(video_path, audio_path, output_path):
41
  command = [
42
  'ffmpeg',
@@ -51,7 +148,8 @@ def replace_audio_in_video(video_path, audio_path, output_path):
51
  subprocess.run(command, check=True)
52
  return output_path
53
 
54
- def replace_audio_and_generate_video(temp_video_path, gradio_audio):
 
55
  if not temp_video_path or gradio_audio is None:
56
  return "Both video and audio are required to replace audio.", None
57
 
@@ -101,10 +199,8 @@ def replace_audio_and_generate_video(temp_video_path, gradio_audio):
101
  finally:
102
  os.unlink(original_audio_path) # Clean up the original audio file
103
  os.unlink(adjusted_audio_path) # Clean up the adjusted audio file
104
-
105
-
106
 
107
- def fetch_youtube_id(youtube_url: str) -> str:
108
  if 'v=' in youtube_url:
109
  return youtube_url.split("v=")[1].split("&")[0]
110
  elif 'youtu.be/' in youtube_url:
@@ -114,8 +210,8 @@ def fetch_youtube_id(youtube_url: str) -> str:
114
  else:
115
  raise Exception("Unsupported URL format")
116
 
117
- def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[tuple[str, str]]:
118
- video_id = fetch_youtube_id(youtube_url)
119
 
120
  if not video_id:
121
  return None
@@ -126,8 +222,8 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
126
  output_filename = os.path.join(output_dir, f"{video_id}.mp3")
127
  temp_filename = os.path.join(output_dir, f"{video_id}.mp4")
128
  if os.path.exists(output_filename) and os.path.exists(temp_filename):
129
- return (output_filename, temp_filename) # Return if the file already exists
130
-
131
  url = "https://youtube86.p.rapidapi.com/api/youtube/links"
132
  headers = {
133
  'Content-Type': 'application/json',
@@ -138,51 +234,29 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
138
  "url": youtube_url
139
  }
140
 
141
- response = requests.post(url, headers=headers, json=data)
142
- print('Fetched audio links')
143
-
144
- if response.status_code == 200:
145
- result = response.json()
146
- for url in result[0]['urls']:
147
- if url.get('isBundle'):
148
- audio_url = url['url']
149
- extension = url['extension']
150
- audio_response = requests.get(audio_url)
151
-
152
- if audio_response.status_code == 200:
153
- temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
154
- with open(temp_filename, 'wb') as audio_file:
155
- audio_file.write(audio_response.content)
156
-
157
- # Convert to MP3 and downsample to 16000 Hz
158
- audio = AudioSegment.from_file(temp_filename, format=extension)
159
- audio = audio.set_frame_rate(16000)
160
- audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
161
- print ("audio video", output_filename,temp_filename)
162
- #os.remove(temp_filename) # Remove the temporary file
163
- return (output_filename, temp_filename) # Return the final MP3 filename
164
-
165
- return None # Return None if no successful download occurs
166
- else:
167
- print("Error:", response.status_code, response.text)
168
- return None # Return None on failure
169
-
170
-
171
  punctuation_marks = r'([\.!?!?。])'
172
- """
173
- def split_text_with_punctuation(text):
174
- # Split the text using the punctuation marks, keeping the punctuation marks
175
- split_text = re.split(punctuation_marks, text)
176
- # Combine each punctuation mark with the preceding segment
177
- combined_segments = []
178
- for i in range(0, len(split_text) - 1, 2):
179
- combined_segments.append(split_text[i] + split_text[i + 1])
180
- # If there's any remaining text after the last punctuation, append it as well
181
- if len(split_text) % 2 != 0 and split_text[-1]:
182
- combined_segments.append(split_text[-1])
183
-
184
- return combined_segments
185
- """
186
  def split_text_with_punctuation(text):
187
  # Split the text using the punctuation marks, keeping the punctuation marks
188
  split_text = re.split(punctuation_marks, text)
@@ -209,8 +283,50 @@ def split_text_with_punctuation(text):
209
  final_segments.append(segment)
210
 
211
  return [segment for segment in final_segments if segment] # Filter out empty strings
 
 
 
 
 
 
 
212
 
213
- def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  print(input_text)
215
  one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
216
  vllm_api = 'http://astarwiz.com:2333/' + "v1/completions"
@@ -224,77 +340,189 @@ def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
224
  'repetition_penalty': 1.1,
225
  "stop_token_ids": [151645, ],
226
  }
227
- response = requests.post(vllm_api, headers={"Content-Type": "application/json"}, json=data).json()
228
- print(response)
229
- if "choices" in response.keys():
230
- return response["choices"][0]['text'].strip()
231
- else:
232
- return "The system got some error during vLLM generation. Please try it again."
 
 
 
 
 
 
 
 
233
 
234
- def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None):
235
- video_path =None
 
236
  if youtube_url:
237
- audio = download_youtube_audio(youtube_url)
238
  if audio is None:
239
  return "Failed to download YouTube audio.", None, None, video_path
240
- audio, video_path =audio
241
  if not audio:
242
  return "Please provide an audio input or a valid YouTube URL.", None, None, video_path
243
 
244
  # ASR
 
245
  file_id = str(uuid.uuid4())
246
- files = {'file': open(audio, 'rb')}
247
- data = {
248
- 'language': 'ms' if source_lang == 'ma' else source_lang,
249
- 'model_name': 'whisper-large-v2-local-cs',
250
- 'with_timestamp': False
251
- }
252
-
253
- asr_response = requests.post(ASR_API, files=files, data=data)
254
- print(asr_response.json())
255
- if asr_response.status_code == 200:
256
- transcription = asr_response.json()['text']
257
- else:
258
- return "ASR failed", None, None, video_path
 
 
 
 
 
 
259
 
 
260
 
261
- split_result = split_text_with_punctuation(transcription)
262
- translate_segments=[]
263
- for segment in split_result:
264
- translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
265
- translated_seg_txt = inference_via_llm_api(translation_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  translate_segments.append(translated_seg_txt)
267
  print(f"Translation: {translated_seg_txt}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  translated_text = " ".join(translate_segments)
269
- # TTS
270
- tts_params = {
271
- 'language': target_lang,
272
- 'speed': 1.1,
273
- 'speaker': target_speaker or AVAILABLE_SPEAKERS[target_lang][0], # Use the first speaker as default
274
- 'text': translated_text
275
- }
276
 
277
- tts_response = requests.get(TTS_SPEAK_SERVICE, params=tts_params)
278
- if tts_response.status_code == 200:
279
- audio_file = tts_response.text.strip()
280
- audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}"
281
- return transcription, translated_text, audio_url,video_path
282
  else:
283
- return transcription, translated_text, "TTS failed",video_path
284
 
285
- def check_password(password):
286
- return password == DEVELOPER_PASSWORD
287
-
288
- def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
289
- temp_video_path =None;
290
- transcription, translated_text, audio_url,temp_video_path = transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
291
-
292
- return transcription, translated_text, audio_url,temp_video_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
  with gr.Blocks() as demo:
295
  gr.Markdown("# Speech Translation")
296
 
297
- # with gr.Tab("User Mode"):
298
  gr.Markdown("Speak into the microphone, upload an audio file, or provide a YouTube URL. The app will translate and speak it back to you.")
299
 
300
  with gr.Row():
@@ -309,46 +537,66 @@ with gr.Blocks() as demo:
309
  with gr.Row():
310
  user_button = gr.Button("Translate and Speak", interactive=False)
311
 
312
-
313
  with gr.Row():
314
  user_transcription_output = gr.Textbox(label="Transcription")
315
  user_translation_output = gr.Textbox(label="Translation")
316
  user_audio_output = gr.Audio(label="Translated Speech")
317
-
 
 
318
  user_video_output = gr.HTML(label="YouTube Video")
319
 
320
- def update_button_state(audio, youtube_url):
321
- print(audio, youtube_url)
322
- return gr.Button(interactive=bool(audio) or bool(youtube_url))
 
 
 
 
 
 
 
323
 
324
  user_audio_input.change(
325
  fn=update_button_state,
326
- inputs=[user_audio_input, user_youtube_url],
327
  outputs=user_button
328
  )
329
  user_youtube_url.change(
330
  fn=update_button_state,
331
- inputs=[user_audio_input, user_youtube_url],
332
  outputs=user_button
333
  )
334
-
335
- # New components
336
- replace_audio_button = gr.Button("Replace Audio", interactive=False)
337
- final_video_output = gr.Video(label="Video with Replaced Audio")
338
 
339
- # Add a state to store temporary file paths
340
- temp_video_path = gr.State()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  user_button.click(
343
- fn=run_speech_translation,
344
  inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
345
- outputs=[user_transcription_output, user_translation_output, user_audio_output,temp_video_path]
346
  )
347
-
348
 
349
- # Enable the Replace Audio button when both video and audio are available
350
- def update_replace_audio_button(audio_url, video_path):
351
- print ("update replace:", audio_url, video_path)
352
  return gr.Button(interactive=bool(audio_url) and bool(video_path))
353
 
354
  user_audio_output.change(
@@ -357,17 +605,16 @@ with gr.Blocks() as demo:
357
  outputs=[replace_audio_button]
358
  )
359
 
360
- # Handle Replace Audio button click
361
  replace_audio_button.click(
362
  fn=replace_audio_and_generate_video,
363
  inputs=[temp_video_path, user_audio_output],
364
  outputs=[gr.Textbox(label="Status"), final_video_output]
365
  )
366
 
367
- def update_video_embed(youtube_url):
368
  if youtube_url:
369
  try:
370
- video_id = fetch_youtube_id(youtube_url)
371
  return f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>'
372
  except Exception as e:
373
  print(f"Error embedding video: {e}")
@@ -379,7 +626,7 @@ with gr.Blocks() as demo:
379
  outputs=[user_video_output]
380
  )
381
 
382
- def update_target_speakers(target_lang):
383
  return gr.Dropdown(choices=AVAILABLE_SPEAKERS[target_lang], value=AVAILABLE_SPEAKERS[target_lang][0])
384
 
385
  user_target_lang.change(
@@ -388,4 +635,28 @@ with gr.Blocks() as demo:
388
  outputs=[user_target_speaker]
389
  )
390
 
391
- demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import uuid
3
  import os
4
  from typing import Optional
 
8
  import subprocess
9
  import numpy as np
10
  import soundfile as sf
11
+ import sounddevice as sd
12
+ import time
13
  import sox
14
+ from io import BytesIO
15
+ import asyncio
16
+ import aiohttp
17
  from moviepy.editor import VideoFileClip
18
+ import threading
19
+ import socketio
20
+ import base64
21
 
22
  ASR_API = "http://astarwiz.com:9998/asr"
23
  TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
24
  TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
25
 
26
+
27
+
28
+
29
  LANGUAGE_MAP = {
30
  "en": "English",
31
  "ma": "Malay",
 
33
  "zh": "Chinese"
34
  }
35
 
 
36
  DEVELOPER_PASSWORD = os.getenv("DEV_PWD")
 
 
 
37
  RAPID_API_KEY = os.getenv("RAPID_API_KEY")
38
 
 
39
  AVAILABLE_SPEAKERS = {
40
  "en": ["MS"],
41
+ "ma": ["msFemale"],
42
  "ta": ["ta_female1"],
43
  "zh": ["childChinese2"]
44
  }
45
+
46
+ # global variable to playing of tts generated
47
+ audio_queue = []
48
+ is_playing = False
49
+ audio_update_event = asyncio.Event()
50
+
51
+ def play_audio():
52
+ global is_playing
53
+ is_playing = True
54
+
55
+ #
56
+ while is_playing:
57
+ if audio_queue:
58
+ audio_chunk = audio_queue.pop(0)
59
+ sd.play(audio_chunk, samplerate=22050)
60
+ sd.wait()
61
+ else:
62
+ time.sleep(0.1)
63
+ print(" tts generating finished. play all the rest to finish playing")
64
+ while audio_queue:
65
+ audio_chunk = audio_queue.pop(0)
66
+ sd.play(audio_chunk, samplerate=22050)
67
+ sd.wait()
68
+ # cosy voice tts related;
69
+ #TTS_SOCKET_SERVER = "http://localhost:9244"
70
+ TTS_SOCKET_SERVER = "http://astarwiz.com:9244"
71
+
72
+ sio = socketio.AsyncClient()
73
+
74
+ @sio.on('connect')
75
+ def on_connect():
76
+ print('Connected to server')
77
+
78
+ @sio.on('disconnect')
79
+ def on_disconnect():
80
+ print('Disconnected from server')
81
+
82
+ @sio.on('audio_chunk')
83
+ async def on_audio_chunk(data):
84
+ global translation_update, audio_update
85
+
86
+ translated_seg_txt = data['trans_text']
87
+ with translation_lock:
88
+ translation_update["content"] = translation_update["content"] + " " + translated_seg_txt
89
+ translation_update["new"] = True
90
+
91
+ audio_base64 = data['audio']
92
+ audio_bytes = base64.b64decode(audio_base64)
93
+ audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
94
+ audio_queue.append(audio_np)
95
+
96
+ if audio_update["content"] is None:
97
+ sr,accumulated_audio= 22050 ,audio_np
98
+ else:
99
+ sr, accumulated_audio = audio_update["content"]
100
+ accumulated_audio = np.concatenate((accumulated_audio, audio_np))
101
+
102
+ with audio_lock:
103
+ audio_update["content"] = (sr, accumulated_audio)
104
+ audio_update["new"] = True
105
+
106
+
107
+ #audio_float = audio_np.astype(np.float32) / 32767.0
108
+ #audio_queue.append(audio_float)
109
+ #accumulated_audio.extend(audio_float)
110
+
111
+ if not is_playing:
112
+ playback_thread = threading.Thread(target=play_audio)
113
+ playback_thread.start()
114
+
115
+ @sio.on('tts_complete')
116
+ async def on_tts_complete():
117
+ await sio.disconnect()
118
+ print("Disconnected from server after TTS completion")
119
+
120
+ audio_update_event.set()
121
+ global is_playing
122
+ while audio_queue:
123
+ await asyncio.sleep(0.1)
124
+ is_playing = False
125
+
126
+
127
+ # Global variables for storing update information
128
+ transcription_update = {"content": "", "new": False}
129
+ translation_update = {"content": "", "new": False}
130
+ audio_update = {"content": None, "new": False}
131
+
132
+ # Locks for thread-safe operations
133
+ transcription_lock = threading.Lock()
134
+ translation_lock = threading.Lock()
135
+ audio_lock = threading.Lock()
136
+
137
  def replace_audio_in_video(video_path, audio_path, output_path):
138
  command = [
139
  'ffmpeg',
 
148
  subprocess.run(command, check=True)
149
  return output_path
150
 
151
+ async def replace_audio_and_generate_video(temp_video_path, gradio_audio):
152
+ print ("gradio_audio:", gradio_audio)
153
  if not temp_video_path or gradio_audio is None:
154
  return "Both video and audio are required to replace audio.", None
155
 
 
199
  finally:
200
  os.unlink(original_audio_path) # Clean up the original audio file
201
  os.unlink(adjusted_audio_path) # Clean up the adjusted audio file
 
 
202
 
203
+ async def fetch_youtube_id(youtube_url: str) -> str:
204
  if 'v=' in youtube_url:
205
  return youtube_url.split("v=")[1].split("&")[0]
206
  elif 'youtu.be/' in youtube_url:
 
210
  else:
211
  raise Exception("Unsupported URL format")
212
 
213
+ async def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[tuple[str, str]]:
214
+ video_id = await fetch_youtube_id(youtube_url)
215
 
216
  if not video_id:
217
  return None
 
222
  output_filename = os.path.join(output_dir, f"{video_id}.mp3")
223
  temp_filename = os.path.join(output_dir, f"{video_id}.mp4")
224
  if os.path.exists(output_filename) and os.path.exists(temp_filename):
225
+ return (output_filename, temp_filename)
226
+
227
  url = "https://youtube86.p.rapidapi.com/api/youtube/links"
228
  headers = {
229
  'Content-Type': 'application/json',
 
234
  "url": youtube_url
235
  }
236
 
237
+ async with aiohttp.ClientSession() as session:
238
+ async with session.post(url, headers=headers, json=data) as response:
239
+ if response.status == 200:
240
+ result = await response.json()
241
+ for url in result[0]['urls']:
242
+ if url.get('isBundle'):
243
+ audio_url = url['url']
244
+ extension = url['extension']
245
+ async with session.get(audio_url) as audio_response:
246
+ if audio_response.status == 200:
247
+ content = await audio_response.read()
248
+ temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
249
+ with open(temp_filename, 'wb') as audio_file:
250
+ audio_file.write(content)
251
+
252
+ audio = AudioSegment.from_file(temp_filename, format=extension)
253
+ audio = audio.set_frame_rate(16000)
254
+ audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
255
+ return (output_filename, temp_filename)
256
+ else:
257
+ print("Error:", response.status, await response.text())
258
+ return None
 
 
 
 
 
 
 
 
259
  punctuation_marks = r'([\.!?!?。])'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  def split_text_with_punctuation(text):
261
  # Split the text using the punctuation marks, keeping the punctuation marks
262
  split_text = re.split(punctuation_marks, text)
 
283
  final_segments.append(segment)
284
 
285
  return [segment for segment in final_segments if segment] # Filter out empty strings
286
+
287
+ def extract_segments(text):
288
+ pattern = r'\[(\d+\.\d+)s\s*->\s*(\d+\.\d+)s\]\s*(.*?)(?=\[\d+\.\d+s|\Z)'
289
+ matches = re.findall(pattern, text, re.DOTALL)
290
+
291
+ if not matches:
292
+ return []
293
 
294
+ segments = []
295
+ for start, end, content in matches:
296
+ segments.append({
297
+ 'start': float(start),
298
+ 'end': float(end),
299
+ 'text': content.strip()
300
+ })
301
+
302
+ return segments
303
+
304
+ def adjust_tempo_pysox_array(gradio_audio, duration):
305
+ # Unpack the Gradio audio output
306
+ sample_rate, audio_data = gradio_audio
307
+ # Ensure audio_data is a numpy array
308
+ if not isinstance(audio_data, np.ndarray):
309
+ audio_data = np.array(audio_data)
310
+ # Calculate the current duration of the audio in seconds
311
+ current_duration = len(audio_data) / sample_rate
312
+ # Calculate the necessary tempo factor to match the desired duration
313
+ tempo_factor = current_duration / duration
314
+ # Create a pysox Transformer
315
+ tfm = sox.Transformer()
316
+ tfm.tempo(tempo_factor)
317
+ # Use pysox to transform the audio directly in memory
318
+ adjusted_audio = tfm.build_array(input_array=audio_data, sample_rate_in=sample_rate)
319
+ # Trim or pad the audio to exactly match the desired duration
320
+ target_length = int(sample_rate * duration)
321
+ if len(adjusted_audio) > target_length:
322
+ adjusted_audio = adjusted_audio[:target_length] # Trim if too long
323
+ else:
324
+ # Pad with zeros if too short
325
+ adjusted_audio = np.pad(adjusted_audio, (0, target_length - len(adjusted_audio)), mode='constant')
326
+ # Return the processed audio in the Gradio format (sample_rate, adjusted_audio)
327
+ return sample_rate, adjusted_audio
328
+
329
+ async def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
330
  print(input_text)
331
  one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
332
  vllm_api = 'http://astarwiz.com:2333/' + "v1/completions"
 
340
  'repetition_penalty': 1.1,
341
  "stop_token_ids": [151645, ],
342
  }
343
+ async with aiohttp.ClientSession() as session:
344
+ async with session.post(vllm_api, headers={"Content-Type": "application/json"}, json=data) as response:
345
+ if response.status == 200:
346
+ result = await response.json()
347
+ if "choices" in result:
348
+ return result["choices"][0]['text'].strip()
349
+ return "The system got some error during vLLM generation. Please try it again."
350
+
351
+ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
352
+ global transcription_update, translation_update, audio_update
353
+ transcription_update = {"content": "", "new": False}
354
+ translation_update = {"content": "", "new": False}
355
+ audio_update = {"content": None, "new": False}
356
+ video_path = None
357
 
358
+ #progress = gr.Progress();
359
+
360
+ #progress(0.1, "started:")
361
  if youtube_url:
362
+ audio = await download_youtube_audio(youtube_url)
363
  if audio is None:
364
  return "Failed to download YouTube audio.", None, None, video_path
365
+ audio, video_path = audio
366
  if not audio:
367
  return "Please provide an audio input or a valid YouTube URL.", None, None, video_path
368
 
369
  # ASR
370
+ #progress(0.2, "ASR started:")
371
  file_id = str(uuid.uuid4())
372
+ data = aiohttp.FormData()
373
+ data.add_field('file', open(audio, 'rb'))
374
+ data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
375
+ data.add_field('model_name', 'whisper-large-v2-local-cs')
376
+ #data.add_field('with_timestamp', 'false')
377
+ data.add_field('with_timestamp', 'true')
378
+
379
+ async with aiohttp.ClientSession() as session:
380
+ async with session.post(ASR_API, data=data) as asr_response:
381
+ if asr_response.status == 200:
382
+ result = await asr_response.json()
383
+ transcription = result['text']
384
+ with transcription_lock:
385
+ transcription_update["content"] = transcription
386
+ transcription_update["new"] = True
387
+ else:
388
+ return "ASR failed", None, None, video_path
389
+ #progress(0.4, "ASR done:")
390
+
391
 
392
+ # use cosy voice if target_lang == 'en' or target_lang == 'zh'
393
 
394
+ if target_lang == 'en' or target_lang == 'zh':
395
+ try:
396
+ if not sio.connected:
397
+ server_url = TTS_SOCKET_SERVER
398
+ await sio.connect(server_url)
399
+ print(f"Connected to {server_url}")
400
+
401
+
402
+ # use defualt voice
403
+ tts_request = {
404
+ 'text': transcription,
405
+ 'overwrite_prompt': False,
406
+ 'promptText':"",
407
+ 'promptAudio':"",
408
+ 'sourceLang':source_lang,
409
+ 'targetLang':target_lang
410
+ }
411
+ await sio.emit('tts_request', tts_request)
412
+
413
+ # wait until all cosy voice tts is done :
414
+ await audio_update_event.wait()
415
+ print('cosy tts complete,',audio_update)
416
+
417
+ return transcription, translation_update["content"], audio_update["content"], video_path
418
+
419
+ except Exception as e:
420
+ print(f"Failed to process request: {str(e)}")
421
+ print("let use vits then")
422
+
423
+
424
+
425
+ #split_result = split_text_with_punctuation(transcription)
426
+ split_result = extract_segments(transcription);
427
+ translate_segments = []
428
+ accumulated_audio = None
429
+ sample_rate = None
430
+ global is_playing
431
+ for i, segment in enumerate(split_result):
432
+ #translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
433
+ translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
434
+ translated_seg_txt = await inference_via_llm_api(translation_prompt)
435
  translate_segments.append(translated_seg_txt)
436
  print(f"Translation: {translated_seg_txt}")
437
+ with translation_lock:
438
+ translation_update["content"] = " ".join(translate_segments)
439
+ translation_update["new"] = True
440
+
441
+ # Generate TTS for each translated segment
442
+ #progress(0.4 + (0.5 * (i + 1) / len(split_result)), "translation and tts in progress :")
443
+
444
+ tts_params = {
445
+ 'language': target_lang,
446
+ 'speed': 1.1,
447
+ 'speaker': target_speaker or AVAILABLE_SPEAKERS[target_lang][0],
448
+ 'text': translated_seg_txt
449
+ }
450
+
451
+ async with aiohttp.ClientSession() as session:
452
+ async with session.get(TTS_SPEAK_SERVICE, params=tts_params) as tts_response:
453
+ if tts_response.status == 200:
454
+ audio_file = await tts_response.text()
455
+ audio_file = audio_file.strip()
456
+ audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}"
457
+ async with session.get(audio_url) as response:
458
+ content = await response.read()
459
+ audio_chunk, sr = sf.read(BytesIO(content))
460
+ #print ('audio_chunk:', type(audio_chunk),audio_chunk)
461
+ print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
462
+ # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
463
+ audio_queue.append(audio_chunk)
464
+ if not is_playing:
465
+ playback_thread = threading.Thread(target=play_audio)
466
+ playback_thread.start()
467
+
468
+ if accumulated_audio is None:
469
+ accumulated_audio = audio_chunk
470
+ sample_rate = sr
471
+ else:
472
+ accumulated_audio = np.concatenate((accumulated_audio, audio_chunk))
473
+
474
+ with audio_lock:
475
+ audio_update["content"] = (sample_rate, accumulated_audio)
476
+ audio_update["new"] = True
477
+ else:
478
+ print(f"TTS failed for segment: {translated_seg_txt}")
479
+
480
  translated_text = " ".join(translate_segments)
 
 
 
 
 
 
 
481
 
482
+ #progress(1, "all done.")
483
+ print("sigal the playing could stop now. all tts generated")
484
+ is_playing =False;
485
+ if accumulated_audio is not None:
486
+ return transcription, translated_text, (sample_rate, accumulated_audio), video_path
487
  else:
488
+ return transcription, translated_text, "TTS failed", video_path
489
 
490
+ """
491
+ async def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
492
+ temp_video_path = None
493
+ transcription, translated_text, audio_chunksr, temp_video_path = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
494
+ return transcription, translated_text, audio_chunksr, temp_video_path
495
+ """
496
+ async def update_transcription():
497
+ global transcription_update
498
+ with transcription_lock:
499
+ if transcription_update["new"]:
500
+ content = transcription_update["content"]
501
+ transcription_update["new"] = False
502
+ return content
503
+ return gr.update()
504
+
505
+ async def update_translation():
506
+ global translation_update
507
+ with translation_lock:
508
+ if translation_update["new"]:
509
+ content = translation_update["content"]
510
+ translation_update["new"] = False
511
+ return content
512
+ return gr.update()
513
+
514
+ async def update_audio():
515
+ global audio_update
516
+ with audio_lock:
517
+ if audio_update["new"]:
518
+ content = audio_update["content"]
519
+ audio_update["new"] = False
520
+ return content
521
+ return gr.update()
522
 
523
  with gr.Blocks() as demo:
524
  gr.Markdown("# Speech Translation")
525
 
 
526
  gr.Markdown("Speak into the microphone, upload an audio file, or provide a YouTube URL. The app will translate and speak it back to you.")
527
 
528
  with gr.Row():
 
537
  with gr.Row():
538
  user_button = gr.Button("Translate and Speak", interactive=False)
539
 
 
540
  with gr.Row():
541
  user_transcription_output = gr.Textbox(label="Transcription")
542
  user_translation_output = gr.Textbox(label="Translation")
543
  user_audio_output = gr.Audio(label="Translated Speech")
544
+ progress_bar = gr.Textbox(label="progress", interactive=False)
545
+ status_message = gr.Textbox(label="Status", interactive=False)
546
+
547
  user_video_output = gr.HTML(label="YouTube Video")
548
 
549
+ replace_audio_button = gr.Button("Replace Audio", interactive=False)
550
+ final_video_output = gr.Video(label="Video with Replaced Audio")
551
+
552
+ temp_video_path = gr.State()
553
+ translation_progress = gr.State(0.0)
554
+
555
+ async def update_button_state(audio, youtube_url, progress):
556
+ print(audio, youtube_url, progress)
557
+ # Button is interactive if there's input and progress is 0 or 1 (not in progress)
558
+ return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
559
 
560
  user_audio_input.change(
561
  fn=update_button_state,
562
+ inputs=[user_audio_input, user_youtube_url, translation_progress],
563
  outputs=user_button
564
  )
565
  user_youtube_url.change(
566
  fn=update_button_state,
567
+ inputs=[user_audio_input, user_youtube_url, translation_progress],
568
  outputs=user_button
569
  )
 
 
 
 
570
 
571
+ async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker):
572
+
573
+
574
+ #audio_data, sample_rate = sf.read(audio)
575
+ #print ("user_audio_input:", audio, audio_data, sample_rate)
576
+
577
+
578
+ yield (0.01,
579
+ gr.update(interactive=False),
580
+ gr.update(), gr.update(), gr.update(), gr.update(),
581
+ "Translation in progress...")
582
+
583
+
584
+ temp_video_path = None
585
+ transcription, translated_text, audio_chunksr, temp_video_path = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
586
+
587
+ yield (1,
588
+ gr.update(interactive=True),
589
+ transcription, translated_text, audio_chunksr, temp_video_path,
590
+ "Translation complete")
591
 
592
  user_button.click(
593
+ fn=run_speech_translation_wrapper,
594
  inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
595
+ outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message]
596
  )
 
597
 
598
+ async def update_replace_audio_button(audio_url, video_path):
599
+ print("update replace:", audio_url, video_path)
 
600
  return gr.Button(interactive=bool(audio_url) and bool(video_path))
601
 
602
  user_audio_output.change(
 
605
  outputs=[replace_audio_button]
606
  )
607
 
 
608
  replace_audio_button.click(
609
  fn=replace_audio_and_generate_video,
610
  inputs=[temp_video_path, user_audio_output],
611
  outputs=[gr.Textbox(label="Status"), final_video_output]
612
  )
613
 
614
+ async def update_video_embed(youtube_url):
615
  if youtube_url:
616
  try:
617
+ video_id = await fetch_youtube_id(youtube_url)
618
  return f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>'
619
  except Exception as e:
620
  print(f"Error embedding video: {e}")
 
626
  outputs=[user_video_output]
627
  )
628
 
629
+ async def update_target_speakers(target_lang):
630
  return gr.Dropdown(choices=AVAILABLE_SPEAKERS[target_lang], value=AVAILABLE_SPEAKERS[target_lang][0])
631
 
632
  user_target_lang.change(
 
635
  outputs=[user_target_speaker]
636
  )
637
 
638
+ async def periodic_update():
639
+ transcription = await update_transcription()
640
+ translation = await update_translation()
641
+ audio = await update_audio()
642
+ return (
643
+ transcription,
644
+ translation,
645
+ audio
646
+ )
647
+
648
+ demo.load(
649
+ periodic_update,
650
+ inputs=[],
651
+ outputs=[
652
+ user_transcription_output,
653
+ user_translation_output,
654
+ user_audio_output,
655
+ ],
656
+ every=0.3
657
+ )
658
+
659
+ demo.queue()
660
+
661
+ asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))))
662
+