jerrypan7 commited on
Commit
ef89cb1
·
verified ·
1 Parent(s): 412e2b2

Update app.py

Browse files

adaptation of tts using timestamp speech for cloning

Files changed (1) hide show
  1. app.py +67 -39
app.py CHANGED
@@ -23,6 +23,10 @@ ASR_API = "http://astarwiz.com:9998/asr"
23
  TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
24
  TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
25
 
 
 
 
 
26
  LANGUAGE_MAP = {
27
  "en": "English",
28
  "ma": "Malay",
@@ -40,10 +44,12 @@ AVAILABLE_SPEAKERS = {
40
  "zh": ["childChinese2"]
41
  }
42
 
 
43
  audio_update_event = asyncio.Event()
44
  acc_cosy_audio = None
45
  # cosy voice tts related;
46
- TTS_SOCKET_SERVER = "http://astarwiz.com:9123"
 
47
 
48
  sio = socketio.AsyncClient()
49
 
@@ -209,7 +215,9 @@ async def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = N
209
  if url.get('isBundle'):
210
  audio_url = url['url']
211
  extension = url['extension']
 
212
  async with session.get(audio_url) as audio_response:
 
213
  if audio_response.status == 200:
214
  content = await audio_response.read()
215
  temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
@@ -320,18 +328,17 @@ async def upload_file(file_path, upload_url):
320
  with open(file_path, 'rb') as f:
321
  form_data = aiohttp.FormData()
322
  form_data.add_field('file', f, filename=os.path.basename(file_path))
323
-
324
  async with session.post(upload_url, data=form_data) as response:
325
  print(f"5. Client receives headers: {time.time()}")
326
  print(f"Status: {response.status}")
327
-
328
  result = await response.json()
329
  print(f"7. Client fully received and parsed response: {time.time()}")
330
  if response.status == 200:
331
  return result
332
  else:
333
  return {"file_id",""}
334
-
335
  async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
336
  global transcription_update, translation_update, audio_update, acc_cosy_audio,audio_update_event
337
  transcription_update = {"content": "", "new": True}
@@ -357,9 +364,12 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
357
  data = aiohttp.FormData()
358
  data.add_field('file', open(audio, 'rb'))
359
  data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
360
- data.add_field('model_name', 'whisper-large-v2-local-cs')
361
- #data.add_field('with_timestamp', 'false')
362
- data.add_field('with_timestamp', 'true')
 
 
 
363
 
364
  async with aiohttp.ClientSession() as session:
365
  async with session.post(ASR_API, data=data) as asr_response:
@@ -382,8 +392,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
382
  server_url = TTS_SOCKET_SERVER
383
  await sio.connect(server_url)
384
  print(f"Connected to {server_url}")
385
-
386
-
387
  # Handle the audio file
388
  file_id=""
389
  if audio and os.path.exists(audio):
@@ -395,6 +404,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
395
  print ("upload_result:", upload_result)
396
  file_id = upload_result['file_id']
397
 
 
398
  # use defualt voice
399
  tts_request = {
400
  'text': transcription,
@@ -418,15 +428,20 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
418
 
419
 
420
 
421
- #split_result = split_text_with_punctuation(transcription)
422
- split_result = extract_segments(transcription);
 
 
 
423
  translate_segments = []
424
  accumulated_audio = None
425
  sample_rate = 22050
426
  global is_playing
427
  for i, segment in enumerate(split_result):
428
- #translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
429
- translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
 
 
430
  translated_seg_txt = await inference_via_llm_api(translation_prompt)
431
  translate_segments.append(translated_seg_txt)
432
  print(f"Translation: {translated_seg_txt}")
@@ -454,8 +469,8 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
454
  content = await response.read()
455
  audio_chunk, sr = sf.read(BytesIO(content))
456
  #print ('audio_chunk:', type(audio_chunk),audio_chunk)
457
- print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
458
- # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
459
 
460
 
461
  if accumulated_audio is None:
@@ -513,6 +528,10 @@ async def update_audio():
513
  return content
514
  return gr.update()
515
 
 
 
 
 
516
  with gr.Blocks() as demo:
517
  gr.Markdown("# Speech Translation")
518
 
@@ -533,15 +552,14 @@ with gr.Blocks() as demo:
533
  with gr.Row():
534
  user_transcription_output = gr.Textbox(label="Transcription")
535
  user_translation_output = gr.Textbox(label="Translation")
536
- user_audio_output = gr.Audio(label="Translated Speech")
537
  user_audio_final = gr.Audio(label="Final total Speech")
538
- progress_bar = gr.Textbox(label="progress", interactive=False)
539
  status_message = gr.Textbox(label="Status", interactive=False)
540
 
541
  user_video_output = gr.HTML(label="YouTube Video")
542
 
543
- replace_audio_button = gr.Button("Replace Audio", interactive=False)
544
- final_video_output = gr.Video(label="Video with Replaced Audio")
545
 
546
  temp_video_path = gr.State()
547
  translation_progress = gr.State(0.0)
@@ -549,6 +567,7 @@ with gr.Blocks() as demo:
549
  async def update_button_state(audio, youtube_url, progress):
550
  print(audio, youtube_url, progress)
551
  # Button is interactive if there's input and progress is 0 or 1 (not in progress)
 
552
  return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
553
 
554
  user_audio_input.change(
@@ -562,31 +581,23 @@ with gr.Blocks() as demo:
562
  outputs=user_button
563
  )
564
 
565
- async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker):
566
-
567
-
568
- #audio_data, sample_rate = sf.read(audio)
569
- #print ("user_audio_input:", audio, audio_data, sample_rate)
570
-
571
 
572
- yield (0.01,
573
- gr.update(interactive=False),
574
- gr.update(), gr.update(), gr.update(), gr.update(),
575
- "Translation in progress...",None)
576
 
577
-
578
  temp_video_path = None
579
  transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
580
-
581
- yield (1,
582
- gr.update(interactive=True),
583
- transcription, translated_text, audio_chunksr, temp_video_path,
584
- "Translation complete", accumulated_aud_buf)
585
 
586
  user_button.click(
 
 
 
 
587
  fn=run_speech_translation_wrapper,
588
- inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
589
- outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,]
590
  )
591
 
592
  async def update_replace_audio_button(audio_url, video_path):
@@ -601,8 +612,8 @@ with gr.Blocks() as demo:
601
 
602
  replace_audio_button.click(
603
  fn=replace_audio_and_generate_video,
604
- inputs=[temp_video_path, user_audio_output],
605
- outputs=[gr.Textbox(label="Status"), final_video_output]
606
  )
607
 
608
  async def update_video_embed(youtube_url):
@@ -659,13 +670,16 @@ with gr.Blocks() as demo:
659
  async (audioFilePath) => {
660
  // Debug: Log received audio file path
661
  console.log("Received audio file path:", audioFilePath);
 
662
  if (!window.audioQueue) {
663
  window.audioQueue = [];
664
  window.isPlaying = false;
665
  }
 
666
  // Ensure the correct URL for the audio file is available
667
  if (audioFilePath && audioFilePath.url) {
668
  console.log("Processing audio file...");
 
669
  try {
670
  // Fetch and decode the audio file
671
  const response = await fetch(audioFilePath.url);
@@ -673,51 +687,64 @@ with gr.Blocks() as demo:
673
  console.error("Failed to fetch audio file:", response.statusText);
674
  return;
675
  }
 
676
  const audioData = await response.arrayBuffer();
677
  const audioContext = new AudioContext();
678
  const decodedData = await audioContext.decodeAudioData(audioData);
 
679
  // Split the decoded audio buffer into two chunks
680
  const totalDuration = decodedData.duration;
681
  const midPoint = Math.floor(decodedData.length / 2); // Midpoint for splitting
682
  const sampleRate = decodedData.sampleRate;
 
683
  // Create two separate AudioBuffers for each chunk
684
  const firstHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, midPoint, sampleRate);
685
  const secondHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, decodedData.length - midPoint, sampleRate);
 
686
  // Copy data from original buffer to the two new buffers
687
  for (let channel = 0; channel < decodedData.numberOfChannels; channel++) {
688
  firstHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(0, midPoint), channel, 0);
689
  secondHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(midPoint), channel, 0);
690
  }
 
691
  // Add both chunks to the queue
692
  window.audioQueue.push(firstHalfBuffer);
693
  window.audioQueue.push(secondHalfBuffer);
694
  console.log("Two audio chunks added to queue. Queue length:", window.audioQueue.length);
 
695
  // Function to play the next audio chunk from the queue
696
  const playNextChunk = async () => {
697
  console.log("Attempting to play next chunk. isPlaying:", window.isPlaying);
 
698
  if (!window.isPlaying && window.audioQueue.length > 0) {
699
  console.log("Starting playback...");
700
  window.isPlaying = true;
 
701
  // Get the next audio buffer from the queue
702
  const audioBuffer = window.audioQueue.shift();
703
  console.log("Playing audio chunk from buffer.");
 
704
  const source = audioContext.createBufferSource();
705
  source.buffer = audioBuffer;
706
  source.connect(audioContext.destination);
 
707
  // When the audio finishes playing, play the next chunk
708
  source.onended = () => {
709
  console.log("Audio chunk finished playing.");
710
  window.isPlaying = false;
711
  playNextChunk(); // Play the next audio chunk in the queue
712
  };
 
713
  source.start(0); // Start playing the current chunk
714
  console.log("Audio chunk started.");
715
  } else {
716
  console.log("Already playing or queue is empty.");
717
  }
718
  };
 
719
  // Start playing the next chunk if not already playing
720
  playNextChunk();
 
721
  } catch (error) {
722
  console.error("Error during audio playback:", error);
723
  window.isPlaying = false;
@@ -733,3 +760,4 @@ demo.queue()
733
 
734
  #demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
735
  asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))))
 
 
23
  TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
24
  TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
25
 
26
+
27
+ #bSegByPunct = True
28
+ bSegByPunct = False
29
+
30
  LANGUAGE_MAP = {
31
  "en": "English",
32
  "ma": "Malay",
 
44
  "zh": ["childChinese2"]
45
  }
46
 
47
+
48
  audio_update_event = asyncio.Event()
49
  acc_cosy_audio = None
50
  # cosy voice tts related;
51
+ #TTS_SOCKET_SERVER = "http://localhost:9244"
52
+ TTS_SOCKET_SERVER = "http://astarwiz.com:9244"
53
 
54
  sio = socketio.AsyncClient()
55
 
 
215
  if url.get('isBundle'):
216
  audio_url = url['url']
217
  extension = url['extension']
218
+ print ("audio_url :", audio_url)
219
  async with session.get(audio_url) as audio_response:
220
+ print ("audio_response:", audio_response)
221
  if audio_response.status == 200:
222
  content = await audio_response.read()
223
  temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
 
328
  with open(file_path, 'rb') as f:
329
  form_data = aiohttp.FormData()
330
  form_data.add_field('file', f, filename=os.path.basename(file_path))
331
+
332
  async with session.post(upload_url, data=form_data) as response:
333
  print(f"5. Client receives headers: {time.time()}")
334
  print(f"Status: {response.status}")
335
+
336
  result = await response.json()
337
  print(f"7. Client fully received and parsed response: {time.time()}")
338
  if response.status == 200:
339
  return result
340
  else:
341
  return {"file_id",""}
 
342
  async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
343
  global transcription_update, translation_update, audio_update, acc_cosy_audio,audio_update_event
344
  transcription_update = {"content": "", "new": True}
 
364
  data = aiohttp.FormData()
365
  data.add_field('file', open(audio, 'rb'))
366
  data.add_field('language', 'ms' if source_lang == 'ma' else source_lang)
367
+ if bSegByPunct:
368
+ data.add_field('model_name', 'whisper-large-v2-local-cs')
369
+ data.add_field('with_timestamp', 'false')
370
+ else:
371
+ data.add_field('model_name', 'official-v3')
372
+ data.add_field('with_timestamp', 'true')
373
 
374
  async with aiohttp.ClientSession() as session:
375
  async with session.post(ASR_API, data=data) as asr_response:
 
392
  server_url = TTS_SOCKET_SERVER
393
  await sio.connect(server_url)
394
  print(f"Connected to {server_url}")
395
+
 
396
  # Handle the audio file
397
  file_id=""
398
  if audio and os.path.exists(audio):
 
404
  print ("upload_result:", upload_result)
405
  file_id = upload_result['file_id']
406
 
407
+
408
  # use defualt voice
409
  tts_request = {
410
  'text': transcription,
 
428
 
429
 
430
 
431
+ if bSegByPunct:
432
+ split_result = split_text_with_punctuation(transcription)
433
+ else:
434
+ split_result = extract_segments(transcription);
435
+
436
  translate_segments = []
437
  accumulated_audio = None
438
  sample_rate = 22050
439
  global is_playing
440
  for i, segment in enumerate(split_result):
441
+ if bSegByPunct:
442
+ translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
443
+ else:
444
+ translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}"
445
  translated_seg_txt = await inference_via_llm_api(translation_prompt)
446
  translate_segments.append(translated_seg_txt)
447
  print(f"Translation: {translated_seg_txt}")
 
469
  content = await response.read()
470
  audio_chunk, sr = sf.read(BytesIO(content))
471
  #print ('audio_chunk:', type(audio_chunk),audio_chunk)
472
+ #print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
473
+ # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
474
 
475
 
476
  if accumulated_audio is None:
 
528
  return content
529
  return gr.update()
530
 
531
+ def disable_button():
532
+ # Disable the button during processing
533
+ return gr.update(interactive=False)
534
+
535
  with gr.Blocks() as demo:
536
  gr.Markdown("# Speech Translation")
537
 
 
552
  with gr.Row():
553
  user_transcription_output = gr.Textbox(label="Transcription")
554
  user_translation_output = gr.Textbox(label="Translation")
555
+ user_audio_output = gr.Audio(label="Translated Speech", visible =False)
556
  user_audio_final = gr.Audio(label="Final total Speech")
 
557
  status_message = gr.Textbox(label="Status", interactive=False)
558
 
559
  user_video_output = gr.HTML(label="YouTube Video")
560
 
561
+ replace_audio_button = gr.Button("Replace Audio", interactive=False, visible =False)
562
+ final_video_output = gr.Video(label="Video with Replaced Audio",visible=False)
563
 
564
  temp_video_path = gr.State()
565
  translation_progress = gr.State(0.0)
 
567
  async def update_button_state(audio, youtube_url, progress):
568
  print(audio, youtube_url, progress)
569
  # Button is interactive if there's input and progress is 0 or 1 (not in progress)
570
+ print ("progress:", audio, youtube_url,bool(audio) , bool(youtube_url), progress == 0 or progress == 1)
571
  return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1))
572
 
573
  user_audio_input.change(
 
581
  outputs=user_button
582
  )
583
 
 
 
 
 
 
 
584
 
585
+ async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker,progress):
 
 
 
586
 
587
+ progress = 0.1
588
  temp_video_path = None
589
  transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
590
+ progress = 1
591
+ return transcription, translated_text, audio_chunksr, temp_video_path, "Translation complete", accumulated_aud_buf, gr.update(interactive=True)
 
 
 
592
 
593
  user_button.click(
594
+ fn=disable_button,
595
+ inputs=[],
596
+ outputs=[user_button] # Disable the button during processing
597
+ ).then(
598
  fn=run_speech_translation_wrapper,
599
+ inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker, translation_progress],
600
+ outputs=[user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,user_button]
601
  )
602
 
603
  async def update_replace_audio_button(audio_url, video_path):
 
612
 
613
  replace_audio_button.click(
614
  fn=replace_audio_and_generate_video,
615
+ inputs=[temp_video_path, user_audio_final],
616
+ outputs=[status_message, final_video_output]
617
  )
618
 
619
  async def update_video_embed(youtube_url):
 
670
  async (audioFilePath) => {
671
  // Debug: Log received audio file path
672
  console.log("Received audio file path:", audioFilePath);
673
+
674
  if (!window.audioQueue) {
675
  window.audioQueue = [];
676
  window.isPlaying = false;
677
  }
678
+
679
  // Ensure the correct URL for the audio file is available
680
  if (audioFilePath && audioFilePath.url) {
681
  console.log("Processing audio file...");
682
+
683
  try {
684
  // Fetch and decode the audio file
685
  const response = await fetch(audioFilePath.url);
 
687
  console.error("Failed to fetch audio file:", response.statusText);
688
  return;
689
  }
690
+
691
  const audioData = await response.arrayBuffer();
692
  const audioContext = new AudioContext();
693
  const decodedData = await audioContext.decodeAudioData(audioData);
694
+
695
  // Split the decoded audio buffer into two chunks
696
  const totalDuration = decodedData.duration;
697
  const midPoint = Math.floor(decodedData.length / 2); // Midpoint for splitting
698
  const sampleRate = decodedData.sampleRate;
699
+
700
  // Create two separate AudioBuffers for each chunk
701
  const firstHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, midPoint, sampleRate);
702
  const secondHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, decodedData.length - midPoint, sampleRate);
703
+
704
  // Copy data from original buffer to the two new buffers
705
  for (let channel = 0; channel < decodedData.numberOfChannels; channel++) {
706
  firstHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(0, midPoint), channel, 0);
707
  secondHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(midPoint), channel, 0);
708
  }
709
+
710
  // Add both chunks to the queue
711
  window.audioQueue.push(firstHalfBuffer);
712
  window.audioQueue.push(secondHalfBuffer);
713
  console.log("Two audio chunks added to queue. Queue length:", window.audioQueue.length);
714
+
715
  // Function to play the next audio chunk from the queue
716
  const playNextChunk = async () => {
717
  console.log("Attempting to play next chunk. isPlaying:", window.isPlaying);
718
+
719
  if (!window.isPlaying && window.audioQueue.length > 0) {
720
  console.log("Starting playback...");
721
  window.isPlaying = true;
722
+
723
  // Get the next audio buffer from the queue
724
  const audioBuffer = window.audioQueue.shift();
725
  console.log("Playing audio chunk from buffer.");
726
+
727
  const source = audioContext.createBufferSource();
728
  source.buffer = audioBuffer;
729
  source.connect(audioContext.destination);
730
+
731
  // When the audio finishes playing, play the next chunk
732
  source.onended = () => {
733
  console.log("Audio chunk finished playing.");
734
  window.isPlaying = false;
735
  playNextChunk(); // Play the next audio chunk in the queue
736
  };
737
+
738
  source.start(0); // Start playing the current chunk
739
  console.log("Audio chunk started.");
740
  } else {
741
  console.log("Already playing or queue is empty.");
742
  }
743
  };
744
+
745
  // Start playing the next chunk if not already playing
746
  playNextChunk();
747
+
748
  } catch (error) {
749
  console.error("Error during audio playback:", error);
750
  window.isPlaying = false;
 
760
 
761
  #demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
762
  asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))))
763
+