jerrypan7 commited on
Commit
e2c6728
·
verified ·
1 Parent(s): d171a54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -53
app.py CHANGED
@@ -43,28 +43,9 @@ AVAILABLE_SPEAKERS = {
43
  "zh": ["childChinese2"]
44
  }
45
 
46
- # global variable to playing of tts generated
47
- audio_queue = []
48
- is_playing = False
49
- audio_update_event = asyncio.Event()
50
 
51
- def play_audio():
52
- global is_playing
53
- is_playing = True
54
-
55
- #
56
- while is_playing:
57
- if audio_queue:
58
- audio_chunk = audio_queue.pop(0)
59
- sd.play(audio_chunk, samplerate=22050)
60
- sd.wait()
61
- else:
62
- time.sleep(0.1)
63
- print(" tts generating finished. play all the rest to finish playing")
64
- while audio_queue:
65
- audio_chunk = audio_queue.pop(0)
66
- sd.play(audio_chunk, samplerate=22050)
67
- sd.wait()
68
  # cosy voice tts related;
69
  #TTS_SOCKET_SERVER = "http://localhost:9244"
70
  TTS_SOCKET_SERVER = "http://astarwiz.com:9244"
@@ -81,7 +62,7 @@ def on_disconnect():
81
 
82
  @sio.on('audio_chunk')
83
  async def on_audio_chunk(data):
84
- global translation_update, audio_update
85
 
86
  translated_seg_txt = data['trans_text']
87
  with translation_lock:
@@ -91,26 +72,20 @@ async def on_audio_chunk(data):
91
  audio_base64 = data['audio']
92
  audio_bytes = base64.b64decode(audio_base64)
93
  audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
94
- audio_queue.append(audio_np)
95
 
96
- if audio_update["content"] is None:
97
- sr,accumulated_audio= 22050 ,audio_np
98
  else:
99
- sr, accumulated_audio = audio_update["content"]
100
- accumulated_audio = np.concatenate((accumulated_audio, audio_np))
101
-
102
  with audio_lock:
103
- audio_update["content"] = (sr, accumulated_audio)
104
  audio_update["new"] = True
105
 
106
-
107
  #audio_float = audio_np.astype(np.float32) / 32767.0
108
  #audio_queue.append(audio_float)
109
  #accumulated_audio.extend(audio_float)
110
 
111
- if not is_playing:
112
- playback_thread = threading.Thread(target=play_audio)
113
- playback_thread.start()
114
 
115
  @sio.on('tts_complete')
116
  async def on_tts_complete():
@@ -118,10 +93,7 @@ async def on_tts_complete():
118
  print("Disconnected from server after TTS completion")
119
 
120
  audio_update_event.set()
121
- global is_playing
122
- while audio_queue:
123
- await asyncio.sleep(0.1)
124
- is_playing = False
125
 
126
 
127
  # Global variables for storing update information
@@ -349,10 +321,11 @@ async def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64)
349
  return "The system got some error during vLLM generation. Please try it again."
350
 
351
  async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
352
- global transcription_update, translation_update, audio_update
353
  transcription_update = {"content": "", "new": False}
354
  translation_update = {"content": "", "new": False}
355
  audio_update = {"content": None, "new": False}
 
356
  video_path = None
357
 
358
  #progress = gr.Progress();
@@ -414,7 +387,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
414
  await audio_update_event.wait()
415
  print('cosy tts complete,',audio_update)
416
 
417
- return transcription, translation_update["content"], audio_update["content"], video_path
418
 
419
  except Exception as e:
420
  print(f"Failed to process request: {str(e)}")
@@ -426,7 +399,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
426
  split_result = extract_segments(transcription);
427
  translate_segments = []
428
  accumulated_audio = None
429
- sample_rate = None
430
  global is_playing
431
  for i, segment in enumerate(split_result):
432
  #translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
@@ -460,10 +433,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
460
  #print ('audio_chunk:', type(audio_chunk),audio_chunk)
461
  print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
462
  # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
463
- audio_queue.append(audio_chunk)
464
- if not is_playing:
465
- playback_thread = threading.Thread(target=play_audio)
466
- playback_thread.start()
467
 
468
  if accumulated_audio is None:
469
  accumulated_audio = audio_chunk
@@ -472,7 +442,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
472
  accumulated_audio = np.concatenate((accumulated_audio, audio_chunk))
473
 
474
  with audio_lock:
475
- audio_update["content"] = (sample_rate, accumulated_audio)
476
  audio_update["new"] = True
477
  else:
478
  print(f"TTS failed for segment: {translated_seg_txt}")
@@ -483,9 +453,9 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
483
  print("sigal the playing could stop now. all tts generated")
484
  is_playing =False;
485
  if accumulated_audio is not None:
486
- return transcription, translated_text, (sample_rate, accumulated_audio), video_path
487
  else:
488
- return transcription, translated_text, "TTS failed", video_path
489
 
490
  """
491
  async def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
@@ -541,6 +511,7 @@ with gr.Blocks() as demo:
541
  user_transcription_output = gr.Textbox(label="Transcription")
542
  user_translation_output = gr.Textbox(label="Translation")
543
  user_audio_output = gr.Audio(label="Translated Speech")
 
544
  progress_bar = gr.Textbox(label="progress", interactive=False)
545
  status_message = gr.Textbox(label="Status", interactive=False)
546
 
@@ -578,21 +549,21 @@ with gr.Blocks() as demo:
578
  yield (0.01,
579
  gr.update(interactive=False),
580
  gr.update(), gr.update(), gr.update(), gr.update(),
581
- "Translation in progress...")
582
 
583
 
584
  temp_video_path = None
585
- transcription, translated_text, audio_chunksr, temp_video_path = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
586
 
587
  yield (1,
588
  gr.update(interactive=True),
589
  transcription, translated_text, audio_chunksr, temp_video_path,
590
- "Translation complete")
591
 
592
  user_button.click(
593
  fn=run_speech_translation_wrapper,
594
  inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
595
- outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message]
596
  )
597
 
598
  async def update_replace_audio_button(audio_url, video_path):
@@ -653,10 +624,106 @@ with gr.Blocks() as demo:
653
  user_translation_output,
654
  user_audio_output,
655
  ],
656
- every=0.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657
  )
658
 
659
  demo.queue()
 
660
  demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
661
- #asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")), share=True))
662
 
 
43
  "zh": ["childChinese2"]
44
  }
45
 
 
 
 
 
46
 
47
+ audio_update_event = asyncio.Event()
48
+ acc_cosy_audio = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # cosy voice tts related;
50
  #TTS_SOCKET_SERVER = "http://localhost:9244"
51
  TTS_SOCKET_SERVER = "http://astarwiz.com:9244"
 
62
 
63
  @sio.on('audio_chunk')
64
  async def on_audio_chunk(data):
65
+ global translation_update, audio_update, acc_cosy_audio
66
 
67
  translated_seg_txt = data['trans_text']
68
  with translation_lock:
 
72
  audio_base64 = data['audio']
73
  audio_bytes = base64.b64decode(audio_base64)
74
  audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
 
75
 
76
+ if (acc_cosy_audio is None):
77
+ acc_cosy_audio = audio_np
78
  else:
79
+ acc_cosy_audio = np.concatenate((acc_cosy_audio, audio_np))
80
+
 
81
  with audio_lock:
82
+ audio_update["content"] = (22050, audio_np)
83
  audio_update["new"] = True
84
 
 
85
  #audio_float = audio_np.astype(np.float32) / 32767.0
86
  #audio_queue.append(audio_float)
87
  #accumulated_audio.extend(audio_float)
88
 
 
 
 
89
 
90
  @sio.on('tts_complete')
91
  async def on_tts_complete():
 
93
  print("Disconnected from server after TTS completion")
94
 
95
  audio_update_event.set()
96
+
 
 
 
97
 
98
 
99
  # Global variables for storing update information
 
321
  return "The system got some error during vLLM generation. Please try it again."
322
 
323
  async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
324
+ global transcription_update, translation_update, audio_update, acc_cosy_audio
325
  transcription_update = {"content": "", "new": False}
326
  translation_update = {"content": "", "new": False}
327
  audio_update = {"content": None, "new": False}
328
+ acc_cosy_audio =None
329
  video_path = None
330
 
331
  #progress = gr.Progress();
 
387
  await audio_update_event.wait()
388
  print('cosy tts complete,',audio_update)
389
 
390
+ return transcription, translation_update["content"], audio_update["content"], video_path, (22050, acc_cosy_audio)
391
 
392
  except Exception as e:
393
  print(f"Failed to process request: {str(e)}")
 
399
  split_result = extract_segments(transcription);
400
  translate_segments = []
401
  accumulated_audio = None
402
+ sample_rate = 22050
403
  global is_playing
404
  for i, segment in enumerate(split_result):
405
  #translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
 
433
  #print ('audio_chunk:', type(audio_chunk),audio_chunk)
434
  print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
435
  # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
436
+
 
 
 
437
 
438
  if accumulated_audio is None:
439
  accumulated_audio = audio_chunk
 
442
  accumulated_audio = np.concatenate((accumulated_audio, audio_chunk))
443
 
444
  with audio_lock:
445
+ audio_update["content"] = (sample_rate, audio_chunk)
446
  audio_update["new"] = True
447
  else:
448
  print(f"TTS failed for segment: {translated_seg_txt}")
 
453
  print("sigal the playing could stop now. all tts generated")
454
  is_playing =False;
455
  if accumulated_audio is not None:
456
+ return transcription, translated_text, audio_update["content"], video_path, (sample_rate,accumulated_audio)
457
  else:
458
+ return transcription, translated_text, "TTS failed", video_path, accumulated_audio
459
 
460
  """
461
  async def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
 
511
  user_transcription_output = gr.Textbox(label="Transcription")
512
  user_translation_output = gr.Textbox(label="Translation")
513
  user_audio_output = gr.Audio(label="Translated Speech")
514
+ user_audio_final = gr.Audio(label="Final total Speech")
515
  progress_bar = gr.Textbox(label="progress", interactive=False)
516
  status_message = gr.Textbox(label="Status", interactive=False)
517
 
 
549
  yield (0.01,
550
  gr.update(interactive=False),
551
  gr.update(), gr.update(), gr.update(), gr.update(),
552
+ "Translation in progress...",gr.update())
553
 
554
 
555
  temp_video_path = None
556
+ transcription, translated_text, audio_chunksr, temp_video_path,accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
557
 
558
  yield (1,
559
  gr.update(interactive=True),
560
  transcription, translated_text, audio_chunksr, temp_video_path,
561
+ "Translation complete", accumulated_aud_buf)
562
 
563
  user_button.click(
564
  fn=run_speech_translation_wrapper,
565
  inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
566
+ outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,]
567
  )
568
 
569
  async def update_replace_audio_button(audio_url, video_path):
 
624
  user_translation_output,
625
  user_audio_output,
626
  ],
627
+ every=0.1
628
+ )
629
+
630
+ # JavaScript for client-side queue and playback handling
631
+ user_audio_output.change(
632
+ None, # No backend change needed, we only handle frontend actions
633
+ inputs=user_audio_output, # Set the user_audio_output as input to capture its audio changes
634
+ outputs=None,
635
+ js="""
636
+ async (audioFilePath) => {
637
+ // Debug: Log received audio file path
638
+ console.log("Received audio file path:", audioFilePath);
639
+
640
+ if (!window.audioQueue) {
641
+ window.audioQueue = [];
642
+ window.isPlaying = false;
643
+ }
644
+
645
+ // Ensure the correct URL for the audio file is available
646
+ if (audioFilePath && audioFilePath.url) {
647
+ console.log("Processing audio file...");
648
+
649
+ try {
650
+ // Fetch and decode the audio file
651
+ const response = await fetch(audioFilePath.url);
652
+ if (!response.ok) {
653
+ console.error("Failed to fetch audio file:", response.statusText);
654
+ return;
655
+ }
656
+
657
+ const audioData = await response.arrayBuffer();
658
+ const audioContext = new AudioContext();
659
+ const decodedData = await audioContext.decodeAudioData(audioData);
660
+
661
+ // Split the decoded audio buffer into two chunks
662
+ const totalDuration = decodedData.duration;
663
+ const midPoint = Math.floor(decodedData.length / 2); // Midpoint for splitting
664
+ const sampleRate = decodedData.sampleRate;
665
+
666
+ // Create two separate AudioBuffers for each chunk
667
+ const firstHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, midPoint, sampleRate);
668
+ const secondHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, decodedData.length - midPoint, sampleRate);
669
+
670
+ // Copy data from original buffer to the two new buffers
671
+ for (let channel = 0; channel < decodedData.numberOfChannels; channel++) {
672
+ firstHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(0, midPoint), channel, 0);
673
+ secondHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(midPoint), channel, 0);
674
+ }
675
+
676
+ // Add both chunks to the queue
677
+ window.audioQueue.push(firstHalfBuffer);
678
+ window.audioQueue.push(secondHalfBuffer);
679
+ console.log("Two audio chunks added to queue. Queue length:", window.audioQueue.length);
680
+
681
+ // Function to play the next audio chunk from the queue
682
+ const playNextChunk = async () => {
683
+ console.log("Attempting to play next chunk. isPlaying:", window.isPlaying);
684
+
685
+ if (!window.isPlaying && window.audioQueue.length > 0) {
686
+ console.log("Starting playback...");
687
+ window.isPlaying = true;
688
+
689
+ // Get the next audio buffer from the queue
690
+ const audioBuffer = window.audioQueue.shift();
691
+ console.log("Playing audio chunk from buffer.");
692
+
693
+ const source = audioContext.createBufferSource();
694
+ source.buffer = audioBuffer;
695
+ source.connect(audioContext.destination);
696
+
697
+ // When the audio finishes playing, play the next chunk
698
+ source.onended = () => {
699
+ console.log("Audio chunk finished playing.");
700
+ window.isPlaying = false;
701
+ playNextChunk(); // Play the next audio chunk in the queue
702
+ };
703
+
704
+ source.start(0); // Start playing the current chunk
705
+ console.log("Audio chunk started.");
706
+ } else {
707
+ console.log("Already playing or queue is empty.");
708
+ }
709
+ };
710
+
711
+ // Start playing the next chunk if not already playing
712
+ playNextChunk();
713
+
714
+ } catch (error) {
715
+ console.error("Error during audio playback:", error);
716
+ window.isPlaying = false;
717
+ }
718
+ } else {
719
+ console.log("No valid audio file path received.");
720
+ }
721
+ }
722
+ """
723
  )
724
 
725
  demo.queue()
726
+
727
  demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
728
+ #asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))))
729