Update app.py
Browse files
app.py
CHANGED
@@ -43,28 +43,9 @@ AVAILABLE_SPEAKERS = {
|
|
43 |
"zh": ["childChinese2"]
|
44 |
}
|
45 |
|
46 |
-
# global variable to playing of tts generated
|
47 |
-
audio_queue = []
|
48 |
-
is_playing = False
|
49 |
-
audio_update_event = asyncio.Event()
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
is_playing = True
|
54 |
-
|
55 |
-
#
|
56 |
-
while is_playing:
|
57 |
-
if audio_queue:
|
58 |
-
audio_chunk = audio_queue.pop(0)
|
59 |
-
sd.play(audio_chunk, samplerate=22050)
|
60 |
-
sd.wait()
|
61 |
-
else:
|
62 |
-
time.sleep(0.1)
|
63 |
-
print(" tts generating finished. play all the rest to finish playing")
|
64 |
-
while audio_queue:
|
65 |
-
audio_chunk = audio_queue.pop(0)
|
66 |
-
sd.play(audio_chunk, samplerate=22050)
|
67 |
-
sd.wait()
|
68 |
# cosy voice tts related;
|
69 |
#TTS_SOCKET_SERVER = "http://localhost:9244"
|
70 |
TTS_SOCKET_SERVER = "http://astarwiz.com:9244"
|
@@ -81,7 +62,7 @@ def on_disconnect():
|
|
81 |
|
82 |
@sio.on('audio_chunk')
|
83 |
async def on_audio_chunk(data):
|
84 |
-
global translation_update, audio_update
|
85 |
|
86 |
translated_seg_txt = data['trans_text']
|
87 |
with translation_lock:
|
@@ -91,26 +72,20 @@ async def on_audio_chunk(data):
|
|
91 |
audio_base64 = data['audio']
|
92 |
audio_bytes = base64.b64decode(audio_base64)
|
93 |
audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
|
94 |
-
audio_queue.append(audio_np)
|
95 |
|
96 |
-
if
|
97 |
-
|
98 |
else:
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
with audio_lock:
|
103 |
-
audio_update["content"] = (
|
104 |
audio_update["new"] = True
|
105 |
|
106 |
-
|
107 |
#audio_float = audio_np.astype(np.float32) / 32767.0
|
108 |
#audio_queue.append(audio_float)
|
109 |
#accumulated_audio.extend(audio_float)
|
110 |
|
111 |
-
if not is_playing:
|
112 |
-
playback_thread = threading.Thread(target=play_audio)
|
113 |
-
playback_thread.start()
|
114 |
|
115 |
@sio.on('tts_complete')
|
116 |
async def on_tts_complete():
|
@@ -118,10 +93,7 @@ async def on_tts_complete():
|
|
118 |
print("Disconnected from server after TTS completion")
|
119 |
|
120 |
audio_update_event.set()
|
121 |
-
|
122 |
-
while audio_queue:
|
123 |
-
await asyncio.sleep(0.1)
|
124 |
-
is_playing = False
|
125 |
|
126 |
|
127 |
# Global variables for storing update information
|
@@ -349,10 +321,11 @@ async def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64)
|
|
349 |
return "The system got some error during vLLM generation. Please try it again."
|
350 |
|
351 |
async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
|
352 |
-
global transcription_update, translation_update, audio_update
|
353 |
transcription_update = {"content": "", "new": False}
|
354 |
translation_update = {"content": "", "new": False}
|
355 |
audio_update = {"content": None, "new": False}
|
|
|
356 |
video_path = None
|
357 |
|
358 |
#progress = gr.Progress();
|
@@ -414,7 +387,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
|
|
414 |
await audio_update_event.wait()
|
415 |
print('cosy tts complete,',audio_update)
|
416 |
|
417 |
-
return transcription, translation_update["content"], audio_update["content"], video_path
|
418 |
|
419 |
except Exception as e:
|
420 |
print(f"Failed to process request: {str(e)}")
|
@@ -426,7 +399,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
|
|
426 |
split_result = extract_segments(transcription);
|
427 |
translate_segments = []
|
428 |
accumulated_audio = None
|
429 |
-
sample_rate =
|
430 |
global is_playing
|
431 |
for i, segment in enumerate(split_result):
|
432 |
#translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
|
@@ -460,10 +433,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
|
|
460 |
#print ('audio_chunk:', type(audio_chunk),audio_chunk)
|
461 |
print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
|
462 |
# _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
|
463 |
-
|
464 |
-
if not is_playing:
|
465 |
-
playback_thread = threading.Thread(target=play_audio)
|
466 |
-
playback_thread.start()
|
467 |
|
468 |
if accumulated_audio is None:
|
469 |
accumulated_audio = audio_chunk
|
@@ -472,7 +442,7 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
|
|
472 |
accumulated_audio = np.concatenate((accumulated_audio, audio_chunk))
|
473 |
|
474 |
with audio_lock:
|
475 |
-
audio_update["content"] = (sample_rate,
|
476 |
audio_update["new"] = True
|
477 |
else:
|
478 |
print(f"TTS failed for segment: {translated_seg_txt}")
|
@@ -483,9 +453,9 @@ async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None
|
|
483 |
print("sigal the playing could stop now. all tts generated")
|
484 |
is_playing =False;
|
485 |
if accumulated_audio is not None:
|
486 |
-
return transcription, translated_text, (sample_rate,
|
487 |
else:
|
488 |
-
return transcription, translated_text, "TTS failed", video_path
|
489 |
|
490 |
"""
|
491 |
async def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
|
@@ -541,6 +511,7 @@ with gr.Blocks() as demo:
|
|
541 |
user_transcription_output = gr.Textbox(label="Transcription")
|
542 |
user_translation_output = gr.Textbox(label="Translation")
|
543 |
user_audio_output = gr.Audio(label="Translated Speech")
|
|
|
544 |
progress_bar = gr.Textbox(label="progress", interactive=False)
|
545 |
status_message = gr.Textbox(label="Status", interactive=False)
|
546 |
|
@@ -578,21 +549,21 @@ with gr.Blocks() as demo:
|
|
578 |
yield (0.01,
|
579 |
gr.update(interactive=False),
|
580 |
gr.update(), gr.update(), gr.update(), gr.update(),
|
581 |
-
"Translation in progress...")
|
582 |
|
583 |
|
584 |
temp_video_path = None
|
585 |
-
transcription, translated_text, audio_chunksr, temp_video_path = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
|
586 |
|
587 |
yield (1,
|
588 |
gr.update(interactive=True),
|
589 |
transcription, translated_text, audio_chunksr, temp_video_path,
|
590 |
-
"Translation complete")
|
591 |
|
592 |
user_button.click(
|
593 |
fn=run_speech_translation_wrapper,
|
594 |
inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
|
595 |
-
outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message]
|
596 |
)
|
597 |
|
598 |
async def update_replace_audio_button(audio_url, video_path):
|
@@ -653,10 +624,106 @@ with gr.Blocks() as demo:
|
|
653 |
user_translation_output,
|
654 |
user_audio_output,
|
655 |
],
|
656 |
-
every=0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
657 |
)
|
658 |
|
659 |
demo.queue()
|
|
|
660 |
demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
|
661 |
-
#asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))
|
662 |
|
|
|
43 |
"zh": ["childChinese2"]
|
44 |
}
|
45 |
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
audio_update_event = asyncio.Event()
|
48 |
+
acc_cosy_audio = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# cosy voice tts related;
|
50 |
#TTS_SOCKET_SERVER = "http://localhost:9244"
|
51 |
TTS_SOCKET_SERVER = "http://astarwiz.com:9244"
|
|
|
62 |
|
63 |
@sio.on('audio_chunk')
|
64 |
async def on_audio_chunk(data):
|
65 |
+
global translation_update, audio_update, acc_cosy_audio
|
66 |
|
67 |
translated_seg_txt = data['trans_text']
|
68 |
with translation_lock:
|
|
|
72 |
audio_base64 = data['audio']
|
73 |
audio_bytes = base64.b64decode(audio_base64)
|
74 |
audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
|
|
|
75 |
|
76 |
+
if (acc_cosy_audio is None):
|
77 |
+
acc_cosy_audio = audio_np
|
78 |
else:
|
79 |
+
acc_cosy_audio = np.concatenate((acc_cosy_audio, audio_np))
|
80 |
+
|
|
|
81 |
with audio_lock:
|
82 |
+
audio_update["content"] = (22050, audio_np)
|
83 |
audio_update["new"] = True
|
84 |
|
|
|
85 |
#audio_float = audio_np.astype(np.float32) / 32767.0
|
86 |
#audio_queue.append(audio_float)
|
87 |
#accumulated_audio.extend(audio_float)
|
88 |
|
|
|
|
|
|
|
89 |
|
90 |
@sio.on('tts_complete')
|
91 |
async def on_tts_complete():
|
|
|
93 |
print("Disconnected from server after TTS completion")
|
94 |
|
95 |
audio_update_event.set()
|
96 |
+
|
|
|
|
|
|
|
97 |
|
98 |
|
99 |
# Global variables for storing update information
|
|
|
321 |
return "The system got some error during vLLM generation. Please try it again."
|
322 |
|
323 |
async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None):
|
324 |
+
global transcription_update, translation_update, audio_update, acc_cosy_audio
|
325 |
transcription_update = {"content": "", "new": False}
|
326 |
translation_update = {"content": "", "new": False}
|
327 |
audio_update = {"content": None, "new": False}
|
328 |
+
acc_cosy_audio =None
|
329 |
video_path = None
|
330 |
|
331 |
#progress = gr.Progress();
|
|
|
387 |
await audio_update_event.wait()
|
388 |
print('cosy tts complete,',audio_update)
|
389 |
|
390 |
+
return transcription, translation_update["content"], audio_update["content"], video_path, (22050, acc_cosy_audio)
|
391 |
|
392 |
except Exception as e:
|
393 |
print(f"Failed to process request: {str(e)}")
|
|
|
399 |
split_result = extract_segments(transcription);
|
400 |
translate_segments = []
|
401 |
accumulated_audio = None
|
402 |
+
sample_rate = 22050
|
403 |
global is_playing
|
404 |
for i, segment in enumerate(split_result):
|
405 |
#translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}"
|
|
|
433 |
#print ('audio_chunk:', type(audio_chunk),audio_chunk)
|
434 |
print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr)
|
435 |
# _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start'])
|
436 |
+
|
|
|
|
|
|
|
437 |
|
438 |
if accumulated_audio is None:
|
439 |
accumulated_audio = audio_chunk
|
|
|
442 |
accumulated_audio = np.concatenate((accumulated_audio, audio_chunk))
|
443 |
|
444 |
with audio_lock:
|
445 |
+
audio_update["content"] = (sample_rate, audio_chunk)
|
446 |
audio_update["new"] = True
|
447 |
else:
|
448 |
print(f"TTS failed for segment: {translated_seg_txt}")
|
|
|
453 |
print("sigal the playing could stop now. all tts generated")
|
454 |
is_playing =False;
|
455 |
if accumulated_audio is not None:
|
456 |
+
return transcription, translated_text, audio_update["content"], video_path, (sample_rate,accumulated_audio)
|
457 |
else:
|
458 |
+
return transcription, translated_text, "TTS failed", video_path, accumulated_audio
|
459 |
|
460 |
"""
|
461 |
async def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker):
|
|
|
511 |
user_transcription_output = gr.Textbox(label="Transcription")
|
512 |
user_translation_output = gr.Textbox(label="Translation")
|
513 |
user_audio_output = gr.Audio(label="Translated Speech")
|
514 |
+
user_audio_final = gr.Audio(label="Final total Speech")
|
515 |
progress_bar = gr.Textbox(label="progress", interactive=False)
|
516 |
status_message = gr.Textbox(label="Status", interactive=False)
|
517 |
|
|
|
549 |
yield (0.01,
|
550 |
gr.update(interactive=False),
|
551 |
gr.update(), gr.update(), gr.update(), gr.update(),
|
552 |
+
"Translation in progress...",gr.update())
|
553 |
|
554 |
|
555 |
temp_video_path = None
|
556 |
+
transcription, translated_text, audio_chunksr, temp_video_path,accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker)
|
557 |
|
558 |
yield (1,
|
559 |
gr.update(interactive=True),
|
560 |
transcription, translated_text, audio_chunksr, temp_video_path,
|
561 |
+
"Translation complete", accumulated_aud_buf)
|
562 |
|
563 |
user_button.click(
|
564 |
fn=run_speech_translation_wrapper,
|
565 |
inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker],
|
566 |
+
outputs=[translation_progress, user_button, user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,]
|
567 |
)
|
568 |
|
569 |
async def update_replace_audio_button(audio_url, video_path):
|
|
|
624 |
user_translation_output,
|
625 |
user_audio_output,
|
626 |
],
|
627 |
+
every=0.1
|
628 |
+
)
|
629 |
+
|
630 |
+
# JavaScript for client-side queue and playback handling
|
631 |
+
user_audio_output.change(
|
632 |
+
None, # No backend change needed, we only handle frontend actions
|
633 |
+
inputs=user_audio_output, # Set the user_audio_output as input to capture its audio changes
|
634 |
+
outputs=None,
|
635 |
+
js="""
|
636 |
+
async (audioFilePath) => {
|
637 |
+
// Debug: Log received audio file path
|
638 |
+
console.log("Received audio file path:", audioFilePath);
|
639 |
+
|
640 |
+
if (!window.audioQueue) {
|
641 |
+
window.audioQueue = [];
|
642 |
+
window.isPlaying = false;
|
643 |
+
}
|
644 |
+
|
645 |
+
// Ensure the correct URL for the audio file is available
|
646 |
+
if (audioFilePath && audioFilePath.url) {
|
647 |
+
console.log("Processing audio file...");
|
648 |
+
|
649 |
+
try {
|
650 |
+
// Fetch and decode the audio file
|
651 |
+
const response = await fetch(audioFilePath.url);
|
652 |
+
if (!response.ok) {
|
653 |
+
console.error("Failed to fetch audio file:", response.statusText);
|
654 |
+
return;
|
655 |
+
}
|
656 |
+
|
657 |
+
const audioData = await response.arrayBuffer();
|
658 |
+
const audioContext = new AudioContext();
|
659 |
+
const decodedData = await audioContext.decodeAudioData(audioData);
|
660 |
+
|
661 |
+
// Split the decoded audio buffer into two chunks
|
662 |
+
const totalDuration = decodedData.duration;
|
663 |
+
const midPoint = Math.floor(decodedData.length / 2); // Midpoint for splitting
|
664 |
+
const sampleRate = decodedData.sampleRate;
|
665 |
+
|
666 |
+
// Create two separate AudioBuffers for each chunk
|
667 |
+
const firstHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, midPoint, sampleRate);
|
668 |
+
const secondHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, decodedData.length - midPoint, sampleRate);
|
669 |
+
|
670 |
+
// Copy data from original buffer to the two new buffers
|
671 |
+
for (let channel = 0; channel < decodedData.numberOfChannels; channel++) {
|
672 |
+
firstHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(0, midPoint), channel, 0);
|
673 |
+
secondHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(midPoint), channel, 0);
|
674 |
+
}
|
675 |
+
|
676 |
+
// Add both chunks to the queue
|
677 |
+
window.audioQueue.push(firstHalfBuffer);
|
678 |
+
window.audioQueue.push(secondHalfBuffer);
|
679 |
+
console.log("Two audio chunks added to queue. Queue length:", window.audioQueue.length);
|
680 |
+
|
681 |
+
// Function to play the next audio chunk from the queue
|
682 |
+
const playNextChunk = async () => {
|
683 |
+
console.log("Attempting to play next chunk. isPlaying:", window.isPlaying);
|
684 |
+
|
685 |
+
if (!window.isPlaying && window.audioQueue.length > 0) {
|
686 |
+
console.log("Starting playback...");
|
687 |
+
window.isPlaying = true;
|
688 |
+
|
689 |
+
// Get the next audio buffer from the queue
|
690 |
+
const audioBuffer = window.audioQueue.shift();
|
691 |
+
console.log("Playing audio chunk from buffer.");
|
692 |
+
|
693 |
+
const source = audioContext.createBufferSource();
|
694 |
+
source.buffer = audioBuffer;
|
695 |
+
source.connect(audioContext.destination);
|
696 |
+
|
697 |
+
// When the audio finishes playing, play the next chunk
|
698 |
+
source.onended = () => {
|
699 |
+
console.log("Audio chunk finished playing.");
|
700 |
+
window.isPlaying = false;
|
701 |
+
playNextChunk(); // Play the next audio chunk in the queue
|
702 |
+
};
|
703 |
+
|
704 |
+
source.start(0); // Start playing the current chunk
|
705 |
+
console.log("Audio chunk started.");
|
706 |
+
} else {
|
707 |
+
console.log("Already playing or queue is empty.");
|
708 |
+
}
|
709 |
+
};
|
710 |
+
|
711 |
+
// Start playing the next chunk if not already playing
|
712 |
+
playNextChunk();
|
713 |
+
|
714 |
+
} catch (error) {
|
715 |
+
console.error("Error during audio playback:", error);
|
716 |
+
window.isPlaying = false;
|
717 |
+
}
|
718 |
+
} else {
|
719 |
+
console.log("No valid audio file path received.");
|
720 |
+
}
|
721 |
+
}
|
722 |
+
"""
|
723 |
)
|
724 |
|
725 |
demo.queue()
|
726 |
+
|
727 |
demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))
|
728 |
+
#asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))))
|
729 |
|