Spaces:
Runtime error
Runtime error
import gradio as gr | |
import uuid | |
import os | |
from typing import Optional | |
import tempfile | |
from pydub import AudioSegment | |
import re | |
import subprocess | |
import numpy as np | |
import soundfile as sf | |
import sounddevice as sd | |
import time | |
import sox | |
from io import BytesIO | |
import asyncio | |
import aiohttp | |
from moviepy.editor import VideoFileClip | |
import threading | |
import socketio | |
import base64 | |
ASR_API = "http://astarwiz.com:9998/asr" | |
TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak' | |
TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave' | |
#bSegByPunct = True | |
bSegByPunct = False | |
LANGUAGE_MAP = { | |
"en": "English", | |
"ma": "Malay", | |
"ta": "Tamil", | |
"zh": "Chinese" | |
} | |
DEVELOPER_PASSWORD = os.getenv("DEV_PWD") | |
RAPID_API_KEY = os.getenv("RAPID_API_KEY") | |
AVAILABLE_SPEAKERS = { | |
"en": ["MS"], | |
"ma": ["msFemale"], | |
"ta": ["ta_female1"], | |
"zh": ["childChinese2"] | |
} | |
audio_update_event = asyncio.Event() | |
acc_cosy_audio = None | |
# cosy voice tts related; | |
#TTS_SOCKET_SERVER = "http://localhost:9244" | |
TTS_SOCKET_SERVER = "http://astarwiz.com:9244" | |
sio = socketio.AsyncClient() | |
def on_connect(): | |
print('Connected to server') | |
def on_disconnect(): | |
print('Disconnected from server') | |
async def on_audio_chunk(data): | |
global translation_update, audio_update, acc_cosy_audio | |
translated_seg_txt = data['trans_text'] | |
with translation_lock: | |
translation_update["content"] = translation_update["content"] + " " + translated_seg_txt | |
translation_update["new"] = True | |
audio_base64 = data['audio'] | |
audio_bytes = base64.b64decode(audio_base64) | |
audio_np = np.frombuffer(audio_bytes, dtype=np.int16) | |
if (acc_cosy_audio is None): | |
acc_cosy_audio = audio_np | |
else: | |
acc_cosy_audio = np.concatenate((acc_cosy_audio, audio_np)) | |
with audio_lock: | |
audio_update["content"] = (22050, audio_np) | |
audio_update["new"] = True | |
#audio_float = audio_np.astype(np.float32) / 32767.0 | |
#audio_queue.append(audio_float) | |
#accumulated_audio.extend(audio_float) | |
async def on_tts_complete(): | |
await sio.disconnect() | |
print("Disconnected from server after TTS completion") | |
audio_update_event.set() | |
# Global variables for storing update information | |
transcription_update = {"content": "", "new": False} | |
translation_update = {"content": "", "new": False} | |
audio_update = {"content": None, "new": False} | |
# Locks for thread-safe operations | |
transcription_lock = threading.Lock() | |
translation_lock = threading.Lock() | |
audio_lock = threading.Lock() | |
def replace_audio_in_video(video_path, audio_path, output_path): | |
command = [ | |
'ffmpeg', | |
'-i', video_path, | |
'-i', audio_path, | |
'-c:v', 'copy', | |
'-map', '0:v:0', | |
'-map', '1:a:0', | |
'-shortest', | |
output_path | |
] | |
subprocess.run(command, check=True) | |
return output_path | |
async def replace_audio_and_generate_video(temp_video_path, gradio_audio): | |
print ("gradio_audio:", gradio_audio) | |
if not temp_video_path or gradio_audio is None: | |
return "Both video and audio are required to replace audio.", None | |
if not os.path.exists(temp_video_path): | |
return "Video file not found.", None | |
# Unpack the Gradio audio output | |
sample_rate, audio_data = gradio_audio | |
# Ensure audio_data is a numpy array | |
if not isinstance(audio_data, np.ndarray): | |
audio_data = np.array(audio_data) | |
# Create a temporary WAV file for the original audio | |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio_file: | |
original_audio_path = temp_audio_file.name | |
sf.write(original_audio_path, audio_data, sample_rate) | |
# Get video duration | |
video_clip = VideoFileClip(temp_video_path) | |
video_duration = video_clip.duration | |
video_clip.close() | |
# Get audio duration | |
audio_duration = len(audio_data) / sample_rate | |
# Calculate tempo factor | |
tempo_factor = audio_duration / video_duration | |
# Create a temporary WAV file for the tempo-adjusted audio | |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio_file: | |
adjusted_audio_path = temp_audio_file.name | |
# Adjust audio tempo | |
tfm = sox.Transformer() | |
tfm.tempo(tempo_factor, 's') | |
tfm.build(original_audio_path, adjusted_audio_path) | |
# Generate output video path | |
output_video_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4()}.mp4") | |
try: | |
replace_audio_in_video(temp_video_path, adjusted_audio_path, output_video_path) | |
return "Audio replaced successfully.", output_video_path | |
except subprocess.CalledProcessError as e: | |
return f"Error replacing audio: {str(e)}", None | |
finally: | |
os.unlink(original_audio_path) # Clean up the original audio file | |
os.unlink(adjusted_audio_path) # Clean up the adjusted audio file | |
async def fetch_youtube_id(youtube_url: str) -> str: | |
if 'v=' in youtube_url: | |
return youtube_url.split("v=")[1].split("&")[0] | |
elif 'youtu.be/' in youtube_url: | |
return youtube_url.split("youtu.be/")[1] | |
elif 'shorts' in youtube_url: | |
return youtube_url.split("/")[-1] | |
else: | |
raise Exception("Unsupported URL format") | |
async def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[tuple[str, str]]: | |
video_id = await fetch_youtube_id(youtube_url) | |
if not video_id: | |
return None | |
if output_dir is None: | |
output_dir = tempfile.gettempdir() | |
output_filename = os.path.join(output_dir, f"{video_id}.mp3") | |
temp_filename = os.path.join(output_dir, f"{video_id}.mp4") | |
if os.path.exists(output_filename) and os.path.exists(temp_filename): | |
return (output_filename, temp_filename) | |
url = "https://youtube86.p.rapidapi.com/api/youtube/links" | |
headers = { | |
'Content-Type': 'application/json', | |
'x-rapidapi-host': 'youtube86.p.rapidapi.com', | |
'x-rapidapi-key': RAPID_API_KEY | |
} | |
data = { | |
"url": youtube_url | |
} | |
async with aiohttp.ClientSession() as session: | |
async with session.post(url, headers=headers, json=data) as response: | |
if response.status == 200: | |
result = await response.json() | |
for url in result[0]['urls']: | |
if url.get('isBundle'): | |
audio_url = url['url'] | |
extension = url['extension'] | |
print ("audio_url :", audio_url) | |
async with session.get(audio_url) as audio_response: | |
print ("audio_response:", audio_response) | |
if audio_response.status == 200: | |
content = await audio_response.read() | |
temp_filename = os.path.join(output_dir, f"{video_id}.{extension}") | |
with open(temp_filename, 'wb') as audio_file: | |
audio_file.write(content) | |
audio = AudioSegment.from_file(temp_filename, format=extension) | |
audio = audio.set_frame_rate(16000) | |
audio.export(output_filename, format="mp3", parameters=["-ar", "16000"]) | |
return (output_filename, temp_filename) | |
else: | |
print("Error:", response.status, await response.text()) | |
return None | |
punctuation_marks = r'([\.!?!?。])' | |
def split_text_with_punctuation(text): | |
# Split the text using the punctuation marks, keeping the punctuation marks | |
split_text = re.split(punctuation_marks, text) | |
# Combine each punctuation mark with the preceding segment | |
combined_segments = [] | |
# Loop through the split text in steps of 2 | |
for i in range(0, len(split_text) - 1, 2): | |
combined_segments.append(split_text[i] + split_text[i + 1]) | |
# Handle any remaining text that doesn't have a punctuation following it | |
if len(split_text) % 2 != 0 and split_text[-1]: | |
combined_segments.append(split_text[-1]) | |
# Split any segment that exceeds 50 words | |
final_segments = [] | |
for segment in combined_segments: | |
words = segment.split() # Split each segment into words | |
if len(words) > 50: | |
# Split the segment into chunks of no more than 50 words | |
for j in range(0, len(words), 50): | |
final_segments.append(' '.join(words[j:j+50])) | |
else: | |
final_segments.append(segment) | |
return [segment for segment in final_segments if segment] # Filter out empty strings | |
def extract_segments(text): | |
pattern = r'\[(\d+\.\d+)s\s*->\s*(\d+\.\d+)s\]\s*(.*?)(?=\[\d+\.\d+s|\Z)' | |
matches = re.findall(pattern, text, re.DOTALL) | |
if not matches: | |
return [] | |
segments = [] | |
for start, end, content in matches: | |
segments.append({ | |
'start': float(start), | |
'end': float(end), | |
'text': content.strip() | |
}) | |
return segments | |
def adjust_tempo_pysox_array(gradio_audio, duration): | |
# Unpack the Gradio audio output | |
sample_rate, audio_data = gradio_audio | |
# Ensure audio_data is a numpy array | |
if not isinstance(audio_data, np.ndarray): | |
audio_data = np.array(audio_data) | |
# Calculate the current duration of the audio in seconds | |
current_duration = len(audio_data) / sample_rate | |
# Calculate the necessary tempo factor to match the desired duration | |
tempo_factor = current_duration / duration | |
# Create a pysox Transformer | |
tfm = sox.Transformer() | |
tfm.tempo(tempo_factor) | |
# Use pysox to transform the audio directly in memory | |
adjusted_audio = tfm.build_array(input_array=audio_data, sample_rate_in=sample_rate) | |
# Trim or pad the audio to exactly match the desired duration | |
target_length = int(sample_rate * duration) | |
if len(adjusted_audio) > target_length: | |
adjusted_audio = adjusted_audio[:target_length] # Trim if too long | |
else: | |
# Pad with zeros if too short | |
adjusted_audio = np.pad(adjusted_audio, (0, target_length - len(adjusted_audio)), mode='constant') | |
# Return the processed audio in the Gradio format (sample_rate, adjusted_audio) | |
return sample_rate, adjusted_audio | |
async def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64): | |
print(input_text) | |
one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant" | |
vllm_api = 'http://astarwiz.com:2333/' + "v1/completions" | |
data = { | |
"prompt": one_vllm_input, | |
'model': "./Edu-4B-NewTok-V2-20240904/", | |
'min_tokens': min_new_tokens, | |
'max_tokens': max_new_tokens, | |
'temperature': 0.1, | |
'top_p': 0.75, | |
'repetition_penalty': 1.1, | |
"stop_token_ids": [151645, ], | |
} | |
async with aiohttp.ClientSession() as session: | |
async with session.post(vllm_api, headers={"Content-Type": "application/json"}, json=data) as response: | |
if response.status == 200: | |
result = await response.json() | |
if "choices" in result: | |
return result["choices"][0]['text'].strip() | |
return "The system got some error during vLLM generation. Please try it again." | |
async def upload_file(file_path, upload_url): | |
print(f"1. Client sends request: {time.time()}") | |
async with aiohttp.ClientSession() as session: | |
with open(file_path, 'rb') as f: | |
form_data = aiohttp.FormData() | |
form_data.add_field('file', f, filename=os.path.basename(file_path)) | |
async with session.post(upload_url, data=form_data) as response: | |
print(f"5. Client receives headers: {time.time()}") | |
print(f"Status: {response.status}") | |
result = await response.json() | |
print(f"7. Client fully received and parsed response: {time.time()}") | |
if response.status == 200: | |
return result | |
else: | |
return {"file_id",""} | |
async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None): | |
global transcription_update, translation_update, audio_update, acc_cosy_audio,audio_update_event | |
transcription_update = {"content": "", "new": True} | |
translation_update = {"content": "", "new": True} | |
audio_update = {"content": None, "new": True} | |
acc_cosy_audio =None | |
video_path = None | |
audio_update_event.clear() | |
#progress = gr.Progress(); | |
#progress(0.1, "started:") | |
if youtube_url: | |
audio = await download_youtube_audio(youtube_url) | |
if audio is None: | |
return "Failed to download YouTube audio.", None, None, video_path,(22050, accumulated_audio) | |
audio, video_path = audio | |
if not audio: | |
return "Please provide an audio input or a valid YouTube URL.", None, None, video_path,(22050, accumulated_audio) | |
# ASR | |
#progress(0.2, "ASR started:") | |
file_id = str(uuid.uuid4()) | |
data = aiohttp.FormData() | |
data.add_field('file', open(audio, 'rb')) | |
data.add_field('language', 'ms' if source_lang == 'ma' else source_lang) | |
if bSegByPunct: | |
data.add_field('model_name', 'whisper-large-v2-local-cs') | |
data.add_field('with_timestamp', 'false') | |
else: | |
data.add_field('model_name', 'official-v3') | |
data.add_field('with_timestamp', 'true') | |
async with aiohttp.ClientSession() as session: | |
async with session.post(ASR_API, data=data) as asr_response: | |
if asr_response.status == 200: | |
result = await asr_response.json() | |
transcription = result['text'] | |
with transcription_lock: | |
transcription_update["content"] = transcription | |
transcription_update["new"] = True | |
else: | |
return "ASR failed", None, None, video_path,(22050, accumulated_audio) | |
#progress(0.4, "ASR done:") | |
# use cosy voice if target_lang == 'en' or target_lang == 'zh' | |
if target_lang == 'en' or target_lang == 'zh': | |
try: | |
if not sio.connected: | |
server_url = TTS_SOCKET_SERVER | |
await sio.connect(server_url) | |
print(f"Connected to {server_url}") | |
# Handle the audio file | |
file_id="" | |
if audio and os.path.exists(audio): | |
print("upload_url") | |
upload_url = f"{server_url}/upload" # Adjust this URL as needed | |
print("before call upload_file:") | |
upload_result = await upload_file(audio, upload_url) | |
#print (type(upload_result)) | |
print ("upload_result:", upload_result) | |
file_id = upload_result['file_id'] | |
# use defualt voice | |
tts_request = { | |
'text': transcription, | |
'overwrite_prompt': False, | |
'promptText':"", | |
'promptAudio':file_id, | |
'sourceLang':source_lang, | |
'targetLang':target_lang | |
} | |
await sio.emit('tts_request', tts_request) | |
# wait until all cosy voice tts is done : | |
await audio_update_event.wait() | |
print('cosy tts complete,',audio_update) | |
return transcription, translation_update["content"], audio_update["content"], video_path, (22050, acc_cosy_audio) | |
except Exception as e: | |
print(f"Failed to process request: {str(e)}") | |
print("let use vits then") | |
if bSegByPunct: | |
split_result = split_text_with_punctuation(transcription) | |
else: | |
split_result = extract_segments(transcription); | |
translate_segments = [] | |
accumulated_audio = None | |
sample_rate = 22050 | |
global is_playing | |
for i, segment in enumerate(split_result): | |
if bSegByPunct: | |
translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}" | |
else: | |
translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}" | |
translated_seg_txt = await inference_via_llm_api(translation_prompt) | |
translate_segments.append(translated_seg_txt) | |
print(f"Translation: {translated_seg_txt}") | |
with translation_lock: | |
translation_update["content"] = " ".join(translate_segments) | |
translation_update["new"] = True | |
# Generate TTS for each translated segment | |
#progress(0.4 + (0.5 * (i + 1) / len(split_result)), "translation and tts in progress :") | |
tts_params = { | |
'language': target_lang, | |
'speed': 1.1, | |
'speaker': target_speaker or AVAILABLE_SPEAKERS[target_lang][0], | |
'text': translated_seg_txt | |
} | |
async with aiohttp.ClientSession() as session: | |
async with session.get(TTS_SPEAK_SERVICE, params=tts_params) as tts_response: | |
if tts_response.status == 200: | |
audio_file = await tts_response.text() | |
audio_file = audio_file.strip() | |
audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}" | |
async with session.get(audio_url) as response: | |
content = await response.read() | |
audio_chunk, sr = sf.read(BytesIO(content)) | |
#print ('audio_chunk:', type(audio_chunk),audio_chunk) | |
#print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr) | |
# _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start']) | |
if accumulated_audio is None: | |
accumulated_audio = audio_chunk | |
sample_rate = sr | |
else: | |
accumulated_audio = np.concatenate((accumulated_audio, audio_chunk)) | |
with audio_lock: | |
audio_update["content"] = (sample_rate, audio_chunk) | |
audio_update["new"] = True | |
else: | |
print(f"TTS failed for segment: {translated_seg_txt}") | |
translated_text = " ".join(translate_segments) | |
#progress(1, "all done.") | |
print("sigal the playing could stop now. all tts generated") | |
is_playing =False; | |
if accumulated_audio is not None: | |
return transcription, translated_text, audio_update["content"], video_path, (sample_rate,accumulated_audio) | |
else: | |
return transcription, translated_text, "TTS failed", video_path, (sample_rate, accumulated_audio) | |
""" | |
async def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker): | |
temp_video_path = None | |
transcription, translated_text, audio_chunksr, temp_video_path = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker) | |
return transcription, translated_text, audio_chunksr, temp_video_path | |
""" | |
async def update_transcription(): | |
global transcription_update | |
with transcription_lock: | |
if transcription_update["new"]: | |
content = transcription_update["content"] | |
transcription_update["new"] = False | |
return content | |
return gr.update() | |
async def update_translation(): | |
global translation_update | |
with translation_lock: | |
if translation_update["new"]: | |
content = translation_update["content"] | |
translation_update["new"] = False | |
return content | |
return gr.update() | |
async def update_audio(): | |
global audio_update | |
with audio_lock: | |
if audio_update["new"]: | |
content = audio_update["content"] | |
audio_update["new"] = False | |
return content | |
return gr.update() | |
def disable_button(): | |
# Disable the button during processing | |
return gr.update(interactive=False) | |
with gr.Blocks() as demo: | |
gr.Markdown("# Speech Translation") | |
gr.Markdown("Speak into the microphone, upload an audio file, or provide a YouTube URL. The app will translate and speak it back to you.") | |
with gr.Row(): | |
user_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") | |
user_youtube_url = gr.Textbox(label="YouTube URL (optional)") | |
with gr.Row(): | |
user_source_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Source Language", value="en") | |
user_target_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Target Language", value="zh") | |
user_target_speaker = gr.Dropdown(choices=AVAILABLE_SPEAKERS['zh'], label="Target Speaker", value="childChinese2") | |
with gr.Row(): | |
user_button = gr.Button("Translate and Speak", interactive=False) | |
with gr.Row(): | |
user_transcription_output = gr.Textbox(label="Transcription") | |
user_translation_output = gr.Textbox(label="Translation") | |
user_audio_output = gr.Audio(label="Translated Speech", visible =False) | |
user_audio_final = gr.Audio(label="Final total Speech") | |
status_message = gr.Textbox(label="Status", interactive=False) | |
user_video_output = gr.HTML(label="YouTube Video") | |
replace_audio_button = gr.Button("Replace Audio", interactive=False, visible =False) | |
final_video_output = gr.Video(label="Video with Replaced Audio",visible=False) | |
temp_video_path = gr.State() | |
translation_progress = gr.State(0.0) | |
async def update_button_state(audio, youtube_url, progress): | |
print(audio, youtube_url, progress) | |
# Button is interactive if there's input and progress is 0 or 1 (not in progress) | |
print ("progress:", audio, youtube_url,bool(audio) , bool(youtube_url), progress == 0 or progress == 1) | |
return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1)) | |
user_audio_input.change( | |
fn=update_button_state, | |
inputs=[user_audio_input, user_youtube_url, translation_progress], | |
outputs=user_button | |
) | |
user_youtube_url.change( | |
fn=update_button_state, | |
inputs=[user_audio_input, user_youtube_url, translation_progress], | |
outputs=user_button | |
) | |
async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker,progress): | |
progress = 0.1 | |
temp_video_path = None | |
transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker) | |
progress = 1 | |
return transcription, translated_text, audio_chunksr, temp_video_path, "Translation complete", accumulated_aud_buf, gr.update(interactive=True) | |
user_button.click( | |
fn=disable_button, | |
inputs=[], | |
outputs=[user_button] # Disable the button during processing | |
).then( | |
fn=run_speech_translation_wrapper, | |
inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker, translation_progress], | |
outputs=[user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,user_button] | |
) | |
async def update_replace_audio_button(audio_url, video_path): | |
print("update replace:", audio_url, video_path) | |
return gr.Button(interactive=bool(audio_url) and bool(video_path)) | |
user_audio_output.change( | |
fn=update_replace_audio_button, | |
inputs=[user_audio_output, temp_video_path], | |
outputs=[replace_audio_button] | |
) | |
replace_audio_button.click( | |
fn=replace_audio_and_generate_video, | |
inputs=[temp_video_path, user_audio_final], | |
outputs=[status_message, final_video_output] | |
) | |
async def update_video_embed(youtube_url): | |
if youtube_url: | |
try: | |
video_id = await fetch_youtube_id(youtube_url) | |
return f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>' | |
except Exception as e: | |
print(f"Error embedding video: {e}") | |
return "" | |
user_youtube_url.change( | |
fn=update_video_embed, | |
inputs=[user_youtube_url], | |
outputs=[user_video_output] | |
) | |
async def update_target_speakers(target_lang): | |
return gr.Dropdown(choices=AVAILABLE_SPEAKERS[target_lang], value=AVAILABLE_SPEAKERS[target_lang][0]) | |
user_target_lang.change( | |
fn=update_target_speakers, | |
inputs=[user_target_lang], | |
outputs=[user_target_speaker] | |
) | |
async def periodic_update(): | |
transcription = await update_transcription() | |
translation = await update_translation() | |
audio = await update_audio() | |
return ( | |
transcription, | |
translation, | |
audio | |
) | |
demo.load( | |
periodic_update, | |
inputs=[], | |
outputs=[ | |
user_transcription_output, | |
user_translation_output, | |
user_audio_output, | |
], | |
every=0.1 | |
) | |
# JavaScript for client-side queue and playback handling | |
user_audio_output.change( | |
None, # No backend change needed, we only handle frontend actions | |
inputs=user_audio_output, # Set the user_audio_output as input to capture its audio changes | |
outputs=None, | |
js=""" | |
async (audioFilePath) => { | |
// Debug: Log received audio file path | |
console.log("Received audio file path:", audioFilePath); | |
if (!window.audioQueue) { | |
window.audioQueue = []; | |
window.isPlaying = false; | |
} | |
// Ensure the correct URL for the audio file is available | |
if (audioFilePath && audioFilePath.url) { | |
console.log("Processing audio file..."); | |
try { | |
// Fetch and decode the audio file | |
const response = await fetch(audioFilePath.url); | |
if (!response.ok) { | |
console.error("Failed to fetch audio file:", response.statusText); | |
return; | |
} | |
const audioData = await response.arrayBuffer(); | |
const audioContext = new AudioContext(); | |
const decodedData = await audioContext.decodeAudioData(audioData); | |
// Split the decoded audio buffer into two chunks | |
const totalDuration = decodedData.duration; | |
const midPoint = Math.floor(decodedData.length / 2); // Midpoint for splitting | |
const sampleRate = decodedData.sampleRate; | |
// Create two separate AudioBuffers for each chunk | |
const firstHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, midPoint, sampleRate); | |
const secondHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, decodedData.length - midPoint, sampleRate); | |
// Copy data from original buffer to the two new buffers | |
for (let channel = 0; channel < decodedData.numberOfChannels; channel++) { | |
firstHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(0, midPoint), channel, 0); | |
secondHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(midPoint), channel, 0); | |
} | |
// Add both chunks to the queue | |
window.audioQueue.push(firstHalfBuffer); | |
window.audioQueue.push(secondHalfBuffer); | |
console.log("Two audio chunks added to queue. Queue length:", window.audioQueue.length); | |
// Function to play the next audio chunk from the queue | |
const playNextChunk = async () => { | |
console.log("Attempting to play next chunk. isPlaying:", window.isPlaying); | |
if (!window.isPlaying && window.audioQueue.length > 0) { | |
console.log("Starting playback..."); | |
window.isPlaying = true; | |
// Get the next audio buffer from the queue | |
const audioBuffer = window.audioQueue.shift(); | |
console.log("Playing audio chunk from buffer."); | |
const source = audioContext.createBufferSource(); | |
source.buffer = audioBuffer; | |
source.connect(audioContext.destination); | |
// When the audio finishes playing, play the next chunk | |
source.onended = () => { | |
console.log("Audio chunk finished playing."); | |
window.isPlaying = false; | |
playNextChunk(); // Play the next audio chunk in the queue | |
}; | |
source.start(0); // Start playing the current chunk | |
console.log("Audio chunk started."); | |
} else { | |
console.log("Already playing or queue is empty."); | |
} | |
}; | |
// Start playing the next chunk if not already playing | |
playNextChunk(); | |
} catch (error) { | |
console.error("Error during audio playback:", error); | |
window.isPlaying = false; | |
} | |
} else { | |
console.log("No valid audio file path received."); | |
} | |
} | |
""" | |
) | |
demo.queue() | |
#demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))) | |
asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))) | |