import os import gc import torch import shutil import atexit import torchaudio import numpy as np import gradio as gr from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline os.environ["CUDA_VISIBLE_DEVICES"] = "6" # Initialize AudioLDM2 Pipeline torch.cuda.set_device(0) dtype = torch.float32 pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=dtype) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") pipeline.to(device) def morph_audio(audio_file1, audio_file2, num_inference_steps, prompt1='', prompt2='', negative_prompt1="Low quality", negative_prompt2="Low quality"): save_lora_dir = "output" if os.path.exists(save_lora_dir): shutil.rmtree(save_lora_dir) os.makedirs(save_lora_dir, exist_ok=True) # Load audio and compute duration waveform1, sample_rate1 = torchaudio.load(audio_file1) duration1 = waveform1.shape[1] / sample_rate1 waveform2, sample_rate2 = torchaudio.load(audio_file2) duration2 = waveform2.shape[1] / sample_rate2 # Compare durations and take the shorter one duration = int(min(duration1, duration2)) # Perform morphing using the pipeline _ = pipeline( dtype = dtype, audio_file=audio_file1, audio_file2=audio_file2, audio_length_in_s=duration, time_pooling=2, freq_pooling=2, prompt_1=prompt1, prompt_2=prompt2, negative_prompt_1=negative_prompt1, negative_prompt_2=negative_prompt2, save_lora_dir=save_lora_dir, use_adain=True, use_reschedule=False, num_inference_steps=num_inference_steps, lamd=0.6, output_path=save_lora_dir, num_frames=5, fix_lora=None, use_lora=True, lora_steps=2, noisy_latent_with_lora=True, morphing_with_lora=True, use_morph_prompt=True, guidance_scale=7.5, ) # Collect the output file paths output_paths = sorted( [os.path.join(save_lora_dir, file) for file in os.listdir(save_lora_dir) if file.endswith(".wav")], key=lambda x: int(os.path.splitext(os.path.basename(x))[0]) ) del waveform1, waveform2, _ torch.cuda.empty_cache() gc.collect() return output_paths def morph_audio_with_morphing_factor(audio_file1, audio_file2, alpha,num_inference_steps, prompt1='', prompt2='', negative_prompt1="Low quality", negative_prompt2="Low quality"): save_lora_dir = "output" if os.path.exists(save_lora_dir): shutil.rmtree(save_lora_dir) os.makedirs(save_lora_dir, exist_ok=True) # Load audio and compute duration waveform1, sample_rate1 = torchaudio.load(audio_file1) duration1 = waveform1.shape[1] / sample_rate1 waveform2, sample_rate2 = torchaudio.load(audio_file2) duration2 = waveform2.shape[1] / sample_rate2 # Compare durations and take the shorter one duration = int(min(duration1, duration2)) try: # Perform morphing using the pipeline _ = pipeline( dtype = dtype, morphing_factor = alpha, audio_file=audio_file1, audio_file2=audio_file2, audio_length_in_s=duration, time_pooling=2, freq_pooling=2, prompt_1=prompt1, prompt_2=prompt2, negative_prompt_1=negative_prompt1, negative_prompt_2=negative_prompt2, save_lora_dir=save_lora_dir, use_adain=True, use_reschedule=False, num_inference_steps=num_inference_steps, lamd=0.6, output_path=save_lora_dir, num_frames=5, fix_lora=None, use_lora=True, lora_steps=2, noisy_latent_with_lora=True, morphing_with_lora=True, use_morph_prompt=True, guidance_scale=7.5, ) output_paths = os.path.join(save_lora_dir, 'interpolated.wav') except RuntimeError as e: if "CUDA out of memory" in str(e): print("CUDA out of memory. Releasing unused memory...") torch.cuda.empty_cache() gc.collect() raise e # # Collect the output file paths # del waveform1, waveform2, _ # torch.cuda.empty_cache() # gc.collect() return output_paths def cleanup_output_dir(): save_lora_dir = "output" if os.path.exists(save_lora_dir): shutil.rmtree(save_lora_dir) print(f"Cleaned up directory: {save_lora_dir}") atexit.register(cleanup_output_dir) # Gradio interface function def interface(audio1, audio2, alpha, num_inference_steps): output_paths = morph_audio_with_morphing_factor(audio1, audio2, alpha, num_inference_steps) return output_paths # Gradio Interface # demo = gr.Interface( # fn=interface, # inputs=[ # gr.Audio(label="Upload Audio File 1", type="filepath"), # gr.Audio(label="Upload Audio File 2", type="filepath"), # gr.Slider(0, 1, step=0.01, label="Interpolation Alpha"), # gr.Slider(10, 50, step=1, label="Inference Steps"), # # gr.Textbox(label="Prompt for Audio File 1"), # # gr.Textbox(label="Prompt for Audio File 2"), # ], # outputs=gr.Audio(label="Interpolated Audio") # ) with gr.Blocks() as demo: with gr.Tab("Sound Morphing with fixed frames."): gr.Markdown("### Upload two audio files for morphing") with gr.Row(): audio1 = gr.Audio(label="Upload Audio File 1", type="filepath") audio2 = gr.Audio(label="Upload Audio File 2", type="filepath") num_inference_steps = gr.Slider(10, 50, step=1, label="Inference Steps", value=50) outputs = [ gr.Audio(label="Morphing audio 1"), gr.Audio(label="Morphing audio 2"), gr.Audio(label="Morphing audio 3"), gr.Audio(label="Morphing audio 4"), gr.Audio(label="Morphing audio 5"), ] submit_btn1 = gr.Button("Submit") submit_btn1.click(morph_audio, inputs=[audio1, audio2, num_inference_steps], outputs=outputs) with gr.Tab("Sound Morphing with specified morphing factor."): gr.Markdown("### Upload two audio files for morphing") with gr.Row(): audio1 = gr.Audio(label="Upload Audio File 1", type="filepath") audio2 = gr.Audio(label="Upload Audio File 2", type="filepath") alpha = gr.Slider(0, 1, step=0.01, label="Interpolation Alpha") num_inference_steps = gr.Slider(10, 50, step=1, label="Inference Steps", value=50) outputs=gr.Audio(label="Interpolated Audio") submit_btn2 = gr.Button("Submit") submit_btn2.click(morph_audio_with_morphing_factor, inputs=[audio1, audio2, alpha, num_inference_steps], outputs=outputs) if __name__ == "__main__": demo.launch(share=True)