CSH-1220
File updates regarding memory-saving
117486a
import os
import gc
import torch
import shutil
import atexit
import torchaudio
import numpy as np
import gradio as gr
from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
# Initialize AudioLDM2 Pipeline
torch.cuda.set_device(0)
dtype = torch.float32
pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=dtype)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)
def morph_audio(audio_file1, audio_file2, num_inference_steps, prompt1='', prompt2='', negative_prompt1="Low quality", negative_prompt2="Low quality"):
save_lora_dir = "output"
if os.path.exists(save_lora_dir):
shutil.rmtree(save_lora_dir)
os.makedirs(save_lora_dir, exist_ok=True)
# Load audio and compute duration
waveform1, sample_rate1 = torchaudio.load(audio_file1)
duration1 = waveform1.shape[1] / sample_rate1
waveform2, sample_rate2 = torchaudio.load(audio_file2)
duration2 = waveform2.shape[1] / sample_rate2
# Compare durations and take the shorter one
duration = int(min(duration1, duration2))
# Perform morphing using the pipeline
_ = pipeline(
dtype = dtype,
audio_file=audio_file1,
audio_file2=audio_file2,
audio_length_in_s=duration,
time_pooling=2,
freq_pooling=2,
prompt_1=prompt1,
prompt_2=prompt2,
negative_prompt_1=negative_prompt1,
negative_prompt_2=negative_prompt2,
save_lora_dir=save_lora_dir,
use_adain=True,
use_reschedule=False,
num_inference_steps=num_inference_steps,
lamd=0.6,
output_path=save_lora_dir,
num_frames=5,
fix_lora=None,
use_lora=True,
lora_steps=2,
noisy_latent_with_lora=True,
morphing_with_lora=True,
use_morph_prompt=True,
guidance_scale=7.5,
)
# Collect the output file paths
output_paths = sorted(
[os.path.join(save_lora_dir, file) for file in os.listdir(save_lora_dir) if file.endswith(".wav")],
key=lambda x: int(os.path.splitext(os.path.basename(x))[0])
)
del waveform1, waveform2, _
torch.cuda.empty_cache()
gc.collect()
return output_paths
def morph_audio_with_morphing_factor(audio_file1, audio_file2, alpha,num_inference_steps, prompt1='', prompt2='', negative_prompt1="Low quality", negative_prompt2="Low quality"):
save_lora_dir = "output"
if os.path.exists(save_lora_dir):
shutil.rmtree(save_lora_dir)
os.makedirs(save_lora_dir, exist_ok=True)
# Load audio and compute duration
waveform1, sample_rate1 = torchaudio.load(audio_file1)
duration1 = waveform1.shape[1] / sample_rate1
waveform2, sample_rate2 = torchaudio.load(audio_file2)
duration2 = waveform2.shape[1] / sample_rate2
# Compare durations and take the shorter one
duration = int(min(duration1, duration2))
try:
# Perform morphing using the pipeline
_ = pipeline(
dtype = dtype,
morphing_factor = alpha,
audio_file=audio_file1,
audio_file2=audio_file2,
audio_length_in_s=duration,
time_pooling=2,
freq_pooling=2,
prompt_1=prompt1,
prompt_2=prompt2,
negative_prompt_1=negative_prompt1,
negative_prompt_2=negative_prompt2,
save_lora_dir=save_lora_dir,
use_adain=True,
use_reschedule=False,
num_inference_steps=num_inference_steps,
lamd=0.6,
output_path=save_lora_dir,
num_frames=5,
fix_lora=None,
use_lora=True,
lora_steps=2,
noisy_latent_with_lora=True,
morphing_with_lora=True,
use_morph_prompt=True,
guidance_scale=7.5,
)
output_paths = os.path.join(save_lora_dir, 'interpolated.wav')
except RuntimeError as e:
if "CUDA out of memory" in str(e):
print("CUDA out of memory. Releasing unused memory...")
torch.cuda.empty_cache()
gc.collect()
raise e
# # Collect the output file paths
# del waveform1, waveform2, _
# torch.cuda.empty_cache()
# gc.collect()
return output_paths
def cleanup_output_dir():
save_lora_dir = "output"
if os.path.exists(save_lora_dir):
shutil.rmtree(save_lora_dir)
print(f"Cleaned up directory: {save_lora_dir}")
atexit.register(cleanup_output_dir)
# Gradio interface function
def interface(audio1, audio2, alpha, num_inference_steps):
output_paths = morph_audio_with_morphing_factor(audio1, audio2, alpha, num_inference_steps)
return output_paths
# Gradio Interface
# demo = gr.Interface(
# fn=interface,
# inputs=[
# gr.Audio(label="Upload Audio File 1", type="filepath"),
# gr.Audio(label="Upload Audio File 2", type="filepath"),
# gr.Slider(0, 1, step=0.01, label="Interpolation Alpha"),
# gr.Slider(10, 50, step=1, label="Inference Steps"),
# # gr.Textbox(label="Prompt for Audio File 1"),
# # gr.Textbox(label="Prompt for Audio File 2"),
# ],
# outputs=gr.Audio(label="Interpolated Audio")
# )
with gr.Blocks() as demo:
with gr.Tab("Sound Morphing with fixed frames."):
gr.Markdown("### Upload two audio files for morphing")
with gr.Row():
audio1 = gr.Audio(label="Upload Audio File 1", type="filepath")
audio2 = gr.Audio(label="Upload Audio File 2", type="filepath")
num_inference_steps = gr.Slider(10, 50, step=1, label="Inference Steps", value=50)
outputs = [
gr.Audio(label="Morphing audio 1"),
gr.Audio(label="Morphing audio 2"),
gr.Audio(label="Morphing audio 3"),
gr.Audio(label="Morphing audio 4"),
gr.Audio(label="Morphing audio 5"),
]
submit_btn1 = gr.Button("Submit")
submit_btn1.click(morph_audio, inputs=[audio1, audio2, num_inference_steps], outputs=outputs)
with gr.Tab("Sound Morphing with specified morphing factor."):
gr.Markdown("### Upload two audio files for morphing")
with gr.Row():
audio1 = gr.Audio(label="Upload Audio File 1", type="filepath")
audio2 = gr.Audio(label="Upload Audio File 2", type="filepath")
alpha = gr.Slider(0, 1, step=0.01, label="Interpolation Alpha")
num_inference_steps = gr.Slider(10, 50, step=1, label="Inference Steps", value=50)
outputs=gr.Audio(label="Interpolated Audio")
submit_btn2 = gr.Button("Submit")
submit_btn2.click(morph_audio_with_morphing_factor, inputs=[audio1, audio2, alpha, num_inference_steps], outputs=outputs)
if __name__ == "__main__":
demo.launch(share=True)