File size: 2,998 Bytes
d57e374
 
4cf73d6
075c9a6
4cf73d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d57e374
4cf73d6
075c9a6
d57e374
1834911
 
d57e374
075c9a6
d57e374
 
 
 
075c9a6
d57e374
 
 
 
075c9a6
d57e374
 
 
 
 
 
 
 
 
 
 
 
075c9a6
d57e374
 
 
 
 
 
 
 
 
 
 
 
 
075c9a6
d57e374
 
 
075c9a6
d57e374
 
075c9a6
d57e374
075c9a6
 
 
 
 
 
 
 
 
 
 
4cf73d6
 
 
 
 
075c9a6
 
d57e374
075c9a6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import torch
import torchaudio
import numpy as np
import gradio as gr
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(
    repo_id="DennisHung/Pre-trained_AudioMAE_weights",
    filename="pretrained.pth",
    local_dir="./",                                
    local_dir_use_symlinks=False 
)

model_path = hf_hub_download(
    repo_id="DennisHung/Pre-trained_AudioMAE_weights",
    filename="pytorch_model.bin",
    local_dir="./",                                
    local_dir_use_symlinks=False 
)

from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline 
# Initialize AudioLDM2 Pipeline
pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)

# Audio morphing function
def morph_audio(audio_file1, audio_file2, prompt1, prompt2, negative_prompt1="Low quality", negative_prompt2="Low quality"):
    save_lora_dir = "output"
    os.makedirs(save_lora_dir, exist_ok=True)
    
    # Load audio and compute duration
    waveform, sample_rate = torchaudio.load(audio_file1)
    duration = waveform.shape[1] / sample_rate
    duration = int(duration)
    
    # Perform morphing using the pipeline
    _ = pipeline(
        audio_file=audio_file1,
        audio_file2=audio_file2,
        audio_length_in_s=duration,
        time_pooling=2,
        freq_pooling=2,
        prompt_1=prompt1,
        prompt_2=prompt2,
        negative_prompt_1=negative_prompt1,
        negative_prompt_2=negative_prompt2,
        save_lora_dir=save_lora_dir,
        use_adain=True,
        use_reschedule=False,
        num_inference_steps=50,
        lamd=0.6,
        output_path=save_lora_dir,
        num_frames=5,
        fix_lora=None,
        use_lora=True,
        lora_steps=50,
        noisy_latent_with_lora=True,
        morphing_with_lora=True,
        use_morph_prompt=True,
        guidance_scale=7.5,
    )
    
    # Collect the output file paths
    output_paths = [os.path.join(save_lora_dir, file) for file in os.listdir(save_lora_dir) if file.endswith(".wav")]
    return output_paths

# Gradio interface function
def interface(audio1, audio2, prompt1, prompt2):
    output_paths = morph_audio(audio1, audio2, prompt1, prompt2)
    return output_paths

# Gradio Interface
demo = gr.Interface(
    fn=interface,
    inputs=[
        gr.Audio(label="Upload Audio File 1", type="filepath"),
        gr.Audio(label="Upload Audio File 2", type="filepath"),
        # gr.Slider(4, 6, step=1, label="Octave 1"),
        gr.Textbox(label="Prompt for Audio File 1"),
        gr.Textbox(label="Prompt for Audio File 2")
    ],
    outputs=[
        gr.Audio(label="Morphing audio 1"),
        gr.Audio(label="Morphing audio 2"),
        gr.Audio(label="Morphing audio 3"),
        gr.Audio(label="Morphing audio 4"),
        gr.Audio(label="Morphing audio 5"),
    ],
)

if __name__ == "__main__":
    demo.launch()