Spaces:

DennisHung
/

DiffmorpherXAp-adapter

Runtime error

App Files Files Community

CSH-1220 commited on Jan 3

Commit

4cf73d6

1 Parent(s): 075c9a6

Update scripts

Browse files

Files changed (3) hide show

app.py +22 -8
audio_encoder/AudioMAE.py +1 -1
pipeline/morph_pipeline_successed_ver1.py +15 -13

app.py CHANGED Viewed

@@ -1,10 +1,24 @@
 import os
-import gradio as gr
-import torchaudio
 import torch
 import numpy as np
-from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline
 # Initialize AudioLDM2 Pipeline
 pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
 pipeline.to("cuda")
@@ -66,11 +80,11 @@ demo = gr.Interface(
         gr.Textbox(label="Prompt for Audio File 2")
     ],
     outputs=[
-        gr.Audio(label="Generated Tone 1"),
-        gr.Audio(label="Generated Tone 2"),
-        gr.Audio(label="Generated Tone 3"),
-        gr.Audio(label="Generated Tone 4"),
-        gr.Audio(label="Generated Tone 5"),
     ],
 )

 import os
 import torch
+import torchaudio
 import numpy as np
+import gradio as gr
+from huggingface_hub import hf_hub_download
+model_path = hf_hub_download(
+    repo_id="DennisHung/Pre-trained_AudioMAE_weights",
+    filename="pretrained.pth",
+    local_dir="./",
+    local_dir_use_symlinks=False
+)
+model_path = hf_hub_download(
+    repo_id="DennisHung/Pre-trained_AudioMAE_weights",
+    filename="pytorch_model.bin",
+    local_dir="./",
+    local_dir_use_symlinks=False
+)
+from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline
 # Initialize AudioLDM2 Pipeline
 pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
 pipeline.to("cuda")
         gr.Textbox(label="Prompt for Audio File 2")
     ],
     outputs=[
+        gr.Audio(label="Morphing audio 1"),
+        gr.Audio(label="Morphing audio 2"),
+        gr.Audio(label="Morphing audio 3"),
+        gr.Audio(label="Morphing audio 4"),
+        gr.Audio(label="Morphing audio 5"),
     ],
 )

audio_encoder/AudioMAE.py CHANGED Viewed

@@ -25,7 +25,7 @@ class Vanilla_AudioMAE(nn.Module):
             in_chans=1, audio_exp=True, img_size=(1024, 128)
         )
-        checkpoint_path = '/Data/home/Dennis/DeepMIR-2024/Final_Project/AP-adapter/pretrained.pth'
         checkpoint = torch.load(checkpoint_path, map_location='cpu')
         msg = model.load_state_dict(checkpoint['model'], strict=False)

             in_chans=1, audio_exp=True, img_size=(1024, 128)
         )
+        checkpoint_path = 'pretrained.pth'
         checkpoint = torch.load(checkpoint_path, map_location='cpu')
         msg = model.load_state_dict(checkpoint['model'], strict=False)

pipeline/morph_pipeline_successed_ver1.py CHANGED Viewed

@@ -53,8 +53,10 @@ import matplotlib.pyplot as plt
 from .pipeline_audioldm2 import AudioLDM2Pipeline
 pipeline_trained = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
-pipeline_trained = pipeline_trained.to("cuda")
 layer_num = 0
 cross = [None, None, 768, 768, 1024, 1024, None, None]
 unet = pipeline_trained.unet
@@ -85,11 +87,11 @@ for name in  unet.attn_processors.keys():
                 scale=0.5,
                 num_tokens=8,
                 do_copy=False
-            ).to("cuda", dtype=torch.float32)
         else:
             attn_procs[name] = AttnProcessor2_0()
-state_dict = torch.load('/Data/home/Dennis/DeepMIR-2024/Final_Project/AP-adapter/pytorch_model.bin', map_location="cuda")
 for name, processor in attn_procs.items():
     if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
         weight_name_v = name + ".to_v_ip.weight"
@@ -98,7 +100,7 @@ for name, processor in attn_procs.items():
         processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
 unet.set_attn_processor(attn_procs)
-unet.to("cuda", dtype=torch.float32)
@@ -902,7 +904,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
         # print("ta_kaldi_fbank.shape",ta_kaldi_fbank.shape)
         mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
-        model = AudioMAEConditionCTPoolRand().cuda()
         model.eval()
         LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
         uncond_LOA_embed = model(torch.zeros_like(mel_spect_tensor), time_pool=time_pooling, freq_pool=freq_pooling)
@@ -1130,7 +1132,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                         scale=0.5,
                         num_tokens=8,
                         do_copy=False
-                    ).to("cuda", dtype=torch.float32)
                 else:
                     attn_procs[name] = AttnProcessor2_0()
@@ -1142,13 +1144,13 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                 processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
                 processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
         self.unet.set_attn_processor(attn_procs)
-        self.vae= self.vae.to("cuda", dtype=torch.float32)
-        self.unet = self.unet.to("cuda", dtype=torch.float32)
-        self.language_model = self.language_model.to("cuda", dtype=torch.float32)
-        self.projection_model = self.projection_model.to("cuda", dtype=torch.float32)
-        self.vocoder = self.vocoder.to("cuda", dtype=torch.float32)
-        self.text_encoder = self.text_encoder.to("cuda", dtype=torch.float32)
-        self.text_encoder_2 = self.text_encoder_2.to("cuda", dtype=torch.float32)

 from .pipeline_audioldm2 import AudioLDM2Pipeline
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 pipeline_trained = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
+pipeline_trained = pipeline_trained.to(DEVICE)
 layer_num = 0
 cross = [None, None, 768, 768, 1024, 1024, None, None]
 unet = pipeline_trained.unet
                 scale=0.5,
                 num_tokens=8,
                 do_copy=False
+            ).to(DEVICE, dtype=torch.float32)
         else:
             attn_procs[name] = AttnProcessor2_0()
+state_dict = torch.load('pytorch_model.bin', map_location=DEVICE)
 for name, processor in attn_procs.items():
     if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
         weight_name_v = name + ".to_v_ip.weight"
         processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
 unet.set_attn_processor(attn_procs)
+unet.to(DEVICE, dtype=torch.float32)
         ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
         # print("ta_kaldi_fbank.shape",ta_kaldi_fbank.shape)
         mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
+        model = AudioMAEConditionCTPoolRand().to(next(self.unet.parameters()).device)
         model.eval()
         LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
         uncond_LOA_embed = model(torch.zeros_like(mel_spect_tensor), time_pool=time_pooling, freq_pool=freq_pooling)
                         scale=0.5,
                         num_tokens=8,
                         do_copy=False
+                    ).to(DEVICE, dtype=torch.float32)
                 else:
                     attn_procs[name] = AttnProcessor2_0()
                 processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
                 processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
         self.unet.set_attn_processor(attn_procs)
+        self.vae= self.vae.to(DEVICE, dtype=torch.float32)
+        self.unet = self.unet.to(DEVICE, dtype=torch.float32)
+        self.language_model = self.language_model.to(DEVICE, dtype=torch.float32)
+        self.projection_model = self.projection_model.to(DEVICE, dtype=torch.float32)
+        self.vocoder = self.vocoder.to(DEVICE, dtype=torch.float32)
+        self.text_encoder = self.text_encoder.to(DEVICE, dtype=torch.float32)
+        self.text_encoder_2 = self.text_encoder_2.to(DEVICE, dtype=torch.float32)