CSH-1220 commited on
Commit
4cf73d6
Β·
1 Parent(s): 075c9a6

Update scripts

Browse files
app.py CHANGED
@@ -1,10 +1,24 @@
1
  import os
2
- import gradio as gr
3
- import torchaudio
4
  import torch
 
5
  import numpy as np
6
- from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
8
  # Initialize AudioLDM2 Pipeline
9
  pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
10
  pipeline.to("cuda")
@@ -66,11 +80,11 @@ demo = gr.Interface(
66
  gr.Textbox(label="Prompt for Audio File 2")
67
  ],
68
  outputs=[
69
- gr.Audio(label="Generated Tone 1"),
70
- gr.Audio(label="Generated Tone 2"),
71
- gr.Audio(label="Generated Tone 3"),
72
- gr.Audio(label="Generated Tone 4"),
73
- gr.Audio(label="Generated Tone 5"),
74
  ],
75
  )
76
 
 
1
  import os
 
 
2
  import torch
3
+ import torchaudio
4
  import numpy as np
5
+ import gradio as gr
6
+ from huggingface_hub import hf_hub_download
7
+ model_path = hf_hub_download(
8
+ repo_id="DennisHung/Pre-trained_AudioMAE_weights",
9
+ filename="pretrained.pth",
10
+ local_dir="./",
11
+ local_dir_use_symlinks=False
12
+ )
13
+
14
+ model_path = hf_hub_download(
15
+ repo_id="DennisHung/Pre-trained_AudioMAE_weights",
16
+ filename="pytorch_model.bin",
17
+ local_dir="./",
18
+ local_dir_use_symlinks=False
19
+ )
20
 
21
+ from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline
22
  # Initialize AudioLDM2 Pipeline
23
  pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
24
  pipeline.to("cuda")
 
80
  gr.Textbox(label="Prompt for Audio File 2")
81
  ],
82
  outputs=[
83
+ gr.Audio(label="Morphing audio 1"),
84
+ gr.Audio(label="Morphing audio 2"),
85
+ gr.Audio(label="Morphing audio 3"),
86
+ gr.Audio(label="Morphing audio 4"),
87
+ gr.Audio(label="Morphing audio 5"),
88
  ],
89
  )
90
 
audio_encoder/AudioMAE.py CHANGED
@@ -25,7 +25,7 @@ class Vanilla_AudioMAE(nn.Module):
25
  in_chans=1, audio_exp=True, img_size=(1024, 128)
26
  )
27
 
28
- checkpoint_path = '/Data/home/Dennis/DeepMIR-2024/Final_Project/AP-adapter/pretrained.pth'
29
  checkpoint = torch.load(checkpoint_path, map_location='cpu')
30
  msg = model.load_state_dict(checkpoint['model'], strict=False)
31
 
 
25
  in_chans=1, audio_exp=True, img_size=(1024, 128)
26
  )
27
 
28
+ checkpoint_path = 'pretrained.pth'
29
  checkpoint = torch.load(checkpoint_path, map_location='cpu')
30
  msg = model.load_state_dict(checkpoint['model'], strict=False)
31
 
pipeline/morph_pipeline_successed_ver1.py CHANGED
@@ -53,8 +53,10 @@ import matplotlib.pyplot as plt
53
 
54
  from .pipeline_audioldm2 import AudioLDM2Pipeline
55
 
 
 
56
  pipeline_trained = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
57
- pipeline_trained = pipeline_trained.to("cuda")
58
  layer_num = 0
59
  cross = [None, None, 768, 768, 1024, 1024, None, None]
60
  unet = pipeline_trained.unet
@@ -85,11 +87,11 @@ for name in unet.attn_processors.keys():
85
  scale=0.5,
86
  num_tokens=8,
87
  do_copy=False
88
- ).to("cuda", dtype=torch.float32)
89
  else:
90
  attn_procs[name] = AttnProcessor2_0()
91
 
92
- state_dict = torch.load('/Data/home/Dennis/DeepMIR-2024/Final_Project/AP-adapter/pytorch_model.bin', map_location="cuda")
93
  for name, processor in attn_procs.items():
94
  if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
95
  weight_name_v = name + ".to_v_ip.weight"
@@ -98,7 +100,7 @@ for name, processor in attn_procs.items():
98
  processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
99
 
100
  unet.set_attn_processor(attn_procs)
101
- unet.to("cuda", dtype=torch.float32)
102
 
103
 
104
 
@@ -902,7 +904,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
902
  ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
903
  # print("ta_kaldi_fbank.shape",ta_kaldi_fbank.shape)
904
  mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
905
- model = AudioMAEConditionCTPoolRand().cuda()
906
  model.eval()
907
  LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
908
  uncond_LOA_embed = model(torch.zeros_like(mel_spect_tensor), time_pool=time_pooling, freq_pool=freq_pooling)
@@ -1130,7 +1132,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
1130
  scale=0.5,
1131
  num_tokens=8,
1132
  do_copy=False
1133
- ).to("cuda", dtype=torch.float32)
1134
  else:
1135
  attn_procs[name] = AttnProcessor2_0()
1136
 
@@ -1142,13 +1144,13 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
1142
  processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
1143
  processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
1144
  self.unet.set_attn_processor(attn_procs)
1145
- self.vae= self.vae.to("cuda", dtype=torch.float32)
1146
- self.unet = self.unet.to("cuda", dtype=torch.float32)
1147
- self.language_model = self.language_model.to("cuda", dtype=torch.float32)
1148
- self.projection_model = self.projection_model.to("cuda", dtype=torch.float32)
1149
- self.vocoder = self.vocoder.to("cuda", dtype=torch.float32)
1150
- self.text_encoder = self.text_encoder.to("cuda", dtype=torch.float32)
1151
- self.text_encoder_2 = self.text_encoder_2.to("cuda", dtype=torch.float32)
1152
 
1153
 
1154
 
 
53
 
54
  from .pipeline_audioldm2 import AudioLDM2Pipeline
55
 
56
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
57
+
58
  pipeline_trained = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
59
+ pipeline_trained = pipeline_trained.to(DEVICE)
60
  layer_num = 0
61
  cross = [None, None, 768, 768, 1024, 1024, None, None]
62
  unet = pipeline_trained.unet
 
87
  scale=0.5,
88
  num_tokens=8,
89
  do_copy=False
90
+ ).to(DEVICE, dtype=torch.float32)
91
  else:
92
  attn_procs[name] = AttnProcessor2_0()
93
 
94
+ state_dict = torch.load('pytorch_model.bin', map_location=DEVICE)
95
  for name, processor in attn_procs.items():
96
  if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
97
  weight_name_v = name + ".to_v_ip.weight"
 
100
  processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
101
 
102
  unet.set_attn_processor(attn_procs)
103
+ unet.to(DEVICE, dtype=torch.float32)
104
 
105
 
106
 
 
904
  ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
905
  # print("ta_kaldi_fbank.shape",ta_kaldi_fbank.shape)
906
  mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
907
+ model = AudioMAEConditionCTPoolRand().to(next(self.unet.parameters()).device)
908
  model.eval()
909
  LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
910
  uncond_LOA_embed = model(torch.zeros_like(mel_spect_tensor), time_pool=time_pooling, freq_pool=freq_pooling)
 
1132
  scale=0.5,
1133
  num_tokens=8,
1134
  do_copy=False
1135
+ ).to(DEVICE, dtype=torch.float32)
1136
  else:
1137
  attn_procs[name] = AttnProcessor2_0()
1138
 
 
1144
  processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
1145
  processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
1146
  self.unet.set_attn_processor(attn_procs)
1147
+ self.vae= self.vae.to(DEVICE, dtype=torch.float32)
1148
+ self.unet = self.unet.to(DEVICE, dtype=torch.float32)
1149
+ self.language_model = self.language_model.to(DEVICE, dtype=torch.float32)
1150
+ self.projection_model = self.projection_model.to(DEVICE, dtype=torch.float32)
1151
+ self.vocoder = self.vocoder.to(DEVICE, dtype=torch.float32)
1152
+ self.text_encoder = self.text_encoder.to(DEVICE, dtype=torch.float32)
1153
+ self.text_encoder_2 = self.text_encoder_2.to(DEVICE, dtype=torch.float32)
1154
 
1155
 
1156