Spaces:
Runtime error
Runtime error
CSH-1220
commited on
Commit
Β·
4cf73d6
1
Parent(s):
075c9a6
Update scripts
Browse files- app.py +22 -8
- audio_encoder/AudioMAE.py +1 -1
- pipeline/morph_pipeline_successed_ver1.py +15 -13
app.py
CHANGED
@@ -1,10 +1,24 @@
|
|
1 |
import os
|
2 |
-
import gradio as gr
|
3 |
-
import torchaudio
|
4 |
import torch
|
|
|
5 |
import numpy as np
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
|
|
8 |
# Initialize AudioLDM2 Pipeline
|
9 |
pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
|
10 |
pipeline.to("cuda")
|
@@ -66,11 +80,11 @@ demo = gr.Interface(
|
|
66 |
gr.Textbox(label="Prompt for Audio File 2")
|
67 |
],
|
68 |
outputs=[
|
69 |
-
gr.Audio(label="
|
70 |
-
gr.Audio(label="
|
71 |
-
gr.Audio(label="
|
72 |
-
gr.Audio(label="
|
73 |
-
gr.Audio(label="
|
74 |
],
|
75 |
)
|
76 |
|
|
|
1 |
import os
|
|
|
|
|
2 |
import torch
|
3 |
+
import torchaudio
|
4 |
import numpy as np
|
5 |
+
import gradio as gr
|
6 |
+
from huggingface_hub import hf_hub_download
|
7 |
+
model_path = hf_hub_download(
|
8 |
+
repo_id="DennisHung/Pre-trained_AudioMAE_weights",
|
9 |
+
filename="pretrained.pth",
|
10 |
+
local_dir="./",
|
11 |
+
local_dir_use_symlinks=False
|
12 |
+
)
|
13 |
+
|
14 |
+
model_path = hf_hub_download(
|
15 |
+
repo_id="DennisHung/Pre-trained_AudioMAE_weights",
|
16 |
+
filename="pytorch_model.bin",
|
17 |
+
local_dir="./",
|
18 |
+
local_dir_use_symlinks=False
|
19 |
+
)
|
20 |
|
21 |
+
from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline
|
22 |
# Initialize AudioLDM2 Pipeline
|
23 |
pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
|
24 |
pipeline.to("cuda")
|
|
|
80 |
gr.Textbox(label="Prompt for Audio File 2")
|
81 |
],
|
82 |
outputs=[
|
83 |
+
gr.Audio(label="Morphing audio 1"),
|
84 |
+
gr.Audio(label="Morphing audio 2"),
|
85 |
+
gr.Audio(label="Morphing audio 3"),
|
86 |
+
gr.Audio(label="Morphing audio 4"),
|
87 |
+
gr.Audio(label="Morphing audio 5"),
|
88 |
],
|
89 |
)
|
90 |
|
audio_encoder/AudioMAE.py
CHANGED
@@ -25,7 +25,7 @@ class Vanilla_AudioMAE(nn.Module):
|
|
25 |
in_chans=1, audio_exp=True, img_size=(1024, 128)
|
26 |
)
|
27 |
|
28 |
-
checkpoint_path = '
|
29 |
checkpoint = torch.load(checkpoint_path, map_location='cpu')
|
30 |
msg = model.load_state_dict(checkpoint['model'], strict=False)
|
31 |
|
|
|
25 |
in_chans=1, audio_exp=True, img_size=(1024, 128)
|
26 |
)
|
27 |
|
28 |
+
checkpoint_path = 'pretrained.pth'
|
29 |
checkpoint = torch.load(checkpoint_path, map_location='cpu')
|
30 |
msg = model.load_state_dict(checkpoint['model'], strict=False)
|
31 |
|
pipeline/morph_pipeline_successed_ver1.py
CHANGED
@@ -53,8 +53,10 @@ import matplotlib.pyplot as plt
|
|
53 |
|
54 |
from .pipeline_audioldm2 import AudioLDM2Pipeline
|
55 |
|
|
|
|
|
56 |
pipeline_trained = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
|
57 |
-
pipeline_trained = pipeline_trained.to(
|
58 |
layer_num = 0
|
59 |
cross = [None, None, 768, 768, 1024, 1024, None, None]
|
60 |
unet = pipeline_trained.unet
|
@@ -85,11 +87,11 @@ for name in unet.attn_processors.keys():
|
|
85 |
scale=0.5,
|
86 |
num_tokens=8,
|
87 |
do_copy=False
|
88 |
-
).to(
|
89 |
else:
|
90 |
attn_procs[name] = AttnProcessor2_0()
|
91 |
|
92 |
-
state_dict = torch.load('
|
93 |
for name, processor in attn_procs.items():
|
94 |
if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
|
95 |
weight_name_v = name + ".to_v_ip.weight"
|
@@ -98,7 +100,7 @@ for name, processor in attn_procs.items():
|
|
98 |
processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
|
99 |
|
100 |
unet.set_attn_processor(attn_procs)
|
101 |
-
unet.to(
|
102 |
|
103 |
|
104 |
|
@@ -902,7 +904,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
|
|
902 |
ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
|
903 |
# print("ta_kaldi_fbank.shape",ta_kaldi_fbank.shape)
|
904 |
mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
|
905 |
-
model = AudioMAEConditionCTPoolRand().
|
906 |
model.eval()
|
907 |
LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
|
908 |
uncond_LOA_embed = model(torch.zeros_like(mel_spect_tensor), time_pool=time_pooling, freq_pool=freq_pooling)
|
@@ -1130,7 +1132,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
|
|
1130 |
scale=0.5,
|
1131 |
num_tokens=8,
|
1132 |
do_copy=False
|
1133 |
-
).to(
|
1134 |
else:
|
1135 |
attn_procs[name] = AttnProcessor2_0()
|
1136 |
|
@@ -1142,13 +1144,13 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
|
|
1142 |
processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
|
1143 |
processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
|
1144 |
self.unet.set_attn_processor(attn_procs)
|
1145 |
-
self.vae= self.vae.to(
|
1146 |
-
self.unet = self.unet.to(
|
1147 |
-
self.language_model = self.language_model.to(
|
1148 |
-
self.projection_model = self.projection_model.to(
|
1149 |
-
self.vocoder = self.vocoder.to(
|
1150 |
-
self.text_encoder = self.text_encoder.to(
|
1151 |
-
self.text_encoder_2 = self.text_encoder_2.to(
|
1152 |
|
1153 |
|
1154 |
|
|
|
53 |
|
54 |
from .pipeline_audioldm2 import AudioLDM2Pipeline
|
55 |
|
56 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
57 |
+
|
58 |
pipeline_trained = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
|
59 |
+
pipeline_trained = pipeline_trained.to(DEVICE)
|
60 |
layer_num = 0
|
61 |
cross = [None, None, 768, 768, 1024, 1024, None, None]
|
62 |
unet = pipeline_trained.unet
|
|
|
87 |
scale=0.5,
|
88 |
num_tokens=8,
|
89 |
do_copy=False
|
90 |
+
).to(DEVICE, dtype=torch.float32)
|
91 |
else:
|
92 |
attn_procs[name] = AttnProcessor2_0()
|
93 |
|
94 |
+
state_dict = torch.load('pytorch_model.bin', map_location=DEVICE)
|
95 |
for name, processor in attn_procs.items():
|
96 |
if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
|
97 |
weight_name_v = name + ".to_v_ip.weight"
|
|
|
100 |
processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
|
101 |
|
102 |
unet.set_attn_processor(attn_procs)
|
103 |
+
unet.to(DEVICE, dtype=torch.float32)
|
104 |
|
105 |
|
106 |
|
|
|
904 |
ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
|
905 |
# print("ta_kaldi_fbank.shape",ta_kaldi_fbank.shape)
|
906 |
mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
|
907 |
+
model = AudioMAEConditionCTPoolRand().to(next(self.unet.parameters()).device)
|
908 |
model.eval()
|
909 |
LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
|
910 |
uncond_LOA_embed = model(torch.zeros_like(mel_spect_tensor), time_pool=time_pooling, freq_pool=freq_pooling)
|
|
|
1132 |
scale=0.5,
|
1133 |
num_tokens=8,
|
1134 |
do_copy=False
|
1135 |
+
).to(DEVICE, dtype=torch.float32)
|
1136 |
else:
|
1137 |
attn_procs[name] = AttnProcessor2_0()
|
1138 |
|
|
|
1144 |
processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
|
1145 |
processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
|
1146 |
self.unet.set_attn_processor(attn_procs)
|
1147 |
+
self.vae= self.vae.to(DEVICE, dtype=torch.float32)
|
1148 |
+
self.unet = self.unet.to(DEVICE, dtype=torch.float32)
|
1149 |
+
self.language_model = self.language_model.to(DEVICE, dtype=torch.float32)
|
1150 |
+
self.projection_model = self.projection_model.to(DEVICE, dtype=torch.float32)
|
1151 |
+
self.vocoder = self.vocoder.to(DEVICE, dtype=torch.float32)
|
1152 |
+
self.text_encoder = self.text_encoder.to(DEVICE, dtype=torch.float32)
|
1153 |
+
self.text_encoder_2 = self.text_encoder_2.to(DEVICE, dtype=torch.float32)
|
1154 |
|
1155 |
|
1156 |
|