Spaces:
Runtime error
Runtime error
guyyariv
commited on
Commit
·
56d047b
1
Parent(s):
5aeb32a
AudioTokenDemo
Browse files- app.py +10 -12
- assets/electric guitar.wav +0 -0
- assets/female singer.wav +0 -0
app.py
CHANGED
@@ -35,7 +35,7 @@ class AudioTokenWrapper(torch.nn.Module):
|
|
35 |
)
|
36 |
|
37 |
checkpoint = torch.load(
|
38 |
-
'
|
39 |
cfg = BEATsConfig(checkpoint['cfg'])
|
40 |
self.aud_encoder = BEATs(cfg)
|
41 |
self.aud_encoder.load_state_dict(checkpoint['model'])
|
@@ -69,12 +69,12 @@ class AudioTokenWrapper(torch.nn.Module):
|
|
69 |
self.unet.set_attn_processor(lora_attn_procs)
|
70 |
self.lora_layers = AttnProcsLayers(self.unet.attn_processors)
|
71 |
self.lora_layers.eval()
|
72 |
-
lora_layers_learned_embeds = '
|
73 |
self.lora_layers.load_state_dict(torch.load(lora_layers_learned_embeds, map_location=device))
|
74 |
self.unet.load_attn_procs(lora_layers_learned_embeds)
|
75 |
|
76 |
self.embedder.eval()
|
77 |
-
embedder_learned_embeds = '
|
78 |
self.embedder.load_state_dict(torch.load(embedder_learned_embeds, map_location=device))
|
79 |
|
80 |
self.placeholder_token = '<*>'
|
@@ -111,27 +111,25 @@ def greet(audio):
|
|
111 |
image = pipeline(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
|
112 |
return image
|
113 |
|
114 |
-
description = """
|
115 |
-
This is a demo of [AudioToken: Adaptation of Text-Conditioned Diffusion Models for Audio-to-Image Generation](https://pages.cs.huji.ac.il/adiyoss-lab/AudioToken/)
|
116 |
-
"""
|
117 |
-
|
118 |
|
119 |
if __name__ == "__main__":
|
120 |
|
121 |
lora = True
|
122 |
-
device =
|
123 |
model = AudioTokenWrapper(lora, device)
|
124 |
|
125 |
description = """<p>
|
126 |
-
This is a demo of <a href='https://pages.cs.huji.ac.il/adiyoss-lab/AudioToken' target='_blank'>AudioToken: Adaptation of Text-Conditioned Diffusion Models for Audio-to-Image Generation</a><br
|
127 |
-
|
128 |
For more information, please see the original <a href='https://arxiv.org/abs/2305.13050' target='_blank'>paper</a> and <a href='https://github.com/guyyariv/AudioToken' target='_blank'>repo</a>.
|
129 |
</p>"""
|
130 |
|
131 |
examples = [
|
132 |
["assets/train.wav"],
|
133 |
["assets/dog barking.wav"],
|
134 |
-
["assets/airplane.wav"]
|
|
|
|
|
135 |
]
|
136 |
|
137 |
demo = gr.Interface(
|
@@ -140,7 +138,7 @@ if __name__ == "__main__":
|
|
140 |
outputs="image",
|
141 |
title='AudioToken',
|
142 |
description=description,
|
143 |
-
|
144 |
)
|
145 |
demo.launch()
|
146 |
|
|
|
35 |
)
|
36 |
|
37 |
checkpoint = torch.load(
|
38 |
+
'BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt')
|
39 |
cfg = BEATsConfig(checkpoint['cfg'])
|
40 |
self.aud_encoder = BEATs(cfg)
|
41 |
self.aud_encoder.load_state_dict(checkpoint['model'])
|
|
|
69 |
self.unet.set_attn_processor(lora_attn_procs)
|
70 |
self.lora_layers = AttnProcsLayers(self.unet.attn_processors)
|
71 |
self.lora_layers.eval()
|
72 |
+
lora_layers_learned_embeds = 'sd1_lora_qi_lora_layers_learned_embeds-40000.bin'
|
73 |
self.lora_layers.load_state_dict(torch.load(lora_layers_learned_embeds, map_location=device))
|
74 |
self.unet.load_attn_procs(lora_layers_learned_embeds)
|
75 |
|
76 |
self.embedder.eval()
|
77 |
+
embedder_learned_embeds = 'sd1_lora_qi_learned_embeds-40000.bin'
|
78 |
self.embedder.load_state_dict(torch.load(embedder_learned_embeds, map_location=device))
|
79 |
|
80 |
self.placeholder_token = '<*>'
|
|
|
111 |
image = pipeline(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
|
112 |
return image
|
113 |
|
|
|
|
|
|
|
|
|
114 |
|
115 |
if __name__ == "__main__":
|
116 |
|
117 |
lora = True
|
118 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
119 |
model = AudioTokenWrapper(lora, device)
|
120 |
|
121 |
description = """<p>
|
122 |
+
This is a demo of <a href='https://pages.cs.huji.ac.il/adiyoss-lab/AudioToken' target='_blank'>AudioToken: Adaptation of Text-Conditioned Diffusion Models for Audio-to-Image Generation</a>.<br><br>
|
123 |
+
In recent years, image generation has shown a great leap in performance, where diffusion models play a central role. Although generating high-quality images, such models are mainly conditioned on textual descriptions. This begs the question: "how can we adopt such models to be conditioned on other modalities?". We propose a novel method utilizing latent diffusion models trained for text-to-image-generation to generate images conditioned on audio recordings. Using a pre-trained audio encoding model, the proposed method encodes audio into a new token, which can be considered as an adaptation layer between the audio and text representations. Such a modeling paradigm requires a small number of trainable parameters, making the proposed approach appealing for lightweight optimization.<br><br>
|
124 |
For more information, please see the original <a href='https://arxiv.org/abs/2305.13050' target='_blank'>paper</a> and <a href='https://github.com/guyyariv/AudioToken' target='_blank'>repo</a>.
|
125 |
</p>"""
|
126 |
|
127 |
examples = [
|
128 |
["assets/train.wav"],
|
129 |
["assets/dog barking.wav"],
|
130 |
+
["assets/airplane.wav"],
|
131 |
+
["assets/electric guitar.wav"],
|
132 |
+
["assets/female singer.wav"],
|
133 |
]
|
134 |
|
135 |
demo = gr.Interface(
|
|
|
138 |
outputs="image",
|
139 |
title='AudioToken',
|
140 |
description=description,
|
141 |
+
examples=examples
|
142 |
)
|
143 |
demo.launch()
|
144 |
|
assets/electric guitar.wav
ADDED
Binary file (320 kB). View file
|
|
assets/female singer.wav
ADDED
Binary file (320 kB). View file
|
|