Anything2Image / app.py
laizeqiang
First model version
c43b0d6
raw
history blame
1.04 kB
import gradio as gr
import imagebind
import torch
from diffusers import StableUnCLIPImg2ImgPipeline
import soundfile as sf
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16"
)
pipe = pipe.to(device)
model = imagebind.imagebind_huge(pretrained=True)
model.eval()
model.to(device)
@torch.no_grad()
def anything2img(prompt, audio):
sr, waveform = audio
audio_path = 'tmp.wav'
sf.write(audio_path, waveform, sr)
audio_paths=[audio_path]
embeddings = model.forward({
imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(audio_paths, device),
})
embeddings = embeddings[imagebind.ModalityType.AUDIO]
images = pipe(prompt=prompt, image_embeds=embeddings.half()).images
return images[0]
demo = gr.Interface(fn=anything2img, inputs=["text", "audio"], outputs="image")
demo.launch(server_name='0.0.0.0', server_port=10051, share=True)