import streamlit as st import torch from diffusers import AudioLDM2Pipeline # make Space compatible with CPU duplicates if torch.cuda.is_available(): device = "cuda" torch_dtype = torch.float16 else: device = "cpu" torch_dtype = torch.float32 # load the diffusers pipeline repo_id = "cvssp/audioldm2" pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device) # set the generator for reproducibility generator = torch.Generator(device) def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates): if text is None: st.error("Please provide a text input.") return waveforms = pipe( text, audio_length_in_s=duration, guidance_scale=guidance_scale, num_inference_steps=200, negative_prompt=negative_prompt, num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1, generator=generator.manual_seed(int(random_seed)), )["audios"] st.audio(waveforms[0], format="audio/wav", sample_rate=16000) # Streamlit UI st.title("AudioLDM 2: A General Framework for Audio, Music, and Speech Generation") st.markdown( "[Paper](https://arxiv.org/abs/2308.05734) [Project Page](https://audioldm.github.io/audioldm2) [Diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2)" ) st.markdown("This is the demo for AudioLDM 2, powered by 🧨 Diffusers. For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings.") st.markdown("### Input") text = st.text_input("Input text", "The vibrant beat of Brazilian samba drums") negative_prompt = st.text_input("Negative prompt", "Low quality") st.markdown("### Configuration") duration = st.slider("Duration (seconds)", 5.0, 15.0, 10.0, step=2.5) guidance_scale = st.slider("Guidance scale", 0.0, 7.0, 3.5, step=0.5) n_candidates = st.slider("Number waveforms to generate", 1.0, 5.0, 3.0, step=1.0) random_seed = st.number_input("Seed", 1.0, 100.0, 45.0) if st.button("Submit"): text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates)