File size: 2,153 Bytes
0c6352e
0767df8
0c6352e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e95c13c
0c6352e
 
 
9f8404c
0c6352e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d89757b
7229c23
0b3533e
7229c23
0c6352e
 
eea388c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import streamlit as st
import torch
from diffusers import AudioLDM2Pipeline

# make Space compatible with CPU duplicates
if torch.cuda.is_available():
    device = "cuda"
    torch_dtype = torch.float16
else:
    device = "cpu"
    torch_dtype = torch.float32

# load the diffusers pipeline
repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)

# set the generator for reproducibility
generator = torch.Generator(device)


def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
    if text is None:
        st.error("Please provide a text input.")
        return

    waveforms = pipe(
        text,
        audio_length_in_s=duration,
        guidance_scale=guidance_scale,
        num_inference_steps=200,
        negative_prompt=negative_prompt,
        num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1,
        generator=generator.manual_seed(int(random_seed)),
    )["audios"]

    st.audio(waveforms[0], format="audio/wav", sample_rate=16000)


# Streamlit UI
st.title("AudioLDM 2: A General Framework for Audio, Music, and Speech Generation")

st.markdown(
    "[Paper](https://arxiv.org/abs/2308.05734) [Project Page](https://audioldm.github.io/audioldm2) [Diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2)"
)

st.markdown("This is the demo for AudioLDM 2, powered by 🧨 Diffusers. For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings.")

st.markdown("### Input")
text = st.text_input("Input text", "The vibrant beat of Brazilian samba drums")
negative_prompt = st.text_input("Negative prompt", "Low quality")

st.markdown("### Configuration")
duration = st.slider("Duration (seconds)", 5.0, 15.0, 10.0, step=2.5)
guidance_scale = st.slider("Guidance scale", 0.0, 7.0, 3.5, step=0.5)
n_candidates = st.slider("Number waveforms to generate", 1.0, 5.0, 3.0, step=1.0)
random_seed = st.number_input("Seed", 1.0, 100.0, 45.0)

if st.button("Submit"):
    text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates)