DanLeBossDeESGI commited on
Commit
27ab8aa
·
1 Parent(s): 43dce03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -37
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import streamlit as st
2
  import torch
3
- from diffusers import AudioLDM2Pipeline
 
 
4
 
5
  # make Space compatible with CPU duplicates
6
  if torch.cuda.is_available():
@@ -11,49 +13,59 @@ else:
11
  torch_dtype = torch.float32
12
 
13
  # load the diffusers pipeline
14
- repo_id = "cvssp/audioldm2"
15
- pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
 
16
 
17
- # set the generator for reproducibility
18
- generator = torch.Generator(device)
19
-
20
-
21
- def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
22
- if text is None:
23
- st.error("Please provide a text input.")
24
- return
25
 
26
- waveforms = pipe(
27
- text,
28
- audio_length_in_s=duration,
29
- guidance_scale=guidance_scale,
30
- num_inference_steps=200,
31
- negative_prompt=negative_prompt,
32
- num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1,
33
- generator=generator.manual_seed(int(random_seed)),
34
- )["audios"]
35
 
36
- st.audio(waveforms[0], format="audio/wav", sample_rate=16000)
 
 
 
 
37
 
 
 
38
 
39
- # Streamlit UI
40
- st.title("AudioLDM 2: A General Framework for Audio, Music, and Speech Generation")
 
 
 
41
 
42
- st.markdown(
43
- "[Paper](https://arxiv.org/abs/2308.05734) [Project Page](https://audioldm.github.io/audioldm2) [Diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2)"
44
- )
 
 
 
 
 
 
 
 
 
 
45
 
46
- st.markdown("This is the demo for AudioLDM 2, powered by 🧨 Diffusers. For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings.")
 
 
 
47
 
48
- st.markdown("### Input")
49
- text = st.text_input("Input text", "The vibrant beat of Brazilian samba drums")
50
- negative_prompt = st.text_input("Negative prompt", "Low quality")
51
 
52
- st.markdown("### Configuration")
53
- duration = st.slider("Duration (seconds)", 5.0, 15.0, 10.0, step=2.5)
54
- guidance_scale = st.slider("Guidance scale", 0.0, 7.0, 3.5, step=0.5)
55
- n_candidates = st.slider("Number waveforms to generate", 1.0, 5.0, 3.0, step=1.0)
56
- random_seed = st.number_input("Seed", 1.0, 100.0, 45.0)
57
 
58
- if st.button("Submit"):
59
- text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates)
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import torch
3
+ from diffusers import AudioLDMPipeline
4
+ from share_btn import community_icon_html, loading_icon_html, share_js
5
+ from transformers import AutoProcessor, ClapModel
6
 
7
  # make Space compatible with CPU duplicates
8
  if torch.cuda.is_available():
 
13
  torch_dtype = torch.float32
14
 
15
  # load the diffusers pipeline
16
+ repo_id = "cvssp/audioldm-m-full"
17
+ pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
18
+ pipe.unet = torch.compile(pipe.unet)
19
 
20
+ # CLAP model (only required for automatic scoring)
21
+ clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
22
+ processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
 
 
 
 
 
23
 
24
+ generator = torch.Generator(device)
 
 
 
 
 
 
 
 
25
 
26
+ # Streamlit app setup
27
+ st.set_page_config(
28
+ page_title="Text to Music",
29
+ page_icon="🎵",
30
+ )
31
 
32
+ text_input = st.text_input("Input text", "A hammer is hitting a wooden surface")
33
+ negative_prompt = st.text_input("Negative prompt", "low quality, average quality")
34
 
35
+ st.markdown("### Configuration")
36
+ seed = st.number_input("Seed", value=45)
37
+ duration = st.slider("Duration (seconds)", 2.5, 10.0, 5.0, 2.5)
38
+ guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5)
39
+ n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1)
40
 
41
+ if st.button("Submit"):
42
+ if text_input is None:
43
+ st.error("Please provide a text input.")
44
+ else:
45
+ waveforms = pipe(
46
+ text_input,
47
+ audio_length_in_s=duration,
48
+ guidance_scale=guidance_scale,
49
+ num_inference_steps=100,
50
+ negative_prompt=negative_prompt,
51
+ num_waveforms_per_prompt=n_candidates if n_candidates else 1,
52
+ generator=generator.manual_seed(int(seed)),
53
+ )["audios"]
54
 
55
+ if waveforms.shape[0] > 1:
56
+ waveform = score_waveforms(text_input, waveforms)
57
+ else:
58
+ waveform = waveforms[0]
59
 
60
+ st.audio(waveform, format="audio/wav")
 
 
61
 
 
 
 
 
 
62
 
63
+ def score_waveforms(text, waveforms):
64
+ inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
65
+ inputs = {key: inputs[key].to(device) for key in inputs}
66
+ with torch.no_grad():
67
+ logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
68
+ probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
69
+ most_probable = torch.argmax(probs) # and now select the most likely audio waveform
70
+ waveform = waveforms[most_probable]
71
+ return waveform