Spaces:

DanLeBossDeESGI
/

Musica

Runtime error

App Files Files Community

DanLeBossDeESGI commited on Sep 23, 2023

Commit

27ab8aa

1 Parent(s): 43dce03

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -37

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import streamlit as st
 import torch
-from diffusers import AudioLDM2Pipeline
 # make Space compatible with CPU duplicates
 if torch.cuda.is_available():
@@ -11,49 +13,59 @@ else:
     torch_dtype = torch.float32
 # load the diffusers pipeline
-repo_id = "cvssp/audioldm2"
-pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
-# set the generator for reproducibility
-generator = torch.Generator(device)
-def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
-    if text is None:
-        st.error("Please provide a text input.")
-        return
-    waveforms = pipe(
-        text,
-        audio_length_in_s=duration,
-        guidance_scale=guidance_scale,
-        num_inference_steps=200,
-        negative_prompt=negative_prompt,
-        num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1,
-        generator=generator.manual_seed(int(random_seed)),
-    )["audios"]
-    st.audio(waveforms[0], format="audio/wav", sample_rate=16000)
-# Streamlit UI
-st.title("AudioLDM 2: A General Framework for Audio, Music, and Speech Generation")
-st.markdown(
-    "[Paper](https://arxiv.org/abs/2308.05734) [Project Page](https://audioldm.github.io/audioldm2) [Diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2)"
-)
-st.markdown("This is the demo for AudioLDM 2, powered by 🧨 Diffusers. For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings.")
-st.markdown("### Input")
-text = st.text_input("Input text", "The vibrant beat of Brazilian samba drums")
-negative_prompt = st.text_input("Negative prompt", "Low quality")
-st.markdown("### Configuration")
-duration = st.slider("Duration (seconds)", 5.0, 15.0, 10.0, step=2.5)
-guidance_scale = st.slider("Guidance scale", 0.0, 7.0, 3.5, step=0.5)
-n_candidates = st.slider("Number waveforms to generate", 1.0, 5.0, 3.0, step=1.0)
-random_seed = st.number_input("Seed", 1.0, 100.0, 45.0)
-if st.button("Submit"):
-    text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates)

 import streamlit as st
 import torch
+from diffusers import AudioLDMPipeline
+from share_btn import community_icon_html, loading_icon_html, share_js
+from transformers import AutoProcessor, ClapModel
 # make Space compatible with CPU duplicates
 if torch.cuda.is_available():
     torch_dtype = torch.float32
 # load the diffusers pipeline
+repo_id = "cvssp/audioldm-m-full"
+pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
+pipe.unet = torch.compile(pipe.unet)
+# CLAP model (only required for automatic scoring)
+clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
+processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
+generator = torch.Generator(device)
+# Streamlit app setup
+st.set_page_config(
+    page_title="Text to Music",
+    page_icon="🎵",
+)
+text_input = st.text_input("Input text", "A hammer is hitting a wooden surface")
+negative_prompt = st.text_input("Negative prompt", "low quality, average quality")
+st.markdown("### Configuration")
+seed = st.number_input("Seed", value=45)
+duration = st.slider("Duration (seconds)", 2.5, 10.0, 5.0, 2.5)
+guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5)
+n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1)
+if st.button("Submit"):
+    if text_input is None:
+        st.error("Please provide a text input.")
+    else:
+        waveforms = pipe(
+            text_input,
+            audio_length_in_s=duration,
+            guidance_scale=guidance_scale,
+            num_inference_steps=100,
+            negative_prompt=negative_prompt,
+            num_waveforms_per_prompt=n_candidates if n_candidates else 1,
+            generator=generator.manual_seed(int(seed)),
+        )["audios"]
+        if waveforms.shape[0] > 1:
+            waveform = score_waveforms(text_input, waveforms)
+        else:
+            waveform = waveforms[0]
+        st.audio(waveform, format="audio/wav")
+def score_waveforms(text, waveforms):
+    inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
+    inputs = {key: inputs[key].to(device) for key in inputs}
+    with torch.no_grad():
+        logits_per_text = clap_model(**inputs).logits_per_text  # this is the audio-text similarity score
+        probs = logits_per_text.softmax(dim=-1)  # we can take the softmax to get the label probabilities
+        most_probable = torch.argmax(probs)  # and now select the most likely audio waveform
+    waveform = waveforms[most_probable]
+    return waveform