Spaces:

DanLeBossDeESGI
/

Musica

Runtime error

DanLeBossDeESGI commited on Sep 23, 2023

Commit

c602888

1 Parent(s): 2c329a4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,9 +17,7 @@ pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(dev
 pipe.unet = torch.compile(pipe.unet)
 # CLAP model (only required for automatic scoring)
 clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
 processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
 generator = torch.Generator(device)
@@ -39,6 +37,16 @@ duration = st.slider("Duration (seconds)", 2.5, 10.0, 5.0, 2.5)
 guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5)
 n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1)
 if st.button("Submit"):
     if text_input is None:
         st.error("Please provide a text input.")
@@ -59,14 +67,3 @@ if st.button("Submit"):
             waveform = waveforms[0]
         st.audio(waveform, format="audio/wav")
-def score_waveforms(text, waveforms):
-    inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
-    inputs = {key: inputs[key].to(device) for key in inputs}
-    with torch.no_grad():
-        logits_per_text = clap_model(**inputs).logits_per_text  # this is the audio-text similarity score
-        probs = logits_per_text.softmax(dim=-1)  # we can take the softmax to get the label probabilities
-        most_probable = torch.argmax(probs)  # and now select the most likely audio waveform
-    waveform = waveforms[most_probable]
-    return waveform

 pipe.unet = torch.compile(pipe.unet)
 # CLAP model (only required for automatic scoring)
 clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
 processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
 generator = torch.Generator(device)
 guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5)
 n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1)
+def score_waveforms(text, waveforms):
+    inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
+    inputs = {key: inputs[key].to(device) for key in inputs}
+    with torch.no_grad():
+        logits_per_text = clap_model(**inputs).logits_per_text  # this is the audio-text similarity score
+        probs = logits_per_text.softmax(dim=-1)  # we can take the softmax to get the label probabilities
+        most_probable = torch.argmax(probs)  # and now select the most likely audio waveform
+    waveform = waveforms[most_probable]
+    return waveform
 if st.button("Submit"):
     if text_input is None:
         st.error("Please provide a text input.")
             waveform = waveforms[0]
         st.audio(waveform, format="audio/wav")