Spaces:
Runtime error
Runtime error
Commit
·
c602888
1
Parent(s):
2c329a4
Update app.py
Browse files
app.py
CHANGED
@@ -17,9 +17,7 @@ pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(dev
|
|
17 |
pipe.unet = torch.compile(pipe.unet)
|
18 |
|
19 |
# CLAP model (only required for automatic scoring)
|
20 |
-
|
21 |
clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
|
22 |
-
|
23 |
processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
|
24 |
|
25 |
generator = torch.Generator(device)
|
@@ -39,6 +37,16 @@ duration = st.slider("Duration (seconds)", 2.5, 10.0, 5.0, 2.5)
|
|
39 |
guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5)
|
40 |
n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
if st.button("Submit"):
|
43 |
if text_input is None:
|
44 |
st.error("Please provide a text input.")
|
@@ -59,14 +67,3 @@ if st.button("Submit"):
|
|
59 |
waveform = waveforms[0]
|
60 |
|
61 |
st.audio(waveform, format="audio/wav")
|
62 |
-
|
63 |
-
|
64 |
-
def score_waveforms(text, waveforms):
|
65 |
-
inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
|
66 |
-
inputs = {key: inputs[key].to(device) for key in inputs}
|
67 |
-
with torch.no_grad():
|
68 |
-
logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
|
69 |
-
probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
|
70 |
-
most_probable = torch.argmax(probs) # and now select the most likely audio waveform
|
71 |
-
waveform = waveforms[most_probable]
|
72 |
-
return waveform
|
|
|
17 |
pipe.unet = torch.compile(pipe.unet)
|
18 |
|
19 |
# CLAP model (only required for automatic scoring)
|
|
|
20 |
clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
|
|
|
21 |
processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
|
22 |
|
23 |
generator = torch.Generator(device)
|
|
|
37 |
guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5)
|
38 |
n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1)
|
39 |
|
40 |
+
def score_waveforms(text, waveforms):
|
41 |
+
inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
|
42 |
+
inputs = {key: inputs[key].to(device) for key in inputs}
|
43 |
+
with torch.no_grad():
|
44 |
+
logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
|
45 |
+
probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
|
46 |
+
most_probable = torch.argmax(probs) # and now select the most likely audio waveform
|
47 |
+
waveform = waveforms[most_probable]
|
48 |
+
return waveform
|
49 |
+
|
50 |
if st.button("Submit"):
|
51 |
if text_input is None:
|
52 |
st.error("Please provide a text input.")
|
|
|
67 |
waveform = waveforms[0]
|
68 |
|
69 |
st.audio(waveform, format="audio/wav")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|