DanLeBossDeESGI commited on
Commit
c602888
·
1 Parent(s): 2c329a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -13
app.py CHANGED
@@ -17,9 +17,7 @@ pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(dev
17
  pipe.unet = torch.compile(pipe.unet)
18
 
19
  # CLAP model (only required for automatic scoring)
20
-
21
  clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
22
-
23
  processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
24
 
25
  generator = torch.Generator(device)
@@ -39,6 +37,16 @@ duration = st.slider("Duration (seconds)", 2.5, 10.0, 5.0, 2.5)
39
  guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5)
40
  n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1)
41
 
 
 
 
 
 
 
 
 
 
 
42
  if st.button("Submit"):
43
  if text_input is None:
44
  st.error("Please provide a text input.")
@@ -59,14 +67,3 @@ if st.button("Submit"):
59
  waveform = waveforms[0]
60
 
61
  st.audio(waveform, format="audio/wav")
62
-
63
-
64
- def score_waveforms(text, waveforms):
65
- inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
66
- inputs = {key: inputs[key].to(device) for key in inputs}
67
- with torch.no_grad():
68
- logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
69
- probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
70
- most_probable = torch.argmax(probs) # and now select the most likely audio waveform
71
- waveform = waveforms[most_probable]
72
- return waveform
 
17
  pipe.unet = torch.compile(pipe.unet)
18
 
19
  # CLAP model (only required for automatic scoring)
 
20
  clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
 
21
  processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
22
 
23
  generator = torch.Generator(device)
 
37
  guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5)
38
  n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1)
39
 
40
+ def score_waveforms(text, waveforms):
41
+ inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
42
+ inputs = {key: inputs[key].to(device) for key in inputs}
43
+ with torch.no_grad():
44
+ logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
45
+ probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
46
+ most_probable = torch.argmax(probs) # and now select the most likely audio waveform
47
+ waveform = waveforms[most_probable]
48
+ return waveform
49
+
50
  if st.button("Submit"):
51
  if text_input is None:
52
  st.error("Please provide a text input.")
 
67
  waveform = waveforms[0]
68
 
69
  st.audio(waveform, format="audio/wav")