Fabrice-TIERCELIN commited on
Commit
ded8ad7
Β·
verified Β·
1 Parent(s): 9e55dff

3 output files

Browse files
Files changed (1) hide show
  1. app.py +77 -14
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import json
3
  import torch
 
4
 
5
  from tqdm import tqdm
6
  from huggingface_hub import snapshot_download
@@ -56,7 +57,7 @@ class Tango:
56
  latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress = disable_progress)
57
  mel = self.vae.decode_first_stage(latents)
58
  wave = self.vae.decode_to_waveform(mel)
59
- return wave[0]
60
 
61
  def generate_for_batch(self, prompts, steps = 200, guidance = 3, samples = 1, batch_size = 8, disable_progress = True):
62
  # Generate audio for a list of prompt strings
@@ -81,26 +82,49 @@ tango.model.to(device_type)
81
 
82
  def check(
83
  prompt,
 
84
  steps,
85
  guidance
86
  ):
87
  if prompt is None or prompt == "":
88
  raise gr.Error("Please provide a prompt input.")
 
 
 
 
 
89
 
90
  def text2audio(
91
  prompt,
 
92
  steps,
93
  guidance
94
  ):
95
- output_wave = tango.generate(prompt, steps, guidance)
96
- return gr.make_waveform((16000, output_wave))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  # Gradio interface
99
  with gr.Blocks() as interface:
100
  gr.Markdown("""
101
  <p style="text-align: center;">
102
  <b><big><big><big>Text-to-Audio</big></big></big></b>
103
- <br/>Generates an audio file, freely, without account, without watermark, that you can download.
104
  </p>
105
  <br/>
106
  <br/>
@@ -110,7 +134,7 @@ with gr.Blocks() as interface:
110
  <li>If you need to generate <b>music</b>, I recommend to use <i>MusicGen</i>,</li>
111
  </ul>
112
  <br/>
113
- 🐌 Slow process... Your computer must <b><u>not</u></b> enter into standby mode.<br/>You can duplicate this space on a free account, it works on CPU.<br/>
114
  <a href='https://huggingface.co/spaces/Fabrice-TIERCELIN/Text-to-Audio?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14'></a>
115
  <br/>
116
  βš–οΈ You can use, modify and share the generated sounds but not for commercial uses.
@@ -118,43 +142,82 @@ with gr.Blocks() as interface:
118
  )
119
  input_text = gr.Textbox(label = "Prompt", value = "Snort of a horse", lines = 2, autofocus = True)
120
  with gr.Accordion("Advanced options", open = False):
 
121
  denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 100, maximum = 200, value = 100, step = 1, interactive = True)
122
  guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
123
 
124
  submit = gr.Button("Generate πŸš€", variant = "primary")
125
 
126
- output_audio = gr.Audio(label = "Generated Audio")
 
 
 
127
 
128
  submit.click(fn = check, inputs = [
129
  input_text,
 
130
  denoising_steps,
131
  guidance_scale
132
- ], outputs = [], queue = False, show_progress = False).success(fn = text2audio, inputs = [
 
 
 
 
 
133
  input_text,
 
134
  denoising_steps,
135
  guidance_scale
136
  ], outputs = [
137
- output_audio
 
 
 
138
  ], scroll_to_output = True)
139
 
140
  gr.Examples(
141
  fn = text2audio,
142
  inputs = [
143
  input_text,
 
144
  denoising_steps,
145
  guidance_scale
146
  ],
147
  outputs = [
148
- output_audio
 
 
 
149
  ],
150
  examples = [
151
- ["A hammer is hitting a wooden surface", 100, 3],
152
- ["Peaceful and calming ambient music with singing bowl and other instruments.", 100, 3],
153
- ["A man is speaking in a small room.", 100, 3],
154
- ["A female is speaking followed by footstep sound", 100, 3],
155
- ["Wooden table tapping sound followed by water pouring sound.", 100, 3],
156
  ],
157
  cache_examples = "lazy",
158
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  interface.queue(10).launch()
 
1
  import gradio as gr
2
  import json
3
  import torch
4
+ import time
5
 
6
  from tqdm import tqdm
7
  from huggingface_hub import snapshot_download
 
57
  latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress = disable_progress)
58
  mel = self.vae.decode_first_stage(latents)
59
  wave = self.vae.decode_to_waveform(mel)
60
+ return wave
61
 
62
  def generate_for_batch(self, prompts, steps = 200, guidance = 3, samples = 1, batch_size = 8, disable_progress = True):
63
  # Generate audio for a list of prompt strings
 
82
 
83
  def check(
84
  prompt,
85
+ output_number,
86
  steps,
87
  guidance
88
  ):
89
  if prompt is None or prompt == "":
90
  raise gr.Error("Please provide a prompt input.")
91
+ if not output_number in [1, 2, 3]:
92
+ raise gr.Error("Please ask for 1, 2 or 3 output files.")
93
+
94
+ def update_display(output_number):
95
+ return [gr.update(visible = (2 <= output_number)), gr.update(visible = (output_number == 3))]
96
 
97
  def text2audio(
98
  prompt,
99
+ output_number,
100
  steps,
101
  guidance
102
  ):
103
+ start = time.time()
104
+ output_wave = tango.generate(prompt, steps, guidance, output_number)
105
+ output_wave_1 = gr.make_waveform((16000, output_wave[0]))
106
+ output_wave_2 = gr.make_waveform((16000, output_wave[1])) if (2 <= output_number) else None
107
+ output_wave_3 = gr.make_waveform((16000, output_wave[2])) if (output_number == 3) else None
108
+
109
+ end = time.time()
110
+ secondes = int(end - start)
111
+ minutes = secondes // 60
112
+ secondes = secondes - (minutes * 60)
113
+ hours = minutes // 60
114
+ minutes = minutes - (hours * 60)
115
+ return [
116
+ output_wave_1,
117
+ output_wave_2,
118
+ output_wave_3,
119
+ "Start again to get a different result. The output have been generated in " + str(hours) + " h, " + str(minutes) + " min, " + str(secondes) + " sec."
120
+ ]
121
 
122
  # Gradio interface
123
  with gr.Blocks() as interface:
124
  gr.Markdown("""
125
  <p style="text-align: center;">
126
  <b><big><big><big>Text-to-Audio</big></big></big></b>
127
+ <br/>Generates 10 seconds of sound effects from description, freely, without account, without watermark, that you can download.
128
  </p>
129
  <br/>
130
  <br/>
 
134
  <li>If you need to generate <b>music</b>, I recommend to use <i>MusicGen</i>,</li>
135
  </ul>
136
  <br/>
137
+ 🐌 Slow process... ~2 hours. Your computer must <b><u>not</u></b> enter into standby mode.<br/>You can duplicate this space on a free account, it works on CPU.<br/>
138
  <a href='https://huggingface.co/spaces/Fabrice-TIERCELIN/Text-to-Audio?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14'></a>
139
  <br/>
140
  βš–οΈ You can use, modify and share the generated sounds but not for commercial uses.
 
142
  )
143
  input_text = gr.Textbox(label = "Prompt", value = "Snort of a horse", lines = 2, autofocus = True)
144
  with gr.Accordion("Advanced options", open = False):
145
+ output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
146
  denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 100, maximum = 200, value = 100, step = 1, interactive = True)
147
  guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
148
 
149
  submit = gr.Button("Generate πŸš€", variant = "primary")
150
 
151
+ output_audio_1 = gr.Audio(label = "Generated Audio #1/3")
152
+ output_audio_2 = gr.Audio(label = "Generated Audio #2/3")
153
+ output_audio_3 = gr.Audio(label = "Generated Audio #3/3")
154
+ information = gr.Label(label = "Information")
155
 
156
  submit.click(fn = check, inputs = [
157
  input_text,
158
+ output_number,
159
  denoising_steps,
160
  guidance_scale
161
+ ], outputs = [], queue = False, show_progress = False).success(fn = update_display, inputs = [
162
+ output_number
163
+ ], outputs = [
164
+ output_audio_2,
165
+ output_audio_3
166
+ ], queue = False, show_progress = False).success(fn = text2audio, inputs = [
167
  input_text,
168
+ output_number,
169
  denoising_steps,
170
  guidance_scale
171
  ], outputs = [
172
+ output_audio_1,
173
+ output_audio_2,
174
+ output_audio_3,
175
+ information
176
  ], scroll_to_output = True)
177
 
178
  gr.Examples(
179
  fn = text2audio,
180
  inputs = [
181
  input_text,
182
+ output_number,
183
  denoising_steps,
184
  guidance_scale
185
  ],
186
  outputs = [
187
+ output_audio_1,
188
+ output_audio_2,
189
+ output_audio_3,
190
+ information
191
  ],
192
  examples = [
193
+ ["A hammer is hitting a wooden surface", 3, 100, 3],
194
+ ["Peaceful and calming ambient music with singing bowl and other instruments.", 3, 100, 3],
195
+ ["A man is speaking in a small room.", 3, 100, 3],
196
+ ["A female is speaking followed by footstep sound", 3, 100, 3],
197
+ ["Wooden table tapping sound followed by water pouring sound.", 3, 100, 3],
198
  ],
199
  cache_examples = "lazy",
200
  )
201
+
202
+ gr.Markdown(
203
+ """
204
+ ## How to prompt your sound
205
+ You can use round brackets to increase the importance of a part:
206
+ ```
207
+ Peaceful and (calming) ambient music with singing bowl and other instruments
208
+ ```
209
+ You can use several levels of round brackets to even more increase the importance of a part:
210
+ ```
211
+ (Peaceful) and ((calming)) ambient music with singing bowl and other instruments
212
+ ```
213
+ You can use number instead of several round brackets:
214
+ ```
215
+ (Peaceful:1.5) and ((calming)) ambient music with singing bowl and other instruments
216
+ ```
217
+ You can do the same thing with square brackets to decrease the importance of a part:
218
+ ```
219
+ (Peaceful:1.5) and ((calming)) ambient music with [singing:2] bowl and other instruments
220
+ """
221
+ )
222
 
223
  interface.queue(10).launch()