Fabrice-TIERCELIN commited on
Commit
6134818
·
verified ·
1 Parent(s): ded8ad7

mp3 or wav

Browse files
Files changed (1) hide show
  1. app.py +59 -19
app.py CHANGED
@@ -2,12 +2,14 @@ import gradio as gr
2
  import json
3
  import torch
4
  import time
 
5
 
6
  from tqdm import tqdm
7
  from huggingface_hub import snapshot_download
8
  from models import AudioDiffusion, DDPMScheduler
9
  from audioldm.audio.stft import TacotronSTFT
10
  from audioldm.variational_autoencoder import AutoencoderKL
 
11
 
12
  # Automatic device detection
13
  if torch.cuda.is_available():
@@ -82,29 +84,61 @@ tango.model.to(device_type)
82
 
83
  def check(
84
  prompt,
 
85
  output_number,
86
  steps,
87
  guidance
88
  ):
89
  if prompt is None or prompt == "":
90
  raise gr.Error("Please provide a prompt input.")
 
 
91
  if not output_number in [1, 2, 3]:
92
  raise gr.Error("Please ask for 1, 2 or 3 output files.")
93
 
94
- def update_display(output_number):
95
- return [gr.update(visible = (2 <= output_number)), gr.update(visible = (output_number == 3))]
 
 
 
 
96
 
97
  def text2audio(
98
  prompt,
 
99
  output_number,
100
  steps,
101
  guidance
102
  ):
103
  start = time.time()
104
  output_wave = tango.generate(prompt, steps, guidance, output_number)
105
- output_wave_1 = gr.make_waveform((16000, output_wave[0]))
106
- output_wave_2 = gr.make_waveform((16000, output_wave[1])) if (2 <= output_number) else None
107
- output_wave_3 = gr.make_waveform((16000, output_wave[2])) if (output_number == 3) else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  end = time.time()
110
  secondes = int(end - start)
@@ -113,10 +147,10 @@ def text2audio(
113
  hours = minutes // 60
114
  minutes = minutes - (hours * 60)
115
  return [
116
- output_wave_1,
117
- output_wave_2,
118
- output_wave_3,
119
- "Start again to get a different result. The output have been generated in " + str(hours) + " h, " + str(minutes) + " min, " + str(secondes) + " sec."
120
  ]
121
 
122
  # Gradio interface
@@ -141,30 +175,35 @@ with gr.Blocks() as interface:
141
  """
142
  )
143
  input_text = gr.Textbox(label = "Prompt", value = "Snort of a horse", lines = 2, autofocus = True)
 
144
  with gr.Accordion("Advanced options", open = False):
145
  output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
146
  denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 100, maximum = 200, value = 100, step = 1, interactive = True)
147
  guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
148
 
149
- submit = gr.Button("Generate 🚀", variant = "primary")
150
 
151
- output_audio_1 = gr.Audio(label = "Generated Audio #1/3")
152
- output_audio_2 = gr.Audio(label = "Generated Audio #2/3")
153
- output_audio_3 = gr.Audio(label = "Generated Audio #3/3")
154
  information = gr.Label(label = "Information")
155
 
156
  submit.click(fn = check, inputs = [
157
  input_text,
 
158
  output_number,
159
  denoising_steps,
160
  guidance_scale
161
- ], outputs = [], queue = False, show_progress = False).success(fn = update_display, inputs = [
 
162
  output_number
163
  ], outputs = [
 
164
  output_audio_2,
165
  output_audio_3
166
  ], queue = False, show_progress = False).success(fn = text2audio, inputs = [
167
  input_text,
 
168
  output_number,
169
  denoising_steps,
170
  guidance_scale
@@ -179,6 +218,7 @@ with gr.Blocks() as interface:
179
  fn = text2audio,
180
  inputs = [
181
  input_text,
 
182
  output_number,
183
  denoising_steps,
184
  guidance_scale
@@ -190,11 +230,11 @@ with gr.Blocks() as interface:
190
  information
191
  ],
192
  examples = [
193
- ["A hammer is hitting a wooden surface", 3, 100, 3],
194
- ["Peaceful and calming ambient music with singing bowl and other instruments.", 3, 100, 3],
195
- ["A man is speaking in a small room.", 3, 100, 3],
196
- ["A female is speaking followed by footstep sound", 3, 100, 3],
197
- ["Wooden table tapping sound followed by water pouring sound.", 3, 100, 3],
198
  ],
199
  cache_examples = "lazy",
200
  )
 
2
  import json
3
  import torch
4
  import time
5
+ import wavio
6
 
7
  from tqdm import tqdm
8
  from huggingface_hub import snapshot_download
9
  from models import AudioDiffusion, DDPMScheduler
10
  from audioldm.audio.stft import TacotronSTFT
11
  from audioldm.variational_autoencoder import AutoencoderKL
12
+ from pydub import AudioSegment
13
 
14
  # Automatic device detection
15
  if torch.cuda.is_available():
 
84
 
85
  def check(
86
  prompt,
87
+ output_format,
88
  output_number,
89
  steps,
90
  guidance
91
  ):
92
  if prompt is None or prompt == "":
93
  raise gr.Error("Please provide a prompt input.")
94
+ if not output_format in ["wav", "mp3"]:
95
+ raise gr.Error("Please choose an allowed output format (.wav or .mp3).")
96
  if not output_number in [1, 2, 3]:
97
  raise gr.Error("Please ask for 1, 2 or 3 output files.")
98
 
99
+ def update_output(output_format, output_number):
100
+ return [
101
+ gr.update(format = output_format),
102
+ gr.update(format = output_format, visible = (2 <= output_number)),
103
+ gr.update(format = output_format, visible = (output_number == 3))
104
+ ]
105
 
106
  def text2audio(
107
  prompt,
108
+ output_format,
109
  output_number,
110
  steps,
111
  guidance
112
  ):
113
  start = time.time()
114
  output_wave = tango.generate(prompt, steps, guidance, output_number)
115
+
116
+ output_filename_1 = "tmp1.wav"
117
+ wavio.write(output_filename_1, output_wave[0], rate = 16000, sampwidth = 2)
118
+
119
+ if (output_format == "mp3"):
120
+ AudioSegment.from_wav("tmp1.wav").export("tmp1.mp3", format = "mp3")
121
+ output_filename_1 = "tmp1.mp3"
122
+
123
+ if (2 <= output_number):
124
+ output_filename_2 = "tmp2.wav"
125
+ wavio.write(output_filename_2, output_wave[1], rate = 16000, sampwidth = 2)
126
+
127
+ if (output_format == "mp3"):
128
+ AudioSegment.from_wav("tmp2.wav").export("tmp2.mp3", format = "mp3")
129
+ output_filename_2 = "tmp2.mp3"
130
+ else:
131
+ output_filename_2 = None
132
+
133
+ if (output_number == 3):
134
+ output_filename_3 = "tmp3.wav"
135
+ wavio.write(output_filename_3, output_wave[2], rate = 16000, sampwidth = 2)
136
+
137
+ if (output_format == "mp3"):
138
+ AudioSegment.from_wav("tmp3.wav").export("tmp3.mp3", format = "mp3")
139
+ output_filename_3 = "tmp3.mp3"
140
+ else:
141
+ output_filename_3 = None
142
 
143
  end = time.time()
144
  secondes = int(end - start)
 
147
  hours = minutes // 60
148
  minutes = minutes - (hours * 60)
149
  return [
150
+ output_filename_1,
151
+ output_filename_2,
152
+ output_filename_3,
153
+ "Start again to get a different result. The output have been generated in " + ((str(hours) + " h, ") if hours != 0 else "") + ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + str(secondes) + " sec."
154
  ]
155
 
156
  # Gradio interface
 
175
  """
176
  )
177
  input_text = gr.Textbox(label = "Prompt", value = "Snort of a horse", lines = 2, autofocus = True)
178
+ output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
179
  with gr.Accordion("Advanced options", open = False):
180
  output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
181
  denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 100, maximum = 200, value = 100, step = 1, interactive = True)
182
  guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
183
 
184
+ submit = gr.Button("🚀 Generate", variant = "primary")
185
 
186
+ output_audio_1 = gr.Audio(label = "Generated Audio #1/3", format = "wav", type="filepath", autoplay = True)
187
+ output_audio_2 = gr.Audio(label = "Generated Audio #2/3", format = "wav", type="filepath")
188
+ output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="filepath")
189
  information = gr.Label(label = "Information")
190
 
191
  submit.click(fn = check, inputs = [
192
  input_text,
193
+ output_format,
194
  output_number,
195
  denoising_steps,
196
  guidance_scale
197
+ ], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
198
+ output_format,
199
  output_number
200
  ], outputs = [
201
+ output_audio_1,
202
  output_audio_2,
203
  output_audio_3
204
  ], queue = False, show_progress = False).success(fn = text2audio, inputs = [
205
  input_text,
206
+ output_format,
207
  output_number,
208
  denoising_steps,
209
  guidance_scale
 
218
  fn = text2audio,
219
  inputs = [
220
  input_text,
221
+ output_format,
222
  output_number,
223
  denoising_steps,
224
  guidance_scale
 
230
  information
231
  ],
232
  examples = [
233
+ ["A hammer is hitting a wooden surface", "mp3", 3, 100, 3],
234
+ ["Peaceful and calming ambient music with singing bowl and other instruments.", "wav", 3, 100, 3],
235
+ ["A man is speaking in a small room.", "mp3", 2, 100, 3],
236
+ ["A female is speaking followed by footstep sound", "mp3", 1, 100, 3],
237
+ ["Wooden table tapping sound followed by water pouring sound.", "mp3", 3, 200, 3],
238
  ],
239
  cache_examples = "lazy",
240
  )