freddyaboulton HF staff commited on
Commit
05fedf7
·
1 Parent(s): 840333c
Files changed (1) hide show
  1. app.py +25 -78
app.py CHANGED
@@ -217,84 +217,32 @@ def generate_base(subject, setting):
217
  play_steps_in_s = 4.0
218
  play_steps = int(frame_rate * play_steps_in_s)
219
 
220
- gr.Info("Generating Audio")
221
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
222
- story_tokens = tokenizer(model_input_tokens, return_tensors="pt", padding=True).input_ids.to(device)
223
- description_tokens = tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").input_ids.to(device)
224
- speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
225
- speech_output = [output.cpu().numpy() for output in speech_output]
226
- gr.Info("Generated Audio")
227
- return None, None, {"audio": speech_output, "text": model_input_tokens}
228
-
229
- def stream_audio(state):
230
- speech_output = state["audio"]
231
- sentences = state["text"]
232
-
233
- gr.Info("Reading Story")
234
-
235
- story = ""
236
- for sentence, new_audio in zip(sentences, speech_output):
237
- # print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
238
- print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
239
- story += f"{sentence}\n"
240
- yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
241
-
242
- # BATCH_SIZE = 4
243
- # for i in range(0, len(model_input), BATCH_SIZE):
244
- # inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
245
- # story_tokens = tokenizer(inputs, return_tensors="pt", padding=True).input_ids.to(device)
246
- # description_tokens = tokenizer([description for _ in range(len(inputs))], return_tensors="pt").input_ids.to(device)
247
- # speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
248
-
249
- # speech_output = [output.cpu().numpy() for output in speech_output]
250
- # for j, new_audio in enumerate(speech_output):
251
- # if i + j == 0:
252
- # gr.Info("Reading story", duration=3)
253
- # print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
254
- # print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
255
- # yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
256
-
257
- # if len(inputs) != 0:
258
- # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
259
- # story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
260
-
261
- # speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
262
-
263
- # speech_output = [output.cpu().numpy() for output in speech_output]
264
-
265
- # for i, new_audio in enumerate(speech_output):
266
- # if i == 0:
267
- # gr.Info("Reading story", duration=3)
268
- # print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
269
- # yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
270
-
271
- # print(f"{i}-th part generated")
272
- # pieces += [*speech_output, silence.copy()]
273
-
274
- # for i, sentence in enumerate(model_input):
275
- # streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
276
-
277
- # prompt = tokenizer(sentence, return_tensors="pt").to(device)
278
-
279
- # generation_kwargs = dict(
280
- # input_ids=inputs.input_ids,
281
- # prompt_input_ids=prompt.input_ids,
282
- # streamer=streamer,
283
- # do_sample=True,
284
- # temperature=1.0,
285
- # min_new_tokens=10,
286
- # )
287
-
288
- # set_seed(SEED)
289
- # thread = Thread(target=model.generate, kwargs=generation_kwargs)
290
- # thread.start()
291
-
292
- # for new_audio in streamer:
293
- # if i == 0:
294
- # gr.Info("Reading story", duration=3)
295
- # print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
296
- # yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
  with gr.Blocks() as block:
300
  gr.HTML(
@@ -319,5 +267,4 @@ with gr.Blocks() as block:
319
  state = gr.State()
320
  run_button.click(fn=generate_base, inputs=inputs, outputs=[story, audio_out, state]).success(stream_audio, inputs=state, outputs=outputs)
321
 
322
- block.queue()
323
- block.launch(share=True)
 
217
  play_steps_in_s = 4.0
218
  play_steps = int(frame_rate * play_steps_in_s)
219
 
 
220
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
221
+ description_tokens = tokenizer(description, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
+ for i, sentence in enumerate(model_input):
224
+ streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
225
+
226
+ prompt = tokenizer(sentence, return_tensors="pt").to(device)
227
+
228
+ generation_kwargs = dict(
229
+ input_ids=description_tokens.input_ids,
230
+ prompt_input_ids=prompt.input_ids,
231
+ streamer=streamer,
232
+ do_sample=True,
233
+ temperature=1.0,
234
+ min_new_tokens=10,
235
+ )
236
+
237
+ set_seed(SEED)
238
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
239
+ thread.start()
240
+
241
+ for new_audio in streamer:
242
+ if i == 0:
243
+ gr.Info("Reading story", duration=3)
244
+ print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
245
+ yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
246
 
247
  with gr.Blocks() as block:
248
  gr.HTML(
 
267
  state = gr.State()
268
  run_button.click(fn=generate_base, inputs=inputs, outputs=[story, audio_out, state]).success(stream_audio, inputs=state, outputs=outputs)
269
 
270
+ block.launch()