freddyaboulton HF Staff commited on
Commit
ee3a553
·
1 Parent(s): 5b58cc8
Files changed (1) hide show
  1. app.py +46 -26
app.py CHANGED
@@ -218,32 +218,52 @@ def generate_base(subject, setting, ):
218
  play_steps = int(frame_rate * play_steps_in_s)
219
 
220
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
221
- inputs = tokenizer(description, return_tensors="pt").to(device)
222
-
223
-
224
- for i, sentence in enumerate(model_input):
225
- streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
226
-
227
- prompt = tokenizer(sentence, return_tensors="pt").to(device)
228
-
229
- generation_kwargs = dict(
230
- input_ids=inputs.input_ids,
231
- prompt_input_ids=prompt.input_ids,
232
- streamer=streamer,
233
- do_sample=True,
234
- temperature=1.0,
235
- min_new_tokens=10,
236
- )
237
-
238
- set_seed(SEED)
239
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
240
- thread.start()
241
-
242
- for new_audio in streamer:
243
- if i == 0:
244
- gr.Info("Reading story", duration=3)
245
- print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
246
- yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
 
249
  with gr.Blocks() as block:
 
218
  play_steps = int(frame_rate * play_steps_in_s)
219
 
220
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
221
+ description = [description for _ in range(len(model_input))]
222
+ description_tokens = tokenizer(description, return_tensors="pt").input_ids.to(device)
223
+
224
+ # for i in range(0, len(model_input), BATCH_SIZE):
225
+ # inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
226
+
227
+ # if len(inputs) != 0:
228
+ # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
229
+ story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
230
+
231
+ speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
232
+
233
+ speech_output = [output.cpu().numpy() for output in speech_output]
234
+
235
+ for i, new_audio in enumerate(speech_output):
236
+ if i == 0:
237
+ gr.Info("Reading story", duration=3)
238
+ print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
239
+ yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
240
+
241
+ # print(f"{i}-th part generated")
242
+ # pieces += [*speech_output, silence.copy()]
243
+
244
+ # for i, sentence in enumerate(model_input):
245
+ # streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
246
+
247
+ # prompt = tokenizer(sentence, return_tensors="pt").to(device)
248
+
249
+ # generation_kwargs = dict(
250
+ # input_ids=inputs.input_ids,
251
+ # prompt_input_ids=prompt.input_ids,
252
+ # streamer=streamer,
253
+ # do_sample=True,
254
+ # temperature=1.0,
255
+ # min_new_tokens=10,
256
+ # )
257
+
258
+ # set_seed(SEED)
259
+ # thread = Thread(target=model.generate, kwargs=generation_kwargs)
260
+ # thread.start()
261
+
262
+ # for new_audio in streamer:
263
+ # if i == 0:
264
+ # gr.Info("Reading story", duration=3)
265
+ # print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
266
+ # yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
267
 
268
 
269
  with gr.Blocks() as block: