Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
05fedf7
1
Parent(s):
840333c
First try
Browse files
app.py
CHANGED
@@ -217,84 +217,32 @@ def generate_base(subject, setting):
|
|
217 |
play_steps_in_s = 4.0
|
218 |
play_steps = int(frame_rate * play_steps_in_s)
|
219 |
|
220 |
-
gr.Info("Generating Audio")
|
221 |
description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
|
222 |
-
|
223 |
-
description_tokens = tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").input_ids.to(device)
|
224 |
-
speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
|
225 |
-
speech_output = [output.cpu().numpy() for output in speech_output]
|
226 |
-
gr.Info("Generated Audio")
|
227 |
-
return None, None, {"audio": speech_output, "text": model_input_tokens}
|
228 |
-
|
229 |
-
def stream_audio(state):
|
230 |
-
speech_output = state["audio"]
|
231 |
-
sentences = state["text"]
|
232 |
-
|
233 |
-
gr.Info("Reading Story")
|
234 |
-
|
235 |
-
story = ""
|
236 |
-
for sentence, new_audio in zip(sentences, speech_output):
|
237 |
-
# print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
|
238 |
-
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
239 |
-
story += f"{sentence}\n"
|
240 |
-
yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
241 |
-
|
242 |
-
# BATCH_SIZE = 4
|
243 |
-
# for i in range(0, len(model_input), BATCH_SIZE):
|
244 |
-
# inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
|
245 |
-
# story_tokens = tokenizer(inputs, return_tensors="pt", padding=True).input_ids.to(device)
|
246 |
-
# description_tokens = tokenizer([description for _ in range(len(inputs))], return_tensors="pt").input_ids.to(device)
|
247 |
-
# speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
|
248 |
-
|
249 |
-
# speech_output = [output.cpu().numpy() for output in speech_output]
|
250 |
-
# for j, new_audio in enumerate(speech_output):
|
251 |
-
# if i + j == 0:
|
252 |
-
# gr.Info("Reading story", duration=3)
|
253 |
-
# print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
|
254 |
-
# print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
255 |
-
# yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
256 |
-
|
257 |
-
# if len(inputs) != 0:
|
258 |
-
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
259 |
-
# story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
|
260 |
-
|
261 |
-
# speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
|
262 |
-
|
263 |
-
# speech_output = [output.cpu().numpy() for output in speech_output]
|
264 |
-
|
265 |
-
# for i, new_audio in enumerate(speech_output):
|
266 |
-
# if i == 0:
|
267 |
-
# gr.Info("Reading story", duration=3)
|
268 |
-
# print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
269 |
-
# yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
270 |
-
|
271 |
-
# print(f"{i}-th part generated")
|
272 |
-
# pieces += [*speech_output, silence.copy()]
|
273 |
-
|
274 |
-
# for i, sentence in enumerate(model_input):
|
275 |
-
# streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
276 |
-
|
277 |
-
# prompt = tokenizer(sentence, return_tensors="pt").to(device)
|
278 |
-
|
279 |
-
# generation_kwargs = dict(
|
280 |
-
# input_ids=inputs.input_ids,
|
281 |
-
# prompt_input_ids=prompt.input_ids,
|
282 |
-
# streamer=streamer,
|
283 |
-
# do_sample=True,
|
284 |
-
# temperature=1.0,
|
285 |
-
# min_new_tokens=10,
|
286 |
-
# )
|
287 |
-
|
288 |
-
# set_seed(SEED)
|
289 |
-
# thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
290 |
-
# thread.start()
|
291 |
-
|
292 |
-
# for new_audio in streamer:
|
293 |
-
# if i == 0:
|
294 |
-
# gr.Info("Reading story", duration=3)
|
295 |
-
# print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
296 |
-
# yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
with gr.Blocks() as block:
|
300 |
gr.HTML(
|
@@ -319,5 +267,4 @@ with gr.Blocks() as block:
|
|
319 |
state = gr.State()
|
320 |
run_button.click(fn=generate_base, inputs=inputs, outputs=[story, audio_out, state]).success(stream_audio, inputs=state, outputs=outputs)
|
321 |
|
322 |
-
block.
|
323 |
-
block.launch(share=True)
|
|
|
217 |
play_steps_in_s = 4.0
|
218 |
play_steps = int(frame_rate * play_steps_in_s)
|
219 |
|
|
|
220 |
description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
|
221 |
+
description_tokens = tokenizer(description, return_tensors="pt").to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
+
for i, sentence in enumerate(model_input):
|
224 |
+
streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
225 |
+
|
226 |
+
prompt = tokenizer(sentence, return_tensors="pt").to(device)
|
227 |
+
|
228 |
+
generation_kwargs = dict(
|
229 |
+
input_ids=description_tokens.input_ids,
|
230 |
+
prompt_input_ids=prompt.input_ids,
|
231 |
+
streamer=streamer,
|
232 |
+
do_sample=True,
|
233 |
+
temperature=1.0,
|
234 |
+
min_new_tokens=10,
|
235 |
+
)
|
236 |
+
|
237 |
+
set_seed(SEED)
|
238 |
+
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
239 |
+
thread.start()
|
240 |
+
|
241 |
+
for new_audio in streamer:
|
242 |
+
if i == 0:
|
243 |
+
gr.Info("Reading story", duration=3)
|
244 |
+
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
245 |
+
yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
246 |
|
247 |
with gr.Blocks() as block:
|
248 |
gr.HTML(
|
|
|
267 |
state = gr.State()
|
268 |
run_button.click(fn=generate_base, inputs=inputs, outputs=[story, audio_out, state]).success(stream_audio, inputs=state, outputs=outputs)
|
269 |
|
270 |
+
block.launch()
|
|