neuralleap commited on
Commit
f43268d
·
1 Parent(s): 3a2be8a

update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -30
app.py CHANGED
@@ -42,33 +42,19 @@ VOICE_OPTIONS = [
42
  "random", # special option for random voice
43
  ]
44
 
45
-
46
  def inference(
47
  text,
48
- voice,
49
- seed,
50
  ):
51
  if text is None or text.strip() == "":
52
- with open(script.name) as f:
53
- text = f.read()
54
- if text.strip() == "":
55
- raise gr.Error("Please provide either text or script file with content.")
56
-
57
- if split_by_newline == "Yes":
58
- texts = list(filter(lambda x: x.strip() != "", text.split("\n")))
59
- else:
60
- texts = split_and_recombine_text(text)
61
 
62
- voices = [voice]
63
 
64
- if len(voices) == 1:
65
- voice_samples, conditioning_latents = load_voice(voice)
66
- else:
67
- voice_samples, conditioning_latents = load_voices(voices)
68
 
69
  start_time = time.time()
70
 
71
- # all_parts = []
72
  for j, text in enumerate(texts):
73
  for audio_frame in tts.tts_with_preset(
74
  text,
@@ -77,21 +63,21 @@ def inference(
77
  preset="ultra_fast",
78
  k=1
79
  ):
80
- # print("Time taken: ", time.time() - start_time)
81
- # all_parts.append(audio_frame)
82
  yield (24000, audio_frame.cpu().detach().numpy())
83
 
84
- # wav = torch.cat(all_parts, dim=0).unsqueeze(0)
85
- # print(wav.shape)
86
- # torchaudio.save("output.wav", wav.cpu(), 24000)
87
- # yield (None, gr.make_waveform(audio="output.wav",))
88
  def main():
89
- title = "Tortoise TTS"
90
  description = """
 
 
 
 
 
 
91
  """
92
  text = gr.Textbox(
93
  lines=4,
94
- label="Text (Provide either text, or upload a newline separated text file below):",
95
  )
96
 
97
  voice = gr.Dropdown(
@@ -99,12 +85,12 @@ def main():
99
  )
100
 
101
  output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
102
- # download_audio = gr.Audio(label="dowanload audio:")
103
  interface = gr.Interface(
104
  fn=inference,
105
  inputs=[
106
  text,
107
- voice,
108
  ],
109
  title=title,
110
  description=description,
@@ -112,7 +98,6 @@ def main():
112
  )
113
  interface.queue().launch()
114
 
115
-
116
  if __name__ == "__main__":
117
  tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
118
 
@@ -121,4 +106,4 @@ if __name__ == "__main__":
121
  f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
122
  )
123
 
124
- main()
 
42
  "random", # special option for random voice
43
  ]
44
 
 
45
  def inference(
46
  text,
47
+ voice
 
48
  ):
49
  if text is None or text.strip() == "":
50
+ raise gr.Error("Please provide text.")
 
 
 
 
 
 
 
 
51
 
52
+ texts = split_and_recombine_text(text)
53
 
54
+ voice_samples, conditioning_latents = load_voice(voice)
 
 
 
55
 
56
  start_time = time.time()
57
 
 
58
  for j, text in enumerate(texts):
59
  for audio_frame in tts.tts_with_preset(
60
  text,
 
63
  preset="ultra_fast",
64
  k=1
65
  ):
 
 
66
  yield (24000, audio_frame.cpu().detach().numpy())
67
 
 
 
 
 
68
  def main():
69
+ title = "Tortoise TTS 🐢"
70
  description = """
71
+ A text-to-speech system which powers lot of organizations in Speech synthesis domain.
72
+ <br/>
73
+ A model with strong multi-voice capabilities, highly realistic prosody and intonation.
74
+ <br/>
75
+ For faster inference, use the 'ultra_fast' preset and duplicate space if you don't want to wait in a queue.
76
+ <br/>
77
  """
78
  text = gr.Textbox(
79
  lines=4,
80
+ label="Text:",
81
  )
82
 
83
  voice = gr.Dropdown(
 
85
  )
86
 
87
  output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
88
+
89
  interface = gr.Interface(
90
  fn=inference,
91
  inputs=[
92
  text,
93
+ voice
94
  ],
95
  title=title,
96
  description=description,
 
98
  )
99
  interface.queue().launch()
100
 
 
101
  if __name__ == "__main__":
102
  tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
103
 
 
106
  f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
107
  )
108
 
109
+ main()