daveokpare commited on
Commit
1417ec9
·
1 Parent(s): 07a092c

Add script for tortoise

Browse files
Files changed (2) hide show
  1. app.py +199 -4
  2. requirements.txt +4 -0
app.py CHANGED
@@ -1,10 +1,205 @@
 
 
1
  import gradio as gr
 
 
2
 
 
 
 
 
 
3
 
4
- def greet(name):
5
- return "Hello " + name + "!!"
 
 
6
 
 
 
7
 
8
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
9
 
10
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import tempfile
3
  import gradio as gr
4
+ import numpy as np
5
+ from typing import Tuple, List
6
 
7
+ # Setup and installation
8
+ os.system("git clone https://github.com/neonbjb/tortoise-tts.git")
9
+ sys.path.append("./tortoise-tts/")
10
+ os.system("pip install -r ./tortoise-tts/requirements.txt")
11
+ os.system("python ./tortoise-tts/setup.py install")
12
 
13
+ import torch
14
+ import torchaudio
15
+ import torch.nn as nn
16
+ import torch.nn.functional as F
17
 
18
+ from tortoise.api import TextToSpeech
19
+ from tortoise.utils.audio import load_audio, load_voice
20
 
21
+ # Download and instantiate model
22
+ tts = TextToSpeech()
23
 
24
+ # Display parameters
25
+ VOICES = [
26
+ "random",
27
+ "train_atkins",
28
+ "train_daws",
29
+ "train_dotrice",
30
+ "train_dreams",
31
+ "train_empire",
32
+ "train_grace",
33
+ "train_kennard",
34
+ "train_lescault",
35
+ "train_mouse",
36
+ "angie",
37
+ "applejack",
38
+ "daniel",
39
+ "deniro",
40
+ "emma",
41
+ "freeman",
42
+ "geralt",
43
+ "halle",
44
+ "jlaw",
45
+ "lj",
46
+ "mol",
47
+ "myself",
48
+ "pat",
49
+ "pat2",
50
+ "rainbow",
51
+ "snakes",
52
+ "tim_reynolds",
53
+ "tom",
54
+ "weaver",
55
+ "william",
56
+ ]
57
+ DEFAULT_VOICE = "random"
58
+ PRESETS = ["ultra_fast", "fast", "standard", "high_quality"]
59
+ DEFAULT_PRESET = "fast"
60
+ DEFAULT_TEXT = "Hello, world!"
61
+
62
+ README = """# TorToiSe
63
+ Tortoise is a text-to-speech model developed by James Betker. It is capable of zero-shot voice cloning from a small set of voice samples. GitHub repo: [neonbjb/tortoise-tts](https://github.com/neonbjb/tortoise-tts).
64
+ ## Usage
65
+ 1. Select a model preset and type the text to speak.
66
+ 2. Load a voice - either by choosing a preset, uploading audio files, or recording via microphone. Select the option to split audio into chunks if the clips are much longer than 10 seconds each. Follow the guidelines in the [voice customization guide](https://github.com/neonbjb/tortoise-tts#voice-customization-guide).
67
+ 3. Click **Generate**, and wait - it's called *tortoise* for a reason!
68
+ """
69
+
70
+ TORTOISE_SR_IN = 22050
71
+ TORTOISE_SR_OUT = 24000
72
+
73
+
74
+ def chunk_audio(
75
+ t: torch.Tensor, sample_rate: int, chunk_duration_sec: int
76
+ ) -> List[torch.Tensor]:
77
+ duration = t.shape[1] / sample_rate
78
+ num_chunks = 1 + int(duration / chunk_duration_sec)
79
+ chunks = [
80
+ t[
81
+ :,
82
+ (sample_rate * chunk_duration_sec * i) : (
83
+ sample_rate * chunk_duration_sec * (i + 1)
84
+ ),
85
+ ]
86
+ for i in range(num_chunks)
87
+ ]
88
+ # remove 0-width chunks
89
+ chunks = [chunk for chunk in chunks if chunk.shape[1] > 0]
90
+ return chunks
91
+
92
+
93
+ def tts_main(voice_samples: List[torch.Tensor], text: str, model_preset: str) -> str:
94
+ gen = tts.tts_with_preset(
95
+ text,
96
+ voice_samples=voice_samples,
97
+ conditioning_latents=None,
98
+ preset=model_preset,
99
+ )
100
+ torchaudio.save("generated.wav", gen.squeeze(0).cpu(), TORTOISE_SR_OUT)
101
+ return "generated.wav"
102
+
103
+
104
+ def tts_from_preset(voice: str, text, model_preset):
105
+ voice_samples, _ = load_voice(voice)
106
+ return tts_main(voice_samples, text, model_preset)
107
+
108
+
109
+ def tts_from_files(
110
+ files: List[tempfile._TemporaryFileWrapper], do_chunk, text, model_preset
111
+ ):
112
+ voice_samples = [load_audio(f.name, TORTOISE_SR_IN) for f in files]
113
+ if do_chunk:
114
+ voice_samples = [
115
+ chunk for t in voice_samples for chunk in chunk_audio(t, TORTOISE_SR_IN, 10)
116
+ ]
117
+ return tts_main(voice_samples, text, model_preset)
118
+
119
+
120
+ def tts_from_recording(recording: Tuple[int, np.ndarray], do_chunk, text, model_preset):
121
+ sample_rate, audio = recording
122
+ # normalize- https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/utils/audio.py#L16
123
+ norm_fix = 1
124
+ if audio.dtype == np.int32:
125
+ norm_fix = 2**31
126
+ elif audio.dtype == np.int16:
127
+ norm_fix = 2**15
128
+ audio = torch.FloatTensor(audio.T) / norm_fix
129
+ if len(audio.shape) > 1:
130
+ # convert to mono
131
+ audio = torch.mean(audio, axis=0).unsqueeze(0)
132
+ audio = torchaudio.transforms.Resample(sample_rate, TORTOISE_SR_IN)(audio)
133
+ if do_chunk:
134
+ voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10)
135
+ else:
136
+ voice_samples = [audio]
137
+ return tts_main(voice_samples, text, model_preset)
138
+
139
+
140
+ def tts_from_url(audio_url, start_time, end_time, do_chunk, text, model_preset):
141
+ os.system(
142
+ f"yt-dlp -x --audio-format mp3 --force-overwrites {audio_url} -o audio.mp3"
143
+ )
144
+ audio = load_audio("audio.mp3", TORTOISE_SR_IN)
145
+ audio = audio[:, start_time * TORTOISE_SR_IN : end_time * TORTOISE_SR_IN]
146
+ if do_chunk:
147
+ voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10)
148
+ else:
149
+ voice_samples = [audio]
150
+ return tts_main(voice_samples, text, model_preset)
151
+
152
+
153
+ with gr.Blocks() as demo:
154
+ gr.Markdown(README)
155
+
156
+ preset = gr.Dropdown(PRESETS, label="Model preset", value=DEFAULT_PRESET)
157
+ text = gr.Textbox(label="Text to speak", value=DEFAULT_TEXT)
158
+ do_chunk_label = "Split audio into chunks? (for audio much longer than 10 seconds.)"
159
+ do_chunk_default = True
160
+
161
+ with gr.Tab("Choose preset voice"):
162
+ inp1 = gr.Dropdown(VOICES, value=DEFAULT_VOICE, label="Preset voice")
163
+ btn1 = gr.Button("Generate")
164
+
165
+ with gr.Tab("Upload audio"):
166
+ inp2 = gr.File(file_count="multiple")
167
+ do_chunk2 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default)
168
+ btn2 = gr.Button("Generate")
169
+
170
+ with gr.Tab("Record audio"):
171
+ inp3 = gr.Audio(source="microphone")
172
+ do_chunk3 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default)
173
+ btn3 = gr.Button("Generate")
174
+
175
+ # with gr.Tab("From YouTube"):
176
+ # inp4 = gr.Textbox(label="URL")
177
+ # do_chunk4 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default)
178
+ # start_time = gr.Number(label="Start time (seconds)", precision=0)
179
+ # end_time = gr.Number(label="End time (seconds)", precision=0)
180
+ # btn4 = gr.Button("Generate")
181
+
182
+ audio_out = gr.Audio()
183
+
184
+ btn1.click(
185
+ tts_from_preset,
186
+ [inp1, text, preset],
187
+ [audio_out],
188
+ )
189
+ btn2.click(
190
+ tts_from_files,
191
+ [inp2, do_chunk2, text, preset],
192
+ [audio_out],
193
+ )
194
+ btn3.click(
195
+ tts_from_recording,
196
+ [inp3, do_chunk3, text, preset],
197
+ [audio_out],
198
+ )
199
+ # btn4.click(
200
+ # tts_from_url,
201
+ # [inp4, start_time, end_time, do_chunk4, text, preset],
202
+ # [audio_out],
203
+ # )
204
+
205
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ numpy==1.24.1
4
+ yt-dlp