Spaces:
Running
on
Zero
Running
on
Zero
mrfakename
commited on
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
app.py
CHANGED
@@ -112,101 +112,34 @@ E2TTS_ema_model = load_model(
|
|
112 |
"E2-TTS", "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
|
113 |
)
|
114 |
|
115 |
-
def
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
word_batches = []
|
131 |
-
for word in words:
|
132 |
-
if len(current_word_part.encode('utf-8')) + len(word.encode('utf-8')) + 1 <= max_chars:
|
133 |
-
current_word_part += word + ' '
|
134 |
-
else:
|
135 |
-
if current_word_part:
|
136 |
-
# Try to find a suitable split word
|
137 |
-
for split_word in split_words:
|
138 |
-
split_index = current_word_part.rfind(' ' + split_word + ' ')
|
139 |
-
if split_index != -1:
|
140 |
-
word_batches.append(current_word_part[:split_index].strip())
|
141 |
-
current_word_part = current_word_part[split_index:].strip() + ' '
|
142 |
-
break
|
143 |
-
else:
|
144 |
-
# If no suitable split word found, just append the current part
|
145 |
-
word_batches.append(current_word_part.strip())
|
146 |
-
current_word_part = ""
|
147 |
-
current_word_part += word + ' '
|
148 |
-
if current_word_part:
|
149 |
-
word_batches.append(current_word_part.strip())
|
150 |
-
return word_batches
|
151 |
|
152 |
for sentence in sentences:
|
153 |
-
if len(
|
154 |
-
|
155 |
else:
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
colon_parts = sentence.split(':')
|
165 |
-
if len(colon_parts) > 1:
|
166 |
-
for part in colon_parts:
|
167 |
-
if len(part.encode('utf-8')) <= max_chars:
|
168 |
-
batches.append(part)
|
169 |
-
else:
|
170 |
-
# If colon part is still too long, split by comma
|
171 |
-
comma_parts = re.split('[,,]', part)
|
172 |
-
if len(comma_parts) > 1:
|
173 |
-
current_comma_part = ""
|
174 |
-
for comma_part in comma_parts:
|
175 |
-
if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
|
176 |
-
current_comma_part += comma_part + ','
|
177 |
-
else:
|
178 |
-
if current_comma_part:
|
179 |
-
batches.append(current_comma_part.rstrip(','))
|
180 |
-
current_comma_part = comma_part + ','
|
181 |
-
if current_comma_part:
|
182 |
-
batches.append(current_comma_part.rstrip(','))
|
183 |
-
else:
|
184 |
-
# If no comma, split by words
|
185 |
-
batches.extend(split_by_words(part))
|
186 |
-
else:
|
187 |
-
# If no colon, split by comma
|
188 |
-
comma_parts = re.split('[,,]', sentence)
|
189 |
-
if len(comma_parts) > 1:
|
190 |
-
current_comma_part = ""
|
191 |
-
for comma_part in comma_parts:
|
192 |
-
if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
|
193 |
-
current_comma_part += comma_part + ','
|
194 |
-
else:
|
195 |
-
if current_comma_part:
|
196 |
-
batches.append(current_comma_part.rstrip(','))
|
197 |
-
current_comma_part = comma_part + ','
|
198 |
-
if current_comma_part:
|
199 |
-
batches.append(current_comma_part.rstrip(','))
|
200 |
-
else:
|
201 |
-
# If no comma, split by words
|
202 |
-
batches.extend(split_by_words(sentence))
|
203 |
-
else:
|
204 |
-
current_batch = sentence
|
205 |
-
|
206 |
-
if current_batch:
|
207 |
-
batches.append(current_batch)
|
208 |
-
|
209 |
-
return batches
|
210 |
|
211 |
@gpu_decorator
|
212 |
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
|
@@ -306,7 +239,9 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
|
|
306 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
307 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
308 |
|
309 |
-
non_silent_segs = silence.split_on_silence(
|
|
|
|
|
310 |
non_silent_wave = AudioSegment.silent(duration=0)
|
311 |
for non_silent_seg in non_silent_segs:
|
312 |
non_silent_wave += non_silent_seg
|
@@ -332,13 +267,20 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
|
|
332 |
else:
|
333 |
gr.Info("Using custom reference text...")
|
334 |
|
335 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
audio, sr = torchaudio.load(ref_audio)
|
337 |
-
|
338 |
-
|
|
|
339 |
print('ref_text', ref_text)
|
340 |
-
for i,
|
341 |
-
print(f'gen_text {i}',
|
342 |
|
343 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
344 |
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
|
@@ -823,4 +765,4 @@ def main(port, host, share, api):
|
|
823 |
|
824 |
|
825 |
if __name__ == "__main__":
|
826 |
-
main()
|
|
|
112 |
"E2-TTS", "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
|
113 |
)
|
114 |
|
115 |
+
def chunk_text(text, max_chars=135):
|
116 |
+
"""
|
117 |
+
Splits the input text into chunks, each with a maximum number of characters.
|
118 |
+
|
119 |
+
Args:
|
120 |
+
text (str): The text to be split.
|
121 |
+
max_chars (int): The maximum number of characters per chunk.
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
List[str]: A list of text chunks.
|
125 |
+
"""
|
126 |
+
chunks = []
|
127 |
+
current_chunk = ""
|
128 |
+
# Split the text into sentences based on punctuation followed by whitespace
|
129 |
+
sentences = re.split(r'(?<=[;:,.!?])\s+', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
for sentence in sentences:
|
132 |
+
if len(current_chunk) + len(sentence) <= max_chars:
|
133 |
+
current_chunk += sentence + " "
|
134 |
else:
|
135 |
+
if current_chunk:
|
136 |
+
chunks.append(current_chunk.strip())
|
137 |
+
current_chunk = sentence + " "
|
138 |
+
|
139 |
+
if current_chunk:
|
140 |
+
chunks.append(current_chunk.strip())
|
141 |
+
|
142 |
+
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
@gpu_decorator
|
145 |
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
|
|
|
239 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
240 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
241 |
|
242 |
+
non_silent_segs = silence.split_on_silence(
|
243 |
+
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500
|
244 |
+
)
|
245 |
non_silent_wave = AudioSegment.silent(duration=0)
|
246 |
for non_silent_seg in non_silent_segs:
|
247 |
non_silent_wave += non_silent_seg
|
|
|
267 |
else:
|
268 |
gr.Info("Using custom reference text...")
|
269 |
|
270 |
+
# Add the functionality to ensure it ends with ". "
|
271 |
+
if not ref_text.endswith(". "):
|
272 |
+
if ref_text.endswith("."):
|
273 |
+
ref_text += " "
|
274 |
+
else:
|
275 |
+
ref_text += ". "
|
276 |
+
|
277 |
audio, sr = torchaudio.load(ref_audio)
|
278 |
+
|
279 |
+
# Use the new chunk_text function to split gen_text
|
280 |
+
gen_text_batches = chunk_text(gen_text, max_chars=135)
|
281 |
print('ref_text', ref_text)
|
282 |
+
for i, batch_text in enumerate(gen_text_batches):
|
283 |
+
print(f'gen_text {i}', batch_text)
|
284 |
|
285 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
286 |
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
|
|
|
765 |
|
766 |
|
767 |
if __name__ == "__main__":
|
768 |
+
main()
|