NeuralFalcon commited on
Commit
c02a317
·
verified ·
1 Parent(s): 30ba20f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +641 -0
app.py ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Initalize a pipeline
3
+ from kokoro import KPipeline
4
+ # from IPython.display import display, Audio
5
+ # import soundfile as sf
6
+ import os
7
+ from huggingface_hub import list_repo_files
8
+ import uuid
9
+ import re
10
+ import gradio as gr
11
+
12
+
13
+ #translate langauge
14
+ from deep_translator import GoogleTranslator
15
+ def bulk_translate(text, target_language, chunk_size=500):
16
+ language_map_local = {
17
+ "American English": "en",
18
+ "British English": "en",
19
+ "Hindi": "hi",
20
+ "Spanish": "es",
21
+ "French": "fr",
22
+ "Italian": "it",
23
+ "Brazilian Portuguese": "pt",
24
+ "Japanese": "ja",
25
+ "Mandarin Chinese": "zh-CN"
26
+ }
27
+ # lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
28
+ lang_code=language_map_local[target_language]
29
+ sentences = re.split(r'(?<=[.!?])\s+', text) # Split text into sentences
30
+ chunks = []
31
+ current_chunk = ""
32
+
33
+ for sentence in sentences:
34
+ if len(current_chunk) + len(sentence) <= chunk_size:
35
+ current_chunk += " " + sentence
36
+ else:
37
+ chunks.append(current_chunk.strip())
38
+ current_chunk = sentence
39
+
40
+ if current_chunk:
41
+ chunks.append(current_chunk.strip())
42
+
43
+ translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
44
+ result=" ".join(translated_chunks)
45
+ return result.strip()
46
+
47
+ # Language mapping dictionary
48
+ language_map = {
49
+ "American English": "a",
50
+ "British English": "b",
51
+ "Hindi": "h",
52
+ "Spanish": "e",
53
+ "French": "f",
54
+ "Italian": "i",
55
+ "Brazilian Portuguese": "p",
56
+ "Japanese": "j",
57
+ "Mandarin Chinese": "z"
58
+ }
59
+
60
+
61
+ def update_pipeline(Language):
62
+ """ Updates the pipeline only if the language has changed. """
63
+ global pipeline, last_used_language
64
+ # Get language code, default to 'a' if not found
65
+ new_lang = language_map.get(Language, "a")
66
+
67
+ # Only update if the language is different
68
+ if new_lang != last_used_language:
69
+ pipeline = KPipeline(lang_code=new_lang)
70
+ last_used_language = new_lang
71
+ try:
72
+ pipeline = KPipeline(lang_code=new_lang)
73
+ last_used_language = new_lang # Update last used language
74
+ except Exception as e:
75
+ gr.Warning(f"Make sure the input text is in {Language}",duration=10)
76
+ gr.Warning(f"Fallback to English Language",duration=5)
77
+ pipeline = KPipeline(lang_code="a") # Fallback to English
78
+ last_used_language = "a"
79
+
80
+
81
+
82
+ def get_voice_names(repo_id):
83
+ """Fetches and returns a list of voice names (without extensions) from the given Hugging Face repository."""
84
+ return [os.path.splitext(file.replace("voices/", ""))[0] for file in list_repo_files(repo_id) if file.startswith("voices/")]
85
+
86
+ def create_audio_dir():
87
+ """Creates the 'kokoro_audio' directory in the root folder if it doesn't exist."""
88
+ root_dir = os.getcwd() # Use current working directory instead of __file__
89
+ audio_dir = os.path.join(root_dir, "kokoro_audio")
90
+
91
+ if not os.path.exists(audio_dir):
92
+ os.makedirs(audio_dir)
93
+ print(f"Created directory: {audio_dir}")
94
+ else:
95
+ print(f"Directory already exists: {audio_dir}")
96
+ return audio_dir
97
+
98
+ import re
99
+
100
+ def clean_text(text):
101
+ # Define replacement rules
102
+ replacements = {
103
+ "–": " ", # Replace en-dash with space
104
+ "-": " ", # Replace hyphen with space
105
+ "**": " ", # Replace double asterisks with space
106
+ "*": " ", # Replace single asterisk with space
107
+ "#": " ", # Replace hash with space
108
+ }
109
+
110
+ # Apply replacements
111
+ for old, new in replacements.items():
112
+ text = text.replace(old, new)
113
+
114
+ # Remove emojis using regex (covering wide range of Unicode characters)
115
+ emoji_pattern = re.compile(
116
+ r'[\U0001F600-\U0001F64F]|' # Emoticons
117
+ r'[\U0001F300-\U0001F5FF]|' # Miscellaneous symbols and pictographs
118
+ r'[\U0001F680-\U0001F6FF]|' # Transport and map symbols
119
+ r'[\U0001F700-\U0001F77F]|' # Alchemical symbols
120
+ r'[\U0001F780-\U0001F7FF]|' # Geometric shapes extended
121
+ r'[\U0001F800-\U0001F8FF]|' # Supplemental arrows-C
122
+ r'[\U0001F900-\U0001F9FF]|' # Supplemental symbols and pictographs
123
+ r'[\U0001FA00-\U0001FA6F]|' # Chess symbols
124
+ r'[\U0001FA70-\U0001FAFF]|' # Symbols and pictographs extended-A
125
+ r'[\U00002702-\U000027B0]|' # Dingbats
126
+ r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
127
+ r'', flags=re.UNICODE)
128
+
129
+ text = emoji_pattern.sub(r'', text)
130
+
131
+ # Remove multiple spaces and extra line breaks
132
+ text = re.sub(r'\s+', ' ', text).strip()
133
+
134
+ return text
135
+
136
+ def tts_file_name(text,language):
137
+ global temp_folder
138
+ # Remove all non-alphabetic characters and convert to lowercase
139
+ text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
140
+ text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
141
+ text = text.replace(" ", "_") # Replace spaces with underscores
142
+ language=language.replace(" ", "_").strip()
143
+ # Truncate or handle empty text
144
+ truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
145
+
146
+ # Generate a random string for uniqueness
147
+ random_string = uuid.uuid4().hex[:8].upper()
148
+
149
+ # Construct the file name
150
+ file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
151
+ return file_name
152
+
153
+
154
+ # import soundfile as sf
155
+ import numpy as np
156
+ import wave
157
+ from pydub import AudioSegment
158
+ from pydub.silence import split_on_silence
159
+
160
+ def remove_silence_function(file_path,minimum_silence=50):
161
+ # Extract file name and format from the provided path
162
+ output_path = file_path.replace(".wav", "_no_silence.wav")
163
+ audio_format = "wav"
164
+ # Reading and splitting the audio file into chunks
165
+ sound = AudioSegment.from_file(file_path, format=audio_format)
166
+ audio_chunks = split_on_silence(sound,
167
+ min_silence_len=100,
168
+ silence_thresh=-45,
169
+ keep_silence=minimum_silence)
170
+ # Putting the file back together
171
+ combined = AudioSegment.empty()
172
+ for chunk in audio_chunks:
173
+ combined += chunk
174
+ combined.export(output_path, format=audio_format)
175
+ return output_path
176
+
177
+ def generate_and_save_audio(text, Language="American English",voice="af_bella", speed=1,remove_silence=False,keep_silence_up_to=0.05):
178
+ text=clean_text(text)
179
+ update_pipeline(Language)
180
+ generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
181
+ save_path=tts_file_name(text,Language)
182
+ # Open the WAV file for writing
183
+ timestamps={}
184
+ with wave.open(save_path, 'wb') as wav_file:
185
+ # Set the WAV file parameters
186
+ wav_file.setnchannels(1) # Mono audio
187
+ wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
188
+ wav_file.setframerate(24000) # Sample rate
189
+ for i, result in enumerate(generator):
190
+ gs = result.graphemes # str
191
+ # print(f"\n{i}: {gs}")
192
+ ps = result.phonemes # str
193
+ # audio = result.audio.cpu().numpy()
194
+ audio = result.audio
195
+ tokens = result.tokens # List[en.MToken]
196
+ timestamps[i]={"text":gs,"words":[]}
197
+ if Language in ["American English", "British English"]:
198
+ for t in tokens:
199
+ # print(t.text, repr(t.whitespace), t.start_ts, t.end_ts)
200
+ timestamps[i]["words"].append({"word":t.text,"start":t.start_ts,"end":t.end_ts})
201
+ audio_np = audio.numpy() # Convert Tensor to NumPy array
202
+ audio_int16 = (audio_np * 32767).astype(np.int16) # Scale to 16-bit range
203
+ audio_bytes = audio_int16.tobytes() # Convert to bytes
204
+ # Write the audio chunk to the WAV file
205
+ duration_sec = len(audio_np) / 24000
206
+ timestamps[i]["duration"] = duration_sec
207
+ wav_file.writeframes(audio_bytes)
208
+ if remove_silence:
209
+ keep_silence = int(keep_silence_up_to * 1000)
210
+ new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
211
+ return new_wave_file,timestamps
212
+ return save_path,timestamps
213
+
214
+
215
+
216
+ def adjust_timestamps(timestamp_dict):
217
+ adjusted_timestamps = []
218
+ last_global_end = 0 # Cumulative audio timeline
219
+
220
+ for segment_id in sorted(timestamp_dict.keys()):
221
+ segment = timestamp_dict[segment_id]
222
+ words = segment["words"]
223
+ chunk_duration = segment["duration"]
224
+
225
+ # If there are valid words, get last word end
226
+ last_word_end_in_chunk = (
227
+ max(w["end"] for w in words if w["end"] not in [None, 0])
228
+ if words else 0
229
+ )
230
+
231
+ silence_gap = chunk_duration - last_word_end_in_chunk
232
+ if silence_gap < 0: # In rare cases where end > duration (due to rounding)
233
+ silence_gap = 0
234
+
235
+ for word in words:
236
+ start = word["start"] or 0
237
+ end = word["end"] or start
238
+
239
+ adjusted_timestamps.append({
240
+ "word": word["word"],
241
+ "start": round(last_global_end + start, 3),
242
+ "end": round(last_global_end + end, 3)
243
+ })
244
+
245
+ # Add entire chunk duration to global end
246
+ last_global_end += chunk_duration
247
+
248
+ return adjusted_timestamps
249
+
250
+
251
+
252
+ import string
253
+
254
+ def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuation=True):
255
+ with open(output_file, "w", encoding="utf-8") as f:
256
+ index = 1 # Track subtitle numbering separately
257
+
258
+ for entry in word_level_timestamps:
259
+ word = entry["word"]
260
+
261
+ # Skip punctuation if enabled
262
+ if skip_punctuation and all(char in string.punctuation for char in word):
263
+ continue
264
+
265
+ start_time = entry["start"]
266
+ end_time = entry["end"]
267
+
268
+ # Convert seconds to SRT time format (HH:MM:SS,mmm)
269
+ def format_srt_time(seconds):
270
+ hours = int(seconds // 3600)
271
+ minutes = int((seconds % 3600) // 60)
272
+ sec = int(seconds % 60)
273
+ millisec = int((seconds % 1) * 1000)
274
+ return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}"
275
+
276
+ start_srt = format_srt_time(start_time)
277
+ end_srt = format_srt_time(end_time)
278
+
279
+ # Write entry to SRT file
280
+ f.write(f"{index}\n{start_srt} --> {end_srt}\n{word}\n\n")
281
+ index += 1 # Increment subtitle number
282
+
283
+ import string
284
+
285
+
286
+ def split_line_by_char_limit(text, max_chars=30):
287
+ words = text.split()
288
+ lines = []
289
+ current_line = ""
290
+
291
+ for word in words:
292
+ if len(current_line + " " + word) <= max_chars:
293
+ current_line = (current_line + " " + word).strip()
294
+ else:
295
+ lines.append(current_line)
296
+ current_line = word
297
+
298
+ if current_line:
299
+ # Check if last line is a single word and there is a previous line
300
+ if len(current_line.split()) == 1 and len(lines) > 0:
301
+ # Append single word to previous line
302
+ lines[-1] += " " + current_line
303
+ else:
304
+ lines.append(current_line)
305
+
306
+ return "\n".join(lines)
307
+
308
+
309
+ def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_words=8, min_pause=0.1):
310
+ subtitles = [] # Stores subtitle blocks
311
+ subtitle_words = [] # Temporary list for words in the current subtitle
312
+ start_time = None # Tracks start time of current subtitle
313
+
314
+ remove_punctuation = ['"',"—"] # Add punctuations to remove if needed
315
+
316
+ for i, entry in enumerate(word_level_timestamps):
317
+ word = entry["word"]
318
+ word_start = entry["start"]
319
+ word_end = entry["end"]
320
+
321
+ # Skip selected punctuation from remove_punctuation list
322
+ if word in remove_punctuation:
323
+ continue
324
+
325
+ # Attach punctuation to the previous word
326
+ if word in string.punctuation:
327
+ if subtitle_words:
328
+ subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
329
+ continue
330
+
331
+ # Start a new subtitle block if needed
332
+ if start_time is None:
333
+ start_time = word_start
334
+
335
+ # Calculate pause duration if this is not the first word
336
+ if subtitle_words:
337
+ last_word_end = subtitle_words[-1][1]
338
+ pause_duration = word_start - last_word_end
339
+ else:
340
+ pause_duration = 0
341
+
342
+ # **NEW FIX:** If pause is too long, create a new subtitle but ensure continuity
343
+ if (word.endswith(('.', '!', '?')) and len(subtitle_words) >= 5) or len(subtitle_words) >= max_words or pause_duration > min_pause:
344
+ end_time = subtitle_words[-1][1] # Use last word's end time
345
+ subtitle_text = " ".join(w[0] for w in subtitle_words)
346
+ subtitles.append((start_time, end_time, subtitle_text))
347
+
348
+ # Reset for the next subtitle, but **ensure continuity**
349
+ subtitle_words = [(word, word_end)] # **Carry the current word to avoid delay**
350
+ start_time = word_start # **Start at the current word, not None**
351
+
352
+ continue # Avoid adding the word twice
353
+
354
+ # Add the current word to the subtitle
355
+ subtitle_words.append((word, word_end))
356
+
357
+ # Ensure last subtitle is added if anything remains
358
+ if subtitle_words:
359
+ end_time = subtitle_words[-1][1]
360
+ subtitle_text = " ".join(w[0] for w in subtitle_words)
361
+ subtitles.append((start_time, end_time, subtitle_text))
362
+
363
+ # Function to format SRT timestamps
364
+ def format_srt_time(seconds):
365
+ hours = int(seconds // 3600)
366
+ minutes = int((seconds % 3600) // 60)
367
+ sec = int(seconds % 60)
368
+ millisec = int((seconds % 1) * 1000)
369
+ return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}"
370
+
371
+ # Write subtitles to SRT file
372
+ with open(output_file, "w", encoding="utf-8") as f:
373
+ for i, (start, end, text) in enumerate(subtitles, start=1):
374
+ text=split_line_by_char_limit(text, max_chars=30)
375
+ f.write(f"{i}\n{format_srt_time(start)} --> {format_srt_time(end)}\n{text}\n\n")
376
+
377
+ # print(f"SRT file '{output_file}' created successfully!")
378
+
379
+
380
+ import json
381
+ import re
382
+
383
+ def fix_punctuation(text):
384
+ # Remove spaces before punctuation marks (., ?, !, ,)
385
+ text = re.sub(r'\s([.,?!])', r'\1', text)
386
+
387
+ # Handle quotation marks: remove spaces before and after them
388
+ text = text.replace('" ', '"')
389
+ text = text.replace(' "', '"')
390
+ text = text.replace('" ', '"')
391
+
392
+ # Track quotation marks to add space after closing quotes
393
+ track = 0
394
+ result = []
395
+
396
+ for index, char in enumerate(text):
397
+ if char == '"':
398
+ track += 1
399
+ result.append(char)
400
+ # If it's a closing quote (even number of quotes), add a space after it
401
+ if track % 2 == 0:
402
+ result.append(' ')
403
+ else:
404
+ result.append(char)
405
+ text=''.join(result)
406
+ return text.strip()
407
+
408
+
409
+
410
+ def make_json(word_timestamps, json_file_name):
411
+ data = {}
412
+ temp = []
413
+ inside_quote = False # Track if we are inside a quoted sentence
414
+ start_time = word_timestamps[0]['start'] # Initialize with the first word's start time
415
+ end_time = word_timestamps[0]['end'] # Initialize with the first word's end time
416
+ words_in_sentence = []
417
+ sentence_id = 0 # Initialize sentence ID
418
+
419
+ # Process each word in word_timestamps
420
+ for i, word_data in enumerate(word_timestamps):
421
+ word = word_data['word']
422
+ word_start = word_data['start']
423
+ word_end = word_data['end']
424
+
425
+ # Collect word info for JSON
426
+ words_in_sentence.append({'word': word, 'start': word_start, 'end': word_end})
427
+
428
+ # Update the end_time for the sentence based on the current word
429
+ end_time = word_end
430
+
431
+ # Properly handle opening and closing quotation marks
432
+ if word == '"':
433
+ if inside_quote:
434
+ temp[-1] += '"' # Attach closing quote to the last word
435
+ else:
436
+ temp.append('"') # Keep opening quote as a separate entry
437
+ inside_quote = not inside_quote # Toggle inside_quote state
438
+ else:
439
+ temp.append(word)
440
+
441
+ # Check if this is a sentence-ending punctuation
442
+ if word.endswith(('.', '?', '!')) and not inside_quote:
443
+ # Ensure the next word is NOT a dialogue tag before finalizing the sentence
444
+ if i + 1 < len(word_timestamps):
445
+ next_word = word_timestamps[i + 1]['word']
446
+ if next_word[0].islower(): # Likely a dialogue tag like "he said"
447
+ continue # Do not break the sentence yet
448
+
449
+ # Store the full sentence for JSON and reset word collection for next sentence
450
+ sentence = " ".join(temp)
451
+ sentence = fix_punctuation(sentence) # Fix punctuation in the sentence
452
+ data[sentence_id] = {
453
+ 'text': sentence,
454
+ 'duration': end_time - start_time,
455
+ 'start': start_time,
456
+ 'end': end_time,
457
+ 'words': words_in_sentence
458
+ }
459
+
460
+ # Reset for the next sentence
461
+ temp = []
462
+ words_in_sentence = []
463
+ start_time = word_data['start'] # Update the start time for the next sentence
464
+ sentence_id += 1 # Increment sentence ID
465
+
466
+ # Handle any remaining words if necessary
467
+ if temp:
468
+ sentence = " ".join(temp)
469
+ sentence = fix_punctuation(sentence) # Fix punctuation in the sentence
470
+ data[sentence_id] = {
471
+ 'text': sentence,
472
+ 'duration': end_time - start_time,
473
+ 'start': start_time,
474
+ 'end': end_time,
475
+ 'words': words_in_sentence
476
+ }
477
+
478
+ # Write data to JSON file
479
+ with open(json_file_name, 'w') as json_file:
480
+ json.dump(data, json_file, indent=4)
481
+ return json_file_name
482
+
483
+
484
+
485
+
486
+ import os
487
+
488
+ def modify_filename(save_path: str, prefix: str = ""):
489
+ directory, filename = os.path.split(save_path)
490
+ name, ext = os.path.splitext(filename)
491
+ new_filename = f"{prefix}{name}{ext}"
492
+ return os.path.join(directory, new_filename)
493
+ import shutil
494
+ def save_current_data():
495
+ if os.path.exists("./last"):
496
+ shutil.rmtree("./last")
497
+ os.makedirs("./last",exist_ok=True)
498
+
499
+
500
+ def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False,keep_silence_up_to=0.05):
501
+ if translate_text:
502
+ text=bulk_translate(text, Language, chunk_size=500)
503
+ save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
504
+ if remove_silence==False:
505
+ if Language in ["American English", "British English"]:
506
+ word_level_timestamps=adjust_timestamps(timestamps)
507
+ word_level_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="word_level_")
508
+ normal_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="sentence_")
509
+ json_file = modify_filename(save_path.replace(".wav", ".json"), prefix="duration_")
510
+ write_word_srt(word_level_timestamps, output_file=word_level_srt, skip_punctuation=True)
511
+ write_sentence_srt(word_level_timestamps, output_file=normal_srt, min_pause=0.01)
512
+ make_json(word_level_timestamps, json_file)
513
+ save_current_data()
514
+ shutil.copy(save_path, "./last/")
515
+ shutil.copy(word_level_srt, "./last/")
516
+ shutil.copy(normal_srt, "./last/")
517
+ shutil.copy(json_file, "./last/")
518
+ return save_path,save_path,word_level_srt,normal_srt,json_file
519
+ return save_path,save_path,None,None,None
520
+
521
+
522
+
523
+
524
+
525
+ def ui():
526
+ def toggle_autoplay(autoplay):
527
+ return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
528
+
529
+ # Define examples in the format you mentioned
530
+ dummy_examples = [
531
+ ["Hey, y'all, let’s grab some coffee and catch up!", "American English", "af_bella"],
532
+ ["I'd like a large coffee, please.", "British English", "bf_isabella"],
533
+ ["नमस्ते, कैसे हो?", "Hindi", "hf_alpha"],
534
+ ["Hola, ¿cómo estás?", "Spanish", "ef_dora"],
535
+ ["Bonjour, comment ça va?", "French", "ff_siwis"],
536
+ ["Ciao, come stai?", "Italian", "if_sara"],
537
+ ["Olá, como você está?", "Brazilian Portuguese", "pf_dora"],
538
+ ["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
539
+ ["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
540
+ ]
541
+
542
+ with gr.Blocks() as demo:
543
+ # gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
544
+ gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
545
+ lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese']
546
+ voice_names = get_voice_names("hexgrad/Kokoro-82M")
547
+
548
+ with gr.Row():
549
+ with gr.Column():
550
+ text = gr.Textbox(label='📝 Enter Text', lines=3)
551
+
552
+ with gr.Row():
553
+ language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
554
+
555
+ with gr.Row():
556
+ voice_name = gr.Dropdown(voice_names, label="🎙️ Choose VoicePack", value='af_heart')#voice_names[0])
557
+
558
+ with gr.Row():
559
+ generate_btn = gr.Button('🚀 Generate', variant='primary')
560
+
561
+ with gr.Accordion('🎛️ Audio Settings', open=False):
562
+ speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='⚡️Speed', info='Adjust the speaking speed')
563
+ translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
564
+ remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
565
+
566
+ with gr.Column():
567
+ audio = gr.Audio(interactive=False, label='🔊 Output Audio', autoplay=True)
568
+ audio_file = gr.File(label='📥 Download Audio')
569
+ # word_level_srt_file = gr.File(label='Download Word-Level SRT')
570
+ # srt_file = gr.File(label='Download Sentence-Level SRT')
571
+ # sentence_duration_file = gr.File(label='Download Sentence Duration JSON')
572
+ with gr.Accordion('🎬 Autoplay, Subtitle, Timestamp', open=False):
573
+ autoplay = gr.Checkbox(value=True, label='▶️ Autoplay')
574
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
575
+ word_level_srt_file = gr.File(label='📝 Download Word-Level SRT')
576
+ srt_file = gr.File(label='📜 Download Sentence-Level SRT')
577
+ sentence_duration_file = gr.File(label='⏳ Download Sentence Timestamp JSON')
578
+
579
+ text.submit(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
580
+ generate_btn.click(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
581
+
582
+ # Add examples to the interface
583
+ gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
584
+
585
+ return demo
586
+
587
+ def tutorial():
588
+ # Markdown explanation for language code
589
+ explanation = """
590
+ ## Language Code Explanation:
591
+ Example: `'af_bella'`
592
+ - **'a'** stands for **American English**.
593
+ - **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
594
+ - **'bella'** refers to the specific voice.
595
+
596
+ The first character in the voice code stands for the language:
597
+ - **"a"**: American English
598
+ - **"b"**: British English
599
+ - **"h"**: Hindi
600
+ - **"e"**: Spanish
601
+ - **"f"**: French
602
+ - **"i"**: Italian
603
+ - **"p"**: Brazilian Portuguese
604
+ - **"j"**: Japanese
605
+ - **"z"**: Mandarin Chinese
606
+
607
+ The second character stands for gender:
608
+ - **"f_"**: Female
609
+ - **"m_"**: Male
610
+ """
611
+ with gr.Blocks() as demo2:
612
+ gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
613
+ gr.Markdown(explanation) # Display the explanation
614
+ return demo2
615
+
616
+
617
+
618
+ import click
619
+ @click.command()
620
+ @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
621
+ @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
622
+ def main(debug, share):
623
+ # def main(debug=True, share=True):
624
+ demo1 = ui()
625
+ demo2 = tutorial()
626
+ demo = gr.TabbedInterface([demo1, demo2],["Multilingual TTS","VoicePack Explanation"],title="Kokoro TTS")#,theme='JohnSmith9982/small_and_pretty')
627
+ demo.queue().launch(debug=debug, share=share)
628
+ # demo.queue().launch(debug=debug, share=share,server_port=9000)
629
+ #Run on local network
630
+ # laptop_ip="192.168.0.30"
631
+ # port=8080
632
+ # demo.queue().launch(debug=debug, share=share,server_name=laptop_ip,server_port=port)
633
+
634
+
635
+
636
+ # Initialize default pipeline
637
+ last_used_language = "a"
638
+ pipeline = KPipeline(lang_code=last_used_language)
639
+ temp_folder = create_audio_dir()
640
+ if __name__ == "__main__":
641
+ main()