|
|
|
from kokoro import KPipeline |
|
|
|
|
|
import os |
|
from huggingface_hub import list_repo_files |
|
import uuid |
|
import re |
|
import gradio as gr |
|
|
|
|
|
|
|
from deep_translator import GoogleTranslator |
|
language_map_local = { |
|
"American English": "en", |
|
"British English": "en", |
|
"Hindi": "hi", |
|
"Spanish": "es", |
|
"French": "fr", |
|
"Italian": "it", |
|
"Brazilian Portuguese": "pt", |
|
"Japanese": "ja", |
|
"Mandarin Chinese": "zh-CN" |
|
} |
|
def bulk_translate(text, target_language, chunk_size=500,MAX_ALLOWED_CHARACTERS = 10000): |
|
if len(text)>=MAX_ALLOWED_CHARACTERS: |
|
gr.Warning("[WARNING] Text too long — skipping translation to prevent Google Translate abuse.") |
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lang_code=language_map_local[target_language] |
|
sentences = re.split(r'(?<=[.!?])\s+', text) |
|
chunks = [] |
|
current_chunk = "" |
|
|
|
for sentence in sentences: |
|
if len(current_chunk) + len(sentence) <= chunk_size: |
|
current_chunk += " " + sentence |
|
else: |
|
chunks.append(current_chunk.strip()) |
|
current_chunk = sentence |
|
|
|
if current_chunk: |
|
chunks.append(current_chunk.strip()) |
|
|
|
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks] |
|
result=" ".join(translated_chunks) |
|
return result.strip() |
|
|
|
|
|
language_map = { |
|
"American English": "a", |
|
"British English": "b", |
|
"Hindi": "h", |
|
"Spanish": "e", |
|
"French": "f", |
|
"Italian": "i", |
|
"Brazilian Portuguese": "p", |
|
"Japanese": "j", |
|
"Mandarin Chinese": "z" |
|
} |
|
|
|
|
|
def update_pipeline(Language): |
|
""" Updates the pipeline only if the language has changed. """ |
|
global pipeline, last_used_language |
|
|
|
new_lang = language_map.get(Language, "a") |
|
|
|
|
|
if new_lang != last_used_language: |
|
pipeline = KPipeline(lang_code=new_lang) |
|
last_used_language = new_lang |
|
try: |
|
pipeline = KPipeline(lang_code=new_lang) |
|
last_used_language = new_lang |
|
except Exception as e: |
|
gr.Warning(f"Make sure the input text is in {Language}",duration=10) |
|
gr.Warning(f"Fallback to English Language",duration=5) |
|
pipeline = KPipeline(lang_code="a") |
|
last_used_language = "a" |
|
|
|
|
|
|
|
def get_voice_names(repo_id): |
|
"""Fetches and returns a list of voice names (without extensions) from the given Hugging Face repository.""" |
|
return [os.path.splitext(file.replace("voices/", ""))[0] for file in list_repo_files(repo_id) if file.startswith("voices/")] |
|
|
|
def create_audio_dir(): |
|
"""Creates the 'kokoro_audio' directory in the root folder if it doesn't exist.""" |
|
root_dir = os.getcwd() |
|
audio_dir = os.path.join(root_dir, "kokoro_audio") |
|
|
|
if not os.path.exists(audio_dir): |
|
os.makedirs(audio_dir) |
|
print(f"Created directory: {audio_dir}") |
|
else: |
|
print(f"Directory already exists: {audio_dir}") |
|
return audio_dir |
|
|
|
import re |
|
|
|
def clean_text(text): |
|
|
|
replacements = { |
|
"–": " ", |
|
"-": " ", |
|
"**": " ", |
|
"*": " ", |
|
"#": " ", |
|
} |
|
|
|
|
|
for old, new in replacements.items(): |
|
text = text.replace(old, new) |
|
|
|
|
|
emoji_pattern = re.compile( |
|
r'[\U0001F600-\U0001F64F]|' |
|
r'[\U0001F300-\U0001F5FF]|' |
|
r'[\U0001F680-\U0001F6FF]|' |
|
r'[\U0001F700-\U0001F77F]|' |
|
r'[\U0001F780-\U0001F7FF]|' |
|
r'[\U0001F800-\U0001F8FF]|' |
|
r'[\U0001F900-\U0001F9FF]|' |
|
r'[\U0001FA00-\U0001FA6F]|' |
|
r'[\U0001FA70-\U0001FAFF]|' |
|
r'[\U00002702-\U000027B0]|' |
|
r'[\U0001F1E0-\U0001F1FF]' |
|
r'', flags=re.UNICODE) |
|
|
|
text = emoji_pattern.sub(r'', text) |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
|
return text |
|
|
|
def tts_file_name(text,language): |
|
global temp_folder |
|
|
|
text = re.sub(r'[^a-zA-Z\s]', '', text) |
|
text = text.lower().strip() |
|
text = text.replace(" ", "_") |
|
language=language.replace(" ", "_").strip() |
|
|
|
truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language |
|
|
|
|
|
random_string = uuid.uuid4().hex[:8].upper() |
|
|
|
|
|
file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav" |
|
return file_name |
|
|
|
|
|
|
|
import numpy as np |
|
import wave |
|
from pydub import AudioSegment |
|
from pydub.silence import split_on_silence |
|
|
|
def remove_silence_function(file_path,minimum_silence=50): |
|
|
|
output_path = file_path.replace(".wav", "_no_silence.wav") |
|
audio_format = "wav" |
|
|
|
sound = AudioSegment.from_file(file_path, format=audio_format) |
|
audio_chunks = split_on_silence(sound, |
|
min_silence_len=100, |
|
silence_thresh=-45, |
|
keep_silence=minimum_silence) |
|
|
|
combined = AudioSegment.empty() |
|
for chunk in audio_chunks: |
|
combined += chunk |
|
combined.export(output_path, format=audio_format) |
|
return output_path |
|
|
|
def generate_and_save_audio(text, Language="American English",voice="af_bella", speed=1,remove_silence=False,keep_silence_up_to=0.05): |
|
text=clean_text(text) |
|
update_pipeline(Language) |
|
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+') |
|
save_path=tts_file_name(text,Language) |
|
|
|
timestamps={} |
|
with wave.open(save_path, 'wb') as wav_file: |
|
|
|
wav_file.setnchannels(1) |
|
wav_file.setsampwidth(2) |
|
wav_file.setframerate(24000) |
|
for i, result in enumerate(generator): |
|
gs = result.graphemes |
|
|
|
ps = result.phonemes |
|
|
|
audio = result.audio |
|
tokens = result.tokens |
|
timestamps[i]={"text":gs,"words":[]} |
|
if Language in ["American English", "British English"]: |
|
for t in tokens: |
|
|
|
timestamps[i]["words"].append({"word":t.text,"start":t.start_ts,"end":t.end_ts}) |
|
audio_np = audio.numpy() |
|
audio_int16 = (audio_np * 32767).astype(np.int16) |
|
audio_bytes = audio_int16.tobytes() |
|
|
|
duration_sec = len(audio_np) / 24000 |
|
timestamps[i]["duration"] = duration_sec |
|
wav_file.writeframes(audio_bytes) |
|
if remove_silence: |
|
keep_silence = int(keep_silence_up_to * 1000) |
|
new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence) |
|
return new_wave_file,timestamps |
|
return save_path,timestamps |
|
|
|
|
|
|
|
def adjust_timestamps(timestamp_dict): |
|
adjusted_timestamps = [] |
|
last_global_end = 0 |
|
|
|
for segment_id in sorted(timestamp_dict.keys()): |
|
segment = timestamp_dict[segment_id] |
|
words = segment["words"] |
|
chunk_duration = segment["duration"] |
|
|
|
|
|
last_word_end_in_chunk = ( |
|
max(w["end"] for w in words if w["end"] not in [None, 0]) |
|
if words else 0 |
|
) |
|
|
|
silence_gap = chunk_duration - last_word_end_in_chunk |
|
if silence_gap < 0: |
|
silence_gap = 0 |
|
|
|
for word in words: |
|
start = word["start"] or 0 |
|
end = word["end"] or start |
|
|
|
adjusted_timestamps.append({ |
|
"word": word["word"], |
|
"start": round(last_global_end + start, 3), |
|
"end": round(last_global_end + end, 3) |
|
}) |
|
|
|
|
|
last_global_end += chunk_duration |
|
|
|
return adjusted_timestamps |
|
|
|
|
|
|
|
import string |
|
|
|
def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuation=True): |
|
with open(output_file, "w", encoding="utf-8") as f: |
|
index = 1 |
|
|
|
for entry in word_level_timestamps: |
|
word = entry["word"] |
|
|
|
|
|
if skip_punctuation and all(char in string.punctuation for char in word): |
|
continue |
|
|
|
start_time = entry["start"] |
|
end_time = entry["end"] |
|
|
|
|
|
def format_srt_time(seconds): |
|
hours = int(seconds // 3600) |
|
minutes = int((seconds % 3600) // 60) |
|
sec = int(seconds % 60) |
|
millisec = int((seconds % 1) * 1000) |
|
return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}" |
|
|
|
start_srt = format_srt_time(start_time) |
|
end_srt = format_srt_time(end_time) |
|
|
|
|
|
f.write(f"{index}\n{start_srt} --> {end_srt}\n{word}\n\n") |
|
index += 1 |
|
|
|
import string |
|
|
|
|
|
def split_line_by_char_limit(text, max_chars=30): |
|
words = text.split() |
|
lines = [] |
|
current_line = "" |
|
|
|
for word in words: |
|
if len(current_line + " " + word) <= max_chars: |
|
current_line = (current_line + " " + word).strip() |
|
else: |
|
lines.append(current_line) |
|
current_line = word |
|
|
|
if current_line: |
|
|
|
if len(current_line.split()) == 1 and len(lines) > 0: |
|
|
|
lines[-1] += " " + current_line |
|
else: |
|
lines.append(current_line) |
|
|
|
return "\n".join(lines) |
|
|
|
|
|
def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_words=8, min_pause=0.1): |
|
subtitles = [] |
|
subtitle_words = [] |
|
start_time = None |
|
|
|
remove_punctuation = ['"',"—"] |
|
|
|
for i, entry in enumerate(word_level_timestamps): |
|
word = entry["word"] |
|
word_start = entry["start"] |
|
word_end = entry["end"] |
|
|
|
|
|
if word in remove_punctuation: |
|
continue |
|
|
|
|
|
if word in string.punctuation: |
|
if subtitle_words: |
|
subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1]) |
|
continue |
|
|
|
|
|
if start_time is None: |
|
start_time = word_start |
|
|
|
|
|
if subtitle_words: |
|
last_word_end = subtitle_words[-1][1] |
|
pause_duration = word_start - last_word_end |
|
else: |
|
pause_duration = 0 |
|
|
|
|
|
if (word.endswith(('.', '!', '?')) and len(subtitle_words) >= 5) or len(subtitle_words) >= max_words or pause_duration > min_pause: |
|
end_time = subtitle_words[-1][1] |
|
subtitle_text = " ".join(w[0] for w in subtitle_words) |
|
subtitles.append((start_time, end_time, subtitle_text)) |
|
|
|
|
|
subtitle_words = [(word, word_end)] |
|
start_time = word_start |
|
|
|
continue |
|
|
|
|
|
subtitle_words.append((word, word_end)) |
|
|
|
|
|
if subtitle_words: |
|
end_time = subtitle_words[-1][1] |
|
subtitle_text = " ".join(w[0] for w in subtitle_words) |
|
subtitles.append((start_time, end_time, subtitle_text)) |
|
|
|
|
|
def format_srt_time(seconds): |
|
hours = int(seconds // 3600) |
|
minutes = int((seconds % 3600) // 60) |
|
sec = int(seconds % 60) |
|
millisec = int((seconds % 1) * 1000) |
|
return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}" |
|
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f: |
|
for i, (start, end, text) in enumerate(subtitles, start=1): |
|
text=split_line_by_char_limit(text, max_chars=30) |
|
f.write(f"{i}\n{format_srt_time(start)} --> {format_srt_time(end)}\n{text}\n\n") |
|
|
|
|
|
|
|
|
|
import json |
|
import re |
|
|
|
def fix_punctuation(text): |
|
|
|
text = re.sub(r'\s([.,?!])', r'\1', text) |
|
|
|
|
|
text = text.replace('" ', '"') |
|
text = text.replace(' "', '"') |
|
text = text.replace('" ', '"') |
|
|
|
|
|
track = 0 |
|
result = [] |
|
|
|
for index, char in enumerate(text): |
|
if char == '"': |
|
track += 1 |
|
result.append(char) |
|
|
|
if track % 2 == 0: |
|
result.append(' ') |
|
else: |
|
result.append(char) |
|
text=''.join(result) |
|
return text.strip() |
|
|
|
|
|
|
|
def make_json(word_timestamps, json_file_name): |
|
data = {} |
|
temp = [] |
|
inside_quote = False |
|
start_time = word_timestamps[0]['start'] |
|
end_time = word_timestamps[0]['end'] |
|
words_in_sentence = [] |
|
sentence_id = 0 |
|
|
|
|
|
for i, word_data in enumerate(word_timestamps): |
|
word = word_data['word'] |
|
word_start = word_data['start'] |
|
word_end = word_data['end'] |
|
|
|
|
|
words_in_sentence.append({'word': word, 'start': word_start, 'end': word_end}) |
|
|
|
|
|
end_time = word_end |
|
|
|
|
|
if word == '"': |
|
if inside_quote: |
|
temp[-1] += '"' |
|
else: |
|
temp.append('"') |
|
inside_quote = not inside_quote |
|
else: |
|
temp.append(word) |
|
|
|
|
|
if word.endswith(('.', '?', '!')) and not inside_quote: |
|
|
|
if i + 1 < len(word_timestamps): |
|
next_word = word_timestamps[i + 1]['word'] |
|
if next_word[0].islower(): |
|
continue |
|
|
|
|
|
sentence = " ".join(temp) |
|
sentence = fix_punctuation(sentence) |
|
data[sentence_id] = { |
|
'text': sentence, |
|
'duration': end_time - start_time, |
|
'start': start_time, |
|
'end': end_time, |
|
'words': words_in_sentence |
|
} |
|
|
|
|
|
temp = [] |
|
words_in_sentence = [] |
|
start_time = word_data['start'] |
|
sentence_id += 1 |
|
|
|
|
|
if temp: |
|
sentence = " ".join(temp) |
|
sentence = fix_punctuation(sentence) |
|
data[sentence_id] = { |
|
'text': sentence, |
|
'duration': end_time - start_time, |
|
'start': start_time, |
|
'end': end_time, |
|
'words': words_in_sentence |
|
} |
|
|
|
|
|
with open(json_file_name, 'w') as json_file: |
|
json.dump(data, json_file, indent=4) |
|
return json_file_name |
|
|
|
|
|
|
|
|
|
import os |
|
|
|
def modify_filename(save_path: str, prefix: str = ""): |
|
directory, filename = os.path.split(save_path) |
|
name, ext = os.path.splitext(filename) |
|
new_filename = f"{prefix}{name}{ext}" |
|
return os.path.join(directory, new_filename) |
|
import shutil |
|
def save_current_data(): |
|
if os.path.exists("./last"): |
|
shutil.rmtree("./last") |
|
os.makedirs("./last",exist_ok=True) |
|
|
|
def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False,keep_silence_up_to=0.05): |
|
if translate_text: |
|
text=bulk_translate(text, Language, chunk_size=500) |
|
save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to) |
|
if remove_silence==False: |
|
if Language in ["American English", "British English"]: |
|
word_level_timestamps=adjust_timestamps(timestamps) |
|
word_level_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="word_level_") |
|
normal_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="sentence_") |
|
json_file = modify_filename(save_path.replace(".wav", ".json"), prefix="duration_") |
|
write_word_srt(word_level_timestamps, output_file=word_level_srt, skip_punctuation=True) |
|
write_sentence_srt(word_level_timestamps, output_file=normal_srt, min_pause=0.01) |
|
make_json(word_level_timestamps, json_file) |
|
save_current_data() |
|
shutil.copy(save_path, "./last/") |
|
shutil.copy(word_level_srt, "./last/") |
|
shutil.copy(normal_srt, "./last/") |
|
shutil.copy(json_file, "./last/") |
|
return save_path,save_path,word_level_srt,normal_srt,json_file |
|
return save_path,save_path,None,None,None |
|
|
|
|
|
|
|
def toggle_autoplay(autoplay): |
|
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay) |
|
lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese'] |
|
voice_names = get_voice_names("hexgrad/Kokoro-82M") |
|
def ui(): |
|
|
|
dummy_examples = [ |
|
["Hey, y'all, let’s grab some coffee and catch up!", "American English", "af_bella"], |
|
["I'd like a large coffee, please.", "British English", "bf_isabella"], |
|
["नमस्ते, कैसे हो?", "Hindi", "hf_alpha"], |
|
["Hola, ¿cómo estás?", "Spanish", "ef_dora"], |
|
["Bonjour, comment ça va?", "French", "ff_siwis"], |
|
["Ciao, come stai?", "Italian", "if_sara"], |
|
["Olá, como você está?", "Brazilian Portuguese", "pf_dora"], |
|
["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"], |
|
["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"] |
|
] |
|
|
|
with gr.Blocks() as demo: |
|
|
|
gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/Kokoro-TTS-Subtitle)") |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
text = gr.Textbox(label='📝 Enter Text', lines=3) |
|
|
|
with gr.Row(): |
|
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0]) |
|
|
|
with gr.Row(): |
|
voice_name = gr.Dropdown(voice_names, label="🎙️ Choose VoicePack", value='af_heart') |
|
|
|
with gr.Row(): |
|
generate_btn = gr.Button('🚀 Generate', variant='primary') |
|
|
|
with gr.Accordion('🎛️ Audio Settings', open=False): |
|
speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='⚡️Speed', info='Adjust the speaking speed') |
|
translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language') |
|
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS') |
|
|
|
with gr.Column(): |
|
audio = gr.Audio(interactive=False, label='🔊 Output Audio', autoplay=True) |
|
audio_file = gr.File(label='📥 Download Audio') |
|
|
|
|
|
|
|
with gr.Accordion('🎬 Autoplay, Subtitle, Timestamp', open=False): |
|
autoplay = gr.Checkbox(value=True, label='▶️ Autoplay') |
|
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio]) |
|
word_level_srt_file = gr.File(label='📝 Download Word-Level SRT') |
|
srt_file = gr.File(label='📜 Download Sentence-Level SRT') |
|
sentence_duration_file = gr.File(label='⏳ Download Sentence Timestamp JSON') |
|
|
|
text.submit(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file]) |
|
generate_btn.click(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file]) |
|
|
|
|
|
gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name]) |
|
|
|
return demo |
|
|
|
def tutorial(): |
|
|
|
explanation = """ |
|
## Language Code Explanation: |
|
Example: `'af_bella'` |
|
- **'a'** stands for **American English**. |
|
- **'f_'** stands for **Female** (If it were 'm_', it would mean Male). |
|
- **'bella'** refers to the specific voice. |
|
|
|
The first character in the voice code stands for the language: |
|
- **"a"**: American English |
|
- **"b"**: British English |
|
- **"h"**: Hindi |
|
- **"e"**: Spanish |
|
- **"f"**: French |
|
- **"i"**: Italian |
|
- **"p"**: Brazilian Portuguese |
|
- **"j"**: Japanese |
|
- **"z"**: Mandarin Chinese |
|
|
|
The second character stands for gender: |
|
- **"f_"**: Female |
|
- **"m_"**: Male |
|
""" |
|
with gr.Blocks() as demo2: |
|
|
|
gr.Markdown(explanation) |
|
return demo2 |
|
|
|
|
|
|
|
import os |
|
import re |
|
import uuid |
|
import shutil |
|
import platform |
|
import datetime |
|
import subprocess |
|
|
|
import pysrt |
|
import librosa |
|
import soundfile as sf |
|
from tqdm.auto import tqdm |
|
from pydub import AudioSegment |
|
from deep_translator import GoogleTranslator |
|
|
|
|
|
|
|
def get_current_time(): |
|
return datetime.datetime.now().strftime("%I_%M_%p") |
|
|
|
def get_subtitle_Dub_path(srt_file_path, Language): |
|
file_name = os.path.splitext(os.path.basename(srt_file_path))[0] |
|
full_base_path = os.path.join(os.getcwd(), "TTS_DUB") |
|
os.makedirs(full_base_path, exist_ok=True) |
|
random_string = str(uuid.uuid4())[:6] |
|
lang = language_map_local.get(Language, Language.replace(" ", "_")) |
|
new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav") |
|
return new_path.replace("__", "_") |
|
|
|
def clean_srt(input_path): |
|
def clean_srt_line(text): |
|
for bad in ["[", "]", "♫"]: |
|
text = text.replace(bad, "") |
|
return text.strip() |
|
|
|
subs = pysrt.open(input_path, encoding='utf-8') |
|
output_path = input_path.lower().replace(".srt", "") + "_.srt" |
|
with open(output_path, "w", encoding='utf-8') as file: |
|
for sub in subs: |
|
file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n") |
|
return output_path |
|
|
|
def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000): |
|
output_path = input_path.replace(".srt", f"{target_language}.srt") |
|
subs = pysrt.open(input_path, encoding='utf-8') |
|
if len(subs) > max_segments: |
|
gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.") |
|
return input_path |
|
|
|
original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)] |
|
full_text = "\n".join(original) |
|
|
|
chunks, start = [], 0 |
|
while start < len(full_text): |
|
end = start + chunk_size |
|
split_point = full_text.rfind("<#", start, end) if end < len(full_text) else len(full_text) |
|
chunks.append(full_text[start:split_point]) |
|
start = split_point |
|
|
|
lang_code = language_map_local.get(target_language, "en") |
|
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks] |
|
translated_text = "\n".join(translated_chunks) |
|
|
|
pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL) |
|
translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)} |
|
|
|
for i, sub in enumerate(subs): |
|
sub.text = translated_dict.get(i, sub.text) |
|
|
|
subs.save(output_path, encoding='utf-8') |
|
return output_path |
|
|
|
def prepare_srt(srt_path, target_language, translate=False): |
|
path = clean_srt(srt_path) |
|
return translate_srt(path, target_language) if translate else path |
|
|
|
|
|
def is_ffmpeg_installed(): |
|
ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg" |
|
try: |
|
subprocess.run([ffmpeg_exe, "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) |
|
return True, ffmpeg_exe |
|
except Exception: |
|
gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20) |
|
return False, ffmpeg_exe |
|
|
|
def speedup_audio_librosa(input_file, output_file, speedup_factor): |
|
try: |
|
y, sr = librosa.load(input_file, sr=None) |
|
y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor) |
|
sf.write(output_file, y_stretched, sr) |
|
except Exception as e: |
|
gr.Warning(f"Librosa speedup failed: {e}") |
|
shutil.copy(input_file, output_file) |
|
|
|
def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path): |
|
if use_ffmpeg: |
|
try: |
|
subprocess.run([ffmpeg_path, "-i", input_file, "-filter:a", f"atempo={speedup_factor}", output_file, "-y"], check=True) |
|
except Exception as e: |
|
gr.Error(f"FFmpeg speedup error: {e}") |
|
speedup_audio_librosa(input_file, output_file, speedup_factor) |
|
else: |
|
speedup_audio_librosa(input_file, output_file, speedup_factor) |
|
|
|
def remove_edge_silence(input_path, output_path): |
|
y, sr = librosa.load(input_path, sr=None) |
|
trimmed_audio, _ = librosa.effects.trim(y, top_db=30) |
|
sf.write(output_path, trimmed_audio, sr) |
|
return output_path |
|
|
|
|
|
|
|
class SRTDubbing: |
|
def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"): |
|
self.use_ffmpeg = use_ffmpeg |
|
self.ffmpeg_path = ffmpeg_path |
|
self.cache_dir = "./cache" |
|
os.makedirs("./dummy", exist_ok=True) |
|
os.makedirs(self.cache_dir, exist_ok=True) |
|
|
|
@staticmethod |
|
def convert_to_millisecond(t): |
|
return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds) |
|
|
|
@staticmethod |
|
def read_srt_file(file_path): |
|
subs = pysrt.open(file_path, encoding='utf-8') |
|
entries = [] |
|
prev_end = 0 |
|
for idx, sub in enumerate(subs, 1): |
|
start, end = SRTDubbing.convert_to_millisecond(sub.start), SRTDubbing.convert_to_millisecond(sub.end) |
|
pause = start - prev_end if idx > 1 else start |
|
entries.append({ |
|
'entry_number': idx, |
|
'start_time': start, |
|
'end_time': end, |
|
'text': sub.text.strip(), |
|
'pause_time': pause, |
|
'audio_name': f"{idx}.wav", |
|
'previous_pause': f"{idx}_before_pause.wav", |
|
}) |
|
prev_end = end |
|
return entries |
|
|
|
def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration): |
|
temp = "./cache/temp.wav" |
|
|
|
path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=False, keep_silence_up_to=0.05) |
|
|
|
remove_edge_silence(path, temp) |
|
|
|
audio = AudioSegment.from_file(temp) |
|
|
|
|
|
if actual_duration == 0: |
|
shutil.move(temp, audio_path) |
|
return |
|
|
|
|
|
if len(audio) > actual_duration: |
|
path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=True, keep_silence_up_to=0.05) |
|
remove_edge_silence(path, temp) |
|
audio = AudioSegment.from_file(temp) |
|
|
|
|
|
if len(audio) > actual_duration: |
|
factor = len(audio) / actual_duration |
|
path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=factor, remove_silence=True, keep_silence_up_to=0.05) |
|
remove_edge_silence(path, temp) |
|
audio = AudioSegment.from_file(temp) |
|
|
|
|
|
if len(audio) > actual_duration: |
|
factor = len(audio) / actual_duration |
|
final_temp = "./cache/speedup_temp.wav" |
|
change_speed(temp, final_temp, factor, self.use_ffmpeg, self.ffmpeg_path) |
|
shutil.move(final_temp, audio_path) |
|
|
|
|
|
elif len(audio) < actual_duration: |
|
silence = AudioSegment.silent(duration=actual_duration - len(audio)) |
|
(audio + silence).export(audio_path, format="wav") |
|
|
|
else: |
|
shutil.move(temp, audio_path) |
|
|
|
@staticmethod |
|
def make_silence(duration, path): |
|
AudioSegment.silent(duration=duration).export(path, format="wav") |
|
|
|
@staticmethod |
|
def create_folder_for_srt(srt_file_path): |
|
base = os.path.splitext(os.path.basename(srt_file_path))[0] |
|
folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}" |
|
os.makedirs(folder, exist_ok=True) |
|
return folder |
|
|
|
@staticmethod |
|
def concatenate_audio_files(paths, output): |
|
audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0)) |
|
audio.export(output, format="wav") |
|
|
|
def srt_to_dub(self, srt_path, output_path, language, voice): |
|
entries = self.read_srt_file(srt_path) |
|
folder = self.create_folder_for_srt(srt_path) |
|
all_audio = [] |
|
for entry in tqdm(entries): |
|
self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause'])) |
|
all_audio.append(os.path.join(folder, entry['previous_pause'])) |
|
|
|
tts_path = os.path.join(folder, entry['audio_name']) |
|
self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time']) |
|
all_audio.append(tts_path) |
|
|
|
self.concatenate_audio_files(all_audio, output_path) |
|
|
|
|
|
|
|
def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False): |
|
if not srt_path.endswith(".srt"): |
|
gr.Error("Please upload a valid .srt file", duration=5) |
|
return None |
|
|
|
use_ffmpeg, ffmpeg_path = is_ffmpeg_installed() |
|
processed_srt = prepare_srt(srt_path, Language, translate) |
|
output_path = get_subtitle_Dub_path(srt_path, Language) |
|
|
|
SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name) |
|
return output_path,output_path |
|
|
|
def subtitle_ui(): |
|
with gr.Blocks() as demo: |
|
|
|
gr.Markdown( |
|
""" |
|
# Generate Audio File From Subtitle [Upload Only .srt file] |
|
|
|
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle) |
|
|
|
""" |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
srt_file = gr.File(label='Upload .srt Subtitle File Only') |
|
|
|
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0]) |
|
|
|
voice = gr.Dropdown( |
|
voice_names, |
|
value='af_bella', |
|
allow_custom_value=False, |
|
label='🎙️ Choose VoicePack', |
|
) |
|
with gr.Row(): |
|
generate_btn_ = gr.Button('Generate', variant='primary') |
|
|
|
with gr.Accordion('Other Settings', open=False): |
|
translate_text = gr.Checkbox(value=False, label='🌐 Translate Subtitle to Selected Language') |
|
|
|
|
|
|
|
with gr.Column(): |
|
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True) |
|
audio_file = gr.File(label='📥 Download Audio') |
|
with gr.Accordion('Enable Autoplay', open=False): |
|
autoplay = gr.Checkbox(value=True, label='Autoplay') |
|
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
generate_btn_.click( |
|
srt_process, |
|
inputs=[srt_file,language_name,voice,translate_text], |
|
outputs=[audio,audio_file] |
|
) |
|
return demo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import click |
|
@click.command() |
|
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.") |
|
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.") |
|
def main(debug, share): |
|
|
|
demo1 = ui() |
|
demo2 = subtitle_ui() |
|
demo3 = tutorial() |
|
demo = gr.TabbedInterface([demo1, demo2,demo3],["Multilingual TTS","SRT Dubbing","VoicePack Explanation"],title="Kokoro TTS") |
|
demo.queue().launch(debug=debug, share=share) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
last_used_language = "a" |
|
pipeline = KPipeline(lang_code=last_used_language) |
|
temp_folder = create_audio_dir() |
|
if __name__ == "__main__": |
|
main() |