Update app.py
Browse files
app.py
CHANGED
@@ -619,11 +619,17 @@ def tutorial():
|
|
619 |
- **"m_"**: Male
|
620 |
"""
|
621 |
with gr.Blocks() as demo2:
|
622 |
-
|
623 |
gr.Markdown(explanation) # Display the explanation
|
624 |
return demo2
|
625 |
|
626 |
|
|
|
|
|
|
|
|
|
|
|
|
|
627 |
#@title subtitle
|
628 |
import os
|
629 |
import re
|
@@ -632,7 +638,8 @@ import shutil
|
|
632 |
import platform
|
633 |
import datetime
|
634 |
import subprocess
|
635 |
-
|
|
|
636 |
import pysrt
|
637 |
import librosa
|
638 |
import soundfile as sf
|
@@ -640,11 +647,13 @@ from tqdm.auto import tqdm
|
|
640 |
from pydub import AudioSegment
|
641 |
from deep_translator import GoogleTranslator
|
642 |
|
643 |
-
|
644 |
# ---------------------- Utility Functions ----------------------
|
|
|
|
|
645 |
def get_current_time():
|
646 |
return datetime.datetime.now().strftime("%I_%M_%p")
|
647 |
|
|
|
648 |
def get_subtitle_Dub_path(srt_file_path, Language):
|
649 |
file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
|
650 |
full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
|
@@ -654,6 +663,7 @@ def get_subtitle_Dub_path(srt_file_path, Language):
|
|
654 |
new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
|
655 |
return new_path.replace("__", "_")
|
656 |
|
|
|
657 |
def clean_srt(input_path):
|
658 |
def clean_srt_line(text):
|
659 |
for bad in ["[", "]", "♫"]:
|
@@ -667,16 +677,20 @@ def clean_srt(input_path):
|
|
667 |
file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
|
668 |
return output_path
|
669 |
|
|
|
670 |
def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000):
|
671 |
output_path = input_path.replace(".srt", f"{target_language}.srt")
|
672 |
subs = pysrt.open(input_path, encoding='utf-8')
|
|
|
673 |
if len(subs) > max_segments:
|
674 |
gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.")
|
675 |
return input_path
|
676 |
|
|
|
677 |
original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)]
|
678 |
full_text = "\n".join(original)
|
679 |
|
|
|
680 |
chunks, start = [], 0
|
681 |
while start < len(full_text):
|
682 |
end = start + chunk_size
|
@@ -688,20 +702,24 @@ def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_s
|
|
688 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
689 |
translated_text = "\n".join(translated_chunks)
|
690 |
|
|
|
691 |
pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL)
|
692 |
translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)}
|
693 |
|
|
|
694 |
for i, sub in enumerate(subs):
|
695 |
sub.text = translated_dict.get(i, sub.text)
|
696 |
|
697 |
subs.save(output_path, encoding='utf-8')
|
698 |
return output_path
|
699 |
|
|
|
700 |
def prepare_srt(srt_path, target_language, translate=False):
|
701 |
path = clean_srt(srt_path)
|
702 |
return translate_srt(path, target_language) if translate else path
|
703 |
|
704 |
-
|
|
|
705 |
def is_ffmpeg_installed():
|
706 |
ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg"
|
707 |
try:
|
@@ -711,6 +729,21 @@ def is_ffmpeg_installed():
|
|
711 |
gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20)
|
712 |
return False, ffmpeg_exe
|
713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
714 |
def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
715 |
try:
|
716 |
y, sr = librosa.load(input_file, sr=None)
|
@@ -720,23 +753,29 @@ def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
|
720 |
gr.Warning(f"Librosa speedup failed: {e}")
|
721 |
shutil.copy(input_file, output_file)
|
722 |
|
|
|
723 |
def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path):
|
724 |
if use_ffmpeg:
|
725 |
try:
|
726 |
-
subprocess.run(
|
|
|
|
|
|
|
|
|
|
|
727 |
except Exception as e:
|
728 |
gr.Error(f"FFmpeg speedup error: {e}")
|
729 |
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
730 |
else:
|
731 |
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
732 |
|
|
|
733 |
def remove_edge_silence(input_path, output_path):
|
734 |
y, sr = librosa.load(input_path, sr=None)
|
735 |
trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
|
736 |
sf.write(output_path, trimmed_audio, sr)
|
737 |
return output_path
|
738 |
|
739 |
-
|
740 |
# ---------------------- Main Class ----------------------
|
741 |
class SRTDubbing:
|
742 |
def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"):
|
@@ -747,16 +786,63 @@ class SRTDubbing:
|
|
747 |
os.makedirs(self.cache_dir, exist_ok=True)
|
748 |
|
749 |
@staticmethod
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
750 |
def convert_to_millisecond(t):
|
751 |
return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)
|
752 |
|
753 |
-
|
754 |
-
def read_srt_file(file_path):
|
755 |
subs = pysrt.open(file_path, encoding='utf-8')
|
756 |
entries = []
|
757 |
prev_end = 0
|
758 |
for idx, sub in enumerate(subs, 1):
|
759 |
-
start
|
|
|
760 |
pause = start - prev_end if idx > 1 else start
|
761 |
entries.append({
|
762 |
'entry_number': idx,
|
@@ -768,55 +854,133 @@ class SRTDubbing:
|
|
768 |
'previous_pause': f"{idx}_before_pause.wav",
|
769 |
})
|
770 |
prev_end = end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
771 |
return entries
|
772 |
|
773 |
-
|
774 |
-
|
775 |
-
|
776 |
-
|
777 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
778 |
remove_edge_silence(path, temp)
|
779 |
-
# 📏 Load the trimmed audio and get its duration in milliseconds.
|
780 |
-
audio = AudioSegment.from_file(temp)
|
781 |
|
782 |
-
#
|
783 |
-
|
|
|
|
|
|
|
|
|
784 |
shutil.move(temp, audio_path)
|
785 |
return
|
786 |
|
787 |
-
# Step
|
788 |
-
if
|
789 |
-
path, _ = generate_and_save_audio(
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
remove_edge_silence(path, temp)
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
802 |
-
|
803 |
-
|
804 |
-
|
805 |
-
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
-
(audio + silence).export(audio_path, format="wav")
|
811 |
-
# ➡️ Fallback: If TTS already perfectly matches subtitle duration, save as-is.
|
812 |
else:
|
813 |
-
shutil.move(temp, audio_path)
|
|
|
814 |
|
815 |
@staticmethod
|
|
|
816 |
def make_silence(duration, path):
|
817 |
AudioSegment.silent(duration=duration).export(path, format="wav")
|
818 |
|
819 |
@staticmethod
|
|
|
820 |
def create_folder_for_srt(srt_file_path):
|
821 |
base = os.path.splitext(os.path.basename(srt_file_path))[0]
|
822 |
folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
|
@@ -824,27 +988,30 @@ class SRTDubbing:
|
|
824 |
return folder
|
825 |
|
826 |
@staticmethod
|
|
|
827 |
def concatenate_audio_files(paths, output):
|
828 |
audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
|
829 |
audio.export(output, format="wav")
|
830 |
|
831 |
-
|
|
|
832 |
entries = self.read_srt_file(srt_path)
|
833 |
folder = self.create_folder_for_srt(srt_path)
|
834 |
all_audio = []
|
|
|
|
|
|
|
|
|
835 |
for entry in tqdm(entries):
|
836 |
self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
|
837 |
all_audio.append(os.path.join(folder, entry['previous_pause']))
|
838 |
-
|
839 |
tts_path = os.path.join(folder, entry['audio_name'])
|
840 |
-
self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time'])
|
841 |
all_audio.append(tts_path)
|
842 |
-
|
843 |
self.concatenate_audio_files(all_audio, output_path)
|
844 |
|
845 |
-
|
846 |
# ---------------------- Entrypoint ----------------------
|
847 |
-
def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False):
|
848 |
if not srt_path.endswith(".srt"):
|
849 |
gr.Error("Please upload a valid .srt file", duration=5)
|
850 |
return None
|
@@ -853,8 +1020,16 @@ def srt_process(srt_path, Language="American English", voice_name="af_bella", tr
|
|
853 |
processed_srt = prepare_srt(srt_path, Language, translate)
|
854 |
output_path = get_subtitle_Dub_path(srt_path, Language)
|
855 |
|
856 |
-
SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name)
|
857 |
-
return output_path,output_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
858 |
|
859 |
def subtitle_ui():
|
860 |
with gr.Blocks() as demo:
|
@@ -862,9 +1037,9 @@ def subtitle_ui():
|
|
862 |
gr.Markdown(
|
863 |
"""
|
864 |
# Generate Audio File From Subtitle [Upload Only .srt file]
|
865 |
-
|
866 |
-
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
867 |
-
|
868 |
"""
|
869 |
)
|
870 |
with gr.Row():
|
@@ -874,19 +1049,20 @@ def subtitle_ui():
|
|
874 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
875 |
# with gr.Row():
|
876 |
voice = gr.Dropdown(
|
877 |
-
voice_names,
|
878 |
-
value='af_bella',
|
879 |
-
allow_custom_value=False,
|
880 |
-
label='🎙️ Choose VoicePack',
|
881 |
)
|
882 |
with gr.Row():
|
883 |
generate_btn_ = gr.Button('Generate', variant='primary')
|
884 |
|
885 |
with gr.Accordion('Other Settings', open=False):
|
|
|
886 |
translate_text = gr.Checkbox(value=False, label='🌐 Translate Subtitle to Selected Language')
|
887 |
-
|
888 |
-
|
889 |
-
|
890 |
with gr.Column():
|
891 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
892 |
audio_file = gr.File(label='📥 Download Audio')
|
@@ -895,23 +1071,18 @@ def subtitle_ui():
|
|
895 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
896 |
|
897 |
# srt_file.submit(
|
898 |
-
# srt_process,
|
899 |
-
# inputs=[srt_file, voice],
|
900 |
# outputs=[audio]
|
901 |
# )
|
902 |
generate_btn_.click(
|
903 |
-
srt_process,
|
904 |
-
inputs=[srt_file,language_name,voice,translate_text],
|
905 |
outputs=[audio,audio_file]
|
906 |
)
|
907 |
return demo
|
908 |
-
|
909 |
|
910 |
|
911 |
-
# Example usage:
|
912 |
-
# srt_file_path = "/content/me.srt"
|
913 |
-
# dub_audio_path = srt_process(srt_file_path, Language="American English", voice_name="af_bella", translate=False)
|
914 |
-
# print(f"Audio file saved at: {dub_audio_path}")
|
915 |
|
916 |
import click
|
917 |
@click.command()
|
@@ -937,4 +1108,4 @@ last_used_language = "a"
|
|
937 |
pipeline = KPipeline(lang_code=last_used_language)
|
938 |
temp_folder = create_audio_dir()
|
939 |
if __name__ == "__main__":
|
940 |
-
main()
|
|
|
619 |
- **"m_"**: Male
|
620 |
"""
|
621 |
with gr.Blocks() as demo2:
|
622 |
+
gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/Kokoro-TTS-Subtitle)")
|
623 |
gr.Markdown(explanation) # Display the explanation
|
624 |
return demo2
|
625 |
|
626 |
|
627 |
+
|
628 |
+
|
629 |
+
|
630 |
+
|
631 |
+
|
632 |
+
|
633 |
#@title subtitle
|
634 |
import os
|
635 |
import re
|
|
|
638 |
import platform
|
639 |
import datetime
|
640 |
import subprocess
|
641 |
+
import math
|
642 |
+
import json
|
643 |
import pysrt
|
644 |
import librosa
|
645 |
import soundfile as sf
|
|
|
647 |
from pydub import AudioSegment
|
648 |
from deep_translator import GoogleTranslator
|
649 |
|
|
|
650 |
# ---------------------- Utility Functions ----------------------
|
651 |
+
|
652 |
+
# Returns the current time formatted as HH_MM_AM/PM (for filenames or logs)
|
653 |
def get_current_time():
|
654 |
return datetime.datetime.now().strftime("%I_%M_%p")
|
655 |
|
656 |
+
# Constructs an output file path for the final dubbed audio
|
657 |
def get_subtitle_Dub_path(srt_file_path, Language):
|
658 |
file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
|
659 |
full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
|
|
|
663 |
new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
|
664 |
return new_path.replace("__", "_")
|
665 |
|
666 |
+
# Removes noise characters like [♫] from the subtitle text and saves a cleaned SRT
|
667 |
def clean_srt(input_path):
|
668 |
def clean_srt_line(text):
|
669 |
for bad in ["[", "]", "♫"]:
|
|
|
677 |
file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
|
678 |
return output_path
|
679 |
|
680 |
+
# Translates subtitles using Deep Translator while preserving subtitle index order
|
681 |
def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000):
|
682 |
output_path = input_path.replace(".srt", f"{target_language}.srt")
|
683 |
subs = pysrt.open(input_path, encoding='utf-8')
|
684 |
+
#Blocking large text translations to prevent DDoS, so Google Translate remains free forever.
|
685 |
if len(subs) > max_segments:
|
686 |
gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.")
|
687 |
return input_path
|
688 |
|
689 |
+
# Annotate original subtitles with <#index> to preserve mapping during translation
|
690 |
original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)]
|
691 |
full_text = "\n".join(original)
|
692 |
|
693 |
+
# Split into manageable chunks for Google Translate API
|
694 |
chunks, start = [], 0
|
695 |
while start < len(full_text):
|
696 |
end = start + chunk_size
|
|
|
702 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
703 |
translated_text = "\n".join(translated_chunks)
|
704 |
|
705 |
+
# Rebuild subtitle dictionary after translation
|
706 |
pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL)
|
707 |
translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)}
|
708 |
|
709 |
+
# Assign translated text back to subtitle entries
|
710 |
for i, sub in enumerate(subs):
|
711 |
sub.text = translated_dict.get(i, sub.text)
|
712 |
|
713 |
subs.save(output_path, encoding='utf-8')
|
714 |
return output_path
|
715 |
|
716 |
+
# Cleans and optionally translates an SRT file before dubbing
|
717 |
def prepare_srt(srt_path, target_language, translate=False):
|
718 |
path = clean_srt(srt_path)
|
719 |
return translate_srt(path, target_language) if translate else path
|
720 |
|
721 |
+
# Checks if FFmpeg is available on the system; if not, warns user and returns fallback
|
722 |
+
# To change audio speed explicitly, we can use either FFmpeg or Librosa.
|
723 |
def is_ffmpeg_installed():
|
724 |
ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg"
|
725 |
try:
|
|
|
729 |
gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20)
|
730 |
return False, ffmpeg_exe
|
731 |
|
732 |
+
# Because FFmpeg can handle speeds from 0.5× to 2.0× only
|
733 |
+
def atempo_chain(factor):
|
734 |
+
if 0.5 <= factor <= 2.0:
|
735 |
+
return f"atempo={factor:.3f}"
|
736 |
+
parts = []
|
737 |
+
while factor > 2.0:
|
738 |
+
parts.append("atempo=2.0")
|
739 |
+
factor /= 2.0
|
740 |
+
while factor < 0.5:
|
741 |
+
parts.append("atempo=0.5")
|
742 |
+
factor *= 2.0
|
743 |
+
parts.append(f"atempo={factor:.3f}")
|
744 |
+
return ",".join(parts)
|
745 |
+
|
746 |
+
# If FFmpeg is not found, we will use Librosa
|
747 |
def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
748 |
try:
|
749 |
y, sr = librosa.load(input_file, sr=None)
|
|
|
753 |
gr.Warning(f"Librosa speedup failed: {e}")
|
754 |
shutil.copy(input_file, output_file)
|
755 |
|
756 |
+
# Change the audio speed if it exceeds the original SRT segment duration.
|
757 |
def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path):
|
758 |
if use_ffmpeg:
|
759 |
try:
|
760 |
+
subprocess.run(
|
761 |
+
[ffmpeg_path, "-i", input_file, "-filter:a", atempo_chain(speedup_factor), output_file, "-y"],
|
762 |
+
check=True,
|
763 |
+
stdout=subprocess.DEVNULL,
|
764 |
+
stderr=subprocess.DEVNULL
|
765 |
+
)
|
766 |
except Exception as e:
|
767 |
gr.Error(f"FFmpeg speedup error: {e}")
|
768 |
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
769 |
else:
|
770 |
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
771 |
|
772 |
+
# Remove silence from the start and end of the audio.
|
773 |
def remove_edge_silence(input_path, output_path):
|
774 |
y, sr = librosa.load(input_path, sr=None)
|
775 |
trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
|
776 |
sf.write(output_path, trimmed_audio, sr)
|
777 |
return output_path
|
778 |
|
|
|
779 |
# ---------------------- Main Class ----------------------
|
780 |
class SRTDubbing:
|
781 |
def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"):
|
|
|
786 |
os.makedirs(self.cache_dir, exist_ok=True)
|
787 |
|
788 |
@staticmethod
|
789 |
+
# Because our target is single-speaker SRT dubbing,
|
790 |
+
# we will calculate the speaker's average talking speed per second.
|
791 |
+
def get_avg_speaker_speed(srt_path):
|
792 |
+
subs = pysrt.open(srt_path, encoding='utf-8')
|
793 |
+
speeds = []
|
794 |
+
for sub in subs:
|
795 |
+
duration_sec = (sub.end.ordinal - sub.start.ordinal) / 1000
|
796 |
+
char_count = len(sub.text.replace(" ", ""))
|
797 |
+
if duration_sec > 0 and char_count > 0:
|
798 |
+
speeds.append(char_count / duration_sec)
|
799 |
+
return sum(speeds) / len(speeds) if speeds else 14
|
800 |
+
|
801 |
+
@staticmethod
|
802 |
+
# Calculate the speaker's default talking speed (e.g., 0.5x, 1x, 1.5x)
|
803 |
+
def get_speed_factor(srt_path, default_tts_rate=14):
|
804 |
+
avg_rate = SRTDubbing.get_avg_speaker_speed(srt_path)
|
805 |
+
speed_factor = avg_rate / default_tts_rate if default_tts_rate > 0 else 1.0
|
806 |
+
return math.floor(speed_factor * 100) / 100 # Truncate
|
807 |
+
|
808 |
+
@staticmethod
|
809 |
+
# Merge multiple SRT segments if the gap is small and total duration
|
810 |
+
# stays under N milliseconds
|
811 |
+
def merge_fast_entries(entries, max_pause_gap=1000, max_merged_duration_ms=8000):
|
812 |
+
merged = []
|
813 |
+
i = 0
|
814 |
+
n = len(entries)
|
815 |
+
while i < n:
|
816 |
+
curr = entries[i].copy()
|
817 |
+
j = i + 1
|
818 |
+
while j < n:
|
819 |
+
next_ = entries[j]
|
820 |
+
gap = next_["start_time"] - curr["end_time"]
|
821 |
+
new_duration = next_["end_time"] - curr["start_time"]
|
822 |
+
if gap > max_pause_gap or new_duration > max_merged_duration_ms:
|
823 |
+
break
|
824 |
+
if not curr["text"].strip().endswith((".", "!", "?")):
|
825 |
+
curr["text"] = curr["text"].strip() + ","
|
826 |
+
curr["text"] += " " + next_["text"]
|
827 |
+
curr["end_time"] = next_["end_time"]
|
828 |
+
j += 1
|
829 |
+
merged.append(curr)
|
830 |
+
i = j
|
831 |
+
return merged
|
832 |
+
|
833 |
+
@staticmethod
|
834 |
+
# Convert SRT timestamp to milliseconds
|
835 |
def convert_to_millisecond(t):
|
836 |
return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)
|
837 |
|
838 |
+
# Read SRT file and convert it to our required dictionary format for dubbing
|
839 |
+
def read_srt_file(self, file_path):
|
840 |
subs = pysrt.open(file_path, encoding='utf-8')
|
841 |
entries = []
|
842 |
prev_end = 0
|
843 |
for idx, sub in enumerate(subs, 1):
|
844 |
+
start = self.convert_to_millisecond(sub.start)
|
845 |
+
end = self.convert_to_millisecond(sub.end)
|
846 |
pause = start - prev_end if idx > 1 else start
|
847 |
entries.append({
|
848 |
'entry_number': idx,
|
|
|
854 |
'previous_pause': f"{idx}_before_pause.wav",
|
855 |
})
|
856 |
prev_end = end
|
857 |
+
|
858 |
+
entries = self.merge_fast_entries(entries)
|
859 |
+
|
860 |
+
## For debug
|
861 |
+
# with open("./old.json", "w", encoding="utf-8") as f:
|
862 |
+
# json.dump(entries, f, indent=2, ensure_ascii=False)
|
863 |
+
# with open("/content/new.json", "w", encoding="utf-8") as f:
|
864 |
+
# json.dump(entries, f, indent=2, ensure_ascii=False)
|
865 |
+
|
866 |
return entries
|
867 |
|
868 |
+
# For TTS, modify this function in the future to use a different TTS or voice cloning tool
|
869 |
+
# def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration, default_speed_factor=None):
|
870 |
+
# temp = "./cache/temp.wav"
|
871 |
+
# if default_speed_factor is None:
|
872 |
+
# default_speed_factor = 1.0
|
873 |
+
|
874 |
+
# # Step 1: Generate clean TTS audio at 1.0x speed (avoid Kokoro noise issue)
|
875 |
+
# path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1.0, remove_silence=False, keep_silence_up_to=0.05)
|
876 |
+
|
877 |
+
# # Step 2: Always adjust the generated TTS to user's speaking speed
|
878 |
+
# if default_speed_factor != 1.0:
|
879 |
+
# temp_wav = path.replace(".wav", "_user_speed.wav")
|
880 |
+
# change_speed(path, temp_wav, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
|
881 |
+
# path = temp_wav
|
882 |
+
|
883 |
+
# # Step 3: Trim edges
|
884 |
+
# remove_edge_silence(path, temp)
|
885 |
+
# audio = AudioSegment.from_file(temp)
|
886 |
+
|
887 |
+
# # Step 4: If no target duration given, save and exit
|
888 |
+
# if actual_duration == 0:
|
889 |
+
# shutil.move(temp, audio_path)
|
890 |
+
# return
|
891 |
+
|
892 |
+
# # Step 5: Try regeneration with silence removal if needed
|
893 |
+
# if len(audio) > actual_duration:
|
894 |
+
# path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1.0, remove_silence=True, keep_silence_up_to=0.05)
|
895 |
+
# if default_speed_factor != 1.0:
|
896 |
+
# temp_wav = path.replace(".wav", "_tight_user_speed.wav")
|
897 |
+
# change_speed(path, temp_wav, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
|
898 |
+
# path = temp_wav
|
899 |
+
# remove_edge_silence(path, temp)
|
900 |
+
# audio = AudioSegment.from_file(temp)
|
901 |
+
|
902 |
+
# # Step 6: Final fallback — force compress audio to fit
|
903 |
+
# if len(audio) > actual_duration:
|
904 |
+
# factor = len(audio) / actual_duration
|
905 |
+
# final_temp = "./cache/speedup_temp.wav"
|
906 |
+
# change_speed(temp, final_temp, factor, self.use_ffmpeg, self.ffmpeg_path)
|
907 |
+
# shutil.move(final_temp, audio_path)
|
908 |
+
# elif len(audio) < actual_duration:
|
909 |
+
# silence = AudioSegment.silent(duration=actual_duration - len(audio))
|
910 |
+
# (audio + silence).export(audio_path, format="wav")
|
911 |
+
# else:
|
912 |
+
# shutil.move(temp, audio_path)
|
913 |
+
|
914 |
+
|
915 |
+
# For TTS, modify this function in the future to use a different TTS or voice cloning tool
|
916 |
+
def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration, default_speed_factor=None):
|
917 |
+
import soundfile as sf
|
918 |
+
from librosa import get_duration
|
919 |
+
|
920 |
+
TOLERANCE_MS = 30
|
921 |
+
temp = os.path.join(self.cache_dir, "temp.wav")
|
922 |
+
|
923 |
+
if default_speed_factor is None:
|
924 |
+
default_speed_factor = 1.0
|
925 |
+
|
926 |
+
# Step 1: Generate clean TTS audio (Kokoro safe speed)
|
927 |
+
path, _ = generate_and_save_audio(
|
928 |
+
text, Language=language, voice=voice,
|
929 |
+
speed=1.0, remove_silence=False, keep_silence_up_to=0.05
|
930 |
+
)
|
931 |
+
|
932 |
+
# Step 2: Apply user-defined speaking speed
|
933 |
+
if default_speed_factor != 1.0:
|
934 |
+
user_speed_path = path.replace(".wav", "_user.wav")
|
935 |
+
change_speed(path, user_speed_path, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
|
936 |
+
path = user_speed_path
|
937 |
+
|
938 |
+
# Step 3: Trim silence
|
939 |
remove_edge_silence(path, temp)
|
|
|
|
|
940 |
|
941 |
+
# Step 4: Duration analysis (high precision)
|
942 |
+
y, sr = sf.read(temp)
|
943 |
+
duration_ms = int(get_duration(y=y, sr=sr) * 1000)
|
944 |
+
|
945 |
+
# Step 5: If very close, skip correction
|
946 |
+
if abs(duration_ms - actual_duration) <= TOLERANCE_MS:
|
947 |
shutil.move(temp, audio_path)
|
948 |
return
|
949 |
|
950 |
+
# Step 6: Try regenerating with silence removal if too long
|
951 |
+
if duration_ms > actual_duration:
|
952 |
+
path, _ = generate_and_save_audio(
|
953 |
+
text, Language=language, voice=voice,
|
954 |
+
speed=1.0, remove_silence=True, keep_silence_up_to=0.05
|
955 |
+
)
|
956 |
+
if default_speed_factor != 1.0:
|
957 |
+
tighter = path.replace(".wav", "_tight_user.wav")
|
958 |
+
change_speed(path, tighter, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
|
959 |
+
path = tighter
|
960 |
remove_edge_silence(path, temp)
|
961 |
+
y, sr = sf.read(temp)
|
962 |
+
duration_ms = int(get_duration(y=y, sr=sr) * 1000)
|
963 |
+
|
964 |
+
# Step 7: Final correction
|
965 |
+
if duration_ms > actual_duration + TOLERANCE_MS:
|
966 |
+
factor = duration_ms / actual_duration
|
967 |
+
corrected = os.path.join(self.cache_dir, "speed_final.wav")
|
968 |
+
change_speed(temp, corrected, factor, self.use_ffmpeg, self.ffmpeg_path)
|
969 |
+
shutil.move(corrected, audio_path)
|
970 |
+
elif duration_ms < actual_duration - TOLERANCE_MS:
|
971 |
+
silence = AudioSegment.silent(duration=actual_duration - duration_ms)
|
972 |
+
(AudioSegment.from_file(temp) + silence).export(audio_path, format="wav")
|
|
|
|
|
973 |
else:
|
974 |
+
shutil.move(temp, audio_path)
|
975 |
+
|
976 |
|
977 |
@staticmethod
|
978 |
+
# Insert silent gaps between two segments
|
979 |
def make_silence(duration, path):
|
980 |
AudioSegment.silent(duration=duration).export(path, format="wav")
|
981 |
|
982 |
@staticmethod
|
983 |
+
# Srt save folder
|
984 |
def create_folder_for_srt(srt_file_path):
|
985 |
base = os.path.splitext(os.path.basename(srt_file_path))[0]
|
986 |
folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
|
|
|
988 |
return folder
|
989 |
|
990 |
@staticmethod
|
991 |
+
# Join Chunks audio files
|
992 |
def concatenate_audio_files(paths, output):
|
993 |
audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
|
994 |
audio.export(output, format="wav")
|
995 |
|
996 |
+
# Util funtion to call other funtions
|
997 |
+
def srt_to_dub(self, srt_path, output_path, language, voice,speaker_talk_speed=True):
|
998 |
entries = self.read_srt_file(srt_path)
|
999 |
folder = self.create_folder_for_srt(srt_path)
|
1000 |
all_audio = []
|
1001 |
+
if speaker_talk_speed:
|
1002 |
+
default_speed_factor = self.get_speed_factor(srt_path)
|
1003 |
+
else:
|
1004 |
+
default_speed_factor=1.0
|
1005 |
for entry in tqdm(entries):
|
1006 |
self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
|
1007 |
all_audio.append(os.path.join(folder, entry['previous_pause']))
|
|
|
1008 |
tts_path = os.path.join(folder, entry['audio_name'])
|
1009 |
+
self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time'], default_speed_factor)
|
1010 |
all_audio.append(tts_path)
|
|
|
1011 |
self.concatenate_audio_files(all_audio, output_path)
|
1012 |
|
|
|
1013 |
# ---------------------- Entrypoint ----------------------
|
1014 |
+
def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False,speaker_talk_speed=True):
|
1015 |
if not srt_path.endswith(".srt"):
|
1016 |
gr.Error("Please upload a valid .srt file", duration=5)
|
1017 |
return None
|
|
|
1020 |
processed_srt = prepare_srt(srt_path, Language, translate)
|
1021 |
output_path = get_subtitle_Dub_path(srt_path, Language)
|
1022 |
|
1023 |
+
SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name,speaker_talk_speed)
|
1024 |
+
return output_path, output_path
|
1025 |
+
|
1026 |
+
# Example usage
|
1027 |
+
# srt_file_path = "/content/last.srt" # @param {type: "string"}
|
1028 |
+
# dub_audio_path, _ = srt_process(srt_file_path, Language="American English", voice_name="af_bella", translate=False,speaker_talk_speed=False)
|
1029 |
+
# print(f"Audio file saved at: {dub_audio_path}")
|
1030 |
+
|
1031 |
+
|
1032 |
+
|
1033 |
|
1034 |
def subtitle_ui():
|
1035 |
with gr.Blocks() as demo:
|
|
|
1037 |
gr.Markdown(
|
1038 |
"""
|
1039 |
# Generate Audio File From Subtitle [Upload Only .srt file]
|
1040 |
+
|
1041 |
+
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
1042 |
+
|
1043 |
"""
|
1044 |
)
|
1045 |
with gr.Row():
|
|
|
1049 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
1050 |
# with gr.Row():
|
1051 |
voice = gr.Dropdown(
|
1052 |
+
voice_names,
|
1053 |
+
value='af_bella',
|
1054 |
+
allow_custom_value=False,
|
1055 |
+
label='🎙️ Choose VoicePack',
|
1056 |
)
|
1057 |
with gr.Row():
|
1058 |
generate_btn_ = gr.Button('Generate', variant='primary')
|
1059 |
|
1060 |
with gr.Accordion('Other Settings', open=False):
|
1061 |
+
speaker_speed_ = gr.Checkbox(value=True, label="⚡ Match With Apeaker's Average Talking Speed")
|
1062 |
translate_text = gr.Checkbox(value=False, label='🌐 Translate Subtitle to Selected Language')
|
1063 |
+
|
1064 |
+
|
1065 |
+
|
1066 |
with gr.Column():
|
1067 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
1068 |
audio_file = gr.File(label='📥 Download Audio')
|
|
|
1071 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
1072 |
|
1073 |
# srt_file.submit(
|
1074 |
+
# srt_process,
|
1075 |
+
# inputs=[srt_file, voice],
|
1076 |
# outputs=[audio]
|
1077 |
# )
|
1078 |
generate_btn_.click(
|
1079 |
+
srt_process,
|
1080 |
+
inputs=[srt_file,language_name,voice,translate_text,speaker_speed_],
|
1081 |
outputs=[audio,audio_file]
|
1082 |
)
|
1083 |
return demo
|
|
|
1084 |
|
1085 |
|
|
|
|
|
|
|
|
|
1086 |
|
1087 |
import click
|
1088 |
@click.command()
|
|
|
1108 |
pipeline = KPipeline(lang_code=last_used_language)
|
1109 |
temp_folder = create_audio_dir()
|
1110 |
if __name__ == "__main__":
|
1111 |
+
main()
|