NeuralFalcon commited on
Commit
a15bd1b
·
verified ·
1 Parent(s): 75dd551

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +242 -71
app.py CHANGED
@@ -619,11 +619,17 @@ def tutorial():
619
  - **"m_"**: Male
620
  """
621
  with gr.Blocks() as demo2:
622
- # gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
623
  gr.Markdown(explanation) # Display the explanation
624
  return demo2
625
 
626
 
 
 
 
 
 
 
627
  #@title subtitle
628
  import os
629
  import re
@@ -632,7 +638,8 @@ import shutil
632
  import platform
633
  import datetime
634
  import subprocess
635
-
 
636
  import pysrt
637
  import librosa
638
  import soundfile as sf
@@ -640,11 +647,13 @@ from tqdm.auto import tqdm
640
  from pydub import AudioSegment
641
  from deep_translator import GoogleTranslator
642
 
643
-
644
  # ---------------------- Utility Functions ----------------------
 
 
645
  def get_current_time():
646
  return datetime.datetime.now().strftime("%I_%M_%p")
647
 
 
648
  def get_subtitle_Dub_path(srt_file_path, Language):
649
  file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
650
  full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
@@ -654,6 +663,7 @@ def get_subtitle_Dub_path(srt_file_path, Language):
654
  new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
655
  return new_path.replace("__", "_")
656
 
 
657
  def clean_srt(input_path):
658
  def clean_srt_line(text):
659
  for bad in ["[", "]", "♫"]:
@@ -667,16 +677,20 @@ def clean_srt(input_path):
667
  file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
668
  return output_path
669
 
 
670
  def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000):
671
  output_path = input_path.replace(".srt", f"{target_language}.srt")
672
  subs = pysrt.open(input_path, encoding='utf-8')
 
673
  if len(subs) > max_segments:
674
  gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.")
675
  return input_path
676
 
 
677
  original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)]
678
  full_text = "\n".join(original)
679
 
 
680
  chunks, start = [], 0
681
  while start < len(full_text):
682
  end = start + chunk_size
@@ -688,20 +702,24 @@ def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_s
688
  translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
689
  translated_text = "\n".join(translated_chunks)
690
 
 
691
  pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL)
692
  translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)}
693
 
 
694
  for i, sub in enumerate(subs):
695
  sub.text = translated_dict.get(i, sub.text)
696
 
697
  subs.save(output_path, encoding='utf-8')
698
  return output_path
699
 
 
700
  def prepare_srt(srt_path, target_language, translate=False):
701
  path = clean_srt(srt_path)
702
  return translate_srt(path, target_language) if translate else path
703
 
704
-
 
705
  def is_ffmpeg_installed():
706
  ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg"
707
  try:
@@ -711,6 +729,21 @@ def is_ffmpeg_installed():
711
  gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20)
712
  return False, ffmpeg_exe
713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
  def speedup_audio_librosa(input_file, output_file, speedup_factor):
715
  try:
716
  y, sr = librosa.load(input_file, sr=None)
@@ -720,23 +753,29 @@ def speedup_audio_librosa(input_file, output_file, speedup_factor):
720
  gr.Warning(f"Librosa speedup failed: {e}")
721
  shutil.copy(input_file, output_file)
722
 
 
723
  def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path):
724
  if use_ffmpeg:
725
  try:
726
- subprocess.run([ffmpeg_path, "-i", input_file, "-filter:a", f"atempo={speedup_factor}", output_file, "-y"], check=True)
 
 
 
 
 
727
  except Exception as e:
728
  gr.Error(f"FFmpeg speedup error: {e}")
729
  speedup_audio_librosa(input_file, output_file, speedup_factor)
730
  else:
731
  speedup_audio_librosa(input_file, output_file, speedup_factor)
732
 
 
733
  def remove_edge_silence(input_path, output_path):
734
  y, sr = librosa.load(input_path, sr=None)
735
  trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
736
  sf.write(output_path, trimmed_audio, sr)
737
  return output_path
738
 
739
-
740
  # ---------------------- Main Class ----------------------
741
  class SRTDubbing:
742
  def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"):
@@ -747,16 +786,63 @@ class SRTDubbing:
747
  os.makedirs(self.cache_dir, exist_ok=True)
748
 
749
  @staticmethod
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  def convert_to_millisecond(t):
751
  return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)
752
 
753
- @staticmethod
754
- def read_srt_file(file_path):
755
  subs = pysrt.open(file_path, encoding='utf-8')
756
  entries = []
757
  prev_end = 0
758
  for idx, sub in enumerate(subs, 1):
759
- start, end = SRTDubbing.convert_to_millisecond(sub.start), SRTDubbing.convert_to_millisecond(sub.end)
 
760
  pause = start - prev_end if idx > 1 else start
761
  entries.append({
762
  'entry_number': idx,
@@ -768,55 +854,133 @@ class SRTDubbing:
768
  'previous_pause': f"{idx}_before_pause.wav",
769
  })
770
  prev_end = end
 
 
 
 
 
 
 
 
 
771
  return entries
772
 
773
- def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration):
774
- temp = "./cache/temp.wav"
775
- # Step 1: Generate initial audio
776
- path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=False, keep_silence_up_to=0.05)
777
- # ✂️ Remove leading and trailing silence to make timing tight without trimming actual speech.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
  remove_edge_silence(path, temp)
779
- # 📏 Load the trimmed audio and get its duration in milliseconds.
780
- audio = AudioSegment.from_file(temp)
781
 
782
- # ⏱️ If no duration is specified (edge case), use the TTS as-is without speed/timing adjustments.
783
- if actual_duration == 0:
 
 
 
 
784
  shutil.move(temp, audio_path)
785
  return
786
 
787
- # Step 2: If TTS audio is longer, retry with remove_silence=True
788
- if len(audio) > actual_duration:
789
- path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=True, keep_silence_up_to=0.05)
790
- remove_edge_silence(path, temp)
791
- audio = AudioSegment.from_file(temp)
792
-
793
- # Step 3: If still longer → speed up
794
- if len(audio) > actual_duration:
795
- factor = len(audio) / actual_duration
796
- path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=factor, remove_silence=True, keep_silence_up_to=0.05)
797
  remove_edge_silence(path, temp)
798
- audio = AudioSegment.from_file(temp)
799
-
800
- # Final Adjustment: Speed up via FFmpeg or librosa
801
- if len(audio) > actual_duration:
802
- factor = len(audio) / actual_duration
803
- final_temp = "./cache/speedup_temp.wav"
804
- change_speed(temp, final_temp, factor, self.use_ffmpeg, self.ffmpeg_path)
805
- shutil.move(final_temp, audio_path)
806
-
807
- # Add silence if too short
808
- elif len(audio) < actual_duration:
809
- silence = AudioSegment.silent(duration=actual_duration - len(audio))
810
- (audio + silence).export(audio_path, format="wav")
811
- # ➡️ Fallback: If TTS already perfectly matches subtitle duration, save as-is.
812
  else:
813
- shutil.move(temp, audio_path) #bad code
 
814
 
815
  @staticmethod
 
816
  def make_silence(duration, path):
817
  AudioSegment.silent(duration=duration).export(path, format="wav")
818
 
819
  @staticmethod
 
820
  def create_folder_for_srt(srt_file_path):
821
  base = os.path.splitext(os.path.basename(srt_file_path))[0]
822
  folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
@@ -824,27 +988,30 @@ class SRTDubbing:
824
  return folder
825
 
826
  @staticmethod
 
827
  def concatenate_audio_files(paths, output):
828
  audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
829
  audio.export(output, format="wav")
830
 
831
- def srt_to_dub(self, srt_path, output_path, language, voice):
 
832
  entries = self.read_srt_file(srt_path)
833
  folder = self.create_folder_for_srt(srt_path)
834
  all_audio = []
 
 
 
 
835
  for entry in tqdm(entries):
836
  self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
837
  all_audio.append(os.path.join(folder, entry['previous_pause']))
838
-
839
  tts_path = os.path.join(folder, entry['audio_name'])
840
- self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time'])
841
  all_audio.append(tts_path)
842
-
843
  self.concatenate_audio_files(all_audio, output_path)
844
 
845
-
846
  # ---------------------- Entrypoint ----------------------
847
- def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False):
848
  if not srt_path.endswith(".srt"):
849
  gr.Error("Please upload a valid .srt file", duration=5)
850
  return None
@@ -853,8 +1020,16 @@ def srt_process(srt_path, Language="American English", voice_name="af_bella", tr
853
  processed_srt = prepare_srt(srt_path, Language, translate)
854
  output_path = get_subtitle_Dub_path(srt_path, Language)
855
 
856
- SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name)
857
- return output_path,output_path
 
 
 
 
 
 
 
 
858
 
859
  def subtitle_ui():
860
  with gr.Blocks() as demo:
@@ -862,9 +1037,9 @@ def subtitle_ui():
862
  gr.Markdown(
863
  """
864
  # Generate Audio File From Subtitle [Upload Only .srt file]
865
-
866
- To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
867
-
868
  """
869
  )
870
  with gr.Row():
@@ -874,19 +1049,20 @@ def subtitle_ui():
874
  language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
875
  # with gr.Row():
876
  voice = gr.Dropdown(
877
- voice_names,
878
- value='af_bella',
879
- allow_custom_value=False,
880
- label='🎙️ Choose VoicePack',
881
  )
882
  with gr.Row():
883
  generate_btn_ = gr.Button('Generate', variant='primary')
884
 
885
  with gr.Accordion('Other Settings', open=False):
 
886
  translate_text = gr.Checkbox(value=False, label='🌐 Translate Subtitle to Selected Language')
887
-
888
-
889
-
890
  with gr.Column():
891
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
892
  audio_file = gr.File(label='📥 Download Audio')
@@ -895,23 +1071,18 @@ def subtitle_ui():
895
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
896
 
897
  # srt_file.submit(
898
- # srt_process,
899
- # inputs=[srt_file, voice],
900
  # outputs=[audio]
901
  # )
902
  generate_btn_.click(
903
- srt_process,
904
- inputs=[srt_file,language_name,voice,translate_text],
905
  outputs=[audio,audio_file]
906
  )
907
  return demo
908
-
909
 
910
 
911
- # Example usage:
912
- # srt_file_path = "/content/me.srt"
913
- # dub_audio_path = srt_process(srt_file_path, Language="American English", voice_name="af_bella", translate=False)
914
- # print(f"Audio file saved at: {dub_audio_path}")
915
 
916
  import click
917
  @click.command()
@@ -937,4 +1108,4 @@ last_used_language = "a"
937
  pipeline = KPipeline(lang_code=last_used_language)
938
  temp_folder = create_audio_dir()
939
  if __name__ == "__main__":
940
- main()
 
619
  - **"m_"**: Male
620
  """
621
  with gr.Blocks() as demo2:
622
+ gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/Kokoro-TTS-Subtitle)")
623
  gr.Markdown(explanation) # Display the explanation
624
  return demo2
625
 
626
 
627
+
628
+
629
+
630
+
631
+
632
+
633
  #@title subtitle
634
  import os
635
  import re
 
638
  import platform
639
  import datetime
640
  import subprocess
641
+ import math
642
+ import json
643
  import pysrt
644
  import librosa
645
  import soundfile as sf
 
647
  from pydub import AudioSegment
648
  from deep_translator import GoogleTranslator
649
 
 
650
  # ---------------------- Utility Functions ----------------------
651
+
652
+ # Returns the current time formatted as HH_MM_AM/PM (for filenames or logs)
653
  def get_current_time():
654
  return datetime.datetime.now().strftime("%I_%M_%p")
655
 
656
+ # Constructs an output file path for the final dubbed audio
657
  def get_subtitle_Dub_path(srt_file_path, Language):
658
  file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
659
  full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
 
663
  new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
664
  return new_path.replace("__", "_")
665
 
666
+ # Removes noise characters like [♫] from the subtitle text and saves a cleaned SRT
667
  def clean_srt(input_path):
668
  def clean_srt_line(text):
669
  for bad in ["[", "]", "♫"]:
 
677
  file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
678
  return output_path
679
 
680
+ # Translates subtitles using Deep Translator while preserving subtitle index order
681
  def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000):
682
  output_path = input_path.replace(".srt", f"{target_language}.srt")
683
  subs = pysrt.open(input_path, encoding='utf-8')
684
+ #Blocking large text translations to prevent DDoS, so Google Translate remains free forever.
685
  if len(subs) > max_segments:
686
  gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.")
687
  return input_path
688
 
689
+ # Annotate original subtitles with <#index> to preserve mapping during translation
690
  original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)]
691
  full_text = "\n".join(original)
692
 
693
+ # Split into manageable chunks for Google Translate API
694
  chunks, start = [], 0
695
  while start < len(full_text):
696
  end = start + chunk_size
 
702
  translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
703
  translated_text = "\n".join(translated_chunks)
704
 
705
+ # Rebuild subtitle dictionary after translation
706
  pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL)
707
  translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)}
708
 
709
+ # Assign translated text back to subtitle entries
710
  for i, sub in enumerate(subs):
711
  sub.text = translated_dict.get(i, sub.text)
712
 
713
  subs.save(output_path, encoding='utf-8')
714
  return output_path
715
 
716
+ # Cleans and optionally translates an SRT file before dubbing
717
  def prepare_srt(srt_path, target_language, translate=False):
718
  path = clean_srt(srt_path)
719
  return translate_srt(path, target_language) if translate else path
720
 
721
+ # Checks if FFmpeg is available on the system; if not, warns user and returns fallback
722
+ # To change audio speed explicitly, we can use either FFmpeg or Librosa.
723
  def is_ffmpeg_installed():
724
  ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg"
725
  try:
 
729
  gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20)
730
  return False, ffmpeg_exe
731
 
732
+ # Because FFmpeg can handle speeds from 0.5× to 2.0× only
733
+ def atempo_chain(factor):
734
+ if 0.5 <= factor <= 2.0:
735
+ return f"atempo={factor:.3f}"
736
+ parts = []
737
+ while factor > 2.0:
738
+ parts.append("atempo=2.0")
739
+ factor /= 2.0
740
+ while factor < 0.5:
741
+ parts.append("atempo=0.5")
742
+ factor *= 2.0
743
+ parts.append(f"atempo={factor:.3f}")
744
+ return ",".join(parts)
745
+
746
+ # If FFmpeg is not found, we will use Librosa
747
  def speedup_audio_librosa(input_file, output_file, speedup_factor):
748
  try:
749
  y, sr = librosa.load(input_file, sr=None)
 
753
  gr.Warning(f"Librosa speedup failed: {e}")
754
  shutil.copy(input_file, output_file)
755
 
756
+ # Change the audio speed if it exceeds the original SRT segment duration.
757
  def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path):
758
  if use_ffmpeg:
759
  try:
760
+ subprocess.run(
761
+ [ffmpeg_path, "-i", input_file, "-filter:a", atempo_chain(speedup_factor), output_file, "-y"],
762
+ check=True,
763
+ stdout=subprocess.DEVNULL,
764
+ stderr=subprocess.DEVNULL
765
+ )
766
  except Exception as e:
767
  gr.Error(f"FFmpeg speedup error: {e}")
768
  speedup_audio_librosa(input_file, output_file, speedup_factor)
769
  else:
770
  speedup_audio_librosa(input_file, output_file, speedup_factor)
771
 
772
+ # Remove silence from the start and end of the audio.
773
  def remove_edge_silence(input_path, output_path):
774
  y, sr = librosa.load(input_path, sr=None)
775
  trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
776
  sf.write(output_path, trimmed_audio, sr)
777
  return output_path
778
 
 
779
  # ---------------------- Main Class ----------------------
780
  class SRTDubbing:
781
  def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"):
 
786
  os.makedirs(self.cache_dir, exist_ok=True)
787
 
788
  @staticmethod
789
+ # Because our target is single-speaker SRT dubbing,
790
+ # we will calculate the speaker's average talking speed per second.
791
+ def get_avg_speaker_speed(srt_path):
792
+ subs = pysrt.open(srt_path, encoding='utf-8')
793
+ speeds = []
794
+ for sub in subs:
795
+ duration_sec = (sub.end.ordinal - sub.start.ordinal) / 1000
796
+ char_count = len(sub.text.replace(" ", ""))
797
+ if duration_sec > 0 and char_count > 0:
798
+ speeds.append(char_count / duration_sec)
799
+ return sum(speeds) / len(speeds) if speeds else 14
800
+
801
+ @staticmethod
802
+ # Calculate the speaker's default talking speed (e.g., 0.5x, 1x, 1.5x)
803
+ def get_speed_factor(srt_path, default_tts_rate=14):
804
+ avg_rate = SRTDubbing.get_avg_speaker_speed(srt_path)
805
+ speed_factor = avg_rate / default_tts_rate if default_tts_rate > 0 else 1.0
806
+ return math.floor(speed_factor * 100) / 100 # Truncate
807
+
808
+ @staticmethod
809
+ # Merge multiple SRT segments if the gap is small and total duration
810
+ # stays under N milliseconds
811
+ def merge_fast_entries(entries, max_pause_gap=1000, max_merged_duration_ms=8000):
812
+ merged = []
813
+ i = 0
814
+ n = len(entries)
815
+ while i < n:
816
+ curr = entries[i].copy()
817
+ j = i + 1
818
+ while j < n:
819
+ next_ = entries[j]
820
+ gap = next_["start_time"] - curr["end_time"]
821
+ new_duration = next_["end_time"] - curr["start_time"]
822
+ if gap > max_pause_gap or new_duration > max_merged_duration_ms:
823
+ break
824
+ if not curr["text"].strip().endswith((".", "!", "?")):
825
+ curr["text"] = curr["text"].strip() + ","
826
+ curr["text"] += " " + next_["text"]
827
+ curr["end_time"] = next_["end_time"]
828
+ j += 1
829
+ merged.append(curr)
830
+ i = j
831
+ return merged
832
+
833
+ @staticmethod
834
+ # Convert SRT timestamp to milliseconds
835
  def convert_to_millisecond(t):
836
  return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)
837
 
838
+ # Read SRT file and convert it to our required dictionary format for dubbing
839
+ def read_srt_file(self, file_path):
840
  subs = pysrt.open(file_path, encoding='utf-8')
841
  entries = []
842
  prev_end = 0
843
  for idx, sub in enumerate(subs, 1):
844
+ start = self.convert_to_millisecond(sub.start)
845
+ end = self.convert_to_millisecond(sub.end)
846
  pause = start - prev_end if idx > 1 else start
847
  entries.append({
848
  'entry_number': idx,
 
854
  'previous_pause': f"{idx}_before_pause.wav",
855
  })
856
  prev_end = end
857
+
858
+ entries = self.merge_fast_entries(entries)
859
+
860
+ ## For debug
861
+ # with open("./old.json", "w", encoding="utf-8") as f:
862
+ # json.dump(entries, f, indent=2, ensure_ascii=False)
863
+ # with open("/content/new.json", "w", encoding="utf-8") as f:
864
+ # json.dump(entries, f, indent=2, ensure_ascii=False)
865
+
866
  return entries
867
 
868
+ # For TTS, modify this function in the future to use a different TTS or voice cloning tool
869
+ # def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration, default_speed_factor=None):
870
+ # temp = "./cache/temp.wav"
871
+ # if default_speed_factor is None:
872
+ # default_speed_factor = 1.0
873
+
874
+ # # Step 1: Generate clean TTS audio at 1.0x speed (avoid Kokoro noise issue)
875
+ # path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1.0, remove_silence=False, keep_silence_up_to=0.05)
876
+
877
+ # # Step 2: Always adjust the generated TTS to user's speaking speed
878
+ # if default_speed_factor != 1.0:
879
+ # temp_wav = path.replace(".wav", "_user_speed.wav")
880
+ # change_speed(path, temp_wav, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
881
+ # path = temp_wav
882
+
883
+ # # Step 3: Trim edges
884
+ # remove_edge_silence(path, temp)
885
+ # audio = AudioSegment.from_file(temp)
886
+
887
+ # # Step 4: If no target duration given, save and exit
888
+ # if actual_duration == 0:
889
+ # shutil.move(temp, audio_path)
890
+ # return
891
+
892
+ # # Step 5: Try regeneration with silence removal if needed
893
+ # if len(audio) > actual_duration:
894
+ # path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1.0, remove_silence=True, keep_silence_up_to=0.05)
895
+ # if default_speed_factor != 1.0:
896
+ # temp_wav = path.replace(".wav", "_tight_user_speed.wav")
897
+ # change_speed(path, temp_wav, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
898
+ # path = temp_wav
899
+ # remove_edge_silence(path, temp)
900
+ # audio = AudioSegment.from_file(temp)
901
+
902
+ # # Step 6: Final fallback — force compress audio to fit
903
+ # if len(audio) > actual_duration:
904
+ # factor = len(audio) / actual_duration
905
+ # final_temp = "./cache/speedup_temp.wav"
906
+ # change_speed(temp, final_temp, factor, self.use_ffmpeg, self.ffmpeg_path)
907
+ # shutil.move(final_temp, audio_path)
908
+ # elif len(audio) < actual_duration:
909
+ # silence = AudioSegment.silent(duration=actual_duration - len(audio))
910
+ # (audio + silence).export(audio_path, format="wav")
911
+ # else:
912
+ # shutil.move(temp, audio_path)
913
+
914
+
915
+ # For TTS, modify this function in the future to use a different TTS or voice cloning tool
916
+ def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration, default_speed_factor=None):
917
+ import soundfile as sf
918
+ from librosa import get_duration
919
+
920
+ TOLERANCE_MS = 30
921
+ temp = os.path.join(self.cache_dir, "temp.wav")
922
+
923
+ if default_speed_factor is None:
924
+ default_speed_factor = 1.0
925
+
926
+ # Step 1: Generate clean TTS audio (Kokoro safe speed)
927
+ path, _ = generate_and_save_audio(
928
+ text, Language=language, voice=voice,
929
+ speed=1.0, remove_silence=False, keep_silence_up_to=0.05
930
+ )
931
+
932
+ # Step 2: Apply user-defined speaking speed
933
+ if default_speed_factor != 1.0:
934
+ user_speed_path = path.replace(".wav", "_user.wav")
935
+ change_speed(path, user_speed_path, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
936
+ path = user_speed_path
937
+
938
+ # Step 3: Trim silence
939
  remove_edge_silence(path, temp)
 
 
940
 
941
+ # Step 4: Duration analysis (high precision)
942
+ y, sr = sf.read(temp)
943
+ duration_ms = int(get_duration(y=y, sr=sr) * 1000)
944
+
945
+ # Step 5: If very close, skip correction
946
+ if abs(duration_ms - actual_duration) <= TOLERANCE_MS:
947
  shutil.move(temp, audio_path)
948
  return
949
 
950
+ # Step 6: Try regenerating with silence removal if too long
951
+ if duration_ms > actual_duration:
952
+ path, _ = generate_and_save_audio(
953
+ text, Language=language, voice=voice,
954
+ speed=1.0, remove_silence=True, keep_silence_up_to=0.05
955
+ )
956
+ if default_speed_factor != 1.0:
957
+ tighter = path.replace(".wav", "_tight_user.wav")
958
+ change_speed(path, tighter, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
959
+ path = tighter
960
  remove_edge_silence(path, temp)
961
+ y, sr = sf.read(temp)
962
+ duration_ms = int(get_duration(y=y, sr=sr) * 1000)
963
+
964
+ # Step 7: Final correction
965
+ if duration_ms > actual_duration + TOLERANCE_MS:
966
+ factor = duration_ms / actual_duration
967
+ corrected = os.path.join(self.cache_dir, "speed_final.wav")
968
+ change_speed(temp, corrected, factor, self.use_ffmpeg, self.ffmpeg_path)
969
+ shutil.move(corrected, audio_path)
970
+ elif duration_ms < actual_duration - TOLERANCE_MS:
971
+ silence = AudioSegment.silent(duration=actual_duration - duration_ms)
972
+ (AudioSegment.from_file(temp) + silence).export(audio_path, format="wav")
 
 
973
  else:
974
+ shutil.move(temp, audio_path)
975
+
976
 
977
  @staticmethod
978
+ # Insert silent gaps between two segments
979
  def make_silence(duration, path):
980
  AudioSegment.silent(duration=duration).export(path, format="wav")
981
 
982
  @staticmethod
983
+ # Srt save folder
984
  def create_folder_for_srt(srt_file_path):
985
  base = os.path.splitext(os.path.basename(srt_file_path))[0]
986
  folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
 
988
  return folder
989
 
990
  @staticmethod
991
+ # Join Chunks audio files
992
  def concatenate_audio_files(paths, output):
993
  audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
994
  audio.export(output, format="wav")
995
 
996
+ # Util funtion to call other funtions
997
+ def srt_to_dub(self, srt_path, output_path, language, voice,speaker_talk_speed=True):
998
  entries = self.read_srt_file(srt_path)
999
  folder = self.create_folder_for_srt(srt_path)
1000
  all_audio = []
1001
+ if speaker_talk_speed:
1002
+ default_speed_factor = self.get_speed_factor(srt_path)
1003
+ else:
1004
+ default_speed_factor=1.0
1005
  for entry in tqdm(entries):
1006
  self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
1007
  all_audio.append(os.path.join(folder, entry['previous_pause']))
 
1008
  tts_path = os.path.join(folder, entry['audio_name'])
1009
+ self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time'], default_speed_factor)
1010
  all_audio.append(tts_path)
 
1011
  self.concatenate_audio_files(all_audio, output_path)
1012
 
 
1013
  # ---------------------- Entrypoint ----------------------
1014
+ def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False,speaker_talk_speed=True):
1015
  if not srt_path.endswith(".srt"):
1016
  gr.Error("Please upload a valid .srt file", duration=5)
1017
  return None
 
1020
  processed_srt = prepare_srt(srt_path, Language, translate)
1021
  output_path = get_subtitle_Dub_path(srt_path, Language)
1022
 
1023
+ SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name,speaker_talk_speed)
1024
+ return output_path, output_path
1025
+
1026
+ # Example usage
1027
+ # srt_file_path = "/content/last.srt" # @param {type: "string"}
1028
+ # dub_audio_path, _ = srt_process(srt_file_path, Language="American English", voice_name="af_bella", translate=False,speaker_talk_speed=False)
1029
+ # print(f"Audio file saved at: {dub_audio_path}")
1030
+
1031
+
1032
+
1033
 
1034
  def subtitle_ui():
1035
  with gr.Blocks() as demo:
 
1037
  gr.Markdown(
1038
  """
1039
  # Generate Audio File From Subtitle [Upload Only .srt file]
1040
+
1041
+ To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
1042
+
1043
  """
1044
  )
1045
  with gr.Row():
 
1049
  language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
1050
  # with gr.Row():
1051
  voice = gr.Dropdown(
1052
+ voice_names,
1053
+ value='af_bella',
1054
+ allow_custom_value=False,
1055
+ label='🎙️ Choose VoicePack',
1056
  )
1057
  with gr.Row():
1058
  generate_btn_ = gr.Button('Generate', variant='primary')
1059
 
1060
  with gr.Accordion('Other Settings', open=False):
1061
+ speaker_speed_ = gr.Checkbox(value=True, label="⚡ Match With Apeaker's Average Talking Speed")
1062
  translate_text = gr.Checkbox(value=False, label='🌐 Translate Subtitle to Selected Language')
1063
+
1064
+
1065
+
1066
  with gr.Column():
1067
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
1068
  audio_file = gr.File(label='📥 Download Audio')
 
1071
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
1072
 
1073
  # srt_file.submit(
1074
+ # srt_process,
1075
+ # inputs=[srt_file, voice],
1076
  # outputs=[audio]
1077
  # )
1078
  generate_btn_.click(
1079
+ srt_process,
1080
+ inputs=[srt_file,language_name,voice,translate_text,speaker_speed_],
1081
  outputs=[audio,audio_file]
1082
  )
1083
  return demo
 
1084
 
1085
 
 
 
 
 
1086
 
1087
  import click
1088
  @click.command()
 
1108
  pipeline = KPipeline(lang_code=last_used_language)
1109
  temp_folder = create_audio_dir()
1110
  if __name__ == "__main__":
1111
+ main()