qqwjq1981 commited on
Commit
4cba5c4
Β·
verified Β·
1 Parent(s): 68eec85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -30
app.py CHANGED
@@ -408,7 +408,7 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
408
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
409
  return None
410
 
411
- def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
412
  logger.debug(f"Processing entry {i}: {entry}")
413
  error_message = None
414
 
@@ -424,13 +424,15 @@ def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover,
424
  try:
425
  segment_audio_path = f"segment_{i}_voiceover.wav"
426
  desired_duration = entry["end"] - entry["start"]
427
- speaker = entry.get("speaker", "default")
428
- speaker_wav_path = f"speaker_{speaker}_sample.wav"
429
 
430
- output_path, status_msg, tts_error = generate_voiceover_clone([entry], tts_model, desired_duration, target_language, speaker_wav_path, segment_audio_path)
 
 
 
431
 
432
- if tts_error:
433
- error_message = error_message + " | " + tts_error if error_message else tts_error
434
 
435
  if not output_path or not os.path.exists(segment_audio_path):
436
  raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
@@ -453,28 +455,28 @@ def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover,
453
 
454
  return i, txt_clip, audio_segment, error_message
455
 
456
- def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
457
  video = VideoFileClip(video_path)
458
  font_path = "./NotoSansSC-Regular.ttf"
459
 
460
  text_clips = []
461
  audio_segments = []
462
  error_messages = []
463
-
464
- global tts_model
465
- if tts_model is None:
466
- try:
467
- print("πŸ”„ Loading XTTS model...")
468
- tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
469
- print("βœ… XTTS model loaded successfully.")
470
- except Exception as e:
471
- print("❌ Error loading XTTS model:")
472
- traceback.print_exc()
473
- return f"Error loading XTTS model: {e}"
474
- ## Need to implmenet backup option.
475
 
476
  with concurrent.futures.ThreadPoolExecutor() as executor:
477
- futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths, background_audio_path="background_segments.wav")
478
  for i, entry in enumerate(translated_json)]
479
 
480
  results = []
@@ -526,26 +528,56 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
526
 
527
  return error_messages
528
 
529
- def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path, use_clone=False):
530
- try:
531
- full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
532
- if not full_text.strip():
533
- msg = "❌ Translated text is empty."
534
- logger.error(msg)
535
- return None, msg, msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
 
 
 
 
 
 
 
 
 
 
 
537
  if not speaker_wav_path or not os.path.exists(speaker_wav_path):
538
  msg = f"❌ Speaker audio not found: {speaker_wav_path}"
539
  logger.error(msg)
540
  return None, msg, msg
541
 
542
- speed_tts = calibrated_speed(full_text, desired_duration)
543
  tts_model.tts_to_file(
544
  text=full_text,
545
  speaker_wav=speaker_wav_path,
546
  language=target_language,
547
  file_path=output_audio_path,
548
- speed=speed_tts,
549
  split_sentences=True
550
  )
551
 
@@ -584,7 +616,6 @@ def calibrated_speed(text, desired_duration):
584
  slope = (2 - 1.0) / (30 - 14)
585
  return 1.0 + slope * (cps - 14)
586
 
587
-
588
  def upload_and_manage(file, target_language, mode="transcription"):
589
  if file is None:
590
  logger.info("No file uploaded. Please upload a video/audio file.")
@@ -702,6 +733,7 @@ def build_interface():
702
  return demo
703
 
704
  tts_model = None
 
705
  # Launch the Gradio interface
706
  demo = build_interface()
707
  demo.launch()
 
408
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
409
  return None
410
 
411
+ def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover, target_language, font_path, use_clone, speaker_sample_paths=None):
412
  logger.debug(f"Processing entry {i}: {entry}")
413
  error_message = None
414
 
 
424
  try:
425
  segment_audio_path = f"segment_{i}_voiceover.wav"
426
  desired_duration = entry["end"] - entry["start"]
427
+ desired_speed = calibrated_speed(entry['translated'], desired_duration)
 
428
 
429
+ if use_clone:
430
+ speaker = entry.get("speaker", "default")
431
+ speaker_wav_path = f"speaker_{speaker}_sample.wav"
432
+ generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
433
 
434
+ else:
435
+ generate_voiceover_OpenAI(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
436
 
437
  if not output_path or not os.path.exists(segment_audio_path):
438
  raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
 
455
 
456
  return i, txt_clip, audio_segment, error_message
457
 
458
+ def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None, use_clone=False):
459
  video = VideoFileClip(video_path)
460
  font_path = "./NotoSansSC-Regular.ttf"
461
 
462
  text_clips = []
463
  audio_segments = []
464
  error_messages = []
465
+
466
+ if use_clone:
467
+ if tts_model is None:
468
+ try:
469
+ print("πŸ”„ Loading XTTS model...")
470
+ tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
471
+ print("βœ… XTTS model loaded successfully.")
472
+ except Exception as e:
473
+ print("❌ Error loading XTTS model:")
474
+ traceback.print_exc()
475
+ return f"Error loading XTTS model: {e}"
476
+ ## Need to implmenet backup option.
477
 
478
  with concurrent.futures.ThreadPoolExecutor() as executor:
479
+ futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, use_clone, speaker_sample_paths)
480
  for i, entry in enumerate(translated_json)]
481
 
482
  results = []
 
528
 
529
  return error_messages
530
 
531
+ def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
532
+ """
533
+ Generate voiceover from translated text for a given language using OpenAI TTS API.
534
+ """
535
+ # Define the voice based on the language (for now, use 'alloy' as default)
536
+ voice = "alloy" # Adjust based on language if needed
537
+
538
+ # Define the model (use tts-1 for real-time applications)
539
+ model = "tts-1"
540
+
541
+ max_retries = 3
542
+ retry_count = 0
543
+
544
+ while retry_count < max_retries:
545
+ try:
546
+ # Create the speech using OpenAI TTS API
547
+ response = client.audio.speech.create(
548
+ model=model,
549
+ voice=voice,
550
+ input=full_text,
551
+ speed=desired_speed
552
+ )
553
+ # Save the audio to the specified path
554
+ with open(output_audio_path, 'wb') as f:
555
+ for chunk in response.iter_bytes():
556
+ f.write(chunk)
557
+ logging.info(f"Voiceover generated successfully for {output_audio_path}")
558
+ break
559
 
560
+ except Exception as e:
561
+ retry_count += 1
562
+ logging.error(f"Error generating voiceover (retry {retry_count}/{max_retries}): {e}")
563
+ time.sleep(5) # Wait 5 seconds before retrying
564
+
565
+ if retry_count == max_retries:
566
+ raise ValueError(f"Failed to generate voiceover after {max_retries} retries.")
567
+
568
+ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_language, speaker_wav_path, output_audio_path, use_clone=False):
569
+ try:
570
  if not speaker_wav_path or not os.path.exists(speaker_wav_path):
571
  msg = f"❌ Speaker audio not found: {speaker_wav_path}"
572
  logger.error(msg)
573
  return None, msg, msg
574
 
 
575
  tts_model.tts_to_file(
576
  text=full_text,
577
  speaker_wav=speaker_wav_path,
578
  language=target_language,
579
  file_path=output_audio_path,
580
+ speed=desired_speed,
581
  split_sentences=True
582
  )
583
 
 
616
  slope = (2 - 1.0) / (30 - 14)
617
  return 1.0 + slope * (cps - 14)
618
 
 
619
  def upload_and_manage(file, target_language, mode="transcription"):
620
  if file is None:
621
  logger.info("No file uploaded. Please upload a video/audio file.")
 
733
  return demo
734
 
735
  tts_model = None
736
+ global tts_model
737
  # Launch the Gradio interface
738
  demo = build_interface()
739
  demo.launch()