Update app.py
Browse files
app.py
CHANGED
@@ -408,7 +408,7 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
408 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
409 |
return None
|
410 |
|
411 |
-
def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
|
412 |
logger.debug(f"Processing entry {i}: {entry}")
|
413 |
error_message = None
|
414 |
|
@@ -424,13 +424,15 @@ def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover,
|
|
424 |
try:
|
425 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
426 |
desired_duration = entry["end"] - entry["start"]
|
427 |
-
|
428 |
-
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
429 |
|
430 |
-
|
|
|
|
|
|
|
431 |
|
432 |
-
|
433 |
-
|
434 |
|
435 |
if not output_path or not os.path.exists(segment_audio_path):
|
436 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
@@ -453,28 +455,28 @@ def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover,
|
|
453 |
|
454 |
return i, txt_clip, audio_segment, error_message
|
455 |
|
456 |
-
def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
|
457 |
video = VideoFileClip(video_path)
|
458 |
font_path = "./NotoSansSC-Regular.ttf"
|
459 |
|
460 |
text_clips = []
|
461 |
audio_segments = []
|
462 |
error_messages = []
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
|
476 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
477 |
-
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path,
|
478 |
for i, entry in enumerate(translated_json)]
|
479 |
|
480 |
results = []
|
@@ -526,26 +528,56 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
526 |
|
527 |
return error_messages
|
528 |
|
529 |
-
def
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
536 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
537 |
if not speaker_wav_path or not os.path.exists(speaker_wav_path):
|
538 |
msg = f"β Speaker audio not found: {speaker_wav_path}"
|
539 |
logger.error(msg)
|
540 |
return None, msg, msg
|
541 |
|
542 |
-
speed_tts = calibrated_speed(full_text, desired_duration)
|
543 |
tts_model.tts_to_file(
|
544 |
text=full_text,
|
545 |
speaker_wav=speaker_wav_path,
|
546 |
language=target_language,
|
547 |
file_path=output_audio_path,
|
548 |
-
speed=
|
549 |
split_sentences=True
|
550 |
)
|
551 |
|
@@ -584,7 +616,6 @@ def calibrated_speed(text, desired_duration):
|
|
584 |
slope = (2 - 1.0) / (30 - 14)
|
585 |
return 1.0 + slope * (cps - 14)
|
586 |
|
587 |
-
|
588 |
def upload_and_manage(file, target_language, mode="transcription"):
|
589 |
if file is None:
|
590 |
logger.info("No file uploaded. Please upload a video/audio file.")
|
@@ -702,6 +733,7 @@ def build_interface():
|
|
702 |
return demo
|
703 |
|
704 |
tts_model = None
|
|
|
705 |
# Launch the Gradio interface
|
706 |
demo = build_interface()
|
707 |
demo.launch()
|
|
|
408 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
409 |
return None
|
410 |
|
411 |
+
def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover, target_language, font_path, use_clone, speaker_sample_paths=None):
|
412 |
logger.debug(f"Processing entry {i}: {entry}")
|
413 |
error_message = None
|
414 |
|
|
|
424 |
try:
|
425 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
426 |
desired_duration = entry["end"] - entry["start"]
|
427 |
+
desired_speed = calibrated_speed(entry['translated'], desired_duration)
|
|
|
428 |
|
429 |
+
if use_clone:
|
430 |
+
speaker = entry.get("speaker", "default")
|
431 |
+
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
432 |
+
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
433 |
|
434 |
+
else:
|
435 |
+
generate_voiceover_OpenAI(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
436 |
|
437 |
if not output_path or not os.path.exists(segment_audio_path):
|
438 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
|
|
455 |
|
456 |
return i, txt_clip, audio_segment, error_message
|
457 |
|
458 |
+
def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None, use_clone=False):
|
459 |
video = VideoFileClip(video_path)
|
460 |
font_path = "./NotoSansSC-Regular.ttf"
|
461 |
|
462 |
text_clips = []
|
463 |
audio_segments = []
|
464 |
error_messages = []
|
465 |
+
|
466 |
+
if use_clone:
|
467 |
+
if tts_model is None:
|
468 |
+
try:
|
469 |
+
print("π Loading XTTS model...")
|
470 |
+
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
471 |
+
print("β
XTTS model loaded successfully.")
|
472 |
+
except Exception as e:
|
473 |
+
print("β Error loading XTTS model:")
|
474 |
+
traceback.print_exc()
|
475 |
+
return f"Error loading XTTS model: {e}"
|
476 |
+
## Need to implmenet backup option.
|
477 |
|
478 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
479 |
+
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, use_clone, speaker_sample_paths)
|
480 |
for i, entry in enumerate(translated_json)]
|
481 |
|
482 |
results = []
|
|
|
528 |
|
529 |
return error_messages
|
530 |
|
531 |
+
def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
|
532 |
+
"""
|
533 |
+
Generate voiceover from translated text for a given language using OpenAI TTS API.
|
534 |
+
"""
|
535 |
+
# Define the voice based on the language (for now, use 'alloy' as default)
|
536 |
+
voice = "alloy" # Adjust based on language if needed
|
537 |
+
|
538 |
+
# Define the model (use tts-1 for real-time applications)
|
539 |
+
model = "tts-1"
|
540 |
+
|
541 |
+
max_retries = 3
|
542 |
+
retry_count = 0
|
543 |
+
|
544 |
+
while retry_count < max_retries:
|
545 |
+
try:
|
546 |
+
# Create the speech using OpenAI TTS API
|
547 |
+
response = client.audio.speech.create(
|
548 |
+
model=model,
|
549 |
+
voice=voice,
|
550 |
+
input=full_text,
|
551 |
+
speed=desired_speed
|
552 |
+
)
|
553 |
+
# Save the audio to the specified path
|
554 |
+
with open(output_audio_path, 'wb') as f:
|
555 |
+
for chunk in response.iter_bytes():
|
556 |
+
f.write(chunk)
|
557 |
+
logging.info(f"Voiceover generated successfully for {output_audio_path}")
|
558 |
+
break
|
559 |
|
560 |
+
except Exception as e:
|
561 |
+
retry_count += 1
|
562 |
+
logging.error(f"Error generating voiceover (retry {retry_count}/{max_retries}): {e}")
|
563 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
564 |
+
|
565 |
+
if retry_count == max_retries:
|
566 |
+
raise ValueError(f"Failed to generate voiceover after {max_retries} retries.")
|
567 |
+
|
568 |
+
def generate_voiceover_clone(full_text, tts_model, desired_speed, target_language, speaker_wav_path, output_audio_path, use_clone=False):
|
569 |
+
try:
|
570 |
if not speaker_wav_path or not os.path.exists(speaker_wav_path):
|
571 |
msg = f"β Speaker audio not found: {speaker_wav_path}"
|
572 |
logger.error(msg)
|
573 |
return None, msg, msg
|
574 |
|
|
|
575 |
tts_model.tts_to_file(
|
576 |
text=full_text,
|
577 |
speaker_wav=speaker_wav_path,
|
578 |
language=target_language,
|
579 |
file_path=output_audio_path,
|
580 |
+
speed=desired_speed,
|
581 |
split_sentences=True
|
582 |
)
|
583 |
|
|
|
616 |
slope = (2 - 1.0) / (30 - 14)
|
617 |
return 1.0 + slope * (cps - 14)
|
618 |
|
|
|
619 |
def upload_and_manage(file, target_language, mode="transcription"):
|
620 |
if file is None:
|
621 |
logger.info("No file uploaded. Please upload a video/audio file.")
|
|
|
733 |
return demo
|
734 |
|
735 |
tts_model = None
|
736 |
+
global tts_model
|
737 |
# Launch the Gradio interface
|
738 |
demo = build_interface()
|
739 |
demo.launch()
|