Update app.py
Browse files
app.py
CHANGED
@@ -126,55 +126,31 @@ def handle_feedback(feedback):
|
|
126 |
return "Thank you for your feedback!", None
|
127 |
|
128 |
def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
129 |
-
|
130 |
-
"""
|
131 |
-
Detects and extracts non-speech (background) segments from audio using pyannote VAD.
|
132 |
-
|
133 |
-
Parameters:
|
134 |
-
- audio_path (str): Path to input audio (.wav).
|
135 |
-
- segment_audio_path (str): Path to save the output non-speech audio.
|
136 |
-
- hf_token (str): Hugging Face auth token for pyannote.
|
137 |
-
|
138 |
-
Returns:
|
139 |
-
- List of non-speech timestamp tuples (start, end) in seconds.
|
140 |
-
"""
|
141 |
-
|
142 |
-
# Step 1: Load pipeline
|
143 |
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
144 |
-
|
145 |
-
# Step 2: Apply VAD to get speech segments
|
146 |
vad_result = pipeline(audio_path)
|
147 |
-
print("β
Speech segments detected.")
|
148 |
|
149 |
-
# Step 3: Get full duration of the audio
|
150 |
full_audio = AudioSegment.from_wav(audio_path)
|
151 |
full_duration_sec = len(full_audio) / 1000.0
|
152 |
|
153 |
-
# Step 4: Compute non-speech segments
|
154 |
-
background_segments = []
|
155 |
current_time = 0.0
|
|
|
156 |
|
157 |
for segment in vad_result.itersegments():
|
|
|
158 |
if current_time < segment.start:
|
159 |
-
|
|
|
|
|
|
|
|
|
160 |
current_time = segment.end
|
161 |
|
|
|
162 |
if current_time < full_duration_sec:
|
163 |
-
|
164 |
-
|
165 |
-
print(f"π Non-speech segments: {background_segments}")
|
166 |
-
|
167 |
-
# Step 5: Extract and combine non-speech segments
|
168 |
-
non_speech_audio = AudioSegment.empty()
|
169 |
-
for start, end in background_segments:
|
170 |
-
segment = full_audio[int(start * 1000):int(end * 1000)]
|
171 |
-
non_speech_audio += segment
|
172 |
-
|
173 |
-
# Step 6: Export the non-speech audio
|
174 |
-
non_speech_audio.export(background_audio_path, format="wav")
|
175 |
-
print(f"π΅ Non-speech audio saved to: {background_audio_path}")
|
176 |
|
177 |
-
|
|
|
178 |
|
179 |
def transcribe_video_with_speakers(video_path):
|
180 |
# Extract audio from video
|
@@ -427,9 +403,11 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
427 |
speaker = entry.get("speaker", "default")
|
428 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
429 |
|
430 |
-
|
|
|
|
|
|
|
431 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
432 |
-
|
433 |
else:
|
434 |
generate_voiceover_OpenAI(entry['translated'], target_language, desired_speed, segment_audio_path)
|
435 |
|
@@ -504,9 +482,9 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
|
|
504 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
505 |
|
506 |
if background_audio_path and os.path.exists(background_audio_path):
|
507 |
-
|
508 |
-
|
509 |
-
final_audio = voice_audio
|
510 |
logger.info("β
Background audio loaded and merged with voiceover.")
|
511 |
else:
|
512 |
final_audio = voice_audio
|
|
|
126 |
return "Thank you for your feedback!", None
|
127 |
|
128 |
def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
|
|
|
|
130 |
vad_result = pipeline(audio_path)
|
|
|
131 |
|
|
|
132 |
full_audio = AudioSegment.from_wav(audio_path)
|
133 |
full_duration_sec = len(full_audio) / 1000.0
|
134 |
|
|
|
|
|
135 |
current_time = 0.0
|
136 |
+
result_audio = AudioSegment.empty()
|
137 |
|
138 |
for segment in vad_result.itersegments():
|
139 |
+
# Background segment before the speech
|
140 |
if current_time < segment.start:
|
141 |
+
bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
|
142 |
+
result_audio += bg
|
143 |
+
# Add silence for the speech duration
|
144 |
+
silence_duration = segment.end - segment.start
|
145 |
+
result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
|
146 |
current_time = segment.end
|
147 |
|
148 |
+
# Handle any remaining background after the last speech
|
149 |
if current_time < full_duration_sec:
|
150 |
+
result_audio += full_audio[int(current_time * 1000):]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
+
result_audio.export(background_audio_path, format="wav")
|
153 |
+
return background_audio_path
|
154 |
|
155 |
def transcribe_video_with_speakers(video_path):
|
156 |
# Extract audio from video
|
|
|
403 |
speaker = entry.get("speaker", "default")
|
404 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
405 |
|
406 |
+
# Assume this is the list of supported languages for the TTS model
|
407 |
+
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
408 |
+
|
409 |
+
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
410 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
|
|
411 |
else:
|
412 |
generate_voiceover_OpenAI(entry['translated'], target_language, desired_speed, segment_audio_path)
|
413 |
|
|
|
482 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
483 |
|
484 |
if background_audio_path and os.path.exists(background_audio_path):
|
485 |
+
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
486 |
+
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
487 |
+
# final_audio = voice_audio
|
488 |
logger.info("β
Background audio loaded and merged with voiceover.")
|
489 |
else:
|
490 |
final_audio = voice_audio
|