qqwjq1981 commited on
Commit
32a25f2
Β·
verified Β·
1 Parent(s): 7963262

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -40
app.py CHANGED
@@ -126,55 +126,31 @@ def handle_feedback(feedback):
126
  return "Thank you for your feedback!", None
127
 
128
  def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
129
-
130
- """
131
- Detects and extracts non-speech (background) segments from audio using pyannote VAD.
132
-
133
- Parameters:
134
- - audio_path (str): Path to input audio (.wav).
135
- - segment_audio_path (str): Path to save the output non-speech audio.
136
- - hf_token (str): Hugging Face auth token for pyannote.
137
-
138
- Returns:
139
- - List of non-speech timestamp tuples (start, end) in seconds.
140
- """
141
-
142
- # Step 1: Load pipeline
143
  pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
144
-
145
- # Step 2: Apply VAD to get speech segments
146
  vad_result = pipeline(audio_path)
147
- print("βœ… Speech segments detected.")
148
 
149
- # Step 3: Get full duration of the audio
150
  full_audio = AudioSegment.from_wav(audio_path)
151
  full_duration_sec = len(full_audio) / 1000.0
152
 
153
- # Step 4: Compute non-speech segments
154
- background_segments = []
155
  current_time = 0.0
 
156
 
157
  for segment in vad_result.itersegments():
 
158
  if current_time < segment.start:
159
- background_segments.append((current_time, segment.start))
 
 
 
 
160
  current_time = segment.end
161
 
 
162
  if current_time < full_duration_sec:
163
- background_segments.append((current_time, full_duration_sec))
164
-
165
- print(f"πŸ•’ Non-speech segments: {background_segments}")
166
-
167
- # Step 5: Extract and combine non-speech segments
168
- non_speech_audio = AudioSegment.empty()
169
- for start, end in background_segments:
170
- segment = full_audio[int(start * 1000):int(end * 1000)]
171
- non_speech_audio += segment
172
-
173
- # Step 6: Export the non-speech audio
174
- non_speech_audio.export(background_audio_path, format="wav")
175
- print(f"🎡 Non-speech audio saved to: {background_audio_path}")
176
 
177
- return background_segments
 
178
 
179
  def transcribe_video_with_speakers(video_path):
180
  # Extract audio from video
@@ -427,9 +403,11 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
427
  speaker = entry.get("speaker", "default")
428
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
429
 
430
- if process_mode>2 and speaker_wav_path and os.path.exists(speaker_wav_path):
 
 
 
431
  generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
432
-
433
  else:
434
  generate_voiceover_OpenAI(entry['translated'], target_language, desired_speed, segment_audio_path)
435
 
@@ -504,9 +482,9 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
504
  voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
505
 
506
  if background_audio_path and os.path.exists(background_audio_path):
507
- # background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
508
- # final_audio = CompositeAudioClip([voice_audio, background_audio])
509
- final_audio = voice_audio
510
  logger.info("βœ… Background audio loaded and merged with voiceover.")
511
  else:
512
  final_audio = voice_audio
 
126
  return "Thank you for your feedback!", None
127
 
128
  def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
 
 
130
  vad_result = pipeline(audio_path)
 
131
 
 
132
  full_audio = AudioSegment.from_wav(audio_path)
133
  full_duration_sec = len(full_audio) / 1000.0
134
 
 
 
135
  current_time = 0.0
136
+ result_audio = AudioSegment.empty()
137
 
138
  for segment in vad_result.itersegments():
139
+ # Background segment before the speech
140
  if current_time < segment.start:
141
+ bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
142
+ result_audio += bg
143
+ # Add silence for the speech duration
144
+ silence_duration = segment.end - segment.start
145
+ result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
146
  current_time = segment.end
147
 
148
+ # Handle any remaining background after the last speech
149
  if current_time < full_duration_sec:
150
+ result_audio += full_audio[int(current_time * 1000):]
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ result_audio.export(background_audio_path, format="wav")
153
+ return background_audio_path
154
 
155
  def transcribe_video_with_speakers(video_path):
156
  # Extract audio from video
 
403
  speaker = entry.get("speaker", "default")
404
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
405
 
406
+ # Assume this is the list of supported languages for the TTS model
407
+ supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
408
+
409
+ if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
410
  generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
 
411
  else:
412
  generate_voiceover_OpenAI(entry['translated'], target_language, desired_speed, segment_audio_path)
413
 
 
482
  voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
483
 
484
  if background_audio_path and os.path.exists(background_audio_path):
485
+ background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
486
+ final_audio = CompositeAudioClip([voice_audio, background_audio])
487
+ # final_audio = voice_audio
488
  logger.info("βœ… Background audio loaded and merged with voiceover.")
489
  else:
490
  final_audio = voice_audio