BoldActionMan commited on
Commit
2106546
1 Parent(s): 73ce293

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -119
app.py CHANGED
@@ -40,7 +40,6 @@ def process_video(video_file, language_choice):
40
  enhanced = enhance(model, df_state, audio)
41
  save_audio(reference_audio, enhanced, df_state.sr())
42
  reference_speaker = reference_audio # This is the voice you want to clone
43
- target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)
44
 
45
  src_path = os.path.join(output_dir, "tmp.wav")
46
 
@@ -58,123 +57,18 @@ def process_video(video_file, language_choice):
58
  # Get the segments with start and end times
59
  segments = sttresult['segments']
60
 
61
- # Choose the target language for translation
62
- language = 'EN_NEWEST'
63
- match language_choice:
64
- case 'en':
65
- language = 'EN_NEWEST'
66
- case 'es':
67
- language = 'ES'
68
- case 'fr':
69
- language = 'FR'
70
- case 'zh-CN' | 'zh-TW':
71
- language = 'ZH'
72
- case 'ja':
73
- language = 'JP'
74
- case 'ko':
75
- language = 'KR'
76
- case _:
77
- language = 'EN_NEWEST'
78
-
79
- # Translate the transcription segment by segment
80
- def translate_segment(segment):
81
- return segment["start"], segment["end"], ts.translate_text(query_text=segment["text"], translator="google", to_language=language_choice)
82
-
83
- # Batch translation to reduce memory load
84
- batch_size = 2
85
- translation_segments = []
86
- for i in range(0, len(segments), batch_size):
87
- batch = segments[i:i + batch_size]
88
- with ThreadPoolExecutor(max_workers=5) as executor:
89
- batch_translations = list(executor.map(translate_segment, batch))
90
- translation_segments.extend(batch_translations)
91
-
92
- # Generate the translated audio for each segment
93
- model = TTS(language=language, device=device)
94
- speaker_ids = model.hps.data.spk2id
95
-
96
- def generate_segment_audio(segment, speaker_id):
97
- start, end, translated_text = segment
98
- segment_path = os.path.join(output_dir, f'segment_{start}_{end}.wav')
99
- model.tts_to_file(translated_text, speaker_id, segment_path, speed=speed)
100
- return segment_path, start, end, translated_text
101
-
102
- for speaker_key in speaker_ids.keys():
103
- speaker_id = speaker_ids[speaker_key]
104
- speaker_key = speaker_key.lower().replace('_', '-')
105
-
106
- source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
107
-
108
- segment_files = []
109
- subtitle_entries = []
110
- for segment in translation_segments:
111
- segment_file, start, end, translated_text = generate_segment_audio(segment, speaker_id)
112
-
113
- # Run the tone color converter
114
- encode_message = "@MyShell"
115
- tone_color_converter.convert(
116
- audio_src_path=segment_file,
117
- src_se=source_se,
118
- tgt_se=target_se,
119
- output_path=segment_file,
120
- message=encode_message)
121
-
122
- segment_files.append((segment_file, start, end, translated_text))
123
-
124
- # Combine the audio segments
125
- combined_audio = AudioSegment.empty()
126
- video_segments = []
127
- previous_end = 0
128
- subtitle_counter = 1
129
- for segment_file, start, end, translated_text in segment_files:
130
- segment_audio = AudioSegment.from_file(segment_file)
131
- combined_audio += segment_audio
132
-
133
- # Calculate the duration of the audio segment
134
- audio_duration = len(segment_audio) / 1000.0
135
-
136
- # Add the subtitle entry for this segment
137
- subtitle_entries.append((subtitle_counter, previous_end, previous_end + audio_duration, translated_text))
138
- subtitle_counter += 1
139
-
140
- # Get the corresponding video segment and adjust its speed to match the audio duration
141
- video_segment = (
142
- ffmpeg
143
- .input(reference_video.filename, ss=start, to=end)
144
- .filter('setpts', f'PTS / {(end - start) / audio_duration}')
145
- )
146
- video_segments.append((video_segment, ffmpeg.input(segment_file)))
147
- previous_end += audio_duration
148
-
149
- save_path = os.path.join(output_dir, f'output_v2_{speaker_key}.wav')
150
- combined_audio.export(save_path, format="wav")
151
-
152
- # Combine video and audio segments using ffmpeg
153
- video_and_audio_files = [item for sublist in video_segments for item in sublist]
154
- joined = (
155
- ffmpeg
156
- .concat(*video_and_audio_files, v=1, a=1)
157
- .node
158
- )
159
-
160
- final_video_path = os.path.join(output_dir, f'final_video_{speaker_key}.mp4')
161
- try:
162
- (
163
- ffmpeg
164
- .output(joined[0], joined[1], final_video_path, vcodec='libx264', acodec='aac')
165
- .run(overwrite_output=True)
166
- )
167
- except ffmpeg.Error as e:
168
- print('ffmpeg error:', e)
169
- print(e.stderr.decode('utf-8'))
170
-
171
- print(f"Final video without subtitles saved to: {final_video_path}")
172
 
173
  # Generate subtitles file in SRT format
174
  srt_path = os.path.join(output_dir, 'subtitles.srt')
175
  with open(srt_path, 'w', encoding='utf-8') as srt_file:
176
- for entry in subtitle_entries:
177
- index, start, end, text = entry
 
 
 
178
  start_hours, start_minutes = divmod(int(start), 3600)
179
  start_minutes, start_seconds = divmod(start_minutes, 60)
180
  start_milliseconds = int((start * 1000) % 1000)
@@ -183,17 +77,17 @@ def process_video(video_file, language_choice):
183
  end_minutes, end_seconds = divmod(end_minutes, 60)
184
  end_milliseconds = int((end * 1000) % 1000)
185
 
186
- srt_file.write(f"{index}\n")
187
  srt_file.write(f"{start_hours:02}:{start_minutes:02}:{start_seconds:02},{start_milliseconds:03} --> "
188
  f"{end_hours:02}:{end_minutes:02}:{end_seconds:02},{end_milliseconds:03}\n")
189
  srt_file.write(f"{text}\n\n")
190
 
191
  # Add subtitles to the video
192
- final_video_with_subs_path = os.path.join(output_dir, f'final_video_with_subs_{speaker_key}.mp4')
193
  try:
194
  (
195
  ffmpeg
196
- .input(final_video_path)
197
  .output(final_video_with_subs_path, vf=f"subtitles={srt_path}")
198
  .run(overwrite_output=True)
199
  )
@@ -202,8 +96,156 @@ def process_video(video_file, language_choice):
202
  print(e.stderr.decode('utf-8'))
203
 
204
  print(f"Final video with subtitles saved to: {final_video_with_subs_path}")
205
-
206
  return final_video_with_subs_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
 
209
  # Define Gradio interface
@@ -211,12 +253,13 @@ def gradio_interface(video_file, language_choice):
211
  return process_video(video_file, language_choice)
212
 
213
  language_choices = ts.get_languages("google")["en"]
 
214
 
215
  gr.Interface(
216
  fn=gradio_interface,
217
  inputs=[
218
  gr.Video(label="Upload Video", sources=['upload']),
219
- gr.Dropdown(choices=language_choices, label="Choose Language for Translation")
220
  ],
221
  outputs=gr.Video(label="Translated Video"),
222
  title="Video Translation and Voice Cloning",
 
40
  enhanced = enhance(model, df_state, audio)
41
  save_audio(reference_audio, enhanced, df_state.sr())
42
  reference_speaker = reference_audio # This is the voice you want to clone
 
43
 
44
  src_path = os.path.join(output_dir, "tmp.wav")
45
 
 
57
  # Get the segments with start and end times
58
  segments = sttresult['segments']
59
 
60
+ if sttresult["language"] == language_choice[0:2]:
61
+ print("Chosen language is the same as the video's original language. Only adding subtitles.")
62
+ segments = sttresult['segments']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  # Generate subtitles file in SRT format
65
  srt_path = os.path.join(output_dir, 'subtitles.srt')
66
  with open(srt_path, 'w', encoding='utf-8') as srt_file:
67
+ for i, segment in enumerate(segments):
68
+ start = segment['start']
69
+ end = segment['end']
70
+ text = segment['text']
71
+
72
  start_hours, start_minutes = divmod(int(start), 3600)
73
  start_minutes, start_seconds = divmod(start_minutes, 60)
74
  start_milliseconds = int((start * 1000) % 1000)
 
77
  end_minutes, end_seconds = divmod(end_minutes, 60)
78
  end_milliseconds = int((end * 1000) % 1000)
79
 
80
+ srt_file.write(f"{i+1}\n")
81
  srt_file.write(f"{start_hours:02}:{start_minutes:02}:{start_seconds:02},{start_milliseconds:03} --> "
82
  f"{end_hours:02}:{end_minutes:02}:{end_seconds:02},{end_milliseconds:03}\n")
83
  srt_file.write(f"{text}\n\n")
84
 
85
  # Add subtitles to the video
86
+ final_video_with_subs_path = os.path.join(output_dir, f'final_video_with_subs.mp4')
87
  try:
88
  (
89
  ffmpeg
90
+ .input(video_file)
91
  .output(final_video_with_subs_path, vf=f"subtitles={srt_path}")
92
  .run(overwrite_output=True)
93
  )
 
96
  print(e.stderr.decode('utf-8'))
97
 
98
  print(f"Final video with subtitles saved to: {final_video_with_subs_path}")
 
99
  return final_video_with_subs_path
100
+ else:
101
+ target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)
102
+
103
+ # Choose the target language for translation
104
+ language = 'EN_NEWEST'
105
+ match language_choice[0:2]:
106
+ case 'en':
107
+ language = 'EN_NEWEST'
108
+ case 'es':
109
+ language = 'ES'
110
+ case 'fr':
111
+ language = 'FR'
112
+ case 'zh':
113
+ language = 'ZH'
114
+ case 'ja':
115
+ language = 'JP'
116
+ case 'ko':
117
+ language = 'KR'
118
+ case _:
119
+ language = 'EN_NEWEST'
120
+
121
+ # Translate the transcription segment by segment
122
+ def translate_segment(segment):
123
+ return segment["start"], segment["end"], ts.translate_text(query_text=segment["text"], translator="google", to_language=language_choice)
124
+
125
+ # Batch translation to reduce memory load
126
+ batch_size = 2
127
+ translation_segments = []
128
+ for i in range(0, len(segments), batch_size):
129
+ batch = segments[i:i + batch_size]
130
+ with ThreadPoolExecutor(max_workers=5) as executor:
131
+ batch_translations = list(executor.map(translate_segment, batch))
132
+ translation_segments.extend(batch_translations)
133
+
134
+ # Generate the translated audio for each segment
135
+ model = TTS(language=language, device=device)
136
+ speaker_ids = model.hps.data.spk2id
137
+
138
+ def generate_segment_audio(segment, speaker_id):
139
+ start, end, translated_text = segment
140
+ segment_path = os.path.join(output_dir, f'segment_{start}_{end}.wav')
141
+ model.tts_to_file(translated_text, speaker_id, segment_path, speed=speed)
142
+ return segment_path, start, end, translated_text
143
+
144
+ for speaker_key in speaker_ids.keys():
145
+ speaker_id = speaker_ids[speaker_key]
146
+ speaker_key = speaker_key.lower().replace('_', '-')
147
+
148
+ source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
149
+
150
+ segment_files = []
151
+ subtitle_entries = []
152
+ for segment in translation_segments:
153
+ segment_file, start, end, translated_text = generate_segment_audio(segment, speaker_id)
154
+
155
+ # Run the tone color converter
156
+ encode_message = "@MyShell"
157
+ tone_color_converter.convert(
158
+ audio_src_path=segment_file,
159
+ src_se=source_se,
160
+ tgt_se=target_se,
161
+ output_path=segment_file,
162
+ message=encode_message)
163
+
164
+ segment_files.append((segment_file, start, end, translated_text))
165
+
166
+ # Combine the audio segments
167
+ combined_audio = AudioSegment.empty()
168
+ video_segments = []
169
+ previous_end = 0
170
+ subtitle_counter = 1
171
+ for segment_file, start, end, translated_text in segment_files:
172
+ segment_audio = AudioSegment.from_file(segment_file)
173
+ combined_audio += segment_audio
174
+
175
+ # Calculate the duration of the audio segment
176
+ audio_duration = len(segment_audio) / 1000.0
177
+
178
+ # Add the subtitle entry for this segment
179
+ subtitle_entries.append((subtitle_counter, previous_end, previous_end + audio_duration, translated_text))
180
+ subtitle_counter += 1
181
+
182
+ # Get the corresponding video segment and adjust its speed to match the audio duration
183
+ video_segment = (
184
+ ffmpeg
185
+ .input(reference_video.filename, ss=start, to=end)
186
+ .filter('setpts', f'PTS / {(end - start) / audio_duration}')
187
+ )
188
+ video_segments.append((video_segment, ffmpeg.input(segment_file)))
189
+ previous_end += audio_duration
190
+
191
+ save_path = os.path.join(output_dir, f'output_v2_{speaker_key}.wav')
192
+ combined_audio.export(save_path, format="wav")
193
+
194
+ # Combine video and audio segments using ffmpeg
195
+ video_and_audio_files = [item for sublist in video_segments for item in sublist]
196
+ joined = (
197
+ ffmpeg
198
+ .concat(*video_and_audio_files, v=1, a=1)
199
+ .node
200
+ )
201
+
202
+ final_video_path = os.path.join(output_dir, f'final_video_{speaker_key}.mp4')
203
+ try:
204
+ (
205
+ ffmpeg
206
+ .output(joined[0], joined[1], final_video_path, vcodec='libx264', acodec='aac')
207
+ .run(overwrite_output=True)
208
+ )
209
+ except ffmpeg.Error as e:
210
+ print('ffmpeg error:', e)
211
+ print(e.stderr.decode('utf-8'))
212
+
213
+ print(f"Final video without subtitles saved to: {final_video_path}")
214
+
215
+ # Generate subtitles file in SRT format
216
+ srt_path = os.path.join(output_dir, 'subtitles.srt')
217
+ with open(srt_path, 'w', encoding='utf-8') as srt_file:
218
+ for entry in subtitle_entries:
219
+ index, start, end, text = entry
220
+ start_hours, start_minutes = divmod(int(start), 3600)
221
+ start_minutes, start_seconds = divmod(start_minutes, 60)
222
+ start_milliseconds = int((start * 1000) % 1000)
223
+
224
+ end_hours, end_minutes = divmod(int(end), 3600)
225
+ end_minutes, end_seconds = divmod(end_minutes, 60)
226
+ end_milliseconds = int((end * 1000) % 1000)
227
+
228
+ srt_file.write(f"{index}\n")
229
+ srt_file.write(f"{start_hours:02}:{start_minutes:02}:{start_seconds:02},{start_milliseconds:03} --> "
230
+ f"{end_hours:02}:{end_minutes:02}:{end_seconds:02},{end_milliseconds:03}\n")
231
+ srt_file.write(f"{text}\n\n")
232
+
233
+ # Add subtitles to the video
234
+ final_video_with_subs_path = os.path.join(output_dir, f'final_video_with_subs_{speaker_key}.mp4')
235
+ try:
236
+ (
237
+ ffmpeg
238
+ .input(final_video_path)
239
+ .output(final_video_with_subs_path, vf=f"subtitles={srt_path}")
240
+ .run(overwrite_output=True)
241
+ )
242
+ except ffmpeg.Error as e:
243
+ print('ffmpeg error:', e)
244
+ print(e.stderr.decode('utf-8'))
245
+
246
+ print(f"Final video with subtitles saved to: {final_video_with_subs_path}")
247
+
248
+ return final_video_with_subs_path
249
 
250
 
251
  # Define Gradio interface
 
253
  return process_video(video_file, language_choice)
254
 
255
  language_choices = ts.get_languages("google")["en"]
256
+ language_choices.pop("auto")
257
 
258
  gr.Interface(
259
  fn=gradio_interface,
260
  inputs=[
261
  gr.Video(label="Upload Video", sources=['upload']),
262
+ gr.Dropdown(choices=language_choices, label="Choose Language for Translation (Expressed in ISO 639-1 code)")
263
  ],
264
  outputs=gr.Video(label="Translated Video"),
265
  title="Video Translation and Voice Cloning",