text_to_speech_sync_video

Running

App Files Files Community

TDN-M commited on Jan 20

Commit

1ad0926

verified ·

1 Parent(s): 74c50f3

Update avatar.py

Browse files

Files changed (1) hide show

avatar.py +14 -9

avatar.py CHANGED Viewed

@@ -10,8 +10,9 @@ import os
 import torch
 import io
 import soundfile as sf
 from Wav2Lip.models import Wav2Lip
-from models import Wav2Lip
 import face_detection
 import audio
@@ -244,10 +245,12 @@ class Avatar:
     # print ("face_detect_img_results[1][1] shape = " +str(self.face_detect_img_results[1][0].shape)) #this is cropped image
-  def datagen(self, full_frames, mels, face_detect_results,progress_bar):
     img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
     print(len(full_frames))
-    progress_bar.progress(0)
     for i, m in enumerate(mels):
       idx = i%len(full_frames)
       frame_to_save = full_frames[idx].copy()
@@ -272,8 +275,9 @@ class Avatar:
         #print(f"len(img_batch)>{self.datagen_batch_size} now len(img_batch)=" + str(len(img_batch)))
         yield img_batch, mel_batch, frame_batch, coords_batch
         img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
-    progress_percentage = (i + 1) / len(mels)
-    progress_bar.progress(progress_percentage)
     if len(img_batch) > 0:
       img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
@@ -503,8 +507,6 @@ class Avatar:
     self.image_frame_num_current=0
   def text_to_lip_video_parallel(self, input_text, pre_base_video_frames,pre_face_detect_results):
     mel_chunks_from_audio, audio_segment = self.create_mel_from_audio(input_text)
     print(str(len(self.face_detect_img_results)))
     video_full_frames_copy=self.video_full_frames.copy()
@@ -513,7 +515,7 @@ class Avatar:
       self.video_audio_adjust_parallel(mel_chunks_from_audio,video_full_frames_copy,
                                        pre_base_video_frames,pre_face_detect_results))
-    gen=self.datagen(video_full_frames_copy,mel_chunks_from_audio,face_detect_results)
     if self.export_video:
       video_write_handle = cv2.VideoWriter(self.temp_lip_video_no_voice_path + self.temp_lip_video_no_voice_filename,
@@ -523,13 +525,16 @@ class Avatar:
       video_write_handle=0
     self.make_lip_video(gen,video_write_handle, mel_chunks_from_audio,len(mel_chunks_from_audio)>self.datagen_batch_size,audio_segment)
     if self.export_video:
       self.sync_video_audio(self.output_audio_path + self.output_audio_filename,
                             self.temp_lip_video_no_voice_path + self.temp_lip_video_no_voice_filename,
                             self.output_video_path + self.split_current_file_name)
     image_frame_num_current=0
     return post_base_video_frames,post_face_detect_results
   def delete_files_in_path(self,dir_path):
     # Get the list of files in the directory

 import torch
 import io
 import soundfile as sf
+from Wav2Lip.face_detection.models import Wav2Lip
+from Wav2Lip.face_detection import FACE_DETECTION
 from Wav2Lip.models import Wav2Lip
 import face_detection
 import audio
     # print ("face_detect_img_results[1][1] shape = " +str(self.face_detect_img_results[1][0].shape)) #this is cropped image
+  def datagen(self, full_frames, mels, face_detect_results, progress_bar=None):
     img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
     print(len(full_frames))
+    if progress_bar:
+        progress_bar.progress(0)
     for i, m in enumerate(mels):
       idx = i%len(full_frames)
       frame_to_save = full_frames[idx].copy()
         #print(f"len(img_batch)>{self.datagen_batch_size} now len(img_batch)=" + str(len(img_batch)))
         yield img_batch, mel_batch, frame_batch, coords_batch
         img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+    if progress_bar:
+            progress_percentage = (i + 1) / len(mels)
+            progress_bar.progress(progress_percentage)
     if len(img_batch) > 0:
       img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
     self.image_frame_num_current=0
   def text_to_lip_video_parallel(self, input_text, pre_base_video_frames,pre_face_detect_results):
     mel_chunks_from_audio, audio_segment = self.create_mel_from_audio(input_text)
     print(str(len(self.face_detect_img_results)))
     video_full_frames_copy=self.video_full_frames.copy()
       self.video_audio_adjust_parallel(mel_chunks_from_audio,video_full_frames_copy,
                                        pre_base_video_frames,pre_face_detect_results))
+    gen = self.datagen(video_full_frames_copy, mel_chunks_from_audio, face_detect_results, progress_bar)
     if self.export_video:
       video_write_handle = cv2.VideoWriter(self.temp_lip_video_no_voice_path + self.temp_lip_video_no_voice_filename,
       video_write_handle=0
     self.make_lip_video(gen,video_write_handle, mel_chunks_from_audio,len(mel_chunks_from_audio)>self.datagen_batch_size,audio_segment)
+    if self.export_video and video_write_handle:
+        video_write_handle.release()
     if self.export_video:
       self.sync_video_audio(self.output_audio_path + self.output_audio_filename,
                             self.temp_lip_video_no_voice_path + self.temp_lip_video_no_voice_filename,
                             self.output_video_path + self.split_current_file_name)
     image_frame_num_current=0
     return post_base_video_frames,post_face_detect_results
   def delete_files_in_path(self,dir_path):
     # Get the list of files in the directory