text_to_speech_sync_video

Running

App Files Files Community

zmbfeng commited on Jun 4, 2024

Commit

78953ab

1 Parent(s): 41d2d4a

status placeholder and progress bars added

Browse files

Files changed (2) hide show

app.py +10 -6
avatar.py +6 -4

app.py CHANGED Viewed

@@ -16,12 +16,14 @@ from avatar import Avatar
 options = ['Aude', 'Kyla', 'Liv']
 images = ['ref_videos/Aude.png', 'ref_videos/Kyla.png', 'ref_videos/Liv.png']
 if 'is_initialized' not in st.session_state:
     st.session_state.avatar = Avatar()
     st.session_state.avatar.export_video = False
     st.session_state.avatar.load_model("checkpoint/wav2lip_gan.pth")
-    print("load model finished")
     st.session_state.avatar.device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(st.session_state.avatar.device)
     st.session_state.avatar.output_audio_path = "audio/"
@@ -36,7 +38,7 @@ if 'is_initialized' not in st.session_state:
     st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
     st.session_state.avatar.face_detect_batch_size = 16
     # avatar.create_face_detection_results(avatar.video_full_frames,True)
-    print("load face detection result")
     st.session_state.face_det_results_dict={}
     for option in options:
         with open(f'ref_videos/{option}_face_det_result.pkl', 'rb') as file:
@@ -49,8 +51,8 @@ if 'is_initialized' not in st.session_state:
     #     with open(self.face_det_results_path_and_name, 'rb') as file:
     #       self.face_detect_img_results = pickle.load(file)
     input_text = "Hi How are you?"
-    st.session_state.avatar.text_to_lip_video(input_text)
-    print("load face detection result done")
     st.session_state['is_initialized'] = True
@@ -64,6 +66,7 @@ img_col1, img_col2 = st.columns([1,1])
 with img_col1:
     st.image(images[options.index(selected_option)])
 if st.session_state.selected_option != selected_option:
     print("The selected option has changed!")
     st.session_state.selected_option = selected_option
@@ -75,12 +78,13 @@ if st.session_state.selected_option != selected_option:
 from avatar import Avatar
 # Create a text input box and store the input in a variable
 user_input = st.text_input("Enter your text:")
 if user_input:
     st.session_state.avatar.dir_clean_up()
     # Display the entered text
     st.write("You entered:", user_input)
     st.session_state.avatar.export_video=True
-    st.session_state.avatar.text_to_lip_video(user_input)
     col1, col2, col3 = st.columns([1, 4, 1])
     # with col1:

 options = ['Aude', 'Kyla', 'Liv']
 images = ['ref_videos/Aude.png', 'ref_videos/Kyla.png', 'ref_videos/Liv.png']
+current_status_placeholder = st.empty()
+init_progress_bar = st.progress(0)
 if 'is_initialized' not in st.session_state:
     st.session_state.avatar = Avatar()
     st.session_state.avatar.export_video = False
+    current_status_placeholder.write("load model")
     st.session_state.avatar.load_model("checkpoint/wav2lip_gan.pth")
+    current_status_placeholder.write("load model finished")
     st.session_state.avatar.device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(st.session_state.avatar.device)
     st.session_state.avatar.output_audio_path = "audio/"
     st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
     st.session_state.avatar.face_detect_batch_size = 16
     # avatar.create_face_detection_results(avatar.video_full_frames,True)
+    current_status_placeholder.write("load face detection result")
     st.session_state.face_det_results_dict={}
     for option in options:
         with open(f'ref_videos/{option}_face_det_result.pkl', 'rb') as file:
     #     with open(self.face_det_results_path_and_name, 'rb') as file:
     #       self.face_detect_img_results = pickle.load(file)
     input_text = "Hi How are you?"
+    st.session_state.avatar.text_to_lip_video(input_text,init_progress_bar)
+    current_status_placeholder.write("load face detection result done")
     st.session_state['is_initialized'] = True
 with img_col1:
     st.image(images[options.index(selected_option)])
 if st.session_state.selected_option != selected_option:
     print("The selected option has changed!")
     st.session_state.selected_option = selected_option
 from avatar import Avatar
 # Create a text input box and store the input in a variable
 user_input = st.text_input("Enter your text:")
+inference_progress_bar = st.progress(0)
 if user_input:
     st.session_state.avatar.dir_clean_up()
     # Display the entered text
     st.write("You entered:", user_input)
     st.session_state.avatar.export_video=True
+    st.session_state.avatar.text_to_lip_video(user_input,inference_progress_bar)
     col1, col2, col3 = st.columns([1, 4, 1])
     # with col1:

avatar.py CHANGED Viewed

@@ -244,9 +244,10 @@ class Avatar:
     # print ("face_detect_img_results[1][1] shape = " +str(self.face_detect_img_results[1][0].shape)) #this is cropped image
-  def datagen(self, full_frames, mels, face_detect_results):
     img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
     print(len(full_frames))
     for i, m in enumerate(mels):
       idx = i%len(full_frames)
       frame_to_save = full_frames[idx].copy()
@@ -271,7 +272,8 @@ class Avatar:
         #print(f"len(img_batch)>{self.datagen_batch_size} now len(img_batch)=" + str(len(img_batch)))
         yield img_batch, mel_batch, frame_batch, coords_batch
         img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
     if len(img_batch) > 0:
       img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
@@ -474,7 +476,7 @@ class Avatar:
     return out_frame_chunks_from_video,out_faces_from_detect_results,post_frame_chunks_from_video,post_faces_from_detect_results
-  def text_to_lip_video(self, input_text):
     mel_chunks_from_audio, audio_segment =self.create_mel_from_audio(input_text)
     print(str(len(self.face_detect_img_results)))
@@ -482,7 +484,7 @@ class Avatar:
     video_full_frames_copy,face_detect_results=self.video_audio_adjust(mel_chunks_from_audio,video_full_frames_copy)
-    gen=self.datagen(video_full_frames_copy, mel_chunks_from_audio,face_detect_results)
     if self.export_video:
       video_write_handle = cv2.VideoWriter(self.temp_lip_video_no_voice_path+self.temp_lip_video_no_voice_filename,

     # print ("face_detect_img_results[1][1] shape = " +str(self.face_detect_img_results[1][0].shape)) #this is cropped image
+  def datagen(self, full_frames, mels, face_detect_results,progress_bar):
     img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
     print(len(full_frames))
+    progress_bar.progress(0)
     for i, m in enumerate(mels):
       idx = i%len(full_frames)
       frame_to_save = full_frames[idx].copy()
         #print(f"len(img_batch)>{self.datagen_batch_size} now len(img_batch)=" + str(len(img_batch)))
         yield img_batch, mel_batch, frame_batch, coords_batch
         img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+    progress_percentage = (i + 1) / len(mels)
+    progress_bar.progress(progress_percentage)
     if len(img_batch) > 0:
       img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
     return out_frame_chunks_from_video,out_faces_from_detect_results,post_frame_chunks_from_video,post_faces_from_detect_results
+  def text_to_lip_video(self, input_text,progress_bar):
     mel_chunks_from_audio, audio_segment =self.create_mel_from_audio(input_text)
     print(str(len(self.face_detect_img_results)))
     video_full_frames_copy,face_detect_results=self.video_audio_adjust(mel_chunks_from_audio,video_full_frames_copy)
+    gen=self.datagen(video_full_frames_copy, mel_chunks_from_audio,face_detect_results,progress_bar)
     if self.export_video:
       video_write_handle = cv2.VideoWriter(self.temp_lip_video_no_voice_path+self.temp_lip_video_no_voice_filename,