Spaces:
Running
Running
status placeholder and progress bars added
Browse files
app.py
CHANGED
@@ -16,12 +16,14 @@ from avatar import Avatar
|
|
16 |
|
17 |
options = ['Aude', 'Kyla', 'Liv']
|
18 |
images = ['ref_videos/Aude.png', 'ref_videos/Kyla.png', 'ref_videos/Liv.png']
|
|
|
|
|
19 |
if 'is_initialized' not in st.session_state:
|
20 |
st.session_state.avatar = Avatar()
|
21 |
st.session_state.avatar.export_video = False
|
22 |
-
|
23 |
st.session_state.avatar.load_model("checkpoint/wav2lip_gan.pth")
|
24 |
-
|
25 |
st.session_state.avatar.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
26 |
print(st.session_state.avatar.device)
|
27 |
st.session_state.avatar.output_audio_path = "audio/"
|
@@ -36,7 +38,7 @@ if 'is_initialized' not in st.session_state:
|
|
36 |
st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
|
37 |
st.session_state.avatar.face_detect_batch_size = 16
|
38 |
# avatar.create_face_detection_results(avatar.video_full_frames,True)
|
39 |
-
|
40 |
st.session_state.face_det_results_dict={}
|
41 |
for option in options:
|
42 |
with open(f'ref_videos/{option}_face_det_result.pkl', 'rb') as file:
|
@@ -49,8 +51,8 @@ if 'is_initialized' not in st.session_state:
|
|
49 |
# with open(self.face_det_results_path_and_name, 'rb') as file:
|
50 |
# self.face_detect_img_results = pickle.load(file)
|
51 |
input_text = "Hi How are you?"
|
52 |
-
st.session_state.avatar.text_to_lip_video(input_text)
|
53 |
-
|
54 |
|
55 |
st.session_state['is_initialized'] = True
|
56 |
|
@@ -64,6 +66,7 @@ img_col1, img_col2 = st.columns([1,1])
|
|
64 |
with img_col1:
|
65 |
st.image(images[options.index(selected_option)])
|
66 |
|
|
|
67 |
if st.session_state.selected_option != selected_option:
|
68 |
print("The selected option has changed!")
|
69 |
st.session_state.selected_option = selected_option
|
@@ -75,12 +78,13 @@ if st.session_state.selected_option != selected_option:
|
|
75 |
from avatar import Avatar
|
76 |
# Create a text input box and store the input in a variable
|
77 |
user_input = st.text_input("Enter your text:")
|
|
|
78 |
if user_input:
|
79 |
st.session_state.avatar.dir_clean_up()
|
80 |
# Display the entered text
|
81 |
st.write("You entered:", user_input)
|
82 |
st.session_state.avatar.export_video=True
|
83 |
-
st.session_state.avatar.text_to_lip_video(user_input)
|
84 |
col1, col2, col3 = st.columns([1, 4, 1])
|
85 |
|
86 |
# with col1:
|
|
|
16 |
|
17 |
options = ['Aude', 'Kyla', 'Liv']
|
18 |
images = ['ref_videos/Aude.png', 'ref_videos/Kyla.png', 'ref_videos/Liv.png']
|
19 |
+
current_status_placeholder = st.empty()
|
20 |
+
init_progress_bar = st.progress(0)
|
21 |
if 'is_initialized' not in st.session_state:
|
22 |
st.session_state.avatar = Avatar()
|
23 |
st.session_state.avatar.export_video = False
|
24 |
+
current_status_placeholder.write("load model")
|
25 |
st.session_state.avatar.load_model("checkpoint/wav2lip_gan.pth")
|
26 |
+
current_status_placeholder.write("load model finished")
|
27 |
st.session_state.avatar.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
28 |
print(st.session_state.avatar.device)
|
29 |
st.session_state.avatar.output_audio_path = "audio/"
|
|
|
38 |
st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
|
39 |
st.session_state.avatar.face_detect_batch_size = 16
|
40 |
# avatar.create_face_detection_results(avatar.video_full_frames,True)
|
41 |
+
current_status_placeholder.write("load face detection result")
|
42 |
st.session_state.face_det_results_dict={}
|
43 |
for option in options:
|
44 |
with open(f'ref_videos/{option}_face_det_result.pkl', 'rb') as file:
|
|
|
51 |
# with open(self.face_det_results_path_and_name, 'rb') as file:
|
52 |
# self.face_detect_img_results = pickle.load(file)
|
53 |
input_text = "Hi How are you?"
|
54 |
+
st.session_state.avatar.text_to_lip_video(input_text,init_progress_bar)
|
55 |
+
current_status_placeholder.write("load face detection result done")
|
56 |
|
57 |
st.session_state['is_initialized'] = True
|
58 |
|
|
|
66 |
with img_col1:
|
67 |
st.image(images[options.index(selected_option)])
|
68 |
|
69 |
+
|
70 |
if st.session_state.selected_option != selected_option:
|
71 |
print("The selected option has changed!")
|
72 |
st.session_state.selected_option = selected_option
|
|
|
78 |
from avatar import Avatar
|
79 |
# Create a text input box and store the input in a variable
|
80 |
user_input = st.text_input("Enter your text:")
|
81 |
+
inference_progress_bar = st.progress(0)
|
82 |
if user_input:
|
83 |
st.session_state.avatar.dir_clean_up()
|
84 |
# Display the entered text
|
85 |
st.write("You entered:", user_input)
|
86 |
st.session_state.avatar.export_video=True
|
87 |
+
st.session_state.avatar.text_to_lip_video(user_input,inference_progress_bar)
|
88 |
col1, col2, col3 = st.columns([1, 4, 1])
|
89 |
|
90 |
# with col1:
|
avatar.py
CHANGED
@@ -244,9 +244,10 @@ class Avatar:
|
|
244 |
# print ("face_detect_img_results[1][1] shape = " +str(self.face_detect_img_results[1][0].shape)) #this is cropped image
|
245 |
|
246 |
|
247 |
-
def datagen(self, full_frames, mels, face_detect_results):
|
248 |
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
249 |
print(len(full_frames))
|
|
|
250 |
for i, m in enumerate(mels):
|
251 |
idx = i%len(full_frames)
|
252 |
frame_to_save = full_frames[idx].copy()
|
@@ -271,7 +272,8 @@ class Avatar:
|
|
271 |
#print(f"len(img_batch)>{self.datagen_batch_size} now len(img_batch)=" + str(len(img_batch)))
|
272 |
yield img_batch, mel_batch, frame_batch, coords_batch
|
273 |
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
274 |
-
|
|
|
275 |
if len(img_batch) > 0:
|
276 |
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
277 |
|
@@ -474,7 +476,7 @@ class Avatar:
|
|
474 |
return out_frame_chunks_from_video,out_faces_from_detect_results,post_frame_chunks_from_video,post_faces_from_detect_results
|
475 |
|
476 |
|
477 |
-
def text_to_lip_video(self, input_text):
|
478 |
|
479 |
mel_chunks_from_audio, audio_segment =self.create_mel_from_audio(input_text)
|
480 |
print(str(len(self.face_detect_img_results)))
|
@@ -482,7 +484,7 @@ class Avatar:
|
|
482 |
|
483 |
video_full_frames_copy,face_detect_results=self.video_audio_adjust(mel_chunks_from_audio,video_full_frames_copy)
|
484 |
|
485 |
-
gen=self.datagen(video_full_frames_copy, mel_chunks_from_audio,face_detect_results)
|
486 |
|
487 |
if self.export_video:
|
488 |
video_write_handle = cv2.VideoWriter(self.temp_lip_video_no_voice_path+self.temp_lip_video_no_voice_filename,
|
|
|
244 |
# print ("face_detect_img_results[1][1] shape = " +str(self.face_detect_img_results[1][0].shape)) #this is cropped image
|
245 |
|
246 |
|
247 |
+
def datagen(self, full_frames, mels, face_detect_results,progress_bar):
|
248 |
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
249 |
print(len(full_frames))
|
250 |
+
progress_bar.progress(0)
|
251 |
for i, m in enumerate(mels):
|
252 |
idx = i%len(full_frames)
|
253 |
frame_to_save = full_frames[idx].copy()
|
|
|
272 |
#print(f"len(img_batch)>{self.datagen_batch_size} now len(img_batch)=" + str(len(img_batch)))
|
273 |
yield img_batch, mel_batch, frame_batch, coords_batch
|
274 |
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
275 |
+
progress_percentage = (i + 1) / len(mels)
|
276 |
+
progress_bar.progress(progress_percentage)
|
277 |
if len(img_batch) > 0:
|
278 |
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
279 |
|
|
|
476 |
return out_frame_chunks_from_video,out_faces_from_detect_results,post_frame_chunks_from_video,post_faces_from_detect_results
|
477 |
|
478 |
|
479 |
+
def text_to_lip_video(self, input_text,progress_bar):
|
480 |
|
481 |
mel_chunks_from_audio, audio_segment =self.create_mel_from_audio(input_text)
|
482 |
print(str(len(self.face_detect_img_results)))
|
|
|
484 |
|
485 |
video_full_frames_copy,face_detect_results=self.video_audio_adjust(mel_chunks_from_audio,video_full_frames_copy)
|
486 |
|
487 |
+
gen=self.datagen(video_full_frames_copy, mel_chunks_from_audio,face_detect_results,progress_bar)
|
488 |
|
489 |
if self.export_video:
|
490 |
video_write_handle = cv2.VideoWriter(self.temp_lip_video_no_voice_path+self.temp_lip_video_no_voice_filename,
|