zmbfeng commited on
Commit
d35ea54
·
1 Parent(s): 75c8c65

before load ref video and face detection result

Browse files
Files changed (5) hide show
  1. app.py +31 -2
  2. avatar.py +559 -0
  3. requirements.txt +21 -0
  4. results/readme.txt +1 -0
  5. temp/readme.txt +1 -0
app.py CHANGED
@@ -1,4 +1,33 @@
1
  import streamlit as st
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ import sys
4
+ import torch
5
+ path_to_add = os.path.join(os.path.dirname(__file__), "Wav2Lip")
6
+ if path_to_add not in sys.path:
7
+ sys.path.insert(0, path_to_add)
8
+ from avatar import Avatar
9
 
10
+
11
+ if 'is_initialized' not in st.session_state:
12
+ st.session_state.avatar = Avatar()
13
+ st.session_state.avatar.export_video = False
14
+ st.session_state.avatar.load_model("checkpoint/wav2lip_gan.pth")
15
+ print("load model finished")
16
+ st.session_state.avatar.device = 'cuda' if torch.cuda.is_available() else 'cpu'
17
+ print(st.session_state.avatar.device)
18
+ st.session_state.avatar.output_audio_path = "audio/"
19
+ st.session_state.avatar.output_audio_filename = "result.wav"
20
+ st.session_state.avatar.temp_lip_video_no_voice_path = "temp/"
21
+ st.session_state.avatar.temp_lip_video_no_voice_filename = "result.avi"
22
+ st.session_state.avatar.output_video_path = "results/"
23
+ st.session_state.avatar.output_video_name = "result_voice.mp4"
24
+
25
+ st.session_state['is_initialized'] = True
26
+
27
+
28
+ from avatar import Avatar
29
+ # Create a text input box and store the input in a variable
30
+ user_input = st.text_input("Enter your text:")
31
+ st.session_state.avatar.dir_clean_up()
32
+ # Display the entered text
33
+ st.write("You entered:", user_input)
avatar.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gtts import gTTS
2
+ import cv2
3
+ from pydub import AudioSegment
4
+ from tqdm import tqdm
5
+ import numpy as np
6
+ import pickle
7
+ import time
8
+ import subprocess,platform
9
+ import os
10
+ import torch
11
+ import io
12
+ import soundfile as sf
13
+
14
+ from models import Wav2Lip
15
+ import face_detection
16
+ import audio
17
+
18
+
19
+ class Avatar:
20
+ image_frame_num_current = 0
21
+ image_frame_num_goal=0
22
+ wav2lip_gan_model = [] # should be a model
23
+ video_full_frames = []
24
+ images_and_audio_list = []
25
+ images_list = []
26
+ mel_step_size = 16
27
+ output_audio_path = ""
28
+ output_audio_filename = ""
29
+ temp_lip_video_no_voice_path=""
30
+ temp_lip_video_no_voice_filename=""
31
+ input_audio_path=""
32
+ input_video_path=""
33
+ output_video_path=""
34
+ output_video_name=""
35
+ lip_video_no_voice_path=""
36
+ split_current_file_name = ""
37
+ fps = 30.0
38
+ face_detect_img_results = []
39
+ device = ""
40
+ face_detect_batch_size = 16
41
+ face_det_results_path_and_name = ""
42
+ datagen_batch_size = 512
43
+ frame_count = 0
44
+ video_width = 0
45
+ video_height = 0
46
+ export_video = False
47
+ def __init__(self):
48
+ print("Avatar init")
49
+
50
+ def _load(self,checkpoint_path):
51
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
52
+ if device == 'cuda':
53
+ checkpoint = torch.load(checkpoint_path)
54
+ else:
55
+ checkpoint = torch.load(checkpoint_path,
56
+ map_location=lambda storage, loc: storage)
57
+ return checkpoint
58
+
59
+
60
+ def load_model(self,path):
61
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
62
+ model = Wav2Lip()
63
+ print("Load checkpoint from: {}".format(path))
64
+ checkpoint = self._load(path)
65
+ s = checkpoint["state_dict"]
66
+ new_s = {}
67
+ for k, v in s.items():
68
+ new_s[k.replace('module.', '')] = v
69
+ model.load_state_dict(new_s)
70
+
71
+ model = model.to(device)
72
+ self.wav2lip_gan_model = model.eval()
73
+ #return model.eval()
74
+
75
+ def get_video_full_frames(self, video_path):
76
+ video_stream = cv2.VideoCapture(video_path)
77
+
78
+ self.fps = video_stream.get(cv2.CAP_PROP_FPS)
79
+ self.frame_count = video_stream.get(cv2.CAP_PROP_FRAME_COUNT)
80
+ self.video_width = video_stream.get(cv2.CAP_PROP_FRAME_WIDTH)
81
+ self.video_height = video_stream.get(cv2.CAP_PROP_FRAME_HEIGHT)
82
+ print("fps="+str(self.fps))
83
+ print('Reading video frames...')
84
+
85
+ self.video_full_frames = []
86
+
87
+ is_first_frame = True
88
+
89
+ while 1:
90
+ still_reading, frame = video_stream.read()
91
+ if not still_reading:
92
+ video_stream.release()
93
+ break
94
+ self.video_full_frames.append(frame)
95
+ # if is_first_frame:
96
+ # first_frame_shape=frame.shape
97
+ # first_frame=frame
98
+ # is_first_frame = False
99
+ # #[] is a list
100
+ # #full frame is a video!!!!, so 4 dimension
101
+ # print("CV2 frame_count="+str(self.frame_count))
102
+ # print("CV2 video_width="+str(self.video_width))
103
+ # print("CV2 video_height="+str(self.video_height))
104
+ # print("first frame shape="+str(first_frame_shape))
105
+ #
106
+ # print ("Number of frames available for inference: "+str(len(self.video_full_frames)))
107
+ # print("frame element="+str(first_frame[100][100][0]))
108
+ # print("frame element type="+str(type(first_frame[100][100][0])))
109
+ # #The value range for numpy.uint8 is from 0 to 255.
110
+ # print("len(full_frames)"+str(len(self.video_full_frames)))
111
+
112
+
113
+
114
+ def create_mel_from_audio(self,input_text):
115
+ tts = gTTS(text=input_text, lang="en")
116
+ tts.save(f"{self.output_audio_path}input_audio.mp3")
117
+ sound = AudioSegment.from_mp3(f"{self.output_audio_path}input_audio.mp3")
118
+
119
+ # Get the duration in seconds
120
+ sound_duration = sound.duration_seconds
121
+
122
+ sound.export(f"{self.output_audio_path}temp_{self.output_audio_filename}", format="wav")
123
+
124
+ wav = audio.load_wav(f"{self.output_audio_path}temp_{self.output_audio_filename}", 16000)
125
+
126
+ mel = audio.melspectrogram(wav)
127
+ # print(mel.shape)
128
+ # #(80, 97)
129
+ # #It means that the mel spectrogram of the audio input has 80 mel frequency bands and 97 time frames.
130
+ # #Yes, mel frequency bands do overlap
131
+ # #if wav is longer, so will the nmber of time frames
132
+ # #(80, 344)
133
+ # #mel is numpy.ndarray
134
+ # print("mel data type =" + str(type(mel)))
135
+ # print("mel element type =" +str(type(mel[1][2])))
136
+ # print("len(mel[0])="+str(len(mel[0])))
137
+ # #each mel element is numpy.float64, so can go negative
138
+
139
+ mel_chunks = []
140
+ mel_idx_multiplier = 80./self.fps
141
+ #seems there is always 80 mel frequency
142
+ #print("mel_idx_multiplier="+str(mel_idx_multiplier))
143
+ #30 frames per seconds, 80 mel frequency bands, so 2.66 bands per frame per second
144
+ i = 0
145
+ while 1:
146
+ start_idx = int(i * mel_idx_multiplier)
147
+ #len(mel[0]) is the number of time frames of the audio
148
+ if start_idx + self.mel_step_size > len(mel[0]):
149
+ mel_chunks.append(mel[:, len(mel[0]) - self.mel_step_size:])
150
+ break
151
+ mel_chunks.append(mel[:, start_idx : start_idx + self.mel_step_size])
152
+ i += 1
153
+ # for b_index, b_item in enumerate(reversed(mel_chunks)):
154
+ # print(str(b_index)+" "+str(np.average(b_item)))
155
+
156
+ for index, item in enumerate(reversed(mel_chunks)):
157
+ #print(str(index)+" "+str(np.average(item)))
158
+ if np.average(item) > -4.0:
159
+ break
160
+ print("stop at "+str(index))
161
+ num_frames_to_trim=index-1
162
+ mel_chunks=mel_chunks[:-num_frames_to_trim]
163
+ print("wav length={} duration={} num_frames_to_trim={} result={}".format(len(wav),sound_duration,num_frames_to_trim,str(16000*num_frames_to_trim//30)))
164
+ wav=wav[:-(16000*num_frames_to_trim//30)]
165
+ sf.write(f"{self.output_audio_path}{self.output_audio_filename}", wav, 16000)
166
+ sound_file = io.BytesIO(open(f"{self.output_audio_path}{self.output_audio_filename}", "rb").read())
167
+
168
+ # Load the wav file as an AudioSegment object
169
+ audio_segment_sound = AudioSegment.from_wav(sound_file)
170
+ return mel_chunks, audio_segment_sound
171
+
172
+ def get_smoothened_boxes(self, boxes, T):
173
+ for i in range(len(boxes)):
174
+ if i + T > len(boxes):
175
+ window = boxes[len(boxes) - T:]
176
+ else:
177
+ window = boxes[i : i + T]
178
+ boxes[i] = np.mean(window, axis=0)
179
+ return boxes
180
+
181
+
182
+ def create_face_detection_results(self, full_frames,save_result=True):
183
+ detector = FACE_DETECTION.FaceAlignment(FACE_DETECTION.LandmarksType._2D,
184
+ flip_input=False, device=self.device)
185
+ images=full_frames
186
+ while 1:
187
+ predictions = []
188
+ try:
189
+ for i in tqdm(range(0, len(images), self.face_detect_batch_size)):
190
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + self.face_detect_batch_size])))
191
+ except RuntimeError:
192
+ if self.face_detect_batch_size == 1:
193
+ raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
194
+ self.face_detect_batch_size //= 2
195
+ print('Recovering from OOM error; New batch size: {}'.format(self.face_detect_batch_size))
196
+ continue
197
+ break
198
+
199
+ face_detect_results = []
200
+ pady1, pady2, padx1, padx2 = [0, 10, 0, 0]
201
+ for rect, image in zip(predictions, images):
202
+ if rect is None:
203
+ cv2.imwrite('temp_faulty_frame.jpg', image) # check this frame where the face was not detected.
204
+ raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
205
+
206
+ y1 = max(0, rect[1] - pady1)
207
+ y2 = min(image.shape[0], rect[3] + pady2)
208
+ x1 = max(0, rect[0] - padx1)
209
+ x2 = min(image.shape[1], rect[2] + padx2)
210
+
211
+ face_detect_results.append([x1, y1, x2, y2])
212
+ # print("\n")
213
+ # print("face_detect_results length = " + str(len(face_detect_results)))
214
+ # print("face_detect_results[2]="+str(face_detect_results[2]))
215
+
216
+ boxes = np.array(face_detect_results)
217
+ boxes = self.get_smoothened_boxes(boxes, T=5)
218
+ # print ("boxes number of dim="+str(boxes.ndim))
219
+ # print ("boxes shape="+str(boxes.shape))
220
+
221
+ self.face_detect_img_results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
222
+ # print ("face_detect_img_results type =" + str(type(self.face_detect_img_results)))
223
+ # print ("face_detect_img_results length =" + str(len(self.face_detect_img_results)))
224
+ # print ("face_detect_img_results[1] type =" + str(type(self.face_detect_img_results[1])))
225
+ # print ("face_detect_img_results[1] length =" + str(len(self.face_detect_img_results[1])))
226
+ # print ("face_detect_img_results[1][1] = " +str(self.face_detect_img_results[1][1])) #this is the box
227
+ # print ("face_detect_img_results[1][1] shape = " +str(self.face_detect_img_results[1][0].shape)) #this is cropped image
228
+ if save_result:
229
+ with open(self.face_det_results_path_and_name, 'wb') as file:
230
+ pickle.dump(self.face_detect_img_results, file)
231
+
232
+
233
+ def load_face_detection_results(self):
234
+ with open(self.face_det_results_path_and_name, 'rb') as file:
235
+ self.face_detect_img_results = pickle.load(file)
236
+ # print ("face_detect_img_results type =" + str(type(self.face_detect_img_results)))
237
+ # print ("face_detect_img_results length =" + str(len(self.face_detect_img_results)))
238
+ # print ("face_detect_img_results[1] type =" + str(type(self.face_detect_img_results[1])))
239
+ # print ("face_detect_img_results[1] length =" + str(len(self.face_detect_img_results[1])))
240
+ # print ("face_detect_img_results[1][1] = " +str(self.face_detect_img_results[1][1])) #this is the box
241
+ # print ("face_detect_img_results[1][1] shape = " +str(self.face_detect_img_results[1][0].shape)) #this is cropped image
242
+
243
+
244
+ def datagen(self, full_frames, mels, face_detect_results):
245
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
246
+ print(len(full_frames))
247
+ for i, m in enumerate(mels):
248
+ idx = i%len(full_frames)
249
+ frame_to_save = full_frames[idx].copy()
250
+ face, coords = face_detect_results[idx].copy()
251
+ img_size = 96 # for wav2lip, their model is trained on 96x96 image
252
+ face = cv2.resize(face, (img_size, img_size))
253
+
254
+ img_batch.append(face)
255
+ mel_batch.append(m)
256
+ frame_batch.append(frame_to_save)
257
+ coords_batch.append(coords)
258
+
259
+ if len(img_batch) >= self.datagen_batch_size:
260
+
261
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
262
+
263
+ img_masked = img_batch.copy()
264
+ img_masked[:, img_size//2:] = 0
265
+
266
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
267
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
268
+ #print(f"len(img_batch)>{self.datagen_batch_size} now len(img_batch)=" + str(len(img_batch)))
269
+ yield img_batch, mel_batch, frame_batch, coords_batch
270
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
271
+
272
+ if len(img_batch) > 0:
273
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
274
+
275
+ img_masked = img_batch.copy()
276
+ img_masked[:, img_size//2:] = 0
277
+
278
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
279
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
280
+ #print("len(img_batch)>0 now len(img_batch)="+str(len(img_batch)))
281
+ yield img_batch, mel_batch, frame_batch, coords_batch
282
+ # datagen_result=datagen(video_full_frames_copy, 512, mel_chunks_from_audio,face_results)
283
+ # for img, mel, frame, coords in datagen_result:
284
+ # gen_img = img.copy()
285
+ # gen_mel = mel.copy()
286
+ # gen_frame = frame.copy()
287
+ # gen_coords = coords.copy()
288
+ # print("gen image shape ="+str(gen_img.shape))
289
+ # print("gen mel shape = "+str(gen_mel.shape))
290
+ # print("gen_coords length = " + str(len(gen_coords)))
291
+ # print("gen_coords[0] = " + str(gen_coords[0]))
292
+ # print("gen_frame length =" + str(len(gen_frame)))
293
+ # print("gen_frame[0] type =" + str(type(gen_frame[0])))
294
+ # print("gen_frame[0] shape =" + str(gen_frame[0].shape))
295
+ # print(str(gen_frame[0].shape))
296
+ #You are seeing img_batch shape as (batch_size, 96, 96, 6) because you are using the Wav2Lip model with the face detection and alignment option enabled. This option preprocesses the face images by detecting the face region, aligning the face orientation, and cropping and resizing the face image to 96 by 96 pixels. However, instead of discarding the original face image, the option concatenates the aligned face image and the original face image along the channel dimension, resulting in a 6-channel image.
297
+
298
+
299
+
300
+ def make_lip_video(self, datagen_result,video_write_out, mel_chunks,need_split, audio_sound):
301
+
302
+
303
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(datagen_result,
304
+ total=int(np.ceil(float(len(mel_chunks))/self.datagen_batch_size)))):
305
+ #print("\nin the for loop to unpack datagen_result, only run once")
306
+
307
+
308
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(self.device)
309
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(self.device)
310
+ inf_start_time = time.time() # get the start time
311
+ with torch.no_grad():
312
+ pred = self.wav2lip_gan_model(mel_batch, img_batch)
313
+
314
+ pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
315
+ # print("type of pred"+str(type(pred)))
316
+ # print("shape of pred"+str(pred.shape))
317
+ for p, f, c in zip(pred, frames, coords):
318
+ y1, y2, x1, x2 = c
319
+ p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1)) #before the face was extracted in scaled down to 96x96
320
+ f[y1:y2, x1:x2] = p #paste face back
321
+ self.images_list.append(f)
322
+ if need_split:
323
+ self.image_frame_num_current = self.image_frame_num_current + 1
324
+ if self.export_video:
325
+ video_write_out.write(f)
326
+
327
+
328
+ if need_split:
329
+ #print("GLOBAL_IMAGE_FRAME_NUM_CURRENT=" + str(self.image_frame_num_current))
330
+ if self.image_frame_num_current >= self.image_frame_num_goal:
331
+ self.images_and_audio_list.append([self.images_list, audio_sound])
332
+ self.images_list = []
333
+ # print("video_write_out relase in need split")
334
+ if self.export_video:
335
+ video_write_out.release()
336
+ else:
337
+ self.images_and_audio_list.append([self.images_list, audio_sound])
338
+ self.images_list = []
339
+ #print("video_write_out relase")
340
+ if self.export_video:
341
+ video_write_out.release()
342
+ inf_end_time = time.time() # get the end time
343
+ print(f"Inference time: {inf_end_time - inf_start_time} seconds") # print the difference
344
+ #print("img_batch length="+str(len(img_batch)))
345
+ def sync_video_audio(self,input_audio_path_and_name, input_video_path_and_name, output_video_path_and_name):
346
+ #ipdb.set_trace()
347
+ command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(input_audio_path_and_name,input_video_path_and_name,
348
+ output_video_path_and_name)
349
+ subprocess.call(command, shell=platform.system() != 'Windows')
350
+ # command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(input_audio_path, input_video_path,res[0]+'/temp'+res[1] )
351
+ # subprocess.call(command, shell=platform.system() != 'Windows')
352
+ # command = 'ffmpeg -ss 00:00:00 -t {} -i {} -c copy {}'.format(str(round(num_of_frames/fps,2)),res[0]+'/temp'+res[1],output_video_path )
353
+ # subprocess.call(command, shell=platform.system() != 'Windows')
354
+
355
+
356
+ def video_audio_adjust(self, mel_chunks_from_audio, frame_chunks_from_video):
357
+ out_frame_chunks_from_video=frame_chunks_from_video.copy()
358
+ out_faces_from_detect_results=self.face_detect_img_results.copy()
359
+ audio_duration= len(mel_chunks_from_audio)
360
+ video_duration= len(frame_chunks_from_video)
361
+ # print("mel length="+str(audio_duration))
362
+ # print("frame length="+str(video_duration))
363
+ if audio_duration != video_duration:
364
+ if audio_duration>video_duration:
365
+ differece=audio_duration-video_duration
366
+ # calculate how many times video should be concat
367
+ times=differece/video_duration
368
+ # create a file with video name and concat the video using ffmpeg
369
+
370
+ # if times fraction then add 1 to times
371
+ if times%1!=0:
372
+ times+=2
373
+ out_frame_chunks_from_video=out_frame_chunks_from_video*int(times)
374
+ out_faces_from_detect_results=out_faces_from_detect_results*int(times)
375
+ new_video_duration= len(out_frame_chunks_from_video)
376
+ # print("extending video frames and face detect")
377
+ # print("new frame length="+str(new_video_duration))
378
+ if new_video_duration > audio_duration:
379
+ out_frame_chunks_from_video=out_frame_chunks_from_video[:audio_duration]
380
+ out_faces_from_detect_results=out_faces_from_detect_results[:audio_duration]
381
+ new_video_duration= len(out_frame_chunks_from_video)
382
+ # print("new frame length="+str(new_video_duration))
383
+ else:
384
+ # print("truncate video frames and face detect")
385
+ out_frame_chunks_from_video=out_frame_chunks_from_video[:audio_duration]
386
+ out_faces_from_detect_results=out_faces_from_detect_results[:audio_duration]
387
+ return out_frame_chunks_from_video,out_faces_from_detect_results
388
+
389
+ def video_audio_adjust_parallel(self, mel_chunks_from_audio, frame_chunks_from_video,
390
+ pre_frame_chunks_from_video, pre_faces_from_detect_results):
391
+
392
+ out_frame_chunks_from_video=frame_chunks_from_video.copy()
393
+ out_faces_from_detect_results=self.face_detect_img_results.copy()
394
+ audio_duration= len(mel_chunks_from_audio)
395
+ video_duration= len(frame_chunks_from_video)
396
+
397
+ post_frame_chunks_from_video=[]
398
+ post_faces_from_detect_results=[]
399
+ pre_video_duration= len(pre_frame_chunks_from_video)
400
+
401
+ # print("video_audio_adjust_parallel mel length="+str(audio_duration))
402
+ # print("video_audio_adjust_parallel frame length="+str(video_duration))
403
+ # print("video_audio_adjust_parallel pre frame length="+str(pre_video_duration))
404
+ if (audio_duration-pre_video_duration) != video_duration:
405
+ if (audio_duration-pre_video_duration)>video_duration:
406
+ # print("in Case 1")
407
+ differece=(audio_duration-pre_video_duration)-video_duration
408
+ # calculate how many times video should be concat
409
+ times=differece/video_duration
410
+ # create a file with video name and concat the video using ffmpeg
411
+
412
+ # if times fraction then add 1 to times
413
+ if times%1!=0:
414
+ times+=2
415
+ # print("video_audio_adjust_parallel times="+str(times))
416
+ out_frame_chunks_from_video=out_frame_chunks_from_video*int(times)
417
+ # print("video_audio_adjust_parallel video length after multiplying with time ="+str(len(out_frame_chunks_from_video)))
418
+ # if len(pre_frame_chunks_from_video) > 0 :
419
+ # cv2.imwrite('/content/pre_video_first_'+DEBUG_GLOBAL_CURRENT_FILE_NAME, pre_frame_chunks_from_video[0])
420
+ # cv2.imwrite('/content/pre_video_last_'+DEBUG_GLOBAL_CURRENT_FILE_NAME, pre_frame_chunks_from_video[-1])
421
+ # cv2.imwrite('/content/video_first_'+DEBUG_GLOBAL_CURRENT_FILE_NAME, out_frame_chunks_from_video[0])
422
+ out_frame_chunks_from_video=pre_frame_chunks_from_video+out_frame_chunks_from_video
423
+ out_faces_from_detect_results=out_faces_from_detect_results*int(times)
424
+ out_faces_from_detect_results=pre_faces_from_detect_results+out_faces_from_detect_results
425
+ new_video_duration= len(out_frame_chunks_from_video)
426
+ # print("extending video frames and face detect")
427
+ # print("new frame length="+str(new_video_duration))
428
+ if new_video_duration > audio_duration:
429
+ # print("in Case 1a")
430
+ c = np.absolute(out_frame_chunks_from_video[audio_duration-1]- out_frame_chunks_from_video[audio_duration]) # or c = a - b
431
+ # print("video_audio_adjust_parallel difference at cut off is "+str(np.mean(c) ))
432
+
433
+ out_frame_chunks_from_video_copy=out_frame_chunks_from_video.copy()
434
+ out_frame_chunks_from_video=out_frame_chunks_from_video[:audio_duration]
435
+ post_frame_chunks_from_video=out_frame_chunks_from_video_copy[audio_duration:]
436
+ out_faces_from_detect_results_copy=out_faces_from_detect_results.copy()
437
+ out_faces_from_detect_results=out_faces_from_detect_results[:audio_duration]
438
+ post_faces_from_detect_results=out_faces_from_detect_results_copy[audio_duration:]
439
+
440
+ #else:
441
+ # print("unhandled case 1")
442
+ #new_video_duration= len(out_frame_chunks_from_video)
443
+ # print("new frame length="+str(new_video_duration))
444
+ else:
445
+ # print("in Case 2")
446
+ # print("truncate video frames and face detect")
447
+
448
+ # print("video_audio_adjust_parallel video length pre_frame_chunks_from_video ="+str(len(pre_frame_chunks_from_video)))
449
+ # print("video_audio_adjust_parallel video length out_frame_chunks_from_video ="+str(len(out_frame_chunks_from_video)))
450
+
451
+ out_frame_chunks_from_video=pre_frame_chunks_from_video+out_frame_chunks_from_video
452
+ c = np.absolute(out_frame_chunks_from_video[audio_duration-1]-out_frame_chunks_from_video[audio_duration]) # or c = a - b
453
+ # print("video_audio_adjust_parallel difference at cut off is "+str(np.mean(c) ))
454
+ out_faces_from_detect_results=pre_faces_from_detect_results+out_faces_from_detect_results
455
+ out_frame_chunks_from_video_copy=out_frame_chunks_from_video.copy()
456
+ out_frame_chunks_from_video=out_frame_chunks_from_video[:audio_duration]
457
+ post_frame_chunks_from_video=out_frame_chunks_from_video_copy[audio_duration:]
458
+
459
+
460
+
461
+
462
+ out_faces_from_detect_results_copy=out_faces_from_detect_results.copy()
463
+ out_faces_from_detect_results=out_faces_from_detect_results[:audio_duration]
464
+ post_faces_from_detect_results=out_faces_from_detect_results_copy[audio_duration:]
465
+ # cv2.imwrite('/content/video_last_'+DEBUG_GLOBAL_CURRENT_FILE_NAME, out_frame_chunks_from_video[-1])
466
+ # if len(post_frame_chunks_from_video) > 0 :
467
+ # cv2.imwrite('/content/post_video_first_'+DEBUG_GLOBAL_CURRENT_FILE_NAME, post_frame_chunks_from_video[0])
468
+ # cv2.imwrite('/content/post_video_last_'+DEBUG_GLOBAL_CURRENT_FILE_NAME, post_frame_chunks_from_video[-1])
469
+ #else:
470
+ # print("unhandled case 2")
471
+ return out_frame_chunks_from_video,out_faces_from_detect_results,post_frame_chunks_from_video,post_faces_from_detect_results
472
+
473
+
474
+ def text_to_lip_video(self, input_text):
475
+
476
+ mel_chunks_from_audio, audio_segment =self.create_mel_from_audio(input_text)
477
+ print(str(len(self.face_detect_img_results)))
478
+ video_full_frames_copy=self.video_full_frames.copy()
479
+
480
+ video_full_frames_copy,face_detect_results=self.video_audio_adjust(mel_chunks_from_audio,video_full_frames_copy)
481
+
482
+ gen=self.datagen(video_full_frames_copy, mel_chunks_from_audio,face_detect_results)
483
+
484
+ if self.export_video:
485
+ video_write_handle = cv2.VideoWriter(self.temp_lip_video_no_voice_path+self.temp_lip_video_no_voice_filename,
486
+ cv2.VideoWriter_fourcc(*'DIVX'), self.fps,
487
+ (self.video_full_frames[0].shape[0], self.video_full_frames[0].shape[1]))
488
+ else:
489
+ video_write_handle =0
490
+
491
+ self.make_lip_video(gen,video_write_handle, mel_chunks_from_audio,len(mel_chunks_from_audio)>self.datagen_batch_size,
492
+ audio_segment)
493
+ if self.export_video:
494
+ self.sync_video_audio(self.output_audio_path+self.output_audio_filename,
495
+ self.temp_lip_video_no_voice_path+self.temp_lip_video_no_voice_filename,
496
+ self.output_video_path+self.output_video_name
497
+ )
498
+ self.image_frame_num_current=0
499
+ def text_to_lip_video_parallel(self, input_text, pre_base_video_frames,pre_face_detect_results):
500
+
501
+
502
+
503
+ mel_chunks_from_audio, audio_segment = self.create_mel_from_audio(input_text)
504
+ print(str(len(self.face_detect_img_results)))
505
+ video_full_frames_copy=self.video_full_frames.copy()
506
+
507
+ video_full_frames_copy,face_detect_results,post_base_video_frames,post_face_detect_results=(
508
+ self.video_audio_adjust_parallel(mel_chunks_from_audio,video_full_frames_copy,
509
+ pre_base_video_frames,pre_face_detect_results))
510
+
511
+ gen=self.datagen(video_full_frames_copy,mel_chunks_from_audio,face_detect_results)
512
+
513
+ if self.export_video:
514
+ video_write_handle = cv2.VideoWriter(self.temp_lip_video_no_voice_path + self.temp_lip_video_no_voice_filename,
515
+ cv2.VideoWriter_fourcc(*'DIVX'), self.fps,
516
+ (self.video_full_frames[0].shape[0], self.video_full_frames[0].shape[1]))
517
+ else:
518
+ video_write_handle=0
519
+
520
+ self.make_lip_video(gen,video_write_handle, mel_chunks_from_audio,len(mel_chunks_from_audio)>self.datagen_batch_size,audio_segment)
521
+ if self.export_video:
522
+ self.sync_video_audio(self.output_audio_path + self.output_audio_filename,
523
+ self.temp_lip_video_no_voice_path + self.temp_lip_video_no_voice_filename,
524
+ self.output_video_path + self.split_current_file_name)
525
+ image_frame_num_current=0
526
+
527
+
528
+ return post_base_video_frames,post_face_detect_results
529
+ def delete_files_in_path(self,dir_path):
530
+ # Get the list of files in the directory
531
+ files = os.listdir(dir_path)
532
+
533
+ # Check if the list is not empty
534
+ if files:
535
+ # Loop through the files
536
+ for file in files:
537
+ # Join the file name with the directory path
538
+ file_path = os.path.join(dir_path, file)
539
+ # Check if the file is a regular file (not a directory or a link)
540
+ if os.path.isfile(file_path):
541
+ # Delete the file
542
+ os.remove(file_path)
543
+
544
+ def dir_clean_up(self):
545
+ if os.path.isdir(self.output_audio_path):
546
+ self.delete_files_in_path(self.output_audio_path)
547
+ else:
548
+ os.mkdir(self.output_audio_path)
549
+
550
+ if (self.export_video):
551
+ if os.path.isdir(self.temp_lip_video_no_voice_path):
552
+ self.delete_files_in_path(self.temp_lip_video_no_voice_path)
553
+ else:
554
+ os.mkdir(self.temp_lip_video_no_voice_path)
555
+
556
+ if os.path.isdir(self.output_video_path):
557
+ self.delete_files_in_path(self.output_video_path)
558
+ else:
559
+ os.mkdir(self.output_video_path)
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ librosa==0.8.0
2
+ transformers==4.22.2
3
+ ffmpeg-python
4
+ SpeechRecognition
5
+ pydub
6
+ gTTS
7
+ numpy
8
+ opencv-contrib-python
9
+ opencv-python
10
+ torchvision
11
+ tqdm
12
+ numba
13
+ basicsr>=1.4.2
14
+ facexlib>=0.2.5
15
+ lmdb
16
+ pyyaml
17
+ scipy
18
+ tb-nightly
19
+ torch>=1.7
20
+ yapf
21
+ realesrgan
results/readme.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ results folder
temp/readme.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ temp folder