File size: 12,477 Bytes
c83dd81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import sys

from src.utils.img_utils import pil_to_cv2, cv2_to_pil, center_crop_cv2, pils_from_video, save_videos_from_pils, save_video_from_cv2_list
from PIL import Image
import cv2
from IPython import embed
import numpy as np
import copy
from src.utils.motion_utils import motion_sync
import pathlib
import torch
import pickle
from glob import glob
import os
from src.models.dwpose.dwpose_detector import dwpose_detector as dwprocessor
from src.models.dwpose.util import draw_pose
import decord
from tqdm import tqdm
from moviepy.editor import AudioFileClip, VideoFileClip
from multiprocessing.pool import ThreadPool

##################################
base_dir = "root"
tasks = ["emtd"]

process_num = 800 #1266

start = 0
end = process_num + start
#################################
MAX_SIZE = 768


def convert_fps(src_path, tgt_path, tgt_fps=24, tgt_sr=16000):
    clip = VideoFileClip(src_path)
    new_clip = clip.set_fps(tgt_fps)
    if tgt_fps is not None:
        audio = new_clip.audio
        audio = audio.set_fps(tgt_sr)
        new_clip = new_clip.set_audio(audio)
    
    new_clip.write_videofile(tgt_path, codec='libx264', audio_codec='aac')
    
def get_video_pose(

        video_path: str, 

        sample_stride: int=1,

        max_frame=None):

    # read input video
    vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
    sample_stride *= max(1, int(vr.get_avg_fps() / 24))

    frames = vr.get_batch(list(range(0, len(vr), sample_stride))).asnumpy()
    if max_frame is not None:
        frames = frames[0:max_frame,:,:]
    height, width, _ = frames[0].shape
    # detected_poses = [dwprocessor(frm) for frm in tqdm(frames, desc="DWPose")]
    detected_poses = [dwprocessor(frm) for frm in frames]
    dwprocessor.release_memory()

    return detected_poses, height, width, frames
    
def resize_and_pad(img, max_size):
    img_new = np.zeros((max_size, max_size, 3)).astype('uint8')
    imh, imw = img.shape[0], img.shape[1]
    half = max_size // 2
    if imh > imw:
        imh_new = max_size
        imw_new = int(round(imw/imh * imh_new))
        half_w = imw_new // 2
        rb, re = 0, max_size
        cb = half-half_w
        ce = cb + imw_new
    else:
        imw_new = max_size
        imh_new = int(round(imh/imw * imw_new))
        half_h = imh_new // 2
        cb, ce = 0, max_size
        rb = half-half_h
        re = rb + imh_new

    img_resize = cv2.resize(img, (imw_new, imh_new))
    img_new[rb:re,cb:ce,:] = img_resize
    return img_new

def resize_and_pad_param(imh, imw, max_size):
    half = max_size // 2
    if imh > imw:
        imh_new = max_size
        imw_new = int(round(imw/imh * imh_new))
        half_w = imw_new // 2
        rb, re = 0, max_size
        cb = half-half_w
        ce = cb + imw_new
    else:
        imw_new = max_size
        imh_new = int(round(imh/imw * imw_new))
        imh_new = max_size

        half_h = imh_new // 2
        cb, ce = 0, max_size
        rb = half-half_h
        re = rb + imh_new
        
    return imh_new, imw_new, rb, re, cb, ce

def get_pose_params(detected_poses, max_size):
    print('get_pose_params...')
    # pose rescale 
    w_min_all, w_max_all, h_min_all, h_max_all = [], [], [], []
    mid_all = []
    for num, detected_pose in enumerate(detected_poses):
        detected_poses[num]['num'] = num
        candidate_body = detected_pose['bodies']['candidate']
        score_body = detected_pose['bodies']['score']
        candidate_face = detected_pose['faces']
        score_face = detected_pose['faces_score']
        candidate_hand = detected_pose['hands']
        score_hand = detected_pose['hands_score']

        # 选取置信度最高的face
        if candidate_face.shape[0] > 1:
            index = 0
            candidate_face = candidate_face[index]
            score_face = score_face[index]
            detected_poses[num]['faces'] = candidate_face.reshape(1, candidate_face.shape[0], candidate_face.shape[1])
            detected_poses[num]['faces_score'] = score_face.reshape(1, score_face.shape[0])
        else:
            candidate_face = candidate_face[0]
            score_face = score_face[0]

        # 选取置信度最高的body
        if score_body.shape[0] > 1:
            tmp_score = []
            for k in range(0, score_body.shape[0]):
                tmp_score.append(score_body[k].mean())
            index = np.argmax(tmp_score)
            candidate_body = candidate_body[index*18:(index+1)*18,:]
            score_body = score_body[index]
            score_hand = score_hand[(index*2):(index*2+2),:]
            candidate_hand = candidate_hand[(index*2):(index*2+2),:,:]
        else:
            score_body = score_body[0]
        all_pose = np.concatenate((candidate_body, candidate_face))
        all_score = np.concatenate((score_body, score_face))
        all_pose = all_pose[all_score>0.8]


        body_pose = np.concatenate((candidate_body,))
        mid_ = body_pose[1, 0]


        face_pose = candidate_face
        hand_pose = candidate_hand


        h_min, h_max = np.min(face_pose[:,1]), np.max(body_pose[:7,1])

        h_ = h_max - h_min
        
        mid_w = mid_
        w_min = mid_w - h_ // 2
        w_max = mid_w + h_ // 2
        
        w_min_all.append(w_min)
        w_max_all.append(w_max)
        h_min_all.append(h_min)
        h_max_all.append(h_max)
        mid_all.append(mid_w)

    w_min = np.min(w_min_all)
    w_max = np.max(w_max_all)
    h_min = np.min(h_min_all)
    h_max = np.max(h_max_all)
    mid = np.mean(mid_all)
    print(mid)

    margin_ratio = 0.25
    h_margin = (h_max-h_min)*margin_ratio
    
    h_min = max(h_min-h_margin*0.65, 0)
    h_max = min(h_max+h_margin*0.5, 1)

    h_new = h_max - h_min
    
    h_min_real = int(h_min*height)
    h_max_real = int(h_max*height)
    mid_real = int(mid*width)
    
    
    height_new = h_max_real-h_min_real+1
    width_new = height_new
    w_min_real = mid_real - height_new // 2

    w_max_real = w_min_real + width_new
    w_min = w_min_real / width
    w_max = w_max_real / width

    print(width_new, height_new)

    imh_new, imw_new, rb, re, cb, ce = resize_and_pad_param(height_new, width_new, max_size)
    res = {'draw_pose_params': [imh_new, imw_new, rb, re, cb, ce], 
           'pose_params': [w_min, w_max, h_min, h_max],
           'video_params': [h_min_real, h_max_real, w_min_real, w_max_real],
           }
    return res

def save_pose_params_item(input_items):
    detected_pose, pose_params, draw_pose_params, save_dir = input_items
    w_min, w_max, h_min, h_max = pose_params
    num = detected_pose['num']
    candidate_body = detected_pose['bodies']['candidate']
    candidate_face = detected_pose['faces'][0]
    candidate_hand = detected_pose['hands']
    candidate_body[:,0] = (candidate_body[:,0]-w_min)/(w_max-w_min)
    candidate_body[:,1] = (candidate_body[:,1]-h_min)/(h_max-h_min)
    candidate_face[:,0] = (candidate_face[:,0]-w_min)/(w_max-w_min)
    candidate_face[:,1] = (candidate_face[:,1]-h_min)/(h_max-h_min)
    candidate_hand[:,:,0] = (candidate_hand[:,:,0]-w_min)/(w_max-w_min)
    candidate_hand[:,:,1] = (candidate_hand[:,:,1]-h_min)/(h_max-h_min)
    detected_pose['bodies']['candidate'] = candidate_body
    detected_pose['faces'] = candidate_face.reshape(1, candidate_face.shape[0], candidate_face.shape[1])
    detected_pose['hands'] = candidate_hand
    detected_pose['draw_pose_params'] = draw_pose_params
    np.save(save_dir+'/'+str(num)+'.npy', detected_pose)

def save_pose_params(detected_poses, pose_params, draw_pose_params, ori_video_path):
    save_dir = ori_video_path.replace('original_videos', 'image_audio_features/pose/')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    input_list = []
    for i, detected_pose in enumerate(detected_poses):
        input_list.append([detected_pose, pose_params, draw_pose_params, save_dir])

    pool = ThreadPool(8)
    pool.map(save_pose_params_item, input_list)
    pool.close()
    pool.join()
       
def save_processed_video(ori_frames, video_params, ori_video_path, max_size):
    save_path = ori_video_path.replace('original_videos', 'processed/video/')
    save_dir = os.path.dirname(save_path)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    h_min_real, h_max_real, w_min_real, w_max_real = video_params
    video_frame_crop = []
    for img in ori_frames:
        img = img[h_min_real:h_max_real,w_min_real:w_max_real,:]
        img = resize_and_pad(img, max_size=max_size)
        video_frame_crop.append(img)
    save_video_from_cv2_list(video_frame_crop, save_path, fps=24.0, rgb2bgr=True)
    return video_frame_crop

def save_audio(ori_video_path, sub_task):
    save_path = ori_video_path.replace('original_videos', 'processed/audio/')
    save_dir = os.path.dirname(save_path)
    save_path = save_path + '.wav'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    ori_video_path = ori_video_path.replace(sub_task, sub_task+'_24fps')
    audio_clip = AudioFileClip(ori_video_path)
    audio_clip.write_audiofile(save_path)

def draw_pose_video(pose_params_path, save_path, max_size, ori_frames=None):
    pose_files = os.listdir(pose_params_path)
     # 生成Pose图cd pro 
    output_pose_img = []
    for i in range(0, len(pose_files)):
        pose_params_path_tmp = pose_params_path + '/' + str(i) + '.npy'
        detected_pose = np.load(pose_params_path_tmp, allow_pickle=True).tolist()
        imh_new, imw_new, rb, re, cb, ce = detected_pose['draw_pose_params']
        im = draw_pose(detected_pose, imh_new, imw_new, ref_w=800)
        im = np.transpose(np.array(im),(1,2,0))
        img_new = np.zeros((max_size, max_size, 3)).astype('uint8')
        img_new[rb:re,cb:ce,:] = im
        if ori_frames is not None:
            img_new = img_new * 0.6 + ori_frames[i] * 0.4
            img_new = img_new.astype('uint8')
        output_pose_img.append(img_new)

    output_pose_img = np.stack(output_pose_img)
    save_video_from_cv2_list(output_pose_img, save_path, fps=24.0, rgb2bgr=True)
    print('save to ' + save_path)

visualization = False
for sub_task in tasks:

    ori_list = os.listdir(base_dir+sub_task)[start:end]
    
    mp4_list = ori_list

    new_dir = base_dir+sub_task+'_24fps'
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    index = 1
    for i, mp4_file in enumerate(mp4_list):
        ori_video_path = base_dir+sub_task+'/'+mp4_file
        if ori_video_path[-3:]=='mp4' or ori_video_path[-3:] =='MOV':
            try:
                # 转换祯率
                ori_video_path_new = ori_video_path.replace(sub_task, sub_task+'_24fps')
                if '.MOV' in ori_video_path_new:
                    ori_video_path_new.replace('.MOV', '.mp4')
                convert_fps(ori_video_path, ori_video_path_new)
                print([index+start, ori_video_path, start, end])
                # 提取Pose
                detected_poses, height, width, ori_frames = get_video_pose(ori_video_path_new, max_frame=None)
                print(height, width)
                # 提取相关参数
                res_params = get_pose_params(detected_poses, MAX_SIZE)
                # 存储Pose参数
                save_pose_params(detected_poses, res_params['pose_params'], res_params['draw_pose_params'], ori_video_path)
                # 存储截取视频
                video_frame_crop = save_processed_video(ori_frames, res_params['video_params'], ori_video_path, MAX_SIZE)
                # 存储音频
                save_audio(ori_video_path, sub_task)
                index += 1
                if visualization:
                    # 绘制pose图
                    pose_params_path = ori_video_path.replace('original_videos', 'image_audio_features/pose')
                    save_path = "./vis_pose_results/" + os.path.basename(ori_video_path)
                    draw_pose_video(pose_params_path, save_path, ori_frames=video_frame_crop)
            except:
                print(["extract crash!", index+start, ori_video_path, start, end])
                continue 

    print(["All Finished", sub_task, start, end])