root
Reinitialize Git repository with LFS support
c83dd81
raw
history blame
12.5 kB
import sys
from src.utils.img_utils import pil_to_cv2, cv2_to_pil, center_crop_cv2, pils_from_video, save_videos_from_pils, save_video_from_cv2_list
from PIL import Image
import cv2
from IPython import embed
import numpy as np
import copy
from src.utils.motion_utils import motion_sync
import pathlib
import torch
import pickle
from glob import glob
import os
from src.models.dwpose.dwpose_detector import dwpose_detector as dwprocessor
from src.models.dwpose.util import draw_pose
import decord
from tqdm import tqdm
from moviepy.editor import AudioFileClip, VideoFileClip
from multiprocessing.pool import ThreadPool
##################################
base_dir = "root"
tasks = ["emtd"]
process_num = 800 #1266
start = 0
end = process_num + start
#################################
MAX_SIZE = 768
def convert_fps(src_path, tgt_path, tgt_fps=24, tgt_sr=16000):
clip = VideoFileClip(src_path)
new_clip = clip.set_fps(tgt_fps)
if tgt_fps is not None:
audio = new_clip.audio
audio = audio.set_fps(tgt_sr)
new_clip = new_clip.set_audio(audio)
new_clip.write_videofile(tgt_path, codec='libx264', audio_codec='aac')
def get_video_pose(
video_path: str,
sample_stride: int=1,
max_frame=None):
# read input video
vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
sample_stride *= max(1, int(vr.get_avg_fps() / 24))
frames = vr.get_batch(list(range(0, len(vr), sample_stride))).asnumpy()
if max_frame is not None:
frames = frames[0:max_frame,:,:]
height, width, _ = frames[0].shape
# detected_poses = [dwprocessor(frm) for frm in tqdm(frames, desc="DWPose")]
detected_poses = [dwprocessor(frm) for frm in frames]
dwprocessor.release_memory()
return detected_poses, height, width, frames
def resize_and_pad(img, max_size):
img_new = np.zeros((max_size, max_size, 3)).astype('uint8')
imh, imw = img.shape[0], img.shape[1]
half = max_size // 2
if imh > imw:
imh_new = max_size
imw_new = int(round(imw/imh * imh_new))
half_w = imw_new // 2
rb, re = 0, max_size
cb = half-half_w
ce = cb + imw_new
else:
imw_new = max_size
imh_new = int(round(imh/imw * imw_new))
half_h = imh_new // 2
cb, ce = 0, max_size
rb = half-half_h
re = rb + imh_new
img_resize = cv2.resize(img, (imw_new, imh_new))
img_new[rb:re,cb:ce,:] = img_resize
return img_new
def resize_and_pad_param(imh, imw, max_size):
half = max_size // 2
if imh > imw:
imh_new = max_size
imw_new = int(round(imw/imh * imh_new))
half_w = imw_new // 2
rb, re = 0, max_size
cb = half-half_w
ce = cb + imw_new
else:
imw_new = max_size
imh_new = int(round(imh/imw * imw_new))
imh_new = max_size
half_h = imh_new // 2
cb, ce = 0, max_size
rb = half-half_h
re = rb + imh_new
return imh_new, imw_new, rb, re, cb, ce
def get_pose_params(detected_poses, max_size):
print('get_pose_params...')
# pose rescale
w_min_all, w_max_all, h_min_all, h_max_all = [], [], [], []
mid_all = []
for num, detected_pose in enumerate(detected_poses):
detected_poses[num]['num'] = num
candidate_body = detected_pose['bodies']['candidate']
score_body = detected_pose['bodies']['score']
candidate_face = detected_pose['faces']
score_face = detected_pose['faces_score']
candidate_hand = detected_pose['hands']
score_hand = detected_pose['hands_score']
# 选取置信度最高的face
if candidate_face.shape[0] > 1:
index = 0
candidate_face = candidate_face[index]
score_face = score_face[index]
detected_poses[num]['faces'] = candidate_face.reshape(1, candidate_face.shape[0], candidate_face.shape[1])
detected_poses[num]['faces_score'] = score_face.reshape(1, score_face.shape[0])
else:
candidate_face = candidate_face[0]
score_face = score_face[0]
# 选取置信度最高的body
if score_body.shape[0] > 1:
tmp_score = []
for k in range(0, score_body.shape[0]):
tmp_score.append(score_body[k].mean())
index = np.argmax(tmp_score)
candidate_body = candidate_body[index*18:(index+1)*18,:]
score_body = score_body[index]
score_hand = score_hand[(index*2):(index*2+2),:]
candidate_hand = candidate_hand[(index*2):(index*2+2),:,:]
else:
score_body = score_body[0]
all_pose = np.concatenate((candidate_body, candidate_face))
all_score = np.concatenate((score_body, score_face))
all_pose = all_pose[all_score>0.8]
body_pose = np.concatenate((candidate_body,))
mid_ = body_pose[1, 0]
face_pose = candidate_face
hand_pose = candidate_hand
h_min, h_max = np.min(face_pose[:,1]), np.max(body_pose[:7,1])
h_ = h_max - h_min
mid_w = mid_
w_min = mid_w - h_ // 2
w_max = mid_w + h_ // 2
w_min_all.append(w_min)
w_max_all.append(w_max)
h_min_all.append(h_min)
h_max_all.append(h_max)
mid_all.append(mid_w)
w_min = np.min(w_min_all)
w_max = np.max(w_max_all)
h_min = np.min(h_min_all)
h_max = np.max(h_max_all)
mid = np.mean(mid_all)
print(mid)
margin_ratio = 0.25
h_margin = (h_max-h_min)*margin_ratio
h_min = max(h_min-h_margin*0.65, 0)
h_max = min(h_max+h_margin*0.5, 1)
h_new = h_max - h_min
h_min_real = int(h_min*height)
h_max_real = int(h_max*height)
mid_real = int(mid*width)
height_new = h_max_real-h_min_real+1
width_new = height_new
w_min_real = mid_real - height_new // 2
w_max_real = w_min_real + width_new
w_min = w_min_real / width
w_max = w_max_real / width
print(width_new, height_new)
imh_new, imw_new, rb, re, cb, ce = resize_and_pad_param(height_new, width_new, max_size)
res = {'draw_pose_params': [imh_new, imw_new, rb, re, cb, ce],
'pose_params': [w_min, w_max, h_min, h_max],
'video_params': [h_min_real, h_max_real, w_min_real, w_max_real],
}
return res
def save_pose_params_item(input_items):
detected_pose, pose_params, draw_pose_params, save_dir = input_items
w_min, w_max, h_min, h_max = pose_params
num = detected_pose['num']
candidate_body = detected_pose['bodies']['candidate']
candidate_face = detected_pose['faces'][0]
candidate_hand = detected_pose['hands']
candidate_body[:,0] = (candidate_body[:,0]-w_min)/(w_max-w_min)
candidate_body[:,1] = (candidate_body[:,1]-h_min)/(h_max-h_min)
candidate_face[:,0] = (candidate_face[:,0]-w_min)/(w_max-w_min)
candidate_face[:,1] = (candidate_face[:,1]-h_min)/(h_max-h_min)
candidate_hand[:,:,0] = (candidate_hand[:,:,0]-w_min)/(w_max-w_min)
candidate_hand[:,:,1] = (candidate_hand[:,:,1]-h_min)/(h_max-h_min)
detected_pose['bodies']['candidate'] = candidate_body
detected_pose['faces'] = candidate_face.reshape(1, candidate_face.shape[0], candidate_face.shape[1])
detected_pose['hands'] = candidate_hand
detected_pose['draw_pose_params'] = draw_pose_params
np.save(save_dir+'/'+str(num)+'.npy', detected_pose)
def save_pose_params(detected_poses, pose_params, draw_pose_params, ori_video_path):
save_dir = ori_video_path.replace('original_videos', 'image_audio_features/pose/')
if not os.path.exists(save_dir):
os.makedirs(save_dir)
input_list = []
for i, detected_pose in enumerate(detected_poses):
input_list.append([detected_pose, pose_params, draw_pose_params, save_dir])
pool = ThreadPool(8)
pool.map(save_pose_params_item, input_list)
pool.close()
pool.join()
def save_processed_video(ori_frames, video_params, ori_video_path, max_size):
save_path = ori_video_path.replace('original_videos', 'processed/video/')
save_dir = os.path.dirname(save_path)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
h_min_real, h_max_real, w_min_real, w_max_real = video_params
video_frame_crop = []
for img in ori_frames:
img = img[h_min_real:h_max_real,w_min_real:w_max_real,:]
img = resize_and_pad(img, max_size=max_size)
video_frame_crop.append(img)
save_video_from_cv2_list(video_frame_crop, save_path, fps=24.0, rgb2bgr=True)
return video_frame_crop
def save_audio(ori_video_path, sub_task):
save_path = ori_video_path.replace('original_videos', 'processed/audio/')
save_dir = os.path.dirname(save_path)
save_path = save_path + '.wav'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
ori_video_path = ori_video_path.replace(sub_task, sub_task+'_24fps')
audio_clip = AudioFileClip(ori_video_path)
audio_clip.write_audiofile(save_path)
def draw_pose_video(pose_params_path, save_path, max_size, ori_frames=None):
pose_files = os.listdir(pose_params_path)
# 生成Pose图cd pro
output_pose_img = []
for i in range(0, len(pose_files)):
pose_params_path_tmp = pose_params_path + '/' + str(i) + '.npy'
detected_pose = np.load(pose_params_path_tmp, allow_pickle=True).tolist()
imh_new, imw_new, rb, re, cb, ce = detected_pose['draw_pose_params']
im = draw_pose(detected_pose, imh_new, imw_new, ref_w=800)
im = np.transpose(np.array(im),(1,2,0))
img_new = np.zeros((max_size, max_size, 3)).astype('uint8')
img_new[rb:re,cb:ce,:] = im
if ori_frames is not None:
img_new = img_new * 0.6 + ori_frames[i] * 0.4
img_new = img_new.astype('uint8')
output_pose_img.append(img_new)
output_pose_img = np.stack(output_pose_img)
save_video_from_cv2_list(output_pose_img, save_path, fps=24.0, rgb2bgr=True)
print('save to ' + save_path)
visualization = False
for sub_task in tasks:
ori_list = os.listdir(base_dir+sub_task)[start:end]
mp4_list = ori_list
new_dir = base_dir+sub_task+'_24fps'
if not os.path.exists(new_dir):
os.makedirs(new_dir)
index = 1
for i, mp4_file in enumerate(mp4_list):
ori_video_path = base_dir+sub_task+'/'+mp4_file
if ori_video_path[-3:]=='mp4' or ori_video_path[-3:] =='MOV':
try:
# 转换祯率
ori_video_path_new = ori_video_path.replace(sub_task, sub_task+'_24fps')
if '.MOV' in ori_video_path_new:
ori_video_path_new.replace('.MOV', '.mp4')
convert_fps(ori_video_path, ori_video_path_new)
print([index+start, ori_video_path, start, end])
# 提取Pose
detected_poses, height, width, ori_frames = get_video_pose(ori_video_path_new, max_frame=None)
print(height, width)
# 提取相关参数
res_params = get_pose_params(detected_poses, MAX_SIZE)
# 存储Pose参数
save_pose_params(detected_poses, res_params['pose_params'], res_params['draw_pose_params'], ori_video_path)
# 存储截取视频
video_frame_crop = save_processed_video(ori_frames, res_params['video_params'], ori_video_path, MAX_SIZE)
# 存储音频
save_audio(ori_video_path, sub_task)
index += 1
if visualization:
# 绘制pose图
pose_params_path = ori_video_path.replace('original_videos', 'image_audio_features/pose')
save_path = "./vis_pose_results/" + os.path.basename(ori_video_path)
draw_pose_video(pose_params_path, save_path, ori_frames=video_frame_crop)
except:
print(["extract crash!", index+start, ori_video_path, start, end])
continue
print(["All Finished", sub_task, start, end])