Spaces:
Sleeping
Sleeping
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from torch.nn.parallel import DistributedDataParallel as DDP | |
import torch.nn.init as init | |
import torchvision.models as models | |
import nvdiffrast.torch as dr | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import os | |
import os.path as osp | |
import pickle | |
from video3d.render.regularizer import get_edge_length, normal_consistency, laplace_regularizer_const | |
from . import networks | |
from .renderer import * | |
from .utils import misc, meters, flow_viz, arap, custom_loss | |
from .dataloaders import get_sequence_loader, get_image_loader | |
from .dataloaders_ddp import get_sequence_loader_ddp, get_image_loader_ddp | |
from .cub_dataloaders import get_cub_loader | |
from .cub_dataloaders_ddp import get_cub_loader_ddp | |
from .utils.skinning_v4 import estimate_bones, skinning | |
import lpips | |
from einops import rearrange, repeat | |
import clip | |
import torchvision.transforms.functional as tvf | |
from . import discriminator_architecture | |
from .geometry.dmtet import DMTetGeometry | |
from .geometry.dlmesh import DLMesh | |
from .triplane_texture.triplane_predictor import TriPlaneTex | |
from .render import renderutils as ru | |
from .render import material | |
from .render import mlptexture | |
from .render import util | |
from .render import mesh | |
from .render import light | |
from .render import render | |
from .diffusion.sd import StableDiffusion | |
from .diffusion.vsd import StableDiffusion_VSD | |
from .diffusion.sd_utils import rand_poses, rand_lights, append_text_direction | |
EPS = 1e-7 | |
def get_optimizer(model, lr=0.0001, betas=(0.9, 0.999), weight_decay=0): | |
return torch.optim.Adam( | |
filter(lambda p: p.requires_grad, model.parameters()), | |
lr=lr, betas=betas, weight_decay=weight_decay) | |
def set_requires_grad(model, requires_grad): | |
if model is not None: | |
for param in model.parameters(): | |
param.requires_grad = requires_grad | |
def forward_to_matrix(vec_forward, up=[0,1,0]): | |
up = torch.FloatTensor(up).to(vec_forward.device) | |
# vec_forward = nn.functional.normalize(vec_forward, p=2, dim=-1) # x right, y up, z forward | |
vec_right = up.expand_as(vec_forward).cross(vec_forward, dim=-1) | |
vec_right = nn.functional.normalize(vec_right, p=2, dim=-1) | |
vec_up = vec_forward.cross(vec_right, dim=-1) | |
vec_up = nn.functional.normalize(vec_up, p=2, dim=-1) | |
rot_mat = torch.stack([vec_right, vec_up, vec_forward], -2) | |
return rot_mat | |
def sample_pose_hypothesis_from_quad_prediction(poses_raw, total_iter, batch_size, num_frames, pose_xflip_recon=False, input_image_xflip_flag=None, rot_temp_scalar=1., num_hypos=4, naive_probs_iter=2000, best_pose_start_iter=6000, random_sample=True, temp_clip_low = 1., temp_clip_high=100.): | |
rots_pred = poses_raw[..., :num_hypos*4].view(-1, num_hypos, 4) | |
rots_logits = rots_pred[..., 0] # Nx4 | |
# temp = 1 / np.clip(total_iter / 1000 / rot_temp_scalar, 1., 100.) | |
temp = 1 / np.clip(total_iter / 1000 / rot_temp_scalar, temp_clip_low, temp_clip_high) | |
rots_probs = torch.nn.functional.softmax(-rots_logits / temp, dim=1) # N x K | |
# naive_probs = torch.FloatTensor([10] + [1] * (num_hypos - 1)).to(rots_logits.device) | |
naive_probs = torch.ones(num_hypos).to(rots_logits.device) | |
naive_probs = naive_probs / naive_probs.sum() | |
naive_probs_weight = np.clip(1 - (total_iter - naive_probs_iter) / 2000, 0, 1) | |
rots_probs = naive_probs.view(1, num_hypos) * naive_probs_weight + rots_probs * (1 - naive_probs_weight) | |
rots_pred = rots_pred[..., 1:4] | |
trans_pred = poses_raw[..., -3:] | |
best_rot_idx = torch.argmax(rots_probs, dim=1) # N | |
#print("best_rot_idx", best_rot_idx) | |
#print("best_of_best", torch.argmax(rots_probs)) | |
#print("similar 7", torch.zeros_like(best_rot_idx) + 7) | |
#print("similar 2", torch.zeros_like(best_rot_idx) + torch.argmax(rots_probs)) | |
if random_sample: | |
# rand_rot_idx = torch.randint(0, 4, (batch_size * num_frames,), device=poses_raw.device) # N | |
rand_rot_idx = torch.randperm(batch_size * num_frames, device=poses_raw.device) % num_hypos # N | |
# rand_rot_idx = torch.randperm(batch_size, device=poses_raw.device)[:,None].repeat(1, num_frames).view(-1) % 4 # N | |
best_flag = (torch.randperm(batch_size * num_frames, device=poses_raw.device) / (batch_size * num_frames) < np.clip((total_iter - best_pose_start_iter)/2000, 0, 0.8)).long() | |
rand_flag = 1 - best_flag | |
# best_flag = torch.zeros_like(best_rot_idx) | |
rot_idx = best_rot_idx * best_flag + rand_rot_idx * (1 - best_flag) | |
else: | |
rand_flag = torch.zeros_like(best_rot_idx) | |
#rot_idx = torch.full_like(torch.argmax(rots_probs, dim=1), torch.argmax(rots_probs), device=poses_raw.device) | |
rot_idx = best_rot_idx | |
rot_pred = torch.gather(rots_pred, 1, rot_idx[:, None, None].expand(-1, 1, 3))[:, 0] # Nx3 | |
pose_raw = torch.cat([rot_pred, trans_pred], -1) | |
rot_prob = torch.gather(rots_probs, 1, rot_idx[:, None].expand(-1, 1))[:, 0] # N | |
rot_logit = torch.gather(rots_logits, 1, rot_idx[:, None].expand(-1, 1))[:, 0] # N | |
if pose_xflip_recon: | |
raise NotImplementedError | |
#up = torch.FloatTensor([0, 1, 0]).to(pose_raw.device) | |
rot_mat = forward_to_matrix(pose_raw[:, :3], up=[0, 1, 0]) | |
pose = torch.cat([rot_mat.view(batch_size * num_frames, -1), pose_raw[:, 3:]], -1) | |
return pose_raw, pose, rot_idx, rot_prob, rot_logit, rots_probs, rand_flag | |
def get_joints_20_bones(bones, aux): | |
# the bones shape is [1, 1, 20, 2, 3] | |
body_bones_to_joints = aux['bones_to_joints'] | |
body_bones = bones[:, :, :len(body_bones_to_joints), :, :] | |
body_joints = torch.empty(bones.shape[0], bones.shape[1], len(body_bones_to_joints) + 1, 3) | |
for i, (a, b) in enumerate(body_bones_to_joints): | |
body_joints[:, :, a, :] = body_bones[:, :, i, 0, :] | |
body_joints[:, :, b, :] = body_bones[:, :, i, 1, :] | |
leg_aux = aux['legs'] | |
all_leg_joints = [] | |
for i in range(len(leg_aux)): | |
leg_bones = bones[:, :, 8+i*3:11+i*3, :, :] | |
leg_joints = torch.empty(bones.shape[0], bones.shape[1], len(leg_aux[i]['leg_bones_to_joints']), 3) | |
for j in range(len(leg_aux[i]['leg_bones_to_joints'])-1): | |
leg_joint_idx_a = leg_aux[i]['leg_bones_to_joints'][j][0] | |
leg_joint_idx_b = leg_aux[i]['leg_bones_to_joints'][j][1] | |
leg_joints[:, :, leg_joint_idx_a, :] = leg_bones[:, :, j, 0, :] | |
leg_joints[:, :, leg_joint_idx_b, :] = leg_bones[:, :, j, 1, :] | |
all_leg_joints.append(leg_joints) | |
all_joints = [body_joints] + all_leg_joints | |
all_joints = torch.cat(all_joints, dim=2) | |
return all_joints | |
def get_20_bones_joints(joints, aux): | |
# the joints shape is [1, 1, 21, 3] | |
body_bones_to_joints = aux['bones_to_joints'] | |
body_bones = [] | |
for a,b in body_bones_to_joints: | |
body_bones += [torch.stack([joints[:, :, a, :], joints[:, :, b, :]], dim=2)] | |
body_bones = torch.stack(body_bones, dim=2) # [1, 1, 8, 2, 3] | |
legs_bones = [] | |
legs_aux = aux['legs'] | |
for i in range(len(legs_aux)): | |
leg_aux = legs_aux[i] | |
leg_bones = [] | |
leg_bones_to_joints = leg_aux['leg_bones_to_joints'] | |
for j in range(len(leg_bones_to_joints)-1): | |
leg_bones += [torch.stack([joints[:, :, 9+i*3+leg_bones_to_joints[j][0], :], joints[:, :, 9+i*3+leg_bones_to_joints[j][1], :]], dim=2)] | |
# the last bone is attached to the body | |
leg_bones += [torch.stack([ | |
body_bones[:, :, leg_aux['body_bone_idx'], 1, :], joints[:, :, 9+i*3+leg_bones_to_joints[-1][1], :] | |
], dim=2)] | |
leg_bones = torch.stack(leg_bones, dim=2) | |
legs_bones.append(leg_bones) | |
bones = torch.cat([body_bones] + legs_bones, dim=2) | |
return bones | |
class FixedDirectionLight(torch.nn.Module): | |
def __init__(self, direction, amb, diff): | |
super(FixedDirectionLight, self).__init__() | |
self.light_dir = direction | |
self.amb = amb | |
self.diff = diff | |
self.is_hacking = not (isinstance(self.amb, float) | |
or isinstance(self.amb, int)) | |
def forward(self, feat): | |
batch_size = feat.shape[0] | |
if self.is_hacking: | |
return torch.concat([self.light_dir, self.amb, self.diff], -1) | |
else: | |
return torch.concat([self.light_dir, torch.FloatTensor([self.amb, self.diff]).to(self.light_dir.device)], -1).expand(batch_size, -1) | |
def shade(self, feat, kd, normal): | |
light_params = self.forward(feat) | |
light_dir = light_params[..., :3][:, None, None, :] | |
int_amb = light_params[..., 3:4][:, None, None, :] | |
int_diff = light_params[..., 4:5][:, None, None, :] | |
shading = (int_amb + int_diff * | |
torch.clamp(util.dot(light_dir, normal), min=0.0)) | |
shaded = shading * kd | |
return shaded, shading | |
class SmoothLoss(nn.Module): | |
def __init__(self, dim=0, smooth_type=None, loss_type="l2"): | |
super(SmoothLoss, self).__init__() | |
self.dim = dim | |
supported_smooth_types = ['mid_frame', 'dislocation', 'avg'] | |
assert smooth_type in supported_smooth_types, f"supported smooth type: {supported_smooth_types}" | |
self.smooth_type = smooth_type | |
supported_loss_types = ['l2', 'mse', 'l1'] | |
assert loss_type in supported_loss_types, f"supported loss type: {supported_loss_types}" | |
self.loss_type = loss_type | |
if self.loss_type in ['l2', 'mse']: | |
self.loss_fn = torch.nn.MSELoss(reduction='mean') | |
elif self.loss_type in ['l1']: | |
self.loss_fn = torch.nn.L1Loss() | |
else: | |
raise NotImplementedError | |
def mid_frame_smooth(self, inputs): | |
nframe = inputs.shape[self.dim] | |
mid_num = (nframe-1) // 2 | |
# from IPython import embed; embed(); | |
mid_frame = torch.index_select(inputs, self.dim, torch.tensor([mid_num], device=inputs.device)) | |
repeat_num = self.get_repeat_num(inputs) | |
smooth = mid_frame.repeat(repeat_num) | |
loss = self.loss_fn(inputs, smooth) | |
# print(loss) | |
return loss | |
def dislocation_smooth(self, inputs): | |
# from IPython import embed; embed() | |
nframe = inputs.shape[self.dim] | |
t = torch.index_select(inputs, self.dim, torch.arange(0, nframe-1).to(inputs.device)) | |
t_1 = torch.index_select(inputs, self.dim, torch.arange(1, nframe).to(inputs.device)) | |
loss = self.loss_fn(t, t_1) | |
return loss | |
def avg_smooth(self, inputs): | |
# nframe = inputs.shape[self.dim] | |
# from IPython import embed; embed() | |
avg = inputs.mean(dim=self.dim, keepdim=True) | |
repeat_num = self.get_repeat_num(inputs) | |
smooth = avg.repeat(repeat_num) | |
loss = self.loss_fn(inputs, smooth) | |
return loss | |
def get_repeat_num(self, inputs): | |
repeat_num = [1] * inputs.dim() | |
repeat_num[self.dim] = inputs.shape[self.dim] | |
return repeat_num | |
def forward(self, inputs): | |
print(f"smooth_type: {self.smooth_type}") | |
if self.smooth_type is None: | |
return 0. | |
elif self.smooth_type == 'mid_frame': | |
return self.mid_frame_smooth(inputs) | |
elif self.smooth_type == 'dislocation': | |
return self.dislocation_smooth(inputs) | |
elif self.smooth_type == 'avg': | |
return self.avg_smooth(inputs) | |
else: | |
raise NotImplementedError() | |
class PriorPredictor(nn.Module): | |
def __init__(self, cfgs): | |
super().__init__() | |
#add nnParameters | |
dmtet_grid = cfgs.get('dmtet_grid', 64) | |
grid_scale = cfgs.get('grid_scale', 5) | |
prior_sdf_mode = cfgs.get('prior_sdf_mode', 'mlp') | |
num_layers_shape = cfgs.get('num_layers_shape', 5) | |
hidden_size = cfgs.get('hidden_size', 64) | |
embedder_freq_shape = cfgs.get('embedder_freq_shape', 8) | |
embed_concat_pts = cfgs.get('embed_concat_pts', True) | |
init_sdf = cfgs.get('init_sdf', None) | |
jitter_grid = cfgs.get('jitter_grid', 0.) | |
perturb_sdf_iter = cfgs.get('perturb_sdf_iter', 10000) | |
sym_prior_shape = cfgs.get('sym_prior_shape', False) | |
train_data_dir = cfgs.get("train_data_dir", None) | |
if isinstance(train_data_dir, str): | |
num_of_classes = 1 | |
elif isinstance(train_data_dir, dict): | |
self.category_id_map = {} | |
num_of_classes = len(train_data_dir) | |
for i, (k, _) in enumerate(train_data_dir.items()): | |
self.category_id_map[k] = i | |
dim_of_classes = cfgs.get('dim_of_classes', 256) if num_of_classes > 1 else 0 | |
condition_choice = cfgs.get('prior_condition_choice', 'concat') | |
self.netShape = DMTetGeometry(dmtet_grid, grid_scale, prior_sdf_mode, num_layers=num_layers_shape, hidden_size=hidden_size, embedder_freq=embedder_freq_shape, embed_concat_pts=embed_concat_pts, init_sdf=init_sdf, jitter_grid=jitter_grid, perturb_sdf_iter=perturb_sdf_iter, sym_prior_shape=sym_prior_shape, | |
dim_of_classes=dim_of_classes, condition_choice=condition_choice) | |
mlp_hidden_size = cfgs.get('hidden_size', 64) | |
tet_bbox = self.netShape.getAABB() | |
self.render_dino_mode = cfgs.get('render_dino_mode', None) | |
num_layers_dino = cfgs.get("num_layers_dino", 5) | |
dino_feature_recon_dim = cfgs.get('dino_feature_recon_dim', 64) | |
sym_dino = cfgs.get("sym_dino", False) | |
dino_min = torch.zeros(dino_feature_recon_dim) + cfgs.get('dino_min', 0.) | |
dino_max = torch.zeros(dino_feature_recon_dim) + cfgs.get('dino_max', 1.) | |
min_max = torch.stack((dino_min, dino_max), dim=0) | |
if self.render_dino_mode is None: | |
pass | |
elif self.render_dino_mode == 'feature_mlpnv': | |
#MLPTexture3D predict the dino for each single point. | |
self.netDINO = mlptexture.MLPTexture3D(tet_bbox, channels=dino_feature_recon_dim, internal_dims=mlp_hidden_size, hidden=num_layers_dino-1, feat_dim=0, min_max=min_max, bsdf=None, perturb_normal=False, symmetrize=sym_dino) | |
elif self.render_dino_mode == 'feature_mlp': | |
embedder_scaler = 2 * np.pi / grid_scale * 0.9 # originally (-0.5*s, 0.5*s) rescale to (-pi, pi) * 0.9 | |
embed_concat_pts = cfgs.get('embed_concat_pts', True) | |
self.netDINO = networks.MLPTextureSimple( | |
3, # x, y, z coordinates | |
dino_feature_recon_dim, | |
num_layers_dino, | |
nf=mlp_hidden_size, | |
dropout=0, | |
activation="sigmoid", | |
min_max=min_max, | |
n_harmonic_functions=cfgs.get('embedder_freq_dino', 8), | |
omega0=embedder_scaler, | |
extra_dim=dim_of_classes, | |
embed_concat_pts=embed_concat_pts, | |
perturb_normal=False, | |
symmetrize=sym_dino | |
) | |
elif self.render_dino_mode == 'cluster': | |
num_layers_dino = cfgs.get("num_layers_dino", 5) | |
dino_cluster_dim = cfgs.get('dino_cluster_dim', 64) | |
self.netDINO = mlptexture.MLPTexture3D(tet_bbox, channels=dino_cluster_dim, internal_dims=mlp_hidden_size, hidden=num_layers_dino-1, feat_dim=0, min_max=None, bsdf=None, perturb_normal=False, symmetrize=sym_dino) | |
else: | |
raise NotImplementedError | |
self.classes_vectors = None | |
if num_of_classes > 1: | |
self.classes_vectors = torch.nn.Parameter(torch.nn.init.uniform_(torch.empty(num_of_classes, dim_of_classes), a=-0.05, b=0.05)) | |
def forward(self, category_name=None, perturb_sdf=False, total_iter=None, is_training=True, class_embedding=None): | |
class_vector = None | |
if category_name is not None: | |
# print(category_name) | |
if class_embedding is not None: | |
class_vector = class_embedding[0] # [128] | |
return_classes_vectors = class_vector | |
else: | |
class_vector = self.classes_vectors[self.category_id_map[category_name]] | |
return_classes_vectors = self.classes_vectors | |
prior_shape = self.netShape.getMesh(perturb_sdf=perturb_sdf, total_iter=total_iter, jitter_grid=is_training, class_vector=class_vector) | |
# print(prior_shape.v_pos.shape) | |
# return prior_shape, self.netDINO, self.classes_vectors | |
return prior_shape, self.netDINO, return_classes_vectors | |
class InstancePredictor(nn.Module): | |
def __init__(self, cfgs, tet_bbox=None): | |
super().__init__() | |
self.cfgs = cfgs | |
self.grid_scale = cfgs.get('grid_scale', 5) | |
self.enable_encoder = cfgs.get('enable_encoder', False) | |
if self.enable_encoder: | |
encoder_latent_dim = cfgs.get('latent_dim', 256) | |
encoder_pretrained = cfgs.get('encoder_pretrained', False) | |
encoder_frozen = cfgs.get('encoder_frozen', False) | |
encoder_arch = cfgs.get('encoder_arch', 'simple') | |
in_image_size = cfgs.get('in_image_size', 256) | |
self.dino_feature_input = cfgs.get('dino_feature_input', False) | |
dino_feature_dim = cfgs.get('dino_feature_dim', 64) | |
if encoder_arch == 'simple': | |
if self.dino_feature_input: | |
self.netEncoder = networks.EncoderWithDINO(cin_rgb=3, cin_dino=dino_feature_dim, cout=encoder_latent_dim, in_size=in_image_size, zdim=None, nf=64, activation=None) | |
else: | |
self.netEncoder = networks.Encoder(cin=3, cout=encoder_latent_dim, in_size=in_image_size, zdim=None, nf=64, activation=None) | |
elif encoder_arch == 'vgg': | |
self.netEncoder = networks.VGGEncoder(cout=encoder_latent_dim, pretrained=encoder_pretrained) | |
elif encoder_arch == 'resnet': | |
self.netEncoder = networks.ResnetEncoder(cout=encoder_latent_dim, pretrained=encoder_pretrained) | |
elif encoder_arch == 'vit': | |
which_vit = cfgs.get('which_vit', 'dino_vits8') | |
vit_final_layer_type = cfgs.get('vit_final_layer_type', 'conv') | |
root_dir = cfgs.get('root_dir', '/root') | |
self.netEncoder = networks.ViTEncoder(cout=encoder_latent_dim, which_vit=which_vit, pretrained=encoder_pretrained, frozen=encoder_frozen, in_size=in_image_size, final_layer_type=vit_final_layer_type, root=root_dir) | |
else: | |
raise NotImplementedError | |
else: | |
encoder_latent_dim = 0 | |
mlp_hidden_size = cfgs.get('hidden_size', 64) | |
bsdf = cfgs.get("bsdf", 'diffuse') | |
num_layers_tex = cfgs.get("num_layers_tex", 5) | |
feat_dim = cfgs.get("latent_dim", 64) if self.enable_encoder else 0 | |
perturb_normal = cfgs.get("perturb_normal", False) | |
sym_texture = cfgs.get("sym_texture", False) | |
kd_min = torch.FloatTensor(cfgs.get('kd_min', [0., 0., 0., 0.])) | |
kd_max = torch.FloatTensor(cfgs.get('kd_max', [1., 1., 1., 1.])) | |
ks_min = torch.FloatTensor(cfgs.get('ks_min', [0., 0., 0.])) | |
ks_max = torch.FloatTensor(cfgs.get('ks_max', [0., 0., 0.])) | |
nrm_min = torch.FloatTensor(cfgs.get('nrm_min', [-1., -1., 0.])) | |
nrm_max = torch.FloatTensor(cfgs.get('nrm_max', [1., 1., 1.])) | |
mlp_min = torch.cat((kd_min[0:3], ks_min, nrm_min), dim=0) | |
mlp_max = torch.cat((kd_max[0:3], ks_max, nrm_max), dim=0) | |
min_max = torch.stack((mlp_min, mlp_max), dim=0) | |
out_chn = 9 | |
# TODO: if the tet verts are deforming, we need to recompute tet_bbox | |
texture_mode = cfgs.get("texture_mode", 'mlp') | |
if texture_mode == 'mlpnv': | |
self.netTexture = mlptexture.MLPTexture3D(tet_bbox, channels=out_chn, internal_dims=mlp_hidden_size, hidden=num_layers_tex-1, feat_dim=feat_dim, min_max=min_max, bsdf=bsdf, perturb_normal=perturb_normal, symmetrize=sym_texture) | |
elif texture_mode == 'mlp': | |
embedder_scaler = 2 * np.pi / self.grid_scale * 0.9 # originally (-0.5*s, 0.5*s) rescale to (-pi, pi) * 0.9 | |
embed_concat_pts = cfgs.get('embed_concat_pts', True) | |
self.texture_way = cfgs.get('texture_way', None) | |
if self.texture_way is None: | |
texture_act = cfgs.get('texture_act', 'relu') | |
texture_bias = cfgs.get('texture_bias', False) | |
self.netTexture = networks.MLPTextureSimple( | |
3, # x, y, z coordinates | |
out_chn, | |
num_layers_tex, | |
nf=mlp_hidden_size, | |
dropout=0, | |
activation="sigmoid", | |
min_max=min_max, | |
n_harmonic_functions=cfgs.get('embedder_freq_tex', 10), | |
omega0=embedder_scaler, | |
extra_dim=feat_dim, | |
embed_concat_pts=embed_concat_pts, | |
perturb_normal=perturb_normal, | |
symmetrize=sym_texture, | |
texture_act=texture_act, | |
linear_bias=texture_bias | |
) | |
else: | |
self.netTexture = networks.MLPTextureTriplane( | |
3, # x, y, z coordinates | |
out_chn, | |
num_layers_tex, | |
nf=mlp_hidden_size, | |
dropout=0, | |
activation="sigmoid", | |
min_max=min_max, | |
n_harmonic_functions=cfgs.get('embedder_freq_tex', 10), | |
omega0=embedder_scaler, | |
extra_dim=feat_dim, | |
embed_concat_pts=embed_concat_pts, | |
perturb_normal=perturb_normal, | |
symmetrize=sym_texture, | |
texture_act='relu', | |
linear_bias=False, | |
cam_pos_z_offset=cfgs.get('cam_pos_z_offset', 10.), | |
grid_scale=self.grid_scale | |
) | |
# if 'lift' in self.texture_way: | |
# # GET3D use global feature to get a tri-plane | |
# self.netTexture = TriPlaneTex( | |
# w_dim=512, | |
# img_channels=out_chn, | |
# tri_plane_resolution=256, | |
# device=cfgs.get('device', 'cpu'), | |
# mlp_latent_channel=32, | |
# n_implicit_layer=1, | |
# feat_dim=256, | |
# n_mapping_layer=8, | |
# sym_texture=sym_texture, | |
# grid_scale=self.grid_scale, | |
# min_max=min_max, | |
# perturb_normal=perturb_normal | |
# ) | |
# # # project the local feature map into a grid | |
# # self.netTexture = networks.LiftTexture( | |
# # 3, # x, y, z coordinates | |
# # out_chn, | |
# # num_layers_tex, | |
# # nf=mlp_hidden_size, | |
# # dropout=0, | |
# # activation="sigmoid", | |
# # min_max=min_max, | |
# # n_harmonic_functions=cfgs.get('embedder_freq_tex', 10), | |
# # omega0=embedder_scaler, | |
# # extra_dim=feat_dim, | |
# # embed_concat_pts=embed_concat_pts, | |
# # perturb_normal=perturb_normal, | |
# # symmetrize=sym_texture, | |
# # texture_way=self.texture_way, | |
# # cam_pos_z_offset=cfgs.get('cam_pos_z_offset', 10.), | |
# # grid_scale=self.grid_scale, | |
# # local_feat_dim=cfgs.get("lift_local_feat_dim", 128), | |
# # grid_size=cfgs.get("lift_grid_size", 32), | |
# # optim_latent=cfgs.get("lift_optim_latent", False) | |
# # ) | |
# else: | |
# # a texture mlp with local feature map from patch_out | |
# self.netTexture = networks.MLPTextureLocal( | |
# 3, # x, y, z coordinates | |
# out_chn, | |
# num_layers_tex, | |
# nf=mlp_hidden_size, | |
# dropout=0, | |
# activation="sigmoid", | |
# min_max=min_max, | |
# n_harmonic_functions=cfgs.get('embedder_freq_tex', 10), | |
# omega0=embedder_scaler, | |
# extra_dim=feat_dim, | |
# embed_concat_pts=embed_concat_pts, | |
# perturb_normal=perturb_normal, | |
# symmetrize=sym_texture, | |
# texture_way=self.texture_way, | |
# larger_tex_dim=cfgs.get('larger_tex_dim', False), | |
# cam_pos_z_offset=cfgs.get('cam_pos_z_offset', 10.), | |
# grid_scale=self.grid_scale | |
# ) | |
self.rot_rep = cfgs.get('rot_rep', 'euler_angle') | |
self.enable_pose = cfgs.get('enable_pose', False) | |
if self.enable_pose: | |
cam_pos_z_offset = cfgs.get('cam_pos_z_offset', 10.) | |
fov = cfgs.get('crop_fov_approx', 25) | |
half_range = np.tan(fov /2 /180 * np.pi) * cam_pos_z_offset # 2.22 | |
self.max_trans_xy_range = half_range * cfgs.get('max_trans_xy_range_ratio', 1.) | |
self.max_trans_z_range = half_range * cfgs.get('max_trans_z_range_ratio', 1.) | |
self.lookat_init = cfgs.get('lookat_init', None) | |
self.lookat_zeroy = cfgs.get('lookat_zeroy', False) | |
self.rot_temp_scalar = cfgs.get('rot_temp_scalar', 1.) | |
self.naive_probs_iter = cfgs.get('naive_probs_iter', 2000) | |
self.best_pose_start_iter = cfgs.get('best_pose_start_iter', 6000) | |
if self.rot_rep == 'euler_angle': | |
pose_cout = 6 | |
elif self.rot_rep == 'quaternion': | |
pose_cout = 7 | |
elif self.rot_rep == 'lookat': | |
pose_cout = 6 | |
elif self.rot_rep == 'quadlookat': | |
self.num_pose_hypos = 4 | |
pose_cout = (3 + 1) * self.num_pose_hypos + 3 # 4 forward vectors for 4 quadrants, 4 quadrant classification logits, 3 for translation | |
self.orthant_signs = torch.FloatTensor([[1,1,1], [-1,1,1], [-1,1,-1], [1,1,-1]]) | |
elif self.rot_rep == 'octlookat': | |
self.num_pose_hypos = 8 | |
pose_cout = (3 + 1) * self.num_pose_hypos + 3 # 4 forward vectors for 8 octants, 8 octant classification logits, 3 for translation | |
self.orthant_signs = torch.stack(torch.meshgrid([torch.arange(1, -2, -2)] *3), -1).view(-1, 3) # 8x3 | |
else: | |
raise NotImplementedError | |
self.pose_arch = cfgs.get('pose_arch', 'mlp') | |
if self.pose_arch == 'mlp': | |
num_layers_pose = cfgs.get('num_layers_pose', 5) | |
self.netPose = networks.MLP( | |
encoder_latent_dim, | |
pose_cout, | |
num_layers_pose, | |
nf=mlp_hidden_size, | |
dropout=0, | |
activation=None | |
) | |
elif self.pose_arch == 'encoder': | |
if self.dino_feature_input: | |
dino_feature_dim = cfgs.get('dino_feature_dim', 64) | |
self.netPose = networks.EncoderWithDINO(cin_rgb=3, cin_dino=dino_feature_dim, cout=pose_cout, in_size=in_image_size, zdim=None, nf=64, activation=None) | |
else: | |
self.netPose = networks.Encoder(cin=3, cout=pose_cout, in_size=in_image_size, zdim=None, nf=64, activation=None) | |
elif self.pose_arch in ['encoder_dino_patch_out', 'encoder_dino_patch_key']: | |
if which_vit == 'dino_vits8': | |
dino_feat_dim = 384 | |
elif which_vit == 'dinov2_vits14': | |
dino_feat_dim = 384 | |
elif which_vit == 'dino_vitb8': | |
dino_feat_dim = 768 | |
self.netPose = networks.Encoder32(cin=dino_feat_dim, cout=pose_cout, nf=256, activation=None) | |
elif self.pose_arch == 'vit': | |
encoder_pretrained = cfgs.get('encoder_pretrained', False) | |
encoder_frozen = cfgs.get('encoder_frozen', False) | |
which_vit = cfgs.get('which_vit', 'dino_vits8') | |
vit_final_layer_type = cfgs.get('vit_final_layer_type', 'conv') | |
root_dir = cfgs.get('root_dir', '/root') | |
self.netPose = networks.ViTEncoder(cout=encoder_latent_dim, which_vit=which_vit, pretrained=encoder_pretrained, frozen=encoder_frozen, in_size=in_image_size, final_layer_type=vit_final_layer_type, root=root_dir) | |
else: | |
raise NotImplementedError | |
self.enable_deform = cfgs.get('enable_deform', False) | |
if self.enable_deform: | |
embedder_scaler = 2 * np.pi / self.grid_scale * 0.9 # originally (-0.5*s, 0.5*s) rescale to (-pi, pi) * 0.9 | |
embed_concat_pts = cfgs.get('embed_concat_pts', True) | |
num_layers_deform = cfgs.get('num_layers_deform', 5) | |
self.deform_epochs = np.arange(*cfgs.get('deform_epochs', [0, 0])) | |
sym_deform = cfgs.get("sym_deform", False) | |
self.netDeform = networks.MLPWithPositionalEncoding( | |
3, # x, y, z coordinates | |
3, # dx, dy, dz deformation | |
num_layers_deform, | |
nf=mlp_hidden_size, | |
dropout=0, | |
activation=None, | |
n_harmonic_functions=cfgs.get('embedder_freq_deform', 10), | |
omega0=embedder_scaler, | |
extra_dim=encoder_latent_dim, | |
embed_concat_pts=embed_concat_pts, | |
symmetrize=sym_deform | |
) | |
# self.avg_deform = cfgs.get('avg_deform', False) | |
# print(f'********avg_deform: {self.avg_deform}********') | |
self.enable_articulation = cfgs.get('enable_articulation', False) | |
if self.enable_articulation: | |
self.num_body_bones = cfgs.get('num_body_bones', 4) | |
self.articulation_multiplier = cfgs.get('articulation_multiplier', 1) | |
self.static_root_bones = cfgs.get('static_root_bones', False) | |
self.skinning_temperature = cfgs.get('skinning_temperature', 1) | |
self.articulation_epochs = np.arange(*cfgs.get('articulation_epochs', [0, 0])) | |
self.num_legs = cfgs.get('num_legs', 0) | |
self.num_leg_bones = cfgs.get('num_leg_bones', 0) | |
self.body_bones_type = cfgs.get('body_bones_type', 'z_minmax') | |
self.perturb_articulation_epochs = np.arange(*cfgs.get('perturb_articulation_epochs', [0, 0])) | |
self.num_bones = self.num_body_bones + self.num_legs * self.num_leg_bones | |
self.constrain_legs = cfgs.get('constrain_legs', False) | |
self.attach_legs_to_body_epochs = np.arange(*cfgs.get('attach_legs_to_body_epochs', [0, 0])) | |
self.max_arti_angle = cfgs.get('max_arti_angle', 60) | |
num_layers_arti = cfgs.get('num_layers_arti', 5) | |
which_vit = cfgs.get('which_vit', 'dino_vits8') | |
if which_vit == 'dino_vits8': | |
dino_feat_dim = 384 | |
elif which_vit == 'dino_vitb8': | |
dino_feat_dim = 768 | |
self.articulation_arch = cfgs.get('articulation_arch', 'mlp') | |
self.articulation_feature_mode = cfgs.get('articulation_feature_mode', 'sample') | |
embedder_freq_arti = cfgs.get('embedder_freq_arti', 8) | |
if self.articulation_feature_mode == 'global': | |
feat_dim = encoder_latent_dim | |
elif self.articulation_feature_mode == 'sample': | |
feat_dim = dino_feat_dim | |
elif self.articulation_feature_mode == 'sample+global': | |
feat_dim = encoder_latent_dim + dino_feat_dim | |
if self.articulation_feature_mode == 'attention': | |
arti_feat_attn_zdim = cfgs.get('arti_feat_attn_zdim', 128) | |
pos_dim = 1 + 2 + 3*2 | |
self.netFeatureAttn = networks.FeatureAttention(which_vit, pos_dim, embedder_freq_arti, arti_feat_attn_zdim, img_size=in_image_size) | |
embedder_scaler = np.pi * 0.9 # originally (-1, 1) rescale to (-pi, pi) * 0.9 | |
enable_articulation_idadd = cfgs.get('enable_articulation_idadd', False) | |
self.netArticulation = networks.ArticulationNetwork(self.articulation_arch, feat_dim, 1+2+3*2, num_layers_arti, mlp_hidden_size, n_harmonic_functions=embedder_freq_arti, omega0=embedder_scaler, | |
enable_articulation_idadd=enable_articulation_idadd) | |
self.kinematic_tree_epoch = -1 | |
self.enable_lighting = cfgs.get('enable_lighting', False) | |
if self.enable_lighting: | |
num_layers_light = cfgs.get('num_layers_light', 5) | |
amb_diff_min = torch.FloatTensor(cfgs.get('amb_diff_min', [0., 0.])) | |
amb_diff_max = torch.FloatTensor(cfgs.get('amb_diff_max', [1., 1.])) | |
intensity_min_max = torch.stack((amb_diff_min, amb_diff_max), dim=0) | |
self.netLight = light.DirectionalLight(encoder_latent_dim, num_layers_light, mlp_hidden_size, intensity_min_max=intensity_min_max) | |
self.cam_pos_z_offset = cfgs.get('cam_pos_z_offset', 10.) | |
self.crop_fov_approx = cfgs.get("crop_fov_approx", 25) | |
self.temp_clip_low = cfgs.get('temp_clip_low', 1.) | |
self.temp_clip_high = cfgs.get('temp_clip_high', 100.) | |
# if the articulation and deformation is set as iterations, then use iteration to decide, not epoch | |
self.iter_articulation_start = cfgs.get('iter_articulation_start', None) | |
self.iter_deformation_start = cfgs.get('iter_deformation_start', None) | |
self.iter_nozeroy_start = cfgs.get('iter_nozeroy_start', None) | |
self.iter_attach_leg_to_body_start = cfgs.get('iter_attach_leg_to_body_start', None) | |
def forward_encoder(self, images, dino_features=None): | |
images_in = images.view(-1, *images.shape[2:]) * 2 - 1 # rescale to (-1, 1) | |
patch_out = patch_key = None | |
if self.dino_feature_input and self.cfgs.get('encoder_arch', 'simple') != 'vit': | |
dino_features_in = dino_features.view(-1, *dino_features.shape[2:]) * 2 - 1 # rescale to (-1, 1) | |
feat_out = self.netEncoder(images_in, dino_features_in) # Shape: (B, latent_dim) | |
elif self.cfgs.get('encoder_arch', 'simple') == 'vit': | |
feat_out, feat_key, patch_out, patch_key = self.netEncoder(images_in, return_patches=True) | |
else: | |
feat_out = self.netEncoder(images_in) # Shape: (B, latent_dim) | |
return feat_out, feat_key, patch_out, patch_key | |
def forward_pose(self, images, feat, patch_out, patch_key, dino_features): | |
if self.pose_arch == 'mlp': | |
pose = self.netPose(feat) | |
elif self.pose_arch == 'encoder': | |
images_in = images.view(-1, *images.shape[2:]) * 2 - 1 # rescale to (-1, 1) | |
if self.dino_feature_input: | |
dino_features_in = dino_features.view(-1, *dino_features.shape[2:]) * 2 - 1 # rescale to (-1, 1) | |
pose = self.netPose(images_in, dino_features_in) # Shape: (B, latent_dim) | |
else: | |
pose = self.netPose(images_in) # Shape: (B, latent_dim) | |
elif self.pose_arch == 'vit': | |
images_in = images.view(-1, *images.shape[2:]) * 2 - 1 # rescale to (-1, 1) | |
pose = self.netPose(images_in) | |
elif self.pose_arch == 'encoder_dino_patch_out': | |
pose = self.netPose(patch_out) # Shape: (B, latent_dim) | |
elif self.pose_arch == 'encoder_dino_patch_key': | |
pose = self.netPose(patch_key) # Shape: (B, latent_dim) | |
else: | |
raise NotImplementedError | |
trans_pred = pose[...,-3:].tanh() * torch.FloatTensor([self.max_trans_xy_range, self.max_trans_xy_range, self.max_trans_z_range]).to(pose.device) | |
if self.rot_rep == 'euler_angle': | |
multiplier = 1. | |
if self.gradually_expand_yaw: | |
# multiplier += (min(iteration, 20000) // 500) * 0.25 | |
multiplier *= 1.2 ** (min(iteration, 20000) // 500) # 1.125^40 = 111.200 | |
rot_pred = torch.cat([pose[...,:1], pose[...,1:2]*multiplier, pose[...,2:3]], -1).tanh() | |
rot_pred = rot_pred * torch.FloatTensor([self.max_rot_x_range, self.max_rot_y_range, self.max_rot_z_range]).to(pose.device) /180 * np.pi | |
elif self.rot_rep == 'quaternion': | |
quat_init = torch.FloatTensor([0.01,0,0,0]).to(pose.device) | |
rot_pred = pose[...,:4] + quat_init | |
rot_pred = nn.functional.normalize(rot_pred, p=2, dim=-1) | |
# rot_pred = torch.cat([rot_pred[...,:1].abs(), rot_pred[...,1:]], -1) # make real part non-negative | |
rot_pred = rot_pred * rot_pred[...,:1].sign() # make real part non-negative | |
elif self.rot_rep == 'lookat': | |
vec_forward_raw = pose[...,:3] | |
if self.lookat_init is not None: | |
vec_forward_raw = vec_forward_raw + torch.FloatTensor(self.lookat_init).to(pose.device) | |
if self.lookat_zeroy: | |
vec_forward_raw = vec_forward_raw * torch.FloatTensor([1,0,1]).to(pose.device) | |
vec_forward_raw = nn.functional.normalize(vec_forward_raw, p=2, dim=-1) # x right, y up, z forward | |
rot_pred = vec_forward_raw | |
elif self.rot_rep in ['quadlookat', 'octlookat']: | |
rots_pred = pose[..., :self.num_pose_hypos*4].view(-1, self.num_pose_hypos, 4) # (B, T, K, 4) | |
rots_logits = rots_pred[..., :1] | |
vec_forward_raw = rots_pred[..., 1:4] | |
xs, ys, zs = vec_forward_raw.unbind(-1) | |
margin = 0. | |
xs = nn.functional.softplus(xs, beta=np.log(2)/(0.5+margin)) - margin # initialize to 0.5 | |
if self.rot_rep == 'octlookat': | |
ys = nn.functional.softplus(ys, beta=np.log(2)/(0.5+margin)) - margin # initialize to 0.5 | |
if self.lookat_zeroy: | |
ys = ys * 0 | |
zs = nn.functional.softplus(zs, beta=2*np.log(2)) # initialize to 0.5 | |
vec_forward_raw = torch.stack([xs, ys, zs], -1) | |
vec_forward_raw = vec_forward_raw * self.orthant_signs.to(pose.device) | |
vec_forward_raw = nn.functional.normalize(vec_forward_raw, p=2, dim=-1) # x right, y up, z forward | |
rot_pred = torch.cat([rots_logits, vec_forward_raw], -1).view(-1, self.num_pose_hypos*4) | |
else: | |
raise NotImplementedError | |
pose = torch.cat([rot_pred, trans_pred], -1) | |
return pose | |
def forward_deformation(self, shape, feat=None, batch_size=None, num_frames=None): | |
original_verts = shape.v_pos | |
num_verts = original_verts.shape[1] | |
if feat is not None: | |
deform_feat = feat[:, None, :].repeat(1, num_verts, 1) # Shape: (B, num_verts, latent_dim) | |
original_verts = original_verts.repeat(len(feat),1,1) | |
deformation = self.netDeform(original_verts, deform_feat) * 0.1 # Shape: (B, num_verts, 3) | |
# if self.avg_deform: | |
# assert batch_size is not None and num_frames is not None | |
# assert deformation.shape[0] == batch_size * num_frames | |
# deformation = deformation.view(batch_size, num_frames, *deformation.shape[1:]) | |
# deformation = deformation.mean(dim=1, keepdim=True) | |
# deformation = deformation.repeat(1,num_frames,*[1]*(deformation.dim()-2)) | |
# deformation = deformation.view(batch_size*num_frames, *deformation.shape[2:]) | |
shape = shape.deform(deformation) | |
return shape, deformation | |
def forward_articulation(self, shape, feat, patch_feat, mvp, w2c, batch_size, num_frames, epoch, category, total_iter=None): | |
""" | |
Forward propagation of articulation. For each bone, the network takes: 1) the 3D location of the bone; 2) the feature of the patch which | |
the bone is projected to; and 3) an encoding of the bone's index to predict the bone's rotation (represented by an Euler angle). | |
Args: | |
shape: a Mesh object, whose v_pos has batch size BxF or 1. | |
feat: the feature of the patches. Shape: (BxF, feat_dim, num_patches_per_axis, num_patches_per_axis) | |
mvp: the model-view-projection matrix. Shape: (BxF, 4, 4) | |
Returns: | |
shape: a Mesh object, whose v_pos has batch size BxF (collapsed). | |
articulation_angles: the predicted bone rotations. Shape: (B, F, num_bones, 3) | |
aux: a dictionary containing auxiliary information. | |
""" | |
verts = shape.v_pos | |
if len(verts) == 1: | |
verts = verts[None] | |
else: | |
verts = verts.view(batch_size, num_frames, *verts.shape[1:]) | |
if self.kinematic_tree_epoch != epoch: | |
# if (epoch == self.articulation_epochs[0]) and (self.kinematic_tree_epoch != epoch): | |
# if (epoch in [self.articulation_epochs[0], self.articulation_epochs[0]+2, self.articulation_epochs[0]+4]) and (self.kinematic_tree_epoch != epoch): | |
if total_iter is not None and self.iter_attach_leg_to_body_start is not None: | |
attach_legs_to_body = total_iter > self.iter_attach_leg_to_body_start | |
else: | |
attach_legs_to_body = epoch in self.attach_legs_to_body_epochs | |
# bone_y_thresh = None if category is None or not category == "giraffe" else 0.1 | |
bone_y_thresh = self.cfgs.get('bone_y_thresh', None) | |
# trivial set here | |
body_bone_idx_preset_cfg = self.cfgs.get('body_bone_idx_preset', [0, 0, 0, 0]) | |
if isinstance(body_bone_idx_preset_cfg, list): | |
body_bone_idx_preset = body_bone_idx_preset_cfg | |
elif isinstance(body_bone_idx_preset_cfg, dict): | |
iter_point = list(body_bone_idx_preset_cfg.keys())[1] | |
if total_iter <= iter_point: | |
body_bone_idx_preset = body_bone_idx_preset_cfg[0] # the first is start from 0 iter | |
else: | |
body_bone_idx_preset = body_bone_idx_preset_cfg[iter_point] | |
else: | |
raise NotImplementedError | |
bones, self.kinematic_tree, self.bone_aux = estimate_bones(verts.detach(), self.num_body_bones, n_legs=self.num_legs, n_leg_bones=self.num_leg_bones, body_bones_type=self.body_bones_type, compute_kinematic_chain=True, attach_legs_to_body=attach_legs_to_body, bone_y_threshold=bone_y_thresh, body_bone_idx_preset=body_bone_idx_preset) | |
# self.kinematic_tree_epoch = epoch | |
else: | |
bones = estimate_bones(verts.detach(), self.num_body_bones, n_legs=self.num_legs, n_leg_bones=self.num_leg_bones, body_bones_type=self.body_bones_type, compute_kinematic_chain=False, aux=self.bone_aux) | |
bones_pos = bones # Shape: (B, F, K, 2, 3) | |
if batch_size > bones_pos.shape[0] or num_frames > bones_pos.shape[1]: | |
assert bones_pos.shape[0] == 1 and bones_pos.shape[1] == 1, "If there is a mismatch, then there must be only one canonical mesh." | |
bones_pos = bones_pos.repeat(batch_size, num_frames, 1, 1, 1) | |
num_bones = bones_pos.shape[2] | |
bones_pos = bones_pos.view(batch_size*num_frames, num_bones, 2, 3) # NxKx2x3 | |
bones_mid_pos = bones_pos.mean(2) # NxKx3 | |
bones_idx = torch.arange(num_bones).to(bones_pos.device) | |
bones_mid_pos_world4 = torch.cat([bones_mid_pos, torch.ones_like(bones_mid_pos[..., :1])], -1) # NxKx4 | |
bones_mid_pos_clip4 = bones_mid_pos_world4 @ mvp.transpose(-1, -2) | |
bones_mid_pos_uv = bones_mid_pos_clip4[..., :2] / bones_mid_pos_clip4[..., 3:4] | |
bones_mid_pos_uv = bones_mid_pos_uv.detach() | |
bones_pos_world4 = torch.cat([bones_pos, torch.ones_like(bones_pos[..., :1])], -1) # NxKx2x4 | |
bones_pos_cam4 = bones_pos_world4 @ w2c[:,None].transpose(-1, -2) | |
bones_pos_cam3 = bones_pos_cam4[..., :3] / bones_pos_cam4[..., 3:4] | |
bones_pos_cam3 = bones_pos_cam3 + torch.FloatTensor([0, 0, self.cam_pos_z_offset]).to(bones_pos_cam3.device).view(1, 1, 1, 3) | |
bones_pos_in = bones_pos_cam3.view(batch_size*num_frames, num_bones, 2*3) / self.grid_scale * 2 # (-1, 1), NxKx(2*3) | |
bones_idx_in = ((bones_idx[None, :, None] + 0.5) / num_bones * 2 - 1).repeat(batch_size * num_frames, 1, 1) # (-1, 1) | |
bones_pos_in = torch.cat([bones_mid_pos_uv, bones_pos_in, bones_idx_in], -1).detach() | |
if self.articulation_feature_mode == 'global': | |
bones_patch_features = feat[:, None].repeat(1, num_bones, 1) # (BxF, K, feat_dim) | |
elif self.articulation_feature_mode == 'sample': | |
bones_patch_features = F.grid_sample(patch_feat, bones_mid_pos_uv.view(batch_size * num_frames, 1, -1, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1) # (BxF, K, feat_dim) | |
elif self.articulation_feature_mode == 'sample+global': | |
bones_patch_features = F.grid_sample(patch_feat, bones_mid_pos_uv.view(batch_size * num_frames, 1, -1, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1) # (BxF, K, feat_dim) | |
bones_patch_features = torch.cat([feat[:, None].repeat(1, num_bones, 1), bones_patch_features], -1) | |
elif self.articulation_feature_mode == 'attention': | |
bones_patch_features = self.netFeatureAttn(bones_pos_in, patch_feat) | |
else: | |
raise NotImplementedError | |
articulation_angles = self.netArticulation(bones_patch_features, bones_pos_in).view(batch_size, num_frames, num_bones, 3) * self.articulation_multiplier | |
if self.static_root_bones: | |
root_bones = [self.num_body_bones // 2 - 1, self.num_body_bones - 1] | |
tmp_mask = torch.ones_like(articulation_angles) | |
tmp_mask[:, :, root_bones] = 0 | |
articulation_angles = articulation_angles * tmp_mask | |
articulation_angles = articulation_angles.tanh() | |
if self.cfgs.get('iter_leg_rotation_start', -1) > 0: | |
if total_iter <= self.cfgs.get('iter_leg_rotation_start', -1): | |
self.constrain_legs = True | |
else: | |
self.constrain_legs = False | |
if self.constrain_legs: | |
leg_bones_posx = [self.num_body_bones + i for i in range(self.num_leg_bones * self.num_legs // 2)] | |
leg_bones_negx = [self.num_body_bones + self.num_leg_bones * self.num_legs // 2 + i for i in range(self.num_leg_bones * self.num_legs // 2)] | |
tmp_mask = torch.zeros_like(articulation_angles) | |
tmp_mask[:, :, leg_bones_posx + leg_bones_negx, 2] = 1 | |
articulation_angles = tmp_mask * (articulation_angles * 0.3) + (1 - tmp_mask) * articulation_angles # no twist | |
tmp_mask = torch.zeros_like(articulation_angles) | |
tmp_mask[:, :, leg_bones_posx + leg_bones_negx, 1] = 1 | |
articulation_angles = tmp_mask * (articulation_angles * 0.3) + (1 - tmp_mask) * articulation_angles # (-0.4, 0.4), limit side bending | |
# new regularizations, for bottom 2 bones of each leg, they can only rotate around x-axis, | |
# and for the toppest bone of legs, restrict its angles in a smaller range | |
if (self.cfgs.get('iter_leg_rotation_start', -1) > 0) and (total_iter > self.cfgs.get('iter_leg_rotation_start', -1)): | |
if self.cfgs.get('forbid_leg_rotate', False): | |
if self.cfgs.get('small_leg_angle', False): | |
# regularize the rotation angle of first leg bones | |
leg_bones_top = [8, 11, 14, 17] | |
# leg_bones_top = [10, 13, 16, 19] | |
tmp_mask = torch.zeros_like(articulation_angles) | |
tmp_mask[:, :, leg_bones_top, 1] = 1 | |
tmp_mask[:, :, leg_bones_top, 2] = 1 | |
articulation_angles = tmp_mask * (articulation_angles * 0.05) + (1 - tmp_mask) * articulation_angles | |
leg_bones_bottom = [9, 10, 12, 13, 15, 16, 18, 19] | |
# leg_bones_bottom = [8, 9, 11, 12, 14, 15, 17, 18] | |
tmp_mask = torch.ones_like(articulation_angles) | |
tmp_mask[:, :, leg_bones_bottom, 1] = 0 | |
tmp_mask[:, :, leg_bones_bottom, 2] = 0 | |
# tmp_mask[:, :, leg_bones_bottom, 0] = 0.3 | |
articulation_angles = tmp_mask * articulation_angles | |
if epoch in self.perturb_articulation_epochs: | |
articulation_angles = articulation_angles + torch.randn_like(articulation_angles) * 0.1 | |
articulation_angles = articulation_angles * self.max_arti_angle / 180 * np.pi | |
# check if regularize the leg-connecting body bones z-rotation first | |
# then check if regularize all the body bones z-rotation | |
# regularize z-rotation using 0.1 in pi-space | |
body_rotate_mult = self.cfgs.get('reg_body_rotate_mult', 0.1) | |
body_rotate_mult = body_rotate_mult * 180 * 1.0 / (self.max_arti_angle * np.pi) # the max angle = mult*original_max_angle | |
body_rotate_reg_mode = self.cfgs.get('body_rotate_reg_mode', 'nothing') | |
if body_rotate_reg_mode == 'leg-connect': | |
body_bones_mask = [2, 3, 4, 5] | |
tmp_body_mask = torch.zeros_like(articulation_angles) | |
tmp_body_mask[:, :, body_bones_mask, 2] = 1 | |
articulation_angles = tmp_body_mask * (articulation_angles * body_rotate_mult) + (1 - tmp_body_mask) * articulation_angles | |
elif body_rotate_reg_mode == 'all-bones': | |
body_bones_mask = [0, 1, 2, 3, 4, 5, 6, 7] | |
tmp_body_mask = torch.zeros_like(articulation_angles) | |
tmp_body_mask[:, :, body_bones_mask, 2] = 1 | |
articulation_angles = tmp_body_mask * (articulation_angles * body_rotate_mult) + (1 - tmp_body_mask) * articulation_angles | |
elif body_rotate_reg_mode == 'nothing': | |
articulation_angles = articulation_angles * 1. | |
else: | |
raise NotImplementedError | |
verts_articulated, aux = skinning(verts, bones, self.kinematic_tree, articulation_angles, | |
output_posed_bones=True, temperature=self.skinning_temperature) | |
verts_articulated = verts_articulated.view(batch_size*num_frames, *verts_articulated.shape[2:]) | |
v_tex = shape.v_tex | |
if len(v_tex) != len(verts_articulated): | |
v_tex = v_tex.repeat(len(verts_articulated), 1, 1) | |
shape = mesh.make_mesh( | |
verts_articulated, | |
shape.t_pos_idx, | |
v_tex, | |
shape.t_tex_idx, | |
shape.material) | |
return shape, articulation_angles, aux | |
def get_camera_extrinsics_from_pose(self, pose, znear=0.1, zfar=1000., crop_fov_approx=None, offset_extra=None): | |
if crop_fov_approx is None: | |
crop_fov_approx = self.crop_fov_approx | |
N = len(pose) | |
if offset_extra is not None: | |
cam_pos_offset = torch.FloatTensor([0, 0, -self.cam_pos_z_offset - offset_extra]).to(pose.device) | |
else: | |
cam_pos_offset = torch.FloatTensor([0, 0, -self.cam_pos_z_offset]).to(pose.device) | |
pose_R = pose[:, :9].view(N, 3, 3).transpose(2, 1) | |
pose_T = pose[:, -3:] + cam_pos_offset[None, None, :] | |
pose_T = pose_T.view(N, 3, 1) | |
pose_RT = torch.cat([pose_R, pose_T], axis=2) # Nx3x4 | |
w2c = torch.cat([pose_RT, torch.FloatTensor([0, 0, 0, 1]).repeat(N, 1, 1).to(pose.device)], axis=1) # Nx4x4 | |
# We assume the images are perfect square. | |
if isinstance(crop_fov_approx, float) or isinstance(crop_fov_approx, int): | |
proj = util.perspective(crop_fov_approx / 180 * np.pi, 1, znear, zfar)[None].to(pose.device) | |
elif isinstance(crop_fov_approx, torch.Tensor): | |
proj = util.batched_perspective(crop_fov_approx / 180 * np.pi, 1, znear, zfar).to(pose.device) | |
else: | |
raise ValueError('crop_fov_approx must be float or torch.Tensor') | |
mvp = torch.matmul(proj, w2c) | |
campos = -torch.matmul(pose_R.transpose(2, 1), pose_T).view(N, 3) | |
return mvp, w2c, campos | |
def forward(self, category=None, images=None, prior_shape=None, epoch=None, dino_features=None, dino_clusters=None, total_iter=None, is_training=True): | |
batch_size, num_frames = images.shape[:2] | |
if self.enable_encoder: | |
feat_out, feat_key, patch_out, patch_key = self.forward_encoder(images, dino_features) | |
else: | |
feat_out = feat_key = patch_out = patch_key = None | |
shape = prior_shape | |
texture = self.netTexture | |
multi_hypothesis_aux = {} | |
if self.iter_nozeroy_start is not None and total_iter >= self.iter_nozeroy_start: | |
self.lookat_zeroy = False | |
if self.enable_pose: | |
poses_raw = self.forward_pose(images, feat_out, patch_out, patch_key, dino_features) | |
pose_raw, pose, rot_idx, rot_prob, rot_logit, rots_probs, rand_pose_flag = sample_pose_hypothesis_from_quad_prediction(poses_raw, total_iter, batch_size, num_frames, rot_temp_scalar=self.rot_temp_scalar, num_hypos=self.num_pose_hypos, naive_probs_iter=self.naive_probs_iter, best_pose_start_iter=self.best_pose_start_iter, random_sample=is_training, temp_clip_low=self.temp_clip_low, temp_clip_high=self.temp_clip_high) | |
multi_hypothesis_aux['rot_idx'] = rot_idx | |
multi_hypothesis_aux['rot_prob'] = rot_prob | |
multi_hypothesis_aux['rot_logit'] = rot_logit | |
multi_hypothesis_aux['rots_probs'] = rots_probs | |
multi_hypothesis_aux['rand_pose_flag'] = rand_pose_flag | |
else: | |
raise NotImplementedError | |
mvp, w2c, campos = self.get_camera_extrinsics_from_pose(pose) | |
deformation = None | |
if self.iter_deformation_start is not None: | |
if self.enable_deform and total_iter >= self.iter_deformation_start: | |
shape, deformation = self.forward_deformation(shape, feat_key, batch_size, num_frames) | |
else: | |
if self.enable_deform and epoch in self.deform_epochs: | |
shape, deformation = self.forward_deformation(shape, feat_key, batch_size, num_frames) | |
arti_params, articulation_aux = None, {} | |
if self.iter_articulation_start is not None: | |
if self.enable_articulation and total_iter >= self.iter_articulation_start: | |
shape, arti_params, articulation_aux = self.forward_articulation(shape, feat_key, patch_key, mvp, w2c, batch_size, num_frames, epoch, category, total_iter=total_iter) | |
else: | |
if self.enable_articulation and epoch in self.articulation_epochs: | |
shape, arti_params, articulation_aux = self.forward_articulation(shape, feat_key, patch_key, mvp, w2c, batch_size, num_frames, epoch, category, total_iter=None) | |
if self.enable_lighting: | |
light = self.netLight | |
else: | |
light = None | |
aux = articulation_aux | |
aux.update(multi_hypothesis_aux) | |
# if using texture_way to control a local texture, output patch_out | |
if self.texture_way is None: | |
return shape, pose_raw, pose, mvp, w2c, campos, texture, feat_out, patch_key, deformation, arti_params, light, aux | |
else: | |
return shape, pose_raw, pose, mvp, w2c, campos, texture, feat_out, patch_key, deformation, arti_params, light, aux, patch_out | |
class Unsup3DDDP: | |
def __init__(self, cfgs): | |
self.cfgs = cfgs | |
self.device = cfgs.get('device', 'cpu') | |
self.in_image_size = cfgs.get('in_image_size', 128) | |
self.out_image_size = cfgs.get('out_image_size', 128) | |
self.num_epochs = cfgs.get('num_epochs', 10) | |
self.lr = cfgs.get('lr', 1e-4) | |
self.use_scheduler = cfgs.get('use_scheduler', False) | |
if self.use_scheduler: | |
scheduler_milestone = cfgs.get('scheduler_milestone', [1,2,3,4,5]) | |
scheduler_gamma = cfgs.get('scheduler_gamma', 0.5) | |
self.make_scheduler = lambda optim: torch.optim.lr_scheduler.MultiStepLR(optim, milestones=scheduler_milestone, gamma=scheduler_gamma) | |
self.cam_pos_z_offset = cfgs.get('cam_pos_z_offset', 10.) | |
self.full_size_h = cfgs.get('full_size_h', 1080) | |
self.full_size_w = cfgs.get('full_size_w', 1920) | |
# self.fov_w = cfgs.get('fov_w', 60) | |
# self.fov_h = np.arctan(np.tan(self.fov_w /2 /180*np.pi) / self.full_size_w * self.full_size_h) *2 /np.pi*180 # 36 | |
self.crop_fov_approx = cfgs.get("crop_fov_approx", 25) | |
self.mesh_regularization_mode = cfgs.get('mesh_regularization_mode', 'seq') | |
self.enable_prior = cfgs.get('enable_prior', False) | |
if self.enable_prior: | |
self.netPrior = PriorPredictor(self.cfgs) #DOR - add label | |
self.prior_lr = cfgs.get('prior_lr', self.lr) | |
self.prior_weight_decay = cfgs.get('prior_weight_decay', 0.) | |
self.prior_only_epochs = cfgs.get('prior_only_epochs', 0) | |
self.netInstance = InstancePredictor(self.cfgs, tet_bbox=self.netPrior.netShape.getAABB()) | |
self.perturb_sdf = cfgs.get('perturb_sdf', False) | |
self.blur_mask = cfgs.get('blur_mask', False) | |
self.blur_mask_iter = cfgs.get('blur_mask_iter', 1) | |
self.seqshape_epochs = np.arange(*cfgs.get('seqshape_epochs', [0, self.num_epochs])) | |
self.avg_texture_epochs = np.arange(*cfgs.get('avg_texture_epochs', [0, 0])) | |
self.swap_texture_epochs = np.arange(*cfgs.get('swap_texture_epochs', [0, 0])) | |
self.swap_priorshape_epochs = np.arange(*cfgs.get('swap_priorshape_epochs', [0, 0])) | |
self.avg_seqshape_epochs = np.arange(*cfgs.get('avg_seqshape_epochs', [0, 0])) | |
self.swap_seqshape_epochs = np.arange(*cfgs.get('swap_seqshape_epochs', [0, 0])) | |
self.pose_epochs = np.arange(*cfgs.get('pose_epochs', [0, 0])) | |
self.pose_iters = cfgs.get('pose_iters', 0) | |
self.deform_type = cfgs.get('deform_type', None) | |
self.mesh_reg_decay_epoch = cfgs.get('mesh_reg_decay_epoch', 0) | |
self.sdf_reg_decay_start_iter = cfgs.get('sdf_reg_decay_start_iter', 0) | |
self.mesh_reg_decay_rate = cfgs.get('mesh_reg_decay_rate', 1) | |
self.texture_epochs = np.arange(*cfgs.get('texture_epochs', [0, self.num_epochs])) | |
self.zflip_epochs = np.arange(*cfgs.get('zflip_epochs', [0, self.num_epochs])) | |
self.lookat_zflip_loss_epochs = np.arange(*cfgs.get('lookat_zflip_loss_epochs', [0, self.num_epochs])) | |
self.lookat_zflip_no_other_losses = cfgs.get('lookat_zflip_no_other_losses', False) | |
self.flow_loss_epochs = np.arange(*cfgs.get('flow_loss_epochs', [0, self.num_epochs])) | |
self.sdf_inflate_reg_loss_epochs = np.arange(*cfgs.get('sdf_inflate_reg_loss_epochs', [0, self.num_epochs])) | |
self.arti_reg_loss_epochs = np.arange(*cfgs.get('arti_reg_loss_epochs', [0, self.num_epochs])) | |
self.background_mode = cfgs.get('background_mode', 'background') | |
self.shape_prior_type = cfgs.get('shape_prior_type', 'deform') | |
self.backward_prior = cfgs.get('backward_prior', True) | |
self.resume_prior_optim = cfgs.get('resume_prior_optim', True) | |
self.dmtet_grid_smaller_epoch = cfgs.get('dmtet_grid_smaller_epoch', 0) | |
self.dmtet_grid_smaller = cfgs.get('dmtet_grid_smaller', 128) | |
self.dmtet_grid = cfgs.get('dmtet_grid', 256) | |
self.pose_xflip_recon_epochs = np.arange(*cfgs.get('pose_xflip_recon_epochs', [0, 0])) | |
self.rot_rand_quad_epochs = np.arange(*cfgs.get('rot_rand_quad_epochs', [0, 0])) | |
self.rot_all_quad_epochs = np.arange(*cfgs.get('rot_all_quad_epochs', [0, 0])) | |
self.calc_dino_features = cfgs.get('calc_dino_features', False) | |
# self.smooth_type = cfgs.get('smooth_type', 'None') | |
# print(f"****smooth_type: {self.smooth_type}****") | |
## smooth losses | |
# smooth articulation | |
self.arti_smooth_type = cfgs.get('arti_smooth_type', None) | |
self.arti_smooth_loss_type = cfgs.get('arti_smooth_loss_type', None) | |
self.arti_smooth_loss_weight = cfgs.get('arti_smooth_loss_weight', 0.) | |
self.using_arti_smooth_loss = self.arti_smooth_type and self.arti_smooth_loss_type and self.arti_smooth_loss_weight > 0. | |
if self.using_arti_smooth_loss: | |
self.arti_smooth_loss_fn = SmoothLoss(dim=1, smooth_type=self.arti_smooth_type, loss_type=self.arti_smooth_loss_type) | |
else: | |
self.arti_smooth_loss_fn = None | |
# smooth deformation | |
self.deform_smooth_type = cfgs.get('deform_smooth_type', None) | |
self.deform_smooth_loss_type = cfgs.get('deform_smooth_loss_type', None) | |
self.deform_smooth_loss_weight = cfgs.get('deform_smooth_loss_weight', 0.) | |
self.using_deform_smooth_loss = self.deform_smooth_type and self.deform_smooth_loss_type and self.deform_smooth_loss_weight > 0. | |
if self.using_deform_smooth_loss: | |
self.deform_smooth_loss_fn = SmoothLoss(dim=1, smooth_type=self.deform_smooth_type, loss_type=self.deform_smooth_loss_type) | |
else: | |
self.deform_smooth_loss_fn = None | |
# smooth camera pose | |
self.campos_smooth_type = cfgs.get('campos_smooth_type', None) | |
self.campos_smooth_loss_type = cfgs.get('campos_smooth_loss_type', None) | |
self.campos_smooth_loss_weight = cfgs.get('campos_smooth_loss_weight', 0.) | |
self.using_campos_smooth_loss = self.campos_smooth_type and self.campos_smooth_loss_type and self.campos_smooth_loss_weight > 0. | |
if self.using_campos_smooth_loss: | |
self.campos_smooth_loss_fn = SmoothLoss(dim=1, smooth_type=self.campos_smooth_type, loss_type=self.campos_smooth_loss_type) | |
else: | |
self.campos_smooth_loss_fn = None | |
# smooth articulation velocity | |
self.artivel_smooth_type = cfgs.get('artivel_smooth_type', None) | |
self.artivel_smooth_loss_type = cfgs.get('artivel_smooth_loss_type', None) | |
self.artivel_smooth_loss_weight = cfgs.get('artivel_smooth_loss_weight', 0.) | |
self.using_artivel_smooth_loss = self.artivel_smooth_type and self.artivel_smooth_loss_type and self.artivel_smooth_loss_weight > 0. | |
if self.using_artivel_smooth_loss: | |
self.artivel_smooth_loss_fn = SmoothLoss(dim=1, smooth_type=self.artivel_smooth_type, loss_type=self.artivel_smooth_loss_type) | |
else: | |
self.artivel_smooth_loss_fn = None | |
# smooth bone | |
self.bone_smooth_type = cfgs.get('bone_smooth_type', None) | |
self.bone_smooth_loss_type = cfgs.get('bone_smooth_loss_type', None) | |
self.bone_smooth_loss_weight = cfgs.get('bone_smooth_loss_weight', 0.) | |
self.using_bone_smooth_loss = self.bone_smooth_type and self.bone_smooth_loss_type and self.bone_smooth_loss_weight > 0. | |
if self.using_bone_smooth_loss: | |
self.bone_smooth_loss_fn = SmoothLoss(dim=1, smooth_type=self.bone_smooth_type, loss_type=self.bone_smooth_loss_type) | |
else: | |
self.bone_smooth_loss_fn = None | |
# smooth bone velocity | |
self.bonevel_smooth_type = cfgs.get('bonevel_smooth_type', None) | |
self.bonevel_smooth_loss_type = cfgs.get('bonevel_smooth_loss_type', None) | |
self.bonevel_smooth_loss_weight = cfgs.get('bonevel_smooth_loss_weight', 0.) | |
self.using_bonevel_smooth_loss = self.bonevel_smooth_type and self.bonevel_smooth_loss_type and self.bonevel_smooth_loss_weight > 0. | |
if self.using_bonevel_smooth_loss: | |
self.bonevel_smooth_loss_fn = SmoothLoss(dim=1, smooth_type=self.bonevel_smooth_type, loss_type=self.bonevel_smooth_loss_type) | |
else: | |
self.bonevel_smooth_loss_fn = None | |
## perceptual loss | |
if cfgs.get('perceptual_loss_weight', 0.) > 0: | |
self.perceptual_loss_use_lin = cfgs.get('perceptual_loss_use_lin', True) | |
self.perceptual_loss = lpips.LPIPS(net='vgg', lpips=self.perceptual_loss_use_lin) | |
self.glctx = dr.RasterizeGLContext() | |
self.render_flow = self.cfgs.get('flow_loss_weight', 0.) > 0. | |
self.extra_renders = cfgs.get('extra_renders', []) | |
self.renderer_spp = cfgs.get('renderer_spp', 1) | |
self.dino_feature_recon_dim = cfgs.get('dino_feature_recon_dim', 64) | |
self.total_loss = 0. | |
self.all_scores = torch.Tensor() | |
self.checkpoint_dir = cfgs.get('checkpoint_dir', 'results') | |
# iter | |
self.iter_arti_reg_loss_start = cfgs.get('iter_arti_reg_loss_start', None) | |
# mask distribution | |
self.enable_mask_distribution = cfgs.get('enable_mask_distribution', False) | |
self.random_mask_law = cfgs.get('random_mask_law', 'batch_swap_noy') # batch_swap, batch_swap_noy, # random_azimuth # random_all | |
self.mask_distribution_path = cfgs.get('mask_distribution_path', None) | |
if self.enable_mask_distribution and (self.mask_distribution_path is not None): | |
self.class_mask_distribution = {} | |
for category in os.listdir(self.mask_distribution_path): | |
# Here we assume the category names are identical | |
distribution_file = osp.join(self.mask_distribution_path, category, "raw_mask_distribution.npy") | |
distribution = np.load(distribution_file) | |
self.class_mask_distribution.update( | |
{ | |
category: distribution # [256, 256] | |
} | |
) | |
self.mask_distribution_loss_weight = cfgs.get("mask_distribution_loss_weight", 0.1) | |
self.mask_distribution_loss_freq = cfgs.get("mask_distribution_loss_freq", 1) | |
self.mask_distribution_average = cfgs.get("mask_distribution_average", False) | |
else: | |
self.enable_mask_distribution = False | |
self.enable_clip = cfgs.get('enable_clip', False) | |
if self.enable_clip: | |
self.clip_model, _ = clip.load('ViT-B/32', self.device) | |
self.clip_model = self.clip_model.eval().requires_grad_(False) | |
self.clip_mean = [0.48145466, 0.4578275, 0.40821073] | |
self.clip_std = [0.26862954, 0.26130258, 0.27577711] | |
self.clip_reso = 224 | |
self.clip_render_size = 64 | |
self.enable_clip_text = cfgs.get('enable_clip_text', False) | |
if self.enable_clip_text: | |
self.clip_text_feature = {} | |
for category_name in ['bear', 'elephant', 'horse', 'sheep', 'cow', 'zebra', 'giraffe']: | |
text_input = clip.tokenize(['A photo of ' + category_name]).to(self.device) | |
text_feature = self.clip_model.encode_text(text_input).detach() # [1, 512] | |
self.clip_text_feature.update({category_name: text_feature}) | |
self.enable_disc = cfgs.get('enable_disc', False) | |
if self.enable_disc: | |
self.mask_discriminator_iter = cfgs.get('mask_discriminator_iter', [0, 0]) | |
# this module is not in netInstance or netPrior | |
self.mask_disc_feat_condition = cfgs.get('mask_disc_feat_condition', False) | |
if self.mask_disc_feat_condition: | |
self.mask_disc = discriminator_architecture.DCDiscriminator(in_dim=(cfgs.get('dim_of_classes', 128) + 1)).to(self.device) | |
else: | |
self.mask_disc = discriminator_architecture.DCDiscriminator(in_dim=(len(list(self.netPrior.category_id_map.keys())) + 1)).to(self.device) | |
self.disc_gt = cfgs.get('disc_gt', True) | |
self.disc_iv = cfgs.get('disc_iv', False) # whether to use input view render in disc loss | |
self.disc_iv_label = cfgs.get('disc_iv_label', 'Fake') | |
self.disc_reg_mul = cfgs.get('disc_reg_mul', 10.) | |
self.record_mask_gt = None | |
self.record_mask_iv = None | |
self.record_mask_rv = None | |
self.discriminator_loss = 0. | |
self.discriminator_loss_weight = cfgs.get('discriminator_loss_weight', 0.1) | |
# the local texture for fine-tune process stage | |
if (self.cfgs.get('texture_way', None) is not None) or self.cfgs.get('gan_tex', False): | |
if self.cfgs.get('gan_tex', False): | |
self.few_shot_gan_tex = True | |
self.few_shot_gan_tex_reso = self.cfgs.get('few_shot_gan_tex_reso', 64) # used to render novel view, will upsample to out_image_size ASAP | |
self.few_shot_gan_tex_patch = self.cfgs.get('few_shot_gan_tex_patch', 0) # used to sample patch size on out_image_size image | |
if self.few_shot_gan_tex_patch > 0: | |
self.few_shot_gan_tex_patch_max = self.cfgs.get('few_shot_gan_tex_patch_max', 128) | |
assert self.few_shot_gan_tex_patch_max > self.few_shot_gan_tex_patch | |
self.few_shot_gan_tex_patch_num = self.cfgs.get('few_shot_gan_tex_patch_num', 1) | |
self.discriminator_texture = discriminator_architecture.DCDiscriminator(in_dim=3, img_size=self.few_shot_gan_tex_patch).to(self.device) | |
else: | |
self.discriminator_texture = discriminator_architecture.DCDiscriminator(in_dim=3, img_size=self.out_image_size).to(self.device) | |
self.few_shot_gan_tex_real = self.cfgs.get('few_shot_gan_tex_real', 'gt') | |
self.few_shot_gan_tex_fake = self.cfgs.get('few_shot_gan_tex_fake', 'rv') | |
else: | |
self.few_shot_gan_tex = False | |
if self.cfgs.get('clip_tex', False): | |
self.few_shot_clip_tex = True | |
self.clip_model, _ = clip.load('ViT-B/32', self.device) | |
self.clip_model = self.clip_model.eval().requires_grad_(False) | |
self.clip_mean = [0.48145466, 0.4578275, 0.40821073] | |
self.clip_std = [0.26862954, 0.26130258, 0.27577711] | |
self.clip_reso = 224 | |
self.enable_clip_text = False | |
else: | |
self.few_shot_clip_tex = False | |
else: | |
self.few_shot_gan_tex = False | |
self.few_shot_clip_tex = False | |
self.enable_sds = cfgs.get('enable_sds', False) | |
self.enable_vsd = cfgs.get('enable_vsd', False) | |
if self.enable_sds: | |
diffusion_torch_dtype = torch.float16 if cfgs.get('diffusion_precision', 'float16') == 'float16' else torch.float32 | |
# decide if use SDS or VSD | |
if self.enable_vsd: | |
# self.stable_diffusion = misc.LazyClass(StableDiffusion_VSD, device=self.device, torch_dtype=diffusion_torch_dtype) | |
self.stable_diffusion = StableDiffusion_VSD(device=self.device, torch_dtype=diffusion_torch_dtype) | |
self.diffusion_guidance_scale_lora = cfgs.get('diffusion_guidance_scale_lora', 1.) | |
self.diffusion_guidance_scale = cfgs.get('diffusion_guidance_scale', 7.5) | |
else: | |
self.stable_diffusion = misc.LazyClass(StableDiffusion, device=self.device, torch_dtype=diffusion_torch_dtype) | |
self.diffusion_guidance_scale = cfgs.get('diffusion_guidance_scale', 100.) | |
self.diffusion_loss_weight = cfgs.get('diffusion_loss_weight', 1.) | |
self.diffusion_num_random_cameras = cfgs.get('diffusion_num_random_cameras', 1) | |
# For prompts | |
self.diffusion_prompt = cfgs.get('diffusion_prompt', '') | |
self.diffusion_negative_prompt = cfgs.get('diffusion_negative_prompt', '') | |
# For image sampling | |
self.diffusion_albedo_ratio = cfgs.get('diffusion_albedo_ratio', 0.2) | |
self.diffusion_shading_ratio = cfgs.get('diffusion_shading_ratio', 0.4) | |
self.diffusion_light_ambient = cfgs.get('diffusion_light_ambient', 0.5) | |
self.diffusion_light_diffuse = cfgs.get('diffusion_light_diffuse', 0.8) | |
self.diffusion_radius_range = cfgs.get('diffusion_radius_range', [0.8, 1.4]) | |
self.diffusion_uniform_sphere_rate = cfgs.get('diffusion_uniform_sphere_rate', 0.5) | |
self.diffusion_theta_range = cfgs.get('diffusion_theta_range', [0, 120]) | |
self.diffusion_phi_offset = cfgs.get('diffusion_phi_offset', 180) | |
self.diffusion_resolution = cfgs.get('diffusion_resolution', 256) | |
print('-----------------------------------------------') | |
print(f"!!!!!! the phi offset for diffusion is set as {self.diffusion_phi_offset}!!!!!!!!!!!!!") | |
print('-----------------------------------------------') | |
# For randomizing light | |
self.diffusion_random_light = cfgs.get('diffusion_random_light', False) | |
self.diffusion_light_ambient = cfgs.get('diffusion_light_ambient', 0.5) | |
self.diffusion_light_diffuse = cfgs.get('diffusion_light_diffuse', 0.8) | |
# For noise scheduling | |
self.diffusion_max_step = cfgs.get('diffusion_max_step', 0.98) | |
# For view-dependent prompting | |
self.diffusion_append_prompt_directions = cfgs.get('diffusion_append_prompt_directions', False) | |
self.diffusion_angle_overhead = cfgs.get('diffusion_angle_overhead', 30) | |
self.diffusion_angle_front = cfgs.get('diffusion_angle_front', 60) | |
def get_data_loaders(cfgs, dataset, in_image_size=256, out_image_size=256, batch_size=64, num_workers=4, run_train=False, run_test=False, train_data_dir=None, val_data_dir=None, test_data_dir=None, flow_bool=False): | |
train_loader = val_loader = test_loader = None | |
color_jitter_train = cfgs.get('color_jitter_train', None) | |
color_jitter_val = cfgs.get('color_jitter_val', None) | |
random_flip_train = cfgs.get('random_flip_train', False) | |
## video dataset | |
if dataset == 'video': | |
data_loader_mode = cfgs.get('data_loader_mode', 'n_frame') | |
skip_beginning = cfgs.get('skip_beginning', 4) | |
skip_end = cfgs.get('skip_end', 4) | |
num_sample_frames = cfgs.get('num_sample_frames', 2) | |
min_seq_len = cfgs.get('min_seq_len', 10) | |
max_seq_len = cfgs.get('max_seq_len', 10) | |
debug_seq = cfgs.get('debug_seq', False) | |
random_sample_train_frames = cfgs.get('random_sample_train_frames', False) | |
shuffle_train_seqs = cfgs.get('shuffle_train_seqs', False) | |
random_sample_val_frames = cfgs.get('random_sample_val_frames', False) | |
load_background = cfgs.get('background_mode', 'none') == 'background' | |
rgb_suffix = cfgs.get('rgb_suffix', '.png') | |
load_dino_feature = cfgs.get('load_dino_feature', False) | |
load_dino_cluster = cfgs.get('load_dino_cluster', False) | |
dino_feature_dim = cfgs.get('dino_feature_dim', 64) | |
get_loader = lambda **kwargs: get_sequence_loader( | |
mode=data_loader_mode, | |
batch_size=batch_size, | |
num_workers=num_workers, | |
in_image_size=in_image_size, | |
out_image_size=out_image_size, | |
debug_seq=debug_seq, | |
skip_beginning=skip_beginning, | |
skip_end=skip_end, | |
num_sample_frames=num_sample_frames, | |
min_seq_len=min_seq_len, | |
max_seq_len=max_seq_len, | |
load_background=load_background, | |
rgb_suffix=rgb_suffix, | |
load_dino_feature=load_dino_feature, | |
load_dino_cluster=load_dino_cluster, | |
dino_feature_dim=dino_feature_dim, | |
flow_bool=flow_bool, | |
**kwargs) | |
if run_train: | |
assert osp.isdir(train_data_dir), f"Training data directory does not exist: {train_data_dir}" | |
print(f"Loading training data from {train_data_dir}") | |
train_loader = get_loader(data_dir=train_data_dir, is_validation=False, random_sample=random_sample_train_frames, shuffle=shuffle_train_seqs, dense_sample=True, color_jitter=color_jitter_train, random_flip=random_flip_train) | |
if val_data_dir is not None: | |
assert osp.isdir(val_data_dir), f"Validation data directory does not exist: {val_data_dir}" | |
print(f"Loading validation data from {val_data_dir}") | |
val_loader = get_loader(data_dir=val_data_dir, is_validation=True, random_sample=random_sample_val_frames, shuffle=False, dense_sample=False, color_jitter=color_jitter_val, random_flip=False) | |
if run_test: | |
assert osp.isdir(test_data_dir), f"Testing data directory does not exist: {test_data_dir}" | |
print(f"Loading testing data from {test_data_dir}") | |
test_loader = get_loader(data_dir=test_data_dir, is_validation=True, dense_sample=False, color_jitter=None, random_flip=False) | |
## CUB dataset | |
elif dataset == 'cub': | |
get_loader = lambda **kwargs: get_cub_loader( | |
batch_size=batch_size, | |
num_workers=num_workers, | |
image_size=in_image_size, | |
**kwargs) | |
if run_train: | |
assert osp.isdir(train_data_dir), f"Training data directory does not exist: {train_data_dir}" | |
print(f"Loading training data from {train_data_dir}") | |
train_loader = get_loader(data_dir=train_data_dir, split='train', is_validation=False) | |
val_loader = get_loader(data_dir=val_data_dir, split='val', is_validation=True) | |
if run_test: | |
assert osp.isdir(test_data_dir), f"Testing data directory does not exist: {test_data_dir}" | |
print(f"Loading testing data from {test_data_dir}") | |
test_loader = get_loader(data_dir=test_data_dir, split='test', is_validation=True) | |
## other datasets | |
else: | |
get_loader = lambda **kwargs: get_image_loader( | |
batch_size=batch_size, | |
num_workers=num_workers, | |
image_size=in_image_size, | |
**kwargs) | |
if run_train: | |
assert osp.isdir(train_data_dir), f"Training data directory does not exist: {train_data_dir}" | |
print(f"Loading training data from {train_data_dir}") | |
train_loader = get_loader(data_dir=train_data_dir, is_validation=False, color_jitter=color_jitter_train) | |
if val_data_dir is not None: | |
assert osp.isdir(val_data_dir), f"Validation data directory does not exist: {val_data_dir}" | |
print(f"Loading validation data from {val_data_dir}") | |
val_loader = get_loader(data_dir=val_data_dir, is_validation=True, color_jitter=color_jitter_val) | |
if run_test: | |
assert osp.isdir(test_data_dir), f"Testing data directory does not exist: {test_data_dir}" | |
print(f"Loading testing data from {test_data_dir}") | |
test_loader = get_loader(data_dir=test_data_dir, is_validation=True, color_jitter=None) | |
return train_loader, val_loader, test_loader | |
def get_data_loaders_ddp(cfgs, dataset, rank, world_size, in_image_size=256, out_image_size=256, batch_size=64, num_workers=4, run_train=False, run_test=False, train_data_dir=None, val_data_dir=None, test_data_dir=None, flow_bool=False): | |
train_loader = val_loader = test_loader = None | |
color_jitter_train = cfgs.get('color_jitter_train', None) | |
color_jitter_val = cfgs.get('color_jitter_val', None) | |
random_flip_train = cfgs.get('random_flip_train', False) | |
## video dataset | |
if dataset == 'video': | |
data_loader_mode = cfgs.get('data_loader_mode', 'n_frame') | |
skip_beginning = cfgs.get('skip_beginning', 4) | |
skip_end = cfgs.get('skip_end', 4) | |
num_sample_frames = cfgs.get('num_sample_frames', 2) | |
min_seq_len = cfgs.get('min_seq_len', 10) | |
max_seq_len = cfgs.get('max_seq_len', 10) | |
debug_seq = cfgs.get('debug_seq', False) | |
random_sample_train_frames = cfgs.get('random_sample_train_frames', False) | |
shuffle_train_seqs = cfgs.get('shuffle_train_seqs', False) | |
random_sample_val_frames = cfgs.get('random_sample_val_frames', False) | |
load_background = cfgs.get('background_mode', 'none') == 'background' | |
rgb_suffix = cfgs.get('rgb_suffix', '.png') | |
load_dino_feature = cfgs.get('load_dino_feature', False) | |
load_dino_cluster = cfgs.get('load_dino_cluster', False) | |
dino_feature_dim = cfgs.get('dino_feature_dim', 64) | |
get_loader_ddp = lambda **kwargs: get_sequence_loader_ddp( | |
mode=data_loader_mode, | |
batch_size=batch_size, | |
num_workers=num_workers, | |
in_image_size=in_image_size, | |
out_image_size=out_image_size, | |
debug_seq=debug_seq, | |
skip_beginning=skip_beginning, | |
skip_end=skip_end, | |
num_sample_frames=num_sample_frames, | |
min_seq_len=min_seq_len, | |
max_seq_len=max_seq_len, | |
load_background=load_background, | |
rgb_suffix=rgb_suffix, | |
load_dino_feature=load_dino_feature, | |
load_dino_cluster=load_dino_cluster, | |
dino_feature_dim=dino_feature_dim, | |
flow_bool=flow_bool, | |
**kwargs) | |
get_loader = lambda **kwargs: get_sequence_loader( | |
mode=data_loader_mode, | |
batch_size=batch_size, | |
num_workers=num_workers, | |
in_image_size=in_image_size, | |
out_image_size=out_image_size, | |
debug_seq=debug_seq, | |
skip_beginning=skip_beginning, | |
skip_end=skip_end, | |
num_sample_frames=num_sample_frames, | |
min_seq_len=min_seq_len, | |
max_seq_len=max_seq_len, | |
load_background=load_background, | |
rgb_suffix=rgb_suffix, | |
load_dino_feature=load_dino_feature, | |
load_dino_cluster=load_dino_cluster, | |
dino_feature_dim=dino_feature_dim, | |
**kwargs) | |
if run_train: | |
if isinstance(train_data_dir, dict): | |
for data_path in train_data_dir.values(): | |
assert osp.isdir(data_path), f"Training data directory does not exist: {data_path}" | |
elif isinstance(train_data_dir, str): | |
assert osp.isdir(train_data_dir), f"Training data directory does not exist: {train_data_dir}" | |
else: | |
raise ValueError("train_data_dir must be a string or a dict of strings") | |
print(f"Loading training data...") | |
train_loader = get_loader_ddp(data_dir=train_data_dir, rank=rank, world_size=world_size, is_validation=False, random_sample=random_sample_train_frames, shuffle=shuffle_train_seqs, dense_sample=True, color_jitter=color_jitter_train, random_flip=random_flip_train) | |
if val_data_dir is not None: | |
if isinstance(val_data_dir, dict): | |
for data_path in val_data_dir.values(): | |
assert osp.isdir(data_path), f"Training data directory does not exist: {data_path}" | |
elif isinstance(val_data_dir, str): | |
assert osp.isdir(val_data_dir), f"Training data directory does not exist: {val_data_dir}" | |
else: | |
raise ValueError("train_data_dir must be a string or a dict of strings") | |
print(f"Loading validation data...") | |
# No need for data parallel for the validation data loader. | |
val_loader = get_loader(data_dir=val_data_dir, is_validation=True, random_sample=random_sample_val_frames, shuffle=False, dense_sample=False, color_jitter=color_jitter_val, random_flip=False) | |
if run_test: | |
assert osp.isdir(test_data_dir), f"Testing data directory does not exist: {test_data_dir}" | |
print(f"Loading testing data from {test_data_dir}") | |
test_loader = get_loader_ddp(data_dir=test_data_dir, rank=rank, world_size=world_size, is_validation=True, dense_sample=False, color_jitter=None, random_flip=False) | |
## CUB dataset | |
elif dataset == 'cub': | |
get_loader = lambda **kwargs: get_cub_loader_ddp( | |
batch_size=batch_size, | |
num_workers=num_workers, | |
image_size=in_image_size, | |
**kwargs) | |
if run_train: | |
assert osp.isdir(train_data_dir), f"Training data directory does not exist: {train_data_dir}" | |
print(f"Loading training data from {train_data_dir}") | |
train_loader = get_loader(data_dir=train_data_dir, rank=rank, world_size=world_size, split='train', is_validation=False) | |
val_loader = get_loader(data_dir=val_data_dir, rank=rank, world_size=world_size, split='val', is_validation=True) | |
if run_test: | |
assert osp.isdir(test_data_dir), f"Testing data directory does not exist: {test_data_dir}" | |
print(f"Loading testing data from {test_data_dir}") | |
test_loader = get_loader(data_dir=test_data_dir, rank=rank, world_size=world_size, split='test', is_validation=True) | |
## other datasets | |
else: | |
get_loader = lambda **kwargs: get_image_loader_ddp( | |
batch_size=batch_size, | |
num_workers=num_workers, | |
image_size=in_image_size, | |
**kwargs) | |
if run_train: | |
assert osp.isdir(train_data_dir), f"Training data directory does not exist: {train_data_dir}" | |
print(f"Loading training data from {train_data_dir}") | |
train_loader = get_loader(data_dir=train_data_dir, rank=rank, world_size=world_size, is_validation=False, color_jitter=color_jitter_train) | |
if val_data_dir is not None: | |
assert osp.isdir(val_data_dir), f"Validation data directory does not exist: {val_data_dir}" | |
print(f"Loading validation data from {val_data_dir}") | |
val_loader = get_loader(data_dir=val_data_dir, rank=rank, world_size=world_size, is_validation=True, color_jitter=color_jitter_val) | |
if run_test: | |
assert osp.isdir(test_data_dir), f"Testing data directory does not exist: {test_data_dir}" | |
print(f"Loading testing data from {test_data_dir}") | |
test_loader = get_loader(data_dir=test_data_dir, rank=rank, world_size=world_size, is_validation=True, color_jitter=None) | |
return train_loader, val_loader, test_loader | |
def load_model_state(self, cp): | |
# TODO: very hacky: if using local texture, which is also usually finetuned from global texture | |
# we need to check if needs some handcrafted load in netInstance | |
if (self.netInstance.texture_way is not None) or (self.cfgs.get('texture_act', 'relu') != 'relu'): | |
new_netInstance_weights = {k: v for k, v in cp['netInstance'].items() if 'netTexture' not in k} | |
#find the new texture weights | |
texture_weights = self.netInstance.netTexture.state_dict() | |
#add the new weights to the new model weights | |
for k, v in texture_weights.items(): | |
new_netInstance_weights['netTexture.' + k] = v | |
self.netInstance.load_state_dict(new_netInstance_weights) | |
else: | |
self.netInstance.load_state_dict(cp["netInstance"]) | |
if self.enable_disc and "net_mask_disc" in cp: | |
self.mask_disc.load_state_dict(cp["net_mask_disc"]) | |
if self.enable_prior: | |
self.netPrior.load_state_dict(cp["netPrior"]) | |
def load_optimizer_state(self, cp): | |
# TODO: also very hacky here, as the load_model_state above | |
if self.netInstance.texture_way is not None: | |
opt_state_dict = self.optimizerInstance.state_dict() | |
param_ids = [id(p) for p in self.netInstance.netTexture.parameters()] | |
new_opt_state_dict = {} | |
new_opt_state_dict['state'] = {k: v for k, v in opt_state_dict['state'].items() if k not in param_ids} | |
new_param_groups = [] | |
for param_group in opt_state_dict['param_groups']: | |
new_param_group = {k: v for k, v in param_group.items() if k != 'params'} | |
new_param_group['params'] = [p_id for p_id in param_group['params'] if p_id not in param_ids] | |
new_param_groups.append(new_param_group) | |
new_opt_state_dict['param_groups'] = new_param_groups | |
self.optimizerInstance.load_state_dict(new_opt_state_dict) | |
else: | |
self.optimizerInstance.load_state_dict(cp["optimizerInstance"]) | |
# add parameters into optimizerInstance here | |
# if self.enable_disc: | |
# print('add mask discriminator parameters to Instance optimizer') | |
# self.optimizerInstance.add_param_group({'params': self.mask_disc.parameters()}) | |
if self.use_scheduler: | |
if 'schedulerInstance' in cp: | |
self.schedulerInstance.load_state_dict(cp["schedulerInstance"]) | |
if self.enable_disc and "optimizerDiscriminator" in cp: | |
self.optimizerDiscriminator.load_state_dict(cp["optimizerDiscriminator"]) | |
if self.enable_prior and self.resume_prior_optim: | |
self.optimizerPrior.load_state_dict(cp["optimizerPrior"]) | |
if self.use_scheduler: | |
if 'schedulerPrior' in cp: | |
self.schedulerPrior.load_state_dict(cp["schedulerPrior"]) | |
def get_model_state(self): | |
state = {"netInstance": self.netInstance.state_dict()} | |
if self.enable_disc: | |
state["net_mask_disc"] = self.mask_disc.state_dict() | |
if self.enable_prior: | |
state["netPrior"] = self.netPrior.state_dict() | |
return state | |
def get_optimizer_state(self): | |
state = {"optimizerInstance": self.optimizerInstance.state_dict()} | |
if self.enable_disc: | |
state['optimizerDiscriminator'] = self.optimizerDiscriminator.state_dict() | |
if self.use_scheduler: | |
state["schedulerInstance"] = self.schedulerInstance.state_dict() | |
if self.enable_prior: | |
state["optimizerPrior"] = self.optimizerPrior.state_dict() | |
if self.use_scheduler: | |
state["schedulerPrior"] = self.schedulerPrior.state_dict() | |
return state | |
def to(self, device): | |
self.device = device | |
self.netInstance.to(device) | |
if self.enable_prior: | |
self.netPrior.to(device) | |
for v in vars(self.netPrior.netShape): | |
attr = getattr(self.netPrior.netShape,v) | |
if type(attr) == torch.Tensor: | |
setattr(self.netPrior.netShape, v, attr.to(device)) | |
if hasattr(self, 'perceptual_loss'): | |
self.perceptual_loss.to(device) | |
def ddp(self, rank, world_size): | |
self.rank = rank | |
self.world_size = world_size | |
if self.world_size > 1: | |
self.netInstance_ddp = DDP( | |
self.netInstance, device_ids=[rank], | |
find_unused_parameters=True) | |
self.netInstance_ddp._set_static_graph() | |
self.netInstance = self.netInstance_ddp.module | |
if self.enable_prior: | |
self.netPrior_ddp = DDP( | |
self.netPrior, device_ids=[rank], | |
find_unused_parameters=True) | |
self.netPrior_ddp._set_static_graph() | |
self.netPrior = self.netPrior_ddp.module | |
if hasattr(self, 'perceptual_loss'): | |
self.perceptual_loss_ddp = DDP( | |
self.perceptual_loss, device_ids=[rank], | |
find_unused_parameters=True) | |
self.perceptual_loss = self.perceptual_loss_ddp.module | |
else: | |
print('actually no DDP for model') | |
def set_train(self): | |
if self.world_size > 1: | |
self.netInstance_ddp.train() | |
if self.enable_prior: | |
self.netPrior_ddp.train() | |
else: | |
self.netInstance.train() | |
if self.enable_disc: | |
self.mask_disc.train() | |
if self.enable_prior: | |
self.netPrior.train() | |
def set_eval(self): | |
if self.world_size > 1: | |
self.netInstance_ddp.eval() | |
if self.enable_prior: | |
self.netPrior_ddp.eval() | |
else: | |
self.netInstance.eval() | |
if self.enable_disc: | |
self.mask_disc.eval() | |
if self.enable_prior: | |
self.netPrior.eval() | |
def reset_optimizers(self): | |
print("Resetting optimizers...") | |
self.optimizerInstance = get_optimizer(self.netInstance, self.lr) | |
if self.enable_disc: | |
self.optimizerDiscriminator = get_optimizer(self.mask_disc, self.lr) | |
if self.use_scheduler: | |
self.schedulerInstance = self.make_scheduler(self.optimizerInstance) | |
if self.enable_prior: | |
self.optimizerPrior = get_optimizer(self.netPrior, lr=self.prior_lr, weight_decay=self.prior_weight_decay) | |
if self.use_scheduler: | |
self.schedulerPrior = self.make_scheduler(self.optimizerPrior) | |
def reset_only_disc_optimizer(self): | |
if self.enable_disc: | |
self.optimizerDiscriminator = get_optimizer(self.mask_disc, self.lr) | |
def backward(self): | |
self.optimizerInstance.zero_grad() | |
if self.backward_prior: | |
self.optimizerPrior.zero_grad() | |
# self.total_loss = self.add_unused() | |
self.total_loss.backward() | |
self.optimizerInstance.step() | |
if self.backward_prior: | |
self.optimizerPrior.step() | |
self.total_loss = 0. | |
def scheduler_step(self): | |
if self.use_scheduler: | |
self.schedulerInstance.step() | |
if self.enable_prior: | |
self.schedulerPrior.step() | |
def zflip_pose(self, pose): | |
if self.rot_rep == 'lookat': | |
vec_forward = pose[:,:,6:9] | |
vec_forward = vec_forward * torch.FloatTensor([1,1,-1]).view(1,1,3).to(vec_forward.device) | |
up = torch.FloatTensor([0,1,0]).to(pose.device).view(1,1,3) | |
vec_right = up.expand_as(vec_forward).cross(vec_forward, dim=-1) | |
vec_right = nn.functional.normalize(vec_right, p=2, dim=-1) | |
vec_up = vec_forward.cross(vec_right, dim=-1) | |
vec_up = nn.functional.normalize(vec_up, p=2, dim=-1) | |
rot_mat = torch.stack([vec_right, vec_up, vec_forward], 2) | |
rot_pred = rot_mat.reshape(*pose.shape[:-1], -1) | |
pose_zflip = torch.cat([rot_pred, pose[:,:,9:]], -1) | |
else: | |
raise NotImplementedError | |
return pose_zflip | |
def render(self, shape, texture, mvp, w2c, campos, resolution, background='none', im_features=None, light=None, prior_shape=None, render_flow=False, dino_pred=None, class_vector=None, render_mode='diffuse', two_sided_shading=True, num_frames=None, spp=1, bg_image=None, im_features_map=None): | |
h, w = resolution | |
N = len(mvp) | |
if bg_image is None: | |
if background in ['none', 'black']: | |
bg_image = torch.zeros((N, h, w, 3), device=mvp.device) | |
elif background == 'white': | |
bg_image = torch.ones((N, h, w, 3), device=mvp.device) | |
elif background == 'checkerboard': | |
bg_image = torch.FloatTensor(util.checkerboard((h, w), 8), device=self.device).repeat(N, 1, 1, 1) # NxHxWxC | |
elif background == 'random': | |
bg_image = torch.rand((N, h, w, 3), device=mvp.device) # NxHxWxC | |
elif background == 'random-pure': | |
random_values = torch.rand(N) | |
bg_image = random_values[..., None, None, None].repeat(1, h, w, 3).to(self.device) | |
else: | |
raise NotImplementedError | |
#insider render_mesh -> render_layer -> shade DOR | |
frame_rendered = render.render_mesh( | |
self.glctx, | |
shape, | |
mtx_in=mvp, | |
w2c=w2c, | |
view_pos=campos, | |
material=texture, | |
lgt=light, | |
resolution=resolution, | |
spp=spp, | |
msaa=True, | |
background=bg_image, | |
bsdf=render_mode, | |
feat=im_features, | |
prior_mesh=prior_shape, | |
two_sided_shading=two_sided_shading, | |
render_flow=render_flow, | |
dino_pred=dino_pred, | |
class_vector=class_vector, | |
num_frames=num_frames, | |
im_features_map=im_features_map) | |
shaded = frame_rendered['shaded'].permute(0, 3, 1, 2) | |
image_pred = shaded[:, :3, :, :] | |
mask_pred = shaded[:, 3, :, :] | |
albedo = frame_rendered['kd'].permute(0, 3, 1, 2)[:, :3, :, :] | |
if 'shading' in frame_rendered: | |
shading = frame_rendered['shading'].permute(0, 3, 1, 2)[:, :1, :, :] | |
else: | |
shading = None | |
if render_flow: | |
flow_pred = frame_rendered['flow'] | |
flow_pred = flow_pred.permute(0, 3, 1, 2)[:, :2, :, :] | |
else: | |
flow_pred = None | |
if dino_pred is not None: | |
dino_feat_im_pred = frame_rendered['dino_feat_im_pred'] | |
dino_feat_im_pred = dino_feat_im_pred.permute(0, 3, 1, 2)[:, :-1] | |
else: | |
dino_feat_im_pred = None | |
return image_pred, mask_pred, flow_pred, dino_feat_im_pred, albedo, shading | |
def compute_reconstruction_losses(self, image_pred, image_gt, mask_pred, mask_gt, mask_dt, mask_valid, flow_pred, flow_gt, dino_feat_im_gt, dino_feat_im_pred, background_mode='none', reduce=False): | |
losses = {} | |
batch_size, num_frames, _, h, w = image_pred.shape # BxFxCxHxW | |
# image_loss = (image_pred - image_gt) ** 2 | |
image_loss = (image_pred - image_gt).abs() | |
## silhouette loss | |
mask_pred_valid = mask_pred * mask_valid | |
# mask_pred_valid = mask_pred | |
# losses["silhouette_loss"] = ((mask_pred - mask_gt) ** 2).mean() | |
# mask_loss_mask = (image_loss.mean(2).detach() > 0.05).float() | |
mask_loss = (mask_pred_valid - mask_gt) ** 2 | |
# mask_loss = nn.functional.mse_loss(mask_pred, mask_gt) | |
# num_mask_pixels = mask_loss_mask.reshape(batch_size*num_frames, -1).sum(1).clamp(min=1) | |
# losses["silhouette_loss"] = (mask_loss.reshape(batch_size*num_frames, -1).sum(1) / num_mask_pixels).mean() | |
losses['silhouette_loss'] = mask_loss.view(batch_size, num_frames, -1).mean(2) | |
losses['silhouette_dt_loss'] = (mask_pred * mask_dt[:,:,1]).view(batch_size, num_frames, -1).mean(2) | |
losses['silhouette_inv_dt_loss'] = ((1-mask_pred) * mask_dt[:,:,0]).view(batch_size, num_frames, -1).mean(2) | |
mask_pred_binary = (mask_pred_valid > 0.).float().detach() | |
mask_both_binary = (mask_pred_binary * mask_gt).view(batch_size*num_frames, 1, *mask_pred.shape[2:]) | |
mask_both_binary = (nn.functional.avg_pool2d(mask_both_binary, 3, stride=1, padding=1).view(batch_size, num_frames, *mask_pred.shape[2:]) > 0.99).float().detach() # erode by 1 pixel | |
## reconstruction loss | |
# image_loss_mask = (mask_pred*mask_gt).unsqueeze(2).expand_as(image_gt) | |
# image_loss = image_loss * image_loss_mask | |
# num_mask_pixels = image_loss_mask.reshape(batch_size*num_frames, -1).sum(1).clamp(min=1) | |
# losses["rgb_loss"] = (image_loss.reshape(batch_size*num_frames, -1).sum(1) / num_mask_pixels).mean() | |
if background_mode in ['background', 'input']: | |
pass | |
else: | |
image_loss = image_loss * mask_both_binary.unsqueeze(2) | |
losses['rgb_loss'] = image_loss.reshape(batch_size, num_frames, -1).mean(2) | |
if self.cfgs.get('perceptual_loss_weight', 0.) > 0: | |
if background_mode in ['background', 'input']: | |
perc_image_pred = image_pred | |
perc_image_gt = image_gt | |
else: | |
perc_image_pred = image_pred * mask_pred_binary.unsqueeze(2) + 0.5 * (1-mask_pred_binary.unsqueeze(2)) | |
perc_image_gt = image_gt * mask_pred_binary.unsqueeze(2) + 0.5 * (1-mask_pred_binary.unsqueeze(2)) | |
losses['perceptual_loss'] = self.perceptual_loss(perc_image_pred.view(-1, *image_pred.shape[2:]) *2-1, perc_image_gt.view(-1, *image_gt.shape[2:]) *2-1).view(batch_size, num_frames) | |
## flow loss - between first and second frame | |
if flow_pred is not None: | |
flow_loss = (flow_pred - flow_gt).abs() | |
flow_loss_mask = mask_both_binary[:,:-1].unsqueeze(2).expand_as(flow_gt).detach() | |
## ignore frames where GT flow is too large (likely inaccurate) | |
large_flow = (flow_gt.abs() > 0.5).float() * flow_loss_mask | |
large_flow = (large_flow.view(batch_size, num_frames-1, -1).sum(2) > 0).float() | |
self.large_flow = large_flow | |
flow_loss = flow_loss * flow_loss_mask * (1 - large_flow[:,:,None,None,None]) | |
num_mask_pixels = flow_loss_mask.reshape(batch_size, num_frames-1, -1).sum(2).clamp(min=1) | |
losses['flow_loss'] = (flow_loss.reshape(batch_size, num_frames-1, -1).sum(2) / num_mask_pixels) | |
# losses["flow_loss"] = flow_loss.mean() | |
if dino_feat_im_pred is not None and dino_feat_im_gt is not None: | |
dino_feat_loss = (dino_feat_im_pred - dino_feat_im_gt) ** 2 | |
dino_feat_loss = dino_feat_loss * mask_both_binary.unsqueeze(2) | |
losses['dino_feat_im_loss'] = dino_feat_loss.reshape(batch_size, num_frames, -1).mean(2) | |
if reduce: | |
for k, v in losses.item(): | |
losses[k] = v.mean() | |
return losses | |
def compute_pose_xflip_reg_loss(self, input_image, dino_feat_im, pose_raw, input_image_xflip_flag=None): | |
image_xflip = input_image.flip(4) | |
if dino_feat_im is not None: | |
dino_feat_im_xflip = dino_feat_im.flip(4) | |
else: | |
dino_feat_im_xflip = None | |
if self.world_size > 1: | |
netInst = self.netInstance_ddp | |
else: | |
netInst = self.netInstance | |
# feat_xflip, _ = self.netInstance_ddp.forward_encoder(image_xflip, dino_feat_im_xflip) | |
feat_xflip, _ = netInst.forward_encoder(image_xflip, dino_feat_im_xflip) | |
batch_size, num_frames = input_image.shape[:2] | |
# pose_xflip_raw = self.netInstance_ddp.forward_pose(image_xflip, feat_xflip, dino_feat_im_xflip) | |
pose_xflip_raw = netInst.forward_pose(image_xflip, feat_xflip, dino_feat_im_xflip) | |
if input_image_xflip_flag is not None: | |
pose_xflip_raw_xflip = pose_xflip_raw * torch.FloatTensor([-1,1,1,-1,1,1]).to(pose_raw.device) # forward x, trans x | |
pose_xflip_raw = pose_xflip_raw * (1 - input_image_xflip_flag.view(batch_size * num_frames, 1)) + pose_xflip_raw_xflip * input_image_xflip_flag.view(batch_size * num_frames, 1) | |
# rot_rep = self.netInstance_ddp.rot_rep | |
rot_rep = netInst.rot_rep | |
if rot_rep == 'euler_angle' or rot_rep == 'soft_calss': | |
pose_xflip_xflip = pose_xflip * torch.FloatTensor([1,-1,-1,-1,1,1]).to(pose_xflip.device) # rot y+z, trans x | |
pose_xflip_reg_loss = ((pose_xflip_xflip - pose) ** 2.).mean() | |
elif rot_rep == 'quaternion': | |
rot_euler = pytorch3d.transforms.matrix_to_euler_angles(pytorch3d.transforms.quaternion_to_matrix(pose[...,:4]), convention='XYZ') | |
pose_euler = torch.cat([rot_euler, pose[...,4:]], -1) | |
rot_xflip_euler = pytorch3d.transforms.matrix_to_euler_angles(pytorch3d.transforms.quaternion_to_matrix(pose_xflip[...,:4]), convention='XYZ') | |
pose_xflip_euler = torch.cat([rot_xflip_euler, pose_xflip[...,4:]], -1) | |
pose_xflip_euler_xflip = pose_xflip_euler * torch.FloatTensor([1,-1,-1,-1,1,1]).to(pose_xflip.device) # rot y+z, trans x | |
pose_xflip_reg_loss = ((pose_xflip_euler_xflip - pose_euler) ** 2.).mean() | |
elif rot_rep == 'lookat': | |
pose_xflip_raw_xflip = pose_xflip_raw * torch.FloatTensor([-1,1,1,-1,1,1]).to(pose_raw.device) # forward x, trans x | |
pose_xflip_reg_loss = ((pose_xflip_raw_xflip - pose_raw)[...,0] ** 2.) # compute x only | |
# if epoch >= self.nolookat_zflip_loss_epochs and self.lookat_zflip_no_other_losses: | |
# pose_xflip_reg_loss = pose_xflip_reg_loss.mean(1) * is_pose_1_better | |
pose_xflip_reg_loss = pose_xflip_reg_loss.mean() | |
return pose_xflip_reg_loss, pose_xflip_raw | |
def compute_edge_length_reg_loss(self, mesh, prior_mesh): | |
prior_edge_lengths = get_edge_length(prior_mesh.v_pos, prior_mesh.t_pos_idx) | |
max_length = prior_edge_lengths.max().detach() *1.1 | |
edge_lengths = get_edge_length(mesh.v_pos, mesh.t_pos_idx) | |
mesh_edge_length_loss = ((edge_lengths - max_length).clamp(min=0)**2).mean() | |
return mesh_edge_length_loss, edge_lengths | |
def compute_regularizers(self, mesh, prior_mesh, input_image, dino_feat_im, pose_raw, input_image_xflip_flag=None, arti_params=None, deformation=None, mid_img_idx=0, posed_bones=None, class_vector=None): | |
losses = {} | |
aux = {} | |
if self.enable_prior: | |
losses.update(self.netPrior.netShape.get_sdf_reg_loss(class_vector=class_vector)) | |
if self.cfgs.get('pose_xflip_reg_loss_weight', 0.) > 0: | |
losses["pose_xflip_reg_loss"], aux['pose_xflip_raw'] = self.compute_pose_xflip_reg_loss(input_image, dino_feat_im, pose_raw, input_image_xflip_flag) | |
if self.using_campos_smooth_loss: | |
# from IPython import embed; embed() | |
pose_raw_ = pose_raw.view(self.bs, self.nf, *pose_raw.shape[1:]) | |
losses['campos_smooth_loss'] = self.campos_smooth_loss_fn(pose_raw_) | |
b, f = input_image.shape[:2] | |
if b >= 2: | |
vec_forward = pose_raw[..., :3] | |
losses['pose_entropy_loss'] = (vec_forward[:b//2] * vec_forward[b//2:(b//2)*2]).sum(-1).mean() | |
else: | |
losses['pose_entropy_loss'] = 0. | |
losses['mesh_normal_consistency_loss'] = normal_consistency(mesh.v_pos, mesh.t_pos_idx) | |
losses['mesh_laplacian_consistency_loss'] = laplace_regularizer_const(mesh.v_pos, mesh.t_pos_idx) | |
losses['mesh_edge_length_loss'], aux['edge_lengths'] = self.compute_edge_length_reg_loss(mesh, prior_mesh) | |
if arti_params is not None: | |
#losses['arti_reg_loss'] = (arti_params ** 2).mean() | |
losses['arti_reg_loss'] = (arti_params ** 2).mean() #TODO dor Rart | |
if arti_params is not None and self.using_arti_smooth_loss: | |
arti_smooth_loss = self.arti_smooth_loss_fn(arti_params) | |
losses['arti_smooth_loss'] = arti_smooth_loss | |
# if arti_params is not None and self.cfgs.get('arti_smooth_loss_weight', 0.) > 0: | |
# if self.smooth_type == 'loss' and mid_img_idx > 0: | |
# # print("+++++++++++++++++add smooth to *articulation* loss") | |
# # from IPython import embed; embed() | |
# arti_smooth_loss = ( | |
# ((arti_params[:,mid_img_idx,:,:] - arti_params[:,0:mid_img_idx,:,:])**2) | |
# + ((arti_params[:,mid_img_idx,:,:] - arti_params[:,mid_img_idx+1:2*mid_img_idx+1,:,:])**2) | |
# ).mean() | |
# losses['arti_smooth_loss'] = arti_smooth_loss | |
if arti_params is not None and self.using_artivel_smooth_loss: | |
# from IPython import embed; embed() | |
_, nf, _, _= arti_params.shape | |
arti_vel = arti_params[:,1:nf,:,:] - arti_params[:,:(nf-1),:,:] | |
artivel_smooth_loss = self.artivel_smooth_loss_fn(arti_vel) | |
losses['artivel_smooth_loss'] = artivel_smooth_loss | |
if deformation is not None: | |
#losses['deformation_reg_loss'] = (deformation ** 2).mean() | |
losses['deformation_reg_loss'] = (deformation ** 2).mean() #TODO dor - Rdef | |
d1 = deformation[:, mesh.t_pos_idx[0, :, 0], :] | |
d2 = deformation[:, mesh.t_pos_idx[0, :, 1], :] | |
d3 = deformation[:, mesh.t_pos_idx[0, :, 2], :] | |
num_samples = 5000 | |
sample_idx1 = torch.randperm(d1.shape[1])[:num_samples].to(self.device) | |
sample_idx2 = torch.randperm(d1.shape[1])[:num_samples].to(self.device) | |
sample_idx3 = torch.randperm(d1.shape[1])[:num_samples].to(self.device) | |
dist1 = ((d1[:, sample_idx1, :] - d2[:, sample_idx1, :]) ** 2).mean() | |
dist2 = ((d2[:, sample_idx2, :] - d3[:, sample_idx2, :]) ** 2).mean() | |
dist3 = ((d3[:, sample_idx3, :] - d1[:, sample_idx3, :]) ** 2).mean() | |
losses['smooth_deformation_loss'] = dist1 + dist2 + dist3 | |
if deformation is not None and self.using_deform_smooth_loss: | |
deformation_ = deformation.view(self.bs, self.nf, *deformation.shape[1:]) | |
losses['deform_smooth_loss'] = self.deform_smooth_loss_fn(deformation_) | |
# if deformation is not None and self.cfgs.get('deformation_smooth_loss_weight', 0.) > 0: | |
# if self.smooth_type == 'loss' and mid_img_idx > 0: | |
# # print("+++++++++++++++++add smooth to *deformation* loss") | |
# deformation = deformation.view(self.bs, self.nf, *deformation.shape[1:]) | |
# deformation_smooth_loss = ( | |
# ((deformation[:, mid_img_idx,:,:] - deformation[:, 0:mid_img_idx,:,:]) ** 2) | |
# + ((deformation[:, mid_img_idx,:,:] - deformation[:, mid_img_idx+1:2*mid_img_idx+1,:,:]) ** 2) | |
# ).mean() | |
# losses['deformation_smooth_loss'] = deformation_smooth_loss | |
# # deformation = deformation.view(self.bs * self.nf, *deformation.shape[2:]) | |
# # losses['deformation_reg_loss'] = deformation.abs().mean() | |
## posed bones. | |
if posed_bones is not None and self.using_bone_smooth_loss: | |
bone_smooth_loss = self.bone_smooth_loss_fn(posed_bones) | |
losses['bone_smooth_loss'] = bone_smooth_loss | |
if posed_bones is not None and self.using_bonevel_smooth_loss: | |
_, nf, _, _, _= posed_bones.shape | |
bone_vel = posed_bones[:,1:nf,...] - posed_bones[:,:(nf-1),...] | |
bonevel_smooth_loss = self.bonevel_smooth_loss_fn(bone_vel) | |
losses['bonevel_smooth_loss'] = bonevel_smooth_loss | |
return losses, aux | |
def score_distillation_sampling(self, shape, texture, resolution, im_features, light, prior_shape, random_light=False, prompts=None, classes_vectors=None, im_features_map=None, w2c_pred=None): | |
num_instances = im_features.shape[0] | |
n_total_random_cameras = num_instances * self.diffusion_num_random_cameras | |
poses, dirs = rand_poses( | |
n_total_random_cameras, self.device, radius_range=self.diffusion_radius_range, uniform_sphere_rate=self.diffusion_uniform_sphere_rate, | |
cam_z_offset=self.cam_pos_z_offset, theta_range=self.diffusion_theta_range, phi_offset=self.diffusion_phi_offset, return_dirs=True, | |
angle_front=self.diffusion_angle_front, angle_overhead=self.diffusion_angle_overhead, | |
) | |
mvp, w2c, campos = self.netInstance.get_camera_extrinsics_from_pose(poses, crop_fov_approx=self.crop_fov_approx) | |
if random_light: | |
lights = rand_lights(campos, fixed_ambient=self.diffusion_light_ambient, fixed_diffuse=self.diffusion_light_diffuse) | |
else: | |
lights = light | |
proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, n=0.1, f=1000.0).repeat(num_instances, 1, 1).to(self.device) | |
original_mvp = torch.bmm(proj, w2c_pred) | |
im_features = im_features.repeat(self.diffusion_num_random_cameras, 1) if im_features is not None else None | |
num_shapes = shape.v_pos.shape[0] | |
assert n_total_random_cameras % num_shapes == 0 | |
shape = shape.extend(n_total_random_cameras // num_shapes) | |
bg_color = torch.rand((n_total_random_cameras, 3), device=self.device) # channel-wise random | |
background = repeat(bg_color, 'b c -> b h w c', h=resolution[0], w=resolution[1]) | |
# only train the texture | |
safe_detach = lambda x: x.detach() if x is not None else None | |
shape = safe_detach(shape) | |
im_features = safe_detach(im_features) | |
im_features_map = safe_detach(im_features_map) | |
set_requires_grad(texture, True) | |
set_requires_grad(light, True) | |
image_pred, mask_pred, _, _, albedo, shading = self.render( | |
shape, | |
texture, | |
mvp, | |
w2c, | |
campos, | |
resolution, | |
im_features=im_features, | |
light=lights, | |
prior_shape=prior_shape, | |
dino_pred=None, | |
spp=self.renderer_spp, | |
bg_image=background, | |
im_features_map={"original_mvp": original_mvp, "im_features_map": im_features_map} if im_features_map is not None else None | |
) | |
if self.enable_vsd: | |
if prompts is None: | |
prompts = n_total_random_cameras * [self.diffusion_prompt] | |
else: | |
if '_' in prompts: | |
prompts = prompts.replace('_', ' ') | |
prompts = n_total_random_cameras * [prompts] | |
prompts = ['a high-resolution DSLR image of ' + x for x in prompts] | |
assert self.diffusion_append_prompt_directions | |
# TODO: check if this implementation is aligned with stable-diffusion-prompt-processor | |
prompts_vd = append_text_direction(prompts, dirs) | |
negative_prompts = n_total_random_cameras * [self.diffusion_negative_prompt] | |
text_embeddings = self.stable_diffusion.get_text_embeds(prompts, negative_prompts) # [BB, 77, 768] | |
text_embeddings_vd = self.stable_diffusion.get_text_embeds(prompts_vd, negative_prompts) | |
camera_condition_type = 'c2w' | |
if camera_condition_type == 'c2w': | |
camera_condition = torch.linalg.inv(w2c).detach() | |
elif camera_condition_type == 'mvp': | |
camera_condition = mvp.detach() | |
else: | |
raise NotImplementedError | |
# Alternate among albedo, shading, and image | |
rand = torch.rand(n_total_random_cameras, device=self.device) | |
rendered_component = torch.zeros_like(image_pred) | |
mask_pred = mask_pred[:, None] | |
background = rearrange(background, 'b h w c -> b c h w') | |
albedo_flag = rand > (1 - self.diffusion_albedo_ratio) | |
rendered_component[albedo_flag] = albedo[albedo_flag] * mask_pred[albedo_flag] + (1 - mask_pred[albedo_flag]) * background[albedo_flag] | |
shading_flag = (rand > (1 - self.diffusion_albedo_ratio - self.diffusion_shading_ratio)) & (rand <= (1 - self.diffusion_albedo_ratio)) | |
rendered_component[shading_flag] = shading.repeat(1, 3, 1, 1)[shading_flag] / 2 * mask_pred[shading_flag] + (1 - mask_pred[shading_flag]) * background[shading_flag] | |
rendered_component[~(albedo_flag | shading_flag)] = image_pred[~(albedo_flag | shading_flag)] | |
condition_label = classes_vectors | |
# condition_label = im_features | |
sd_loss, sd_aux = self.stable_diffusion.train_step( | |
text_embeddings, | |
text_embeddings_vd, | |
rendered_component, | |
camera_condition, # TODO: can we input category condition in lora? | |
condition_label, | |
guidance_scale=self.diffusion_guidance_scale, | |
guidance_scale_lora=self.diffusion_guidance_scale_lora, | |
loss_weight=self.diffusion_loss_weight, | |
max_step_pct=self.diffusion_max_step, | |
return_aux=True | |
) | |
aux = {'loss': sd_loss['loss_vsd'], 'loss_lora': sd_loss['loss_lora'], 'dirs': dirs, 'sd_aux': sd_aux, 'rendered_shape': shape} | |
else: | |
# Prompt to text embeds | |
if prompts is None: | |
prompts = n_total_random_cameras * [self.diffusion_prompt] | |
else: | |
if '_' in prompts: | |
prompts = prompts.replace('_', ' ') | |
prompts = n_total_random_cameras * [prompts] | |
prompts = ['a high-resolution DSLR image of ' + x for x in prompts] | |
if self.diffusion_append_prompt_directions: | |
prompts = append_text_direction(prompts, dirs) | |
negative_prompts = n_total_random_cameras * [self.diffusion_negative_prompt] | |
text_embeddings = self.stable_diffusion.get_text_embeds(prompts, negative_prompts) # [2, 77, 768] | |
# Alternate among albedo, shading, and image | |
rand = torch.rand(n_total_random_cameras, device=self.device) | |
rendered_component = torch.zeros_like(image_pred) | |
mask_pred = mask_pred[:, None] | |
background = rearrange(background, 'b h w c -> b c h w') | |
albedo_flag = rand > (1 - self.diffusion_albedo_ratio) | |
rendered_component[albedo_flag] = albedo[albedo_flag] * mask_pred[albedo_flag] + (1 - mask_pred[albedo_flag]) * background[albedo_flag] | |
shading_flag = (rand > (1 - self.diffusion_albedo_ratio - self.diffusion_shading_ratio)) & (rand <= (1 - self.diffusion_albedo_ratio)) | |
rendered_component[shading_flag] = shading.repeat(1, 3, 1, 1)[shading_flag] / 2 * mask_pred[shading_flag] + (1 - mask_pred[shading_flag]) * background[shading_flag] | |
rendered_component[~(albedo_flag | shading_flag)] = image_pred[~(albedo_flag | shading_flag)] | |
sd_loss, sd_aux = self.stable_diffusion.train_step( | |
text_embeddings, rendered_component, guidance_scale=self.diffusion_guidance_scale, loss_weight=self.diffusion_loss_weight, max_step_pct=self.diffusion_max_step, return_aux=True) | |
aux = {'loss':sd_loss, 'dirs': dirs, 'sd_aux': sd_aux, 'rendered_shape': shape} | |
return rendered_component, aux | |
def parse_dict_definition(self, dict_config, total_iter): | |
''' | |
The dict_config is a diction-based configuration with ascending order | |
The key: value is the NUM_ITERATION_WEIGHT_BEGIN: WEIGHT | |
For example, | |
{0: 0.1, 1000: 0.2, 10000: 0.3} | |
means at beginning, the weight is 0.1, from 1k iterations, weight is 0.2, and after 10k, weight is 0.3 | |
''' | |
length = len(dict_config) | |
all_iters = list(dict_config.keys()) | |
all_weights = list(dict_config.values()) | |
weight = all_weights[-1] | |
for i in range(length-1): | |
# this works for dict having at least two items, otherwise you don't need dict to set config | |
iter_num = all_iters[i] | |
iter_num_next = all_iters[i+1] | |
if iter_num <= total_iter and total_iter < iter_num_next: | |
weight = all_weights[i] | |
break | |
return weight | |
def compute_clip_loss(self, random_image_pred, image_pred, category): | |
# image preprocess for CLIP | |
random_image = torch.nn.functional.interpolate(random_image_pred, (self.clip_reso, self.clip_reso), mode='bilinear') | |
image_pred = torch.nn.functional.interpolate(image_pred.squeeze(1), (self.clip_reso, self.clip_reso), mode='bilinear') | |
random_image = tvf.normalize(random_image, self.clip_mean, self.clip_std) | |
image_pred = tvf.normalize(image_pred, self.clip_mean, self.clip_std) | |
feat_img_1 = self.clip_model.encode_image(random_image) | |
feat_img_2 = self.clip_model.encode_image(image_pred) | |
clip_all_loss = torch.nn.functional.cosine_similarity(feat_img_1, feat_img_2) | |
clip_all_loss = 1 - clip_all_loss.mean() | |
# feat_img_1 = torch.mean(feat_img_1, dim=0) | |
# feat_img_2 = torch.mean(feat_img_2, dim=0) | |
# clip_all_loss = torch.nn.functional.cosine_similarity(feat_img_1, feat_img_2, dim=0) | |
# clip_all_loss = 1 - clip_all_loss | |
if self.enable_clip_text: | |
text_feature = self.clip_text_feature[category].repeat(feat_img_1.shape[0], 1) | |
text_loss_1 = torch.nn.functional.cosine_similarity(feat_img_1, text_feature).mean() | |
text_loss_2 = torch.nn.functional.cosine_similarity(feat_img_2, text_feature).mean() | |
# text_feature = self.clip_text_feature[category][0] | |
# text_loss_1 = torch.nn.functional.cosine_similarity(feat_img_1, text_feature, dim=0) | |
# text_loss_2 = torch.nn.functional.cosine_similarity(feat_img_2, text_feature, dim=0) | |
clip_all_loss = clip_all_loss + (1 - text_loss_1) + (1 - text_loss_2) | |
return {'clip_all_loss': clip_all_loss} | |
def generate_patch_crop(self, images, masks, patch_size=128, patch_num_per_mask=1): | |
b, _, H, W = masks.shape | |
patches = [] | |
for i in range(masks.shape[0]): | |
mask = masks[i] | |
# mask: [1, H, W] | |
nonzero_indices = torch.nonzero(mask > 0, as_tuple=False) # [K', 3] | |
valid_mask = (nonzero_indices[:, 1] > patch_size // 2) & (nonzero_indices[:, 1] < (H - 1 - patch_size // 2)) & (nonzero_indices[:, 2] > patch_size // 2) & (nonzero_indices[:, 2] < (W - 1 - patch_size // 2)) | |
valid_idx = nonzero_indices[valid_mask] | |
patch_idx = valid_idx[torch.randperm(valid_idx.shape[0])[:patch_num_per_mask]] # [K, 3] | |
if patch_idx.shape[0] < patch_num_per_mask: | |
patches_this_img = torch.zeros(patch_num_per_mask, 3, self.few_shot_gan_tex_patch, self.few_shot_gan_tex_patch).to(self.device) | |
else: | |
patches_this_img = [] | |
for idx in range(patch_idx.shape[0]): | |
_, y, x = patch_idx[idx] | |
y_start = max(0, y - patch_size // 2) | |
y_end = min(H, y_start + patch_size) | |
x_start = max(0, x - patch_size // 2) | |
x_end = min(W, x_start + patch_size) | |
patch_content = images[i, :, y_start:y_end, x_start:x_end] | |
patch = F.interpolate(patch_content.unsqueeze(0), size=self.few_shot_gan_tex_patch, mode='bilinear') # [1, 3, ps, ps] | |
patches_this_img.append(patch) | |
patches_this_img = torch.cat(patches_this_img, dim=0) # [K, 3, ps, ps] | |
patches.append(patches_this_img) | |
patches = torch.concat(patches, dim=0) # [B*K, 3, ps, ps] | |
return patches | |
def compute_gan_tex_loss(self, category, image_gt, mask_gt, iv_image_pred, iv_mask_pred, w2c_pred, campos_pred, shape, prior_shape, texture, dino_pred, im_features, light, class_vector, num_frames, im_features_map, bins=360): | |
''' | |
This part is used to do gan training on texture, this is meant to only be used in fine-tuning, with local texture network | |
Ideally this loss only contributes to the Texture | |
''' | |
delta_angle = 2 * np.pi / bins | |
b = len(shape) | |
rand_degree = torch.randint(120, [b]) | |
rand_degree = rand_degree + 120 | |
# rand_degree = torch.ones(b) * 180 # we want to see the reversed side | |
delta_angle = delta_angle * rand_degree | |
delta_rot_matrix = [] | |
for i in range(b): | |
angle = delta_angle[i].item() | |
angle_matrix = torch.FloatTensor([ | |
[np.cos(angle), 0, np.sin(angle), 0], | |
[0, 1, 0, 0], | |
[-np.sin(angle), 0, np.cos(angle), 0], | |
[0, 0, 0, 1], | |
]).to(self.device) | |
delta_rot_matrix.append(angle_matrix) | |
delta_rot_matrix = torch.stack(delta_rot_matrix, dim=0) | |
proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, n=0.1, f=1000.0).repeat(b, 1, 1).to(self.device) | |
original_mvp = torch.bmm(proj, w2c_pred) | |
# original_campos = -w2c_pred[:, :3, 3] | |
original_campos = campos_pred | |
mvp = torch.matmul(original_mvp, delta_rot_matrix) | |
campos = torch.matmul(delta_rot_matrix[:,:3,:3].transpose(2,1), original_campos[:,:,None])[:,:,0] | |
w2c = w2c_pred | |
resolution = (self.few_shot_gan_tex_reso, self.few_shot_gan_tex_reso) | |
# only train the texture | |
safe_detach = lambda x: x.detach() if x is not None else None | |
mesh = safe_detach(shape) | |
im_features = safe_detach(im_features) | |
im_features_map = safe_detach(im_features_map) | |
class_vector = safe_detach(class_vector) | |
set_requires_grad(texture, True) | |
set_requires_grad(dino_pred, False) | |
set_requires_grad(light, False) | |
background_for_reverse = 'none' | |
# background_for_reverse = 'random-pure' | |
image_pred, mask_pred, _, _, _, _ = self.render( | |
mesh, | |
texture, | |
mvp, | |
w2c, | |
campos, | |
resolution, | |
background=background_for_reverse, | |
im_features=im_features, | |
light=light, | |
prior_shape=prior_shape, | |
render_flow=False, | |
dino_pred=dino_pred, | |
spp=self.renderer_spp, | |
class_vector=class_vector, | |
render_mode='diffuse', | |
two_sided_shading=False, | |
num_frames=num_frames, | |
im_features_map={"original_mvp": original_mvp, "im_features_map": im_features_map} if im_features_map is not None else None # in other views we need to pass the original mvp | |
) | |
mask_pred = mask_pred.unsqueeze(1) | |
if self.few_shot_gan_tex_reso != self.out_image_size: | |
image_pred = torch.nn.functional.interpolate(image_pred, (self.out_image_size, self.out_image_size), mode='bilinear') | |
mask_pred = torch.nn.functional.interpolate(mask_pred, (self.out_image_size, self.out_image_size), mode='bilinear') | |
# image_pred = image_pred.clamp(0, 1) | |
# mask_pred = mask_pred.clamp(0, 1) # [B, 1, H, W] | |
if background_for_reverse == 'random': | |
# as we set a random background for rendering, we also need another random background for input view | |
# for background, we use the same as random view: a small resolution then upsample | |
random_bg = torch.rand(self.bs, self.nf, 3, self.few_shot_gan_tex_reso, self.few_shot_gan_tex_reso).to(self.device) | |
random_bg = torch.nn.functional.interpolate(random_bg.squeeze(1), (self.out_image_size, self.out_image_size), mode='bilinear').unsqueeze(1) | |
iv_mask_pred = iv_mask_pred.unsqueeze(2).repeat(1, 1, 3, 1, 1) | |
iv_image_pred = iv_image_pred * iv_mask_pred + random_bg * (1. - iv_mask_pred) | |
iv_image_pred = iv_image_pred.squeeze(1) | |
random_bg_gt = torch.rand(self.bs, self.nf, 3, self.few_shot_gan_tex_reso, self.few_shot_gan_tex_reso).to(self.device) | |
random_bg_gt = torch.nn.functional.interpolate(random_bg_gt.squeeze(1), (self.out_image_size, self.out_image_size), mode='bilinear').unsqueeze(1) | |
mask_gt = mask_gt.unsqueeze(2).repeat(1, 1, 3, 1, 1) | |
image_gt = image_gt * mask_gt + random_bg_gt * (1. - mask_gt) | |
image_gt = image_gt.squeeze(1) | |
elif background_for_reverse == 'random-pure': | |
# the background is random but with one color | |
random_values = torch.rand(b) | |
random_bg = random_values[..., None, None, None, None].repeat(1, 1, 3, self.few_shot_gan_tex_reso, self.few_shot_gan_tex_reso).to(self.device) | |
random_bg = torch.nn.functional.interpolate(random_bg.squeeze(1), (self.out_image_size, self.out_image_size), mode='bilinear').unsqueeze(1) | |
iv_mask_pred = iv_mask_pred.unsqueeze(2).repeat(1, 1, 3, 1, 1) | |
iv_image_pred = iv_image_pred * iv_mask_pred + random_bg * (1. - iv_mask_pred) | |
iv_image_pred = iv_image_pred.squeeze(1) | |
random_values_gt = torch.rand(b) | |
random_bg_gt = random_values_gt[..., None, None, None, None].repeat(1, 1, 3, self.few_shot_gan_tex_reso, self.few_shot_gan_tex_reso).to(self.device) | |
random_bg_gt = torch.nn.functional.interpolate(random_bg_gt.squeeze(1), (self.out_image_size, self.out_image_size), mode='bilinear').unsqueeze(1) | |
mask_gt = mask_gt.unsqueeze(2).repeat(1, 1, 3, 1, 1) | |
image_gt = image_gt * mask_gt + random_bg_gt * (1. - mask_gt) | |
image_gt = image_gt.squeeze(1) | |
elif background_for_reverse == 'none': | |
iv_image_pred = iv_image_pred.squeeze(1) | |
iv_mask_pred = iv_mask_pred.unsqueeze(2).repeat(1, 1, 3, 1, 1) | |
# image_gt = image_gt * mask_gt + random_bg_gt * (1. - mask_gt) | |
mask_gt = mask_gt.unsqueeze(2).repeat(1, 1, 3, 1, 1) | |
image_gt = image_gt * mask_gt | |
image_gt = image_gt.squeeze(1) | |
else: | |
raise NotImplementedError | |
# image_gt = torch.nn.functional.interpolate(image_gt, (32, 32), mode='bilinear') | |
# image_gt = torch.nn.functional.interpolate(image_gt, (256, 256), mode='bilinear') | |
# we need to let discriminator think this reverse view is Real sample | |
if self.cfgs.get('few_shot_gan_tex_patch', 0) > 0: | |
patch_size = torch.randint(self.few_shot_gan_tex_patch, self.few_shot_gan_tex_patch_max, (1,)).item() | |
# random view | |
image_pred = self.generate_patch_crop(image_pred, mask_pred, patch_size, self.few_shot_gan_tex_patch_num) | |
# input view | |
iv_image_pred = self.generate_patch_crop(iv_image_pred, iv_mask_pred.squeeze(1)[:, 0:1, :, :], patch_size, self.few_shot_gan_tex_patch_num) | |
# gt view | |
image_gt = self.generate_patch_crop(image_gt, mask_gt.squeeze(1)[:, 0:1, :, :], patch_size, self.few_shot_gan_tex_patch_num) | |
return_loss = {} | |
if self.few_shot_gan_tex: | |
# here we compute the fake sample as real loss | |
gan_tex_loss = 0.0 | |
if 'rv' in self.few_shot_gan_tex_fake: | |
d_rv = self.discriminator_texture(image_pred) | |
gan_tex_loss_rv = discriminator_architecture.bce_loss_target(d_rv, 1) | |
gan_tex_loss += gan_tex_loss_rv | |
if 'iv' in self.few_shot_gan_tex_fake: | |
d_iv = self.discriminator_texture(iv_image_pred) | |
gan_tex_loss_iv = discriminator_architecture.bce_loss_target(d_iv, 1) | |
gan_tex_loss += gan_tex_loss_iv | |
return_loss['gan_tex_loss'] = gan_tex_loss | |
if self.few_shot_clip_tex: | |
clip_tex_loss_rv_iv = self.compute_clip_loss(image_pred, iv_image_pred.unsqueeze(1), category='none') | |
clip_tex_loss_rv_gt = self.compute_clip_loss(image_pred, image_gt.unsqueeze(1), category='none') | |
clip_tex_loss = clip_tex_loss_rv_iv['clip_all_loss'] + clip_tex_loss_rv_gt['clip_all_loss'] | |
return_loss['clip_tex_loss'] = clip_tex_loss | |
return_aux = { | |
'gan_tex_render_image': image_pred.clone().clamp(0, 1), | |
'gan_tex_inpview_image': iv_image_pred.clone().clamp(0, 1), | |
'gan_tex_gt_image': image_gt.clone().clamp(0, 1) | |
} | |
with torch.no_grad(): | |
# self.record_image_iv = iv_image_pred.clone().clamp(0, 1) | |
# self.record_image_rv = image_pred.clone().clamp(0, 1) | |
# self.record_image_gt = image_gt.clone().clamp(0, 1) | |
self.record_image_iv = iv_image_pred.clone() | |
self.record_image_rv = image_pred.clone() | |
self.record_image_gt = image_gt.clone() | |
return return_loss, return_aux | |
def compute_mask_distribution_loss(self, category, w2c_pred, shape, prior_shape, texture, dino_pred, im_features, light, class_vector, num_frames, im_features_map, bins=360): | |
delta_angle = 2 * np.pi / bins | |
b = len(shape) | |
if self.random_mask_law == 'batch_swap': | |
# shuffle in predicted poses | |
rand_degree_1 = torch.randperm(int(w2c_pred.shape[0] // 2)) | |
rand_degree_2 = torch.randperm(w2c_pred.shape[0] - int(w2c_pred.shape[0] // 2)) + int(w2c_pred.shape[0] // 2) | |
rand_degree = torch.cat([rand_degree_2, rand_degree_1], dim=0).long().to(w2c_pred.device) | |
w2c = w2c_pred[rand_degree] | |
proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, n=0.1, f=1000.0).repeat(b, 1, 1).to(self.device) | |
mvp = torch.bmm(proj, w2c) | |
campos = -w2c[:, :3, 3] | |
elif self.random_mask_law == 'batch_swap_noy': | |
# shuffle in predicted poses | |
rand_degree_1 = torch.randperm(int(w2c_pred.shape[0] // 2)) | |
rand_degree_2 = torch.randperm(w2c_pred.shape[0] - int(w2c_pred.shape[0] // 2)) + int(w2c_pred.shape[0] // 2) | |
rand_degree = torch.cat([rand_degree_2, rand_degree_1], dim=0).long().to(w2c_pred.device) | |
w2c = w2c_pred[rand_degree] | |
# we don't random swap the y-translation in discriminator loss | |
w2c[:, 1, 3] = w2c_pred[:, 1, 3] | |
proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, n=0.1, f=1000.0).repeat(b, 1, 1).to(self.device) | |
mvp = torch.bmm(proj, w2c) | |
campos = -w2c[:, :3, 3] | |
elif self.random_mask_law == 'random_azimuth': | |
# the render rotation matrix is different | |
rand_degree = torch.randint(bins, [b]) | |
delta_angle = delta_angle * rand_degree | |
delta_rot_matrix = [] | |
for i in range(b): | |
angle = delta_angle[i].item() | |
angle_matrix = torch.FloatTensor([ | |
[np.cos(angle), 0, np.sin(angle), 0], | |
[0, 1, 0, 0], | |
[-np.sin(angle), 0, np.cos(angle), 0], | |
[0, 0, 0, 1], | |
]).to(self.device) | |
delta_rot_matrix.append(angle_matrix) | |
delta_rot_matrix = torch.stack(delta_rot_matrix, dim=0) | |
w2c = torch.FloatTensor(np.diag([1., 1., 1., 1])) | |
w2c[:3, 3] = torch.FloatTensor([0, 0, -self.cam_pos_z_offset *1.4]) | |
w2c = w2c.repeat(b, 1, 1).to(self.device) | |
# use the predicted transition | |
w2c_pred = w2c_pred.detach() | |
w2c[:, :3, 3] = w2c_pred[:b][:, :3, 3] | |
proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, n=0.1, f=1000.0).repeat(b, 1, 1).to(self.device) | |
mvp = torch.bmm(proj, w2c) | |
campos = -w2c[:, :3, 3] | |
mvp = torch.matmul(mvp, delta_rot_matrix) | |
campos = torch.matmul(delta_rot_matrix[:,:3,:3].transpose(2,1), campos[:,:,None])[:,:,0] | |
elif self.random_mask_law == 'random_all': | |
# the render rotation matrix is different, and actually the translation are just pre-set | |
rand_degree = torch.randint(bins, [b]) | |
delta_angle = delta_angle * rand_degree | |
delta_rot_matrix = [] | |
for i in range(b): | |
angle = delta_angle[i].item() | |
angle_matrix = torch.FloatTensor([ | |
[np.cos(angle), 0, np.sin(angle), 0], | |
[0, 1, 0, 0], | |
[-np.sin(angle), 0, np.cos(angle), 0], | |
[0, 0, 0, 1], | |
]).to(self.device) | |
delta_rot_matrix.append(angle_matrix) | |
delta_rot_matrix = torch.stack(delta_rot_matrix, dim=0) | |
w2c = torch.FloatTensor(np.diag([1., 1., 1., 1])) | |
w2c[:3, 3] = torch.FloatTensor([0, 0, -self.cam_pos_z_offset *1.4]) | |
w2c = w2c.repeat(b, 1, 1).to(self.device) | |
proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, n=0.1, f=1000.0).repeat(b, 1, 1).to(self.device) | |
mvp = torch.bmm(proj, w2c) | |
campos = -w2c[:, :3, 3] | |
mvp = torch.matmul(mvp, delta_rot_matrix) | |
campos = torch.matmul(delta_rot_matrix[:,:3,:3].transpose(2,1), campos[:,:,None])[:,:,0] | |
else: | |
raise NotImplementedError | |
resolution = (self.out_image_size, self.out_image_size) | |
# render the articulated shape | |
mesh = shape | |
if self.enable_clip: | |
resolution = (self.clip_render_size, self.clip_render_size) | |
set_requires_grad(texture, False) | |
image_pred, mask_pred, _, _, _, _ = self.render( | |
mesh, | |
texture, | |
mvp, | |
w2c, | |
campos, | |
resolution, | |
background='none', | |
im_features=im_features, | |
light=light, | |
prior_shape=prior_shape, | |
render_flow=False, | |
dino_pred=dino_pred, | |
spp=self.renderer_spp, | |
class_vector=class_vector, | |
render_mode='diffuse', | |
two_sided_shading=False, | |
num_frames=num_frames, | |
im_features_map=im_features_map | |
) | |
if resolution[0] != self.out_image_size: | |
image_pred = torch.nn.functional.interpolate(image_pred, (self.out_image_size, self.out_image_size), mode='bilinear') | |
mask_pred = torch.nn.functional.interpolate(mask_pred.unsqueeze(1), (self.out_image_size, self.out_image_size), mode='bilinear').squeeze(1) | |
else: | |
_, mask_pred, _, _, _, _ = self.render( | |
mesh, | |
None, | |
mvp, | |
w2c, | |
campos, | |
resolution, | |
background='none', | |
im_features=None, | |
light=None, | |
prior_shape=prior_shape, | |
render_flow=False, | |
dino_pred=None, | |
class_vector=class_vector, | |
render_mode='diffuse', | |
two_sided_shading=False, | |
num_frames=num_frames, | |
im_features_map=None | |
) | |
image_pred = None | |
# TODO: disable mask distribution and isolate mask discriminator loss | |
# mask_distribution = self.class_mask_distribution[category] | |
# mask_distribution = torch.Tensor(mask_distribution).to(self.device).unsqueeze(0).repeat(b, 1, 1) | |
mask_distribution = torch.Tensor(self.class_mask_distribution["zebra"]).to(self.device).unsqueeze(0).repeat(b, 1, 1) | |
if self.mask_distribution_average: | |
# if use mask_distribution_average, then first average across batch then compute the loss | |
mask_pred = mask_pred.mean(dim=0).unsqueeze(0).repeat(b, 1, 1) | |
mask_pred = mask_pred.clamp(0,1) | |
mask_distribution = mask_distribution.clamp(0,1) | |
distribution_loss = torch.nn.functional.binary_cross_entropy(mask_pred, mask_distribution) | |
out_loss = {'mask_distribution_loss': 0 * distribution_loss} | |
out_aux = { | |
'mask_random_pred': mask_pred.unsqueeze(1), | |
'mask_distribution': mask_distribution.unsqueeze(1), | |
'rand_degree': rand_degree | |
} | |
if self.enable_clip: | |
out_aux.update({'random_render_image': image_pred}) | |
return out_loss, out_aux | |
def use_line_correct_valid_mask(self, mask_valid, p1, p2, mvp, mask_gt): | |
line = torch.cat([p1.unsqueeze(-2), p2.unsqueeze(-2)], dim=-2) # [B, 2, 3] | |
line_world4 = torch.cat([line, torch.ones_like(line[..., :1])], -1) | |
line_clip4 = line_world4 @ mvp.transpose(-1, -2) | |
line_uv = line_clip4[..., :2] / line_clip4[..., 3:4] | |
line_uv = line_uv.detach() | |
b, _, n_uv = line_uv.shape | |
line_uv = line_uv * torch.Tensor([mask_valid.shape[-2] // 2, mask_valid.shape[-1] // 2]).to(line_uv.device).unsqueeze(0).unsqueeze(-1).repeat(b, 1, n_uv) | |
line_uv = line_uv + torch.Tensor([mask_valid.shape[-2] // 2, mask_valid.shape[-1] // 2]).to(line_uv.device).unsqueeze(0).unsqueeze(-1).repeat(b, 1, n_uv) | |
from pdb import set_trace; set_trace() | |
line_slope = (line_uv[:, 0, 1] - line_uv[:, 1, 1]) / (line_uv[:, 0, 0] - line_uv[:, 1, 0]) | |
uv = np.mgrid[0:mask_valid.shape[-2], 0:mask_valid.shape[-1]].astype(np.int32) | |
uv = torch.from_numpy(np.flip(uv, axis=0).copy()).float().unsqueeze(0).repeat(b, 1, 1, 1) # [B, 2, 256, 256] | |
tmp_u = uv[:, 0, ...][mask_gt[:, 0, ...].bool()] | |
tmp_v = uv[:, 1, ...][mask_gt[:, 0, ...].bool()] | |
return mask_valid | |
def discriminator_step(self): | |
mask_gt = self.record_mask_gt | |
mask_pred = self.record_mask_iv | |
mask_random_pred = self.record_mask_rv | |
self.optimizerDiscriminator.zero_grad() | |
# the random view mask are False | |
d_random_pred = self.mask_disc(mask_random_pred) | |
disc_loss = discriminator_architecture.bce_loss_target(d_random_pred, 0) # in gen loss, train it to be real | |
grad_loss = 0.0 | |
count = 1 | |
discriminator_loss_rv = disc_loss.detach() | |
discriminator_loss_gt = 0.0 | |
discriminator_loss_iv = 0. | |
d_gt = None | |
d_iv = None | |
if self.disc_gt: | |
mask_gt.requires_grad_() | |
d_gt = self.mask_disc(mask_gt) | |
if d_gt.requires_grad is False: | |
# in the test case | |
disc_gt_loss = discriminator_architecture.bce_loss_target(d_gt, 1) | |
else: | |
grad_penalty = self.disc_reg_mul * discriminator_architecture.compute_grad2(d_gt, mask_gt) | |
disc_gt_loss = discriminator_architecture.bce_loss_target(d_gt, 1) + grad_penalty | |
grad_loss += grad_penalty | |
disc_loss = disc_loss + disc_gt_loss | |
discriminator_loss_gt = disc_gt_loss | |
count = count + 1 | |
if self.disc_iv: | |
mask_pred.requires_grad_() | |
d_iv = self.mask_disc(mask_pred) | |
if self.disc_iv_label == 'Real': | |
if d_iv.requires_grad is False: | |
# in the test case | |
disc_iv_loss = discriminator_architecture.bce_loss_target(d_iv, 1) | |
else: | |
grad_penalty = self.disc_reg_mul * discriminator_architecture.compute_grad2(d_iv, mask_pred) | |
disc_iv_loss = discriminator_architecture.bce_loss_target(d_iv, 1) + grad_penalty | |
grad_loss += grad_penalty | |
else: | |
disc_iv_loss = discriminator_architecture.bce_loss_target(d_iv, 0) | |
disc_loss = disc_loss + disc_iv_loss | |
count = count + 1 | |
discriminator_loss_iv = disc_iv_loss | |
disc_loss = disc_loss / count | |
grad_loss = grad_loss / count | |
self.discriminator_loss = disc_loss * self.discriminator_loss_weight | |
self.discriminator_loss.backward() | |
self.optimizerDiscriminator.step() | |
self.discriminator_loss = 0. | |
return { | |
'discriminator_loss': disc_loss, | |
'discriminator_loss_rv': discriminator_loss_rv, | |
'discriminator_loss_iv': discriminator_loss_iv, | |
'discriminator_loss_gt': discriminator_loss_gt, | |
'd_rv': d_random_pred, | |
'd_iv': d_iv if d_iv is not None else None, | |
'd_gt': d_gt if d_gt is not None else None, | |
}, grad_loss | |
def compute_mask_disc_loss_gen(self, mask_gt, mask_pred, mask_random_pred, category_name=None, condition_feat=None): | |
# mask_gt[mask_gt < 1.] = 0. | |
# mask_pred[mask_pred > 0.] = 1. | |
# mask_random_pred[mask_random_pred > 0.] = 1. | |
if not self.mask_disc_feat_condition: | |
try: | |
class_idx = list(self.netPrior.category_id_map.keys()).index(category_name) | |
except: | |
class_idx = 100 | |
num_classes = len(list(self.netPrior.category_id_map.keys())) | |
class_idx = torch.LongTensor([class_idx]) | |
# class_one_hot = torch.nn.functional.one_hot(class_idx, num_classes=7).unsqueeze(-1).unsqueeze(-1).to(mask_gt.device) # [1, 7, 1, 1] | |
class_one_hot = torch.nn.functional.one_hot(class_idx, num_classes=num_classes).unsqueeze(-1).unsqueeze(-1).to(mask_gt.device) | |
class_one_hot = class_one_hot.repeat(mask_gt.shape[0], 1, mask_gt.shape[-2], mask_gt.shape[-1]) | |
# TODO: a hack try here | |
class_one_hot = class_one_hot[:, :(self.mask_disc.in_dim-1), :, :] | |
else: | |
class_one_hot = condition_feat.detach() | |
class_one_hot = class_one_hot.reshape(1, -1, 1, 1).repeat(mask_gt.shape[0], 1, mask_gt.shape[-2], mask_gt.shape[-1]) | |
# concat | |
mask_gt = torch.cat([mask_gt, class_one_hot], dim=1) | |
mask_pred = torch.cat([mask_pred, class_one_hot], dim=1) | |
mask_random_pred = torch.cat([mask_random_pred, class_one_hot], dim=1) | |
# mask shape are all [B,1,256,256] | |
# the random view mask are False | |
d_random_pred = self.mask_disc(mask_random_pred) | |
disc_loss = discriminator_architecture.bce_loss_target(d_random_pred, 1) # in gen loss, train it to be real | |
count = 1 | |
disc_loss_rv = disc_loss.detach() | |
disc_loss_iv = 0.0 | |
if self.disc_iv: | |
if self.disc_iv_label != 'Real': # consider the input view also fake | |
d_iv = self.mask_disc(mask_pred) | |
disc_iv_loss = discriminator_architecture.bce_loss_target(d_iv, 1) # so now we need to train them to be real | |
disc_loss = disc_loss + disc_iv_loss | |
count = count + 1 | |
disc_loss_iv = disc_iv_loss.detach() | |
disc_loss = disc_loss / count | |
# record the masks for discriminator training | |
self.record_mask_gt = mask_gt.clone().detach() | |
self.record_mask_iv = mask_pred.clone().detach() | |
self.record_mask_rv = mask_random_pred.clone().detach() | |
return { | |
'mask_disc_loss': disc_loss, | |
'mask_disc_loss_rv': disc_loss_rv, | |
'mask_disc_loss_iv': disc_loss_iv, | |
} | |
def forward(self, batch, epoch, iter, is_train=True, viz_logger=None, total_iter=None, save_results=False, save_dir=None, which_data='', logger_prefix='', is_training=True, bank_embedding=None): | |
batch = [x.to(self.device) if x is not None and isinstance(x, torch.Tensor) else x for x in batch] | |
input_image, mask_gt, mask_dt, mask_valid, flow_gt, bbox, bg_image, dino_feat_im, dino_cluster_im, seq_idx, frame_idx, category_name = batch | |
# if save_results: | |
# save_for_pkl = { | |
# "image": input_image.cpu(), | |
# "mask_gt": mask_gt.cpu(), | |
# "mask_dt": mask_dt.cpu(), | |
# "mask_valid": mask_valid.cpu(), | |
# "flow_gt": None, | |
# "bbox": bbox.cpu(), | |
# "bg_image": bg_image.cpu(), | |
# "dino_feat_im": dino_feat_im.cpu(), | |
# "dino_cluster_im": dino_cluster_im.cpu(), | |
# "seq_idx": seq_idx.cpu(), | |
# "frame_idx": frame_idx.cpu(), | |
# "category_name": category_name | |
# } | |
batch_size, num_frames, _, h0, w0 = input_image.shape # BxFxCxHxW | |
self.bs = batch_size | |
self.nf = num_frames | |
mid_img_idx = int((input_image.shape[1]-1)//2) | |
# print(f"mid_img_idx: {mid_img_idx}") | |
h = w = self.out_image_size | |
def collapseF(x): | |
return None if x is None else x.view(batch_size * num_frames, *x.shape[2:]) | |
def expandF(x): | |
return None if x is None else x.view(batch_size, num_frames, *x.shape[1:]) | |
if flow_gt.dim() == 2: # dummy tensor for not loading flow | |
flow_gt = None | |
if dino_cluster_im.dim() == 2: # dummy tensor for not loading dino clusters | |
dino_cluster_im = None | |
dino_cluster_im_gt = None | |
else: | |
dino_cluster_im_gt = expandF(torch.nn.functional.interpolate(collapseF(dino_cluster_im), size=[h, w], mode="nearest")) | |
seq_idx = seq_idx.squeeze(1) | |
# seq_idx = seq_idx * 0 # single sequnce model | |
frame_id, crop_x0, crop_y0, crop_w, crop_h, full_w, full_h, sharpness, label = bbox.unbind(2) # BxFx7 | |
bbox = torch.stack([crop_x0, crop_y0, crop_w, crop_h], 2) | |
mask_gt = (mask_gt[:, :, 0, :, :] > 0.9).float() # BxFxHxW | |
mask_dt = mask_dt / self.in_image_size | |
if which_data != 'video': | |
flow_gt = None | |
aux_viz = {} | |
## GT | |
image_gt = input_image | |
if self.out_image_size != self.in_image_size: | |
image_gt = expandF(torch.nn.functional.interpolate(collapseF(image_gt), size=[h, w], mode='bilinear')) | |
if flow_gt is not None: | |
flow_gt = torch.nn.functional.interpolate(flow_gt.view(batch_size*(num_frames-1), 2, h0, w0), size=[h, w], mode="bilinear").view(batch_size, num_frames-1, 2, h, w) | |
self.train_pose_only = False | |
if epoch in self.pose_epochs: | |
if (total_iter // self.pose_iters) % 2 == 0: | |
self.train_pose_only = True | |
## flip input and pose | |
if epoch in self.pose_xflip_recon_epochs: | |
input_image_xflip = input_image.flip(-1) | |
input_image_xflip_flag = torch.randint(0, 2, (batch_size, num_frames), device=input_image.device) | |
input_image = input_image * (1 - input_image_xflip_flag[:,:,None,None,None]) + input_image_xflip * input_image_xflip_flag[:,:,None,None,None] | |
else: | |
input_image_xflip_flag = None | |
## 1st pose hypothesis with original predictions | |
# ============================================================================================== | |
# Predict prior mesh. | |
# ============================================================================================== | |
if self.enable_prior: | |
if self.world_size > 1: | |
if epoch < self.dmtet_grid_smaller_epoch: | |
if self.netPrior_ddp.module.netShape.grid_res != self.dmtet_grid_smaller: | |
self.netPrior_ddp.module.netShape.load_tets(self.dmtet_grid_smaller) | |
else: | |
if self.netPrior_ddp.module.netShape.grid_res != self.dmtet_grid: | |
self.netPrior_ddp.module.netShape.load_tets(self.dmtet_grid) | |
else: | |
if epoch < self.dmtet_grid_smaller_epoch: | |
if self.netPrior.netShape.grid_res != self.dmtet_grid_smaller: | |
self.netPrior.netShape.load_tets(self.dmtet_grid_smaller) | |
else: | |
if self.netPrior.netShape.grid_res != self.dmtet_grid: | |
self.netPrior.netShape.load_tets(self.dmtet_grid) | |
perturb_sdf = self.perturb_sdf if is_train else False | |
# DINO prior category specific - DOR | |
if self.world_size > 1: | |
prior_shape, dino_pred, classes_vectors = self.netPrior_ddp(category_name=category_name[0], perturb_sdf=perturb_sdf, total_iter=total_iter, is_training=is_training, class_embedding=bank_embedding) | |
else: | |
prior_shape, dino_pred, classes_vectors = self.netPrior(category_name=category_name[0], perturb_sdf=perturb_sdf, total_iter=total_iter, is_training=is_training, class_embedding=bank_embedding) | |
else: | |
prior_shape = None | |
raise NotImplementedError | |
if self.world_size > 1: | |
shape, pose_raw, pose, mvp, w2c, campos, texture, im_features, dino_feat_im_calc, deformation, arti_params, light, forward_aux = self.netInstance_ddp(category_name, input_image, prior_shape, epoch, dino_feat_im, dino_cluster_im, total_iter, is_training=is_training) # frame dim collapsed N=(B*F) | |
else: | |
Instance_out = self.netInstance(category_name, input_image, prior_shape, epoch, dino_feat_im, dino_cluster_im, total_iter, is_training=is_training) # frame dim collapsed N=(B*F) | |
# if no patch_out as output from netInstance, then set im_features_map as None in following part | |
if len(Instance_out) == 13: | |
shape, pose_raw, pose, mvp, w2c, campos, texture, im_features, dino_feat_im_calc, deformation, arti_params, light, forward_aux = Instance_out | |
im_features_map = None | |
else: | |
shape, pose_raw, pose, mvp, w2c, campos, texture, im_features, dino_feat_im_calc, deformation, arti_params, light, forward_aux, im_features_map = Instance_out | |
# if save_results: | |
# save_for_pkl.update( | |
# { | |
# "pose_raw": pose_raw.cpu(), | |
# "pose": pose.cpu(), | |
# "mvp": mvp.cpu(), | |
# "w2c": w2c.cpu(), | |
# "campos": campos.cpu(), | |
# "campos_z_offset": self.netInstance.cam_pos_z_offset | |
# } | |
# ) | |
if self.calc_dino_features == True: | |
# get the shape parameters of the tensor | |
batch_size, height, width, channels = dino_feat_im_calc.shape #3 X 384 X 32 X 32 | |
# reshape the tensor to have 2 dimensions, with the last dimension being preserved | |
dino_feat_im = dino_feat_im_calc.reshape(batch_size , height, -1) | |
# normalize the tensor using L2 normalization | |
norm = torch.norm(dino_feat_im, dim=-1, keepdim=True) | |
dino_feat_im = dino_feat_im / norm | |
# reshape the tensor back to the original shape with an additional singleton dimension along the first dimension | |
dino_feat_im = dino_feat_im.reshape(batch_size, height, width, channels) | |
dino_feat_im = dino_feat_im.unsqueeze(1) | |
if dino_feat_im.dim() == 2: # dummy tensor for not loading dino features | |
dino_feat_im = None | |
dino_feat_im_gt = None | |
else: | |
dino_feat_im_gt = expandF(torch.nn.functional.interpolate(collapseF(dino_feat_im), size=[h, w], mode="bilinear"))[:, :, :self.dino_feature_recon_dim] | |
rot_logit = forward_aux['rot_logit'] | |
rot_idx = forward_aux['rot_idx'] | |
rot_prob = forward_aux['rot_prob'] | |
if self.using_bonevel_smooth_loss: | |
posed_bones = forward_aux['posed_bones'] | |
else: | |
posed_bones = None | |
aux_viz.update(forward_aux) | |
if self.train_pose_only: | |
safe_detach = lambda x: x.detach() if x is not None else None | |
prior_shape = safe_detach(prior_shape) | |
shape = safe_detach(shape) | |
im_features = safe_detach(im_features) | |
arti_params = safe_detach(arti_params) | |
deformation = safe_detach(deformation) | |
set_requires_grad(texture, False) | |
set_requires_grad(light, False) | |
set_requires_grad(dino_pred, False) | |
else: | |
set_requires_grad(texture, True) | |
set_requires_grad(light, True) | |
set_requires_grad(dino_pred, True) | |
render_flow = self.render_flow and num_frames > 1 #false | |
# from IPython import embed; embed() | |
# if num_frames > 1 and self.smooth_type == 'rend': | |
# print("rendererr smoothness !!!!") | |
# image_pred, mask_pred, flow_pred, dino_feat_im_pred, albedo, shading = self.render(shape, texture, mvp, w2c, campos, (h, w), background=self.background_mode, im_features=im_features[torch.randperm(im_features.size(0))], light=light, prior_shape=prior_shape, render_flow=render_flow, dino_pred=dino_pred, num_frames=num_frames, spp=self.renderer_spp) #the real rendering process | |
# else: | |
# print("regular render") | |
#print("a cecond before rendering .... need to get the correct label and the correct vector") | |
#print("label", label) | |
#print("classes_vectors", classes_vectors) | |
#print("im_features", im_features.shape) | |
class_vector = None | |
if classes_vectors is not None: | |
if len(classes_vectors.shape) == 1: | |
class_vector = classes_vectors | |
else: | |
class_vector = classes_vectors[self.netPrior.category_id_map[category_name[0]], :] | |
image_pred, mask_pred, flow_pred, dino_feat_im_pred, albedo, shading = self.render(shape, texture, mvp, w2c, campos, (h, w), background=self.background_mode, im_features=im_features, light=light, prior_shape=prior_shape, render_flow=render_flow, dino_pred=dino_pred, class_vector=class_vector[None, :].expand(batch_size * num_frames, -1), num_frames=num_frames, spp=self.renderer_spp, im_features_map=im_features_map) #the real rendering process | |
image_pred, mask_pred, flow_pred, dino_feat_im_pred = map(expandF, (image_pred, mask_pred, flow_pred, dino_feat_im_pred)) | |
if flow_pred is not None: | |
flow_pred = flow_pred[:, :-1] # Bx(F-1)x2xHxW | |
if self.blur_mask: | |
sigma = max(0.5, 3 * (1 - total_iter / self.blur_mask_iter)) | |
if sigma > 0.5: | |
mask_gt = util.blur_image(mask_gt, kernel_size=9, sigma=sigma, mode='gaussian') | |
# mask_pred = util.blur_image(mask_pred, kernel_size=7, mode='average') | |
# back_line_p1 = forward_aux['posed_bones'][:, :, 3, -1].squeeze(1) # [8, 3] | |
# back_line_p2 = forward_aux['posed_bones'][:, :, 7, -1].squeeze(1) | |
# mask_valid = self.use_line_correct_valid_mask(mask_valid, back_line_p1, back_line_p2, mvp, mask_gt) | |
losses = self.compute_reconstruction_losses(image_pred, image_gt, mask_pred, mask_gt, mask_dt, mask_valid, flow_pred, flow_gt, dino_feat_im_gt, dino_feat_im_pred, background_mode=self.background_mode, reduce=False) | |
## TODO: assume flow loss is not used | |
logit_loss_target = torch.zeros_like(expandF(rot_logit)) | |
final_losses = {} | |
for name, loss in losses.items(): | |
if name == 'flow_loss': | |
continue | |
loss_weight_logit = self.cfgs.get(f"{name}_weight", 0.) | |
if isinstance(loss_weight_logit, dict): | |
loss_weight_logit = self.parse_dict_definition(loss_weight_logit, total_iter) | |
# from IPython import embed; embed() | |
# print("-"*10) | |
# print(f"{name}_weight: {loss_weight_logit}.") | |
# print(f"logit_loss_target.shape: {logit_loss_target.shape}.") | |
# print(f"loss.shape: {loss.shape}.") | |
# if (name in ['flow_loss'] and epoch not in self.flow_loss_epochs) or (name in ['rgb_loss', 'perceptual_loss'] and epoch not in self.texture_epochs): | |
# if name in ['flow_loss', 'rgb_loss', 'perceptual_loss']: | |
# loss_weight_logit = 0. | |
if name in ['sdf_bce_reg_loss', 'sdf_gradient_reg_loss', 'sdf_inflate_reg_loss']: | |
if total_iter >= self.sdf_reg_decay_start_iter: | |
decay_rate = max(0, 1 - (total_iter-self.sdf_reg_decay_start_iter) / 10000) | |
loss_weight_logit = max(loss_weight_logit * decay_rate, self.cfgs.get(f"{name}_min_weight", 0.)) | |
if name in ['dino_feat_im_loss']: | |
dino_feat_im_loss_multipler = self.cfgs.get("logit_loss_dino_feat_im_loss_multiplier", 1.) | |
if isinstance(dino_feat_im_loss_multipler, dict): | |
dino_feat_im_loss_multipler = self.parse_dict_definition(dino_feat_im_loss_multipler, total_iter) | |
loss_weight_logit = loss_weight_logit * dino_feat_im_loss_multipler | |
# loss_weight_logit = loss_weight_logit * self.cfgs.get("logit_loss_dino_feat_im_loss_multiplier", 1.) | |
if loss_weight_logit > 0: | |
logit_loss_target += loss * loss_weight_logit | |
if self.netInstance.rot_rep in ['quadlookat', 'octlookat']: | |
loss = loss * rot_prob.detach().view(batch_size, num_frames)[:, :loss.shape[1]] *self.netInstance.num_pose_hypos | |
if name == 'flow_loss' and num_frames > 1: | |
ri = rot_idx.view(batch_size, num_frames) | |
same_rot_idx = (ri[:, 1:] == ri[:, :-1]).float() | |
loss = loss * same_rot_idx | |
final_losses[name] = loss.mean() | |
final_losses['logit_loss'] = ((expandF(rot_logit) - logit_loss_target.detach())**2.).mean() | |
## score distillation sampling | |
sds_random_images = None | |
if self.enable_sds: | |
prompts = None | |
if classes_vectors is not None: | |
prompts = category_name[0] | |
sds_random_images, sds_aux = self.score_distillation_sampling(shape, texture, [self.diffusion_resolution, self.diffusion_resolution], im_features, light, prior_shape, prompts=prompts, classes_vectors=class_vector[None, :].expand(batch_size * num_frames, -1), im_features_map=im_features_map, w2c_pred=w2c) | |
if self.enable_vsd: | |
final_losses.update({'vsd_loss': sds_aux['loss']}) | |
final_losses.update({'vsd_lora_loss': sds_aux['loss_lora']}) | |
else: | |
final_losses.update({'sds_loss': sds_aux['loss']}) | |
## mask distribution loss | |
mask_distribution_aux = None | |
if self.enable_mask_distribution: | |
if total_iter % self.mask_distribution_loss_freq == 0: | |
mask_distribution_loss, mask_distribution_aux = self.compute_mask_distribution_loss(category_name[0], w2c, shape, prior_shape, texture, dino_pred, im_features, light, class_vector[None, :].expand(batch_size * num_frames, -1), num_frames, im_features_map) | |
final_losses.update(mask_distribution_loss) | |
# this also follows the iteration frequency | |
if self.enable_clip: | |
random_render_image = mask_distribution_aux["random_render_image"] | |
clip_all_loss = self.compute_clip_loss(random_render_image, image_pred, category_name[0]) # a dict | |
final_losses.update(clip_all_loss) | |
# implement the mask discriminator | |
if self.enable_disc and (self.mask_discriminator_iter[0] < total_iter) and (self.mask_discriminator_iter[1] > total_iter): | |
disc_loss = self.compute_mask_disc_loss_gen(mask_gt, mask_pred, mask_distribution_aux['mask_random_pred'], category_name=category_name[0], condition_feat=class_vector) | |
final_losses.update(disc_loss) | |
# implement the gan training for local texture in fine-tuning | |
gan_tex_aux = None | |
if (self.few_shot_gan_tex and viz_logger is None) or (self.few_shot_gan_tex and viz_logger is not None and logger_prefix == 'train_'): | |
gan_tex_loss, gan_tex_aux = self.compute_gan_tex_loss(category_name[0], image_gt, mask_gt, image_pred, mask_pred, w2c, campos, shape, prior_shape, texture, dino_pred, im_features, light, class_vector[None, :].expand(batch_size * num_frames, -1), num_frames, im_features_map) | |
final_losses.update(gan_tex_loss) | |
# implement the memory bank related loss | |
if bank_embedding is not None: | |
batch_embedding = bank_embedding[0] # [d] | |
embeddings = bank_embedding[1] # [B, d] | |
bank_mean_dist = torch.nn.functional.mse_loss(embeddings, batch_embedding.unsqueeze(0).repeat(batch_size, 1)) | |
final_losses.update({'bank_mean_dist_loss': bank_mean_dist}) | |
## regularizers | |
regularizers, aux = self.compute_regularizers(shape, prior_shape, input_image, dino_feat_im, pose_raw, input_image_xflip_flag, arti_params, deformation, mid_img_idx, posed_bones=posed_bones, class_vector=class_vector.detach() if class_vector is not None else None) | |
final_losses.update(regularizers) | |
aux_viz.update(aux) | |
total_loss = 0 | |
for name, loss in final_losses.items(): | |
loss_weight = self.cfgs.get(f"{name}_weight", 0.) | |
if isinstance(loss_weight, dict): | |
loss_weight = self.parse_dict_definition(loss_weight, total_iter) | |
if loss_weight <= 0: | |
continue | |
if self.train_pose_only: | |
if name not in ['silhouette_loss', 'silhouette_dt_loss', 'silhouette_inv_dt_loss', 'flow_loss', 'pose_xflip_reg_loss', 'lookat_zflip_loss', 'dino_feat_im_loss']: | |
continue | |
if epoch not in self.flow_loss_epochs: | |
if name in ['flow_loss']: | |
continue | |
if epoch not in self.texture_epochs: | |
if name in ['rgb_loss', 'perceptual_loss']: | |
continue | |
if epoch not in self.lookat_zflip_loss_epochs: | |
if name in ['lookat_zflip_loss']: | |
continue | |
if name in ['mesh_laplacian_smoothing_loss', 'mesh_normal_consistency_loss']: | |
if total_iter < self.cfgs.get('mesh_reg_start_iter', 0): | |
continue | |
if epoch >= self.mesh_reg_decay_epoch: | |
decay_rate = self.mesh_reg_decay_rate ** (epoch - self.mesh_reg_decay_epoch) | |
loss_weight = max(loss_weight * decay_rate, self.cfgs.get(f"{name}_min_weight", 0.)) | |
if epoch not in self.sdf_inflate_reg_loss_epochs: | |
if name in ['sdf_inflate_reg_loss']: | |
continue | |
if self.iter_arti_reg_loss_start is not None: | |
if total_iter <= self.iter_arti_reg_loss_start: | |
if name in ['arti_reg_loss']: | |
continue | |
else: | |
if epoch not in self.arti_reg_loss_epochs: | |
if name in ['arti_reg_loss']: | |
continue | |
if name in ['sdf_bce_reg_loss', 'sdf_gradient_reg_loss', 'sdf_inflate_reg_loss']: | |
if total_iter >= self.sdf_reg_decay_start_iter: | |
decay_rate = max(0, 1 - (total_iter-self.sdf_reg_decay_start_iter) / 10000) | |
loss_weight = max(loss_weight * decay_rate, self.cfgs.get(f"{name}_min_weight", 0.)) | |
total_loss += loss * loss_weight | |
self.total_loss += total_loss # reset to 0 in backward step | |
if torch.isnan(self.total_loss): | |
print("NaN in loss...") | |
import ipdb; ipdb.set_trace() | |
final_losses['logit_loss_target'] = logit_loss_target.mean() | |
metrics = {'loss': total_loss, **final_losses} | |
## log visuals | |
if viz_logger is not None: | |
b0 = max(min(batch_size, 16//num_frames), 1) | |
viz_logger.add_image(logger_prefix+'image/image_gt', misc.image_grid(image_gt.detach().cpu()[:b0,:].reshape(-1,*input_image.shape[2:]).clamp(0,1)), total_iter) | |
viz_logger.add_image(logger_prefix+'image/image_pred', misc.image_grid(image_pred.detach().cpu()[:b0,:].reshape(-1,*image_pred.shape[2:]).clamp(0,1)), total_iter) | |
# viz_logger.add_image(logger_prefix+'image/flow_loss_mask', misc.image_grid(flow_loss_mask[:b0,:,:1].reshape(-1,1,*flow_loss_mask.shape[3:]).repeat(1,3,1,1).clamp(0,1)), total_iter) | |
viz_logger.add_image(logger_prefix+'image/mask_gt', misc.image_grid(mask_gt.detach().cpu()[:b0,:].reshape(-1,*mask_gt.shape[2:]).unsqueeze(1).repeat(1,3,1,1).clamp(0,1)), total_iter) | |
viz_logger.add_image(logger_prefix+'image/mask_pred', misc.image_grid(mask_pred.detach().cpu()[:b0,:].reshape(-1,*mask_pred.shape[2:]).unsqueeze(1).repeat(1,3,1,1).clamp(0,1)), total_iter) | |
if self.render_flow and flow_gt is not None: | |
# if False: | |
flow_gt = flow_gt.detach().cpu() | |
flow_gt_viz = torch.cat([flow_gt[:b0], torch.zeros_like(flow_gt[:b0,:,:1])], 2) + 0.5 # -0.5~1.5 | |
flow_gt_viz = torch.nn.functional.pad(flow_gt_viz, pad=[0, 0, 0, 0, 0, 0, 0, 1]) | |
# ## draw marker on large flow frames | |
# large_flow_marker_mask = torch.zeros_like(flow_gt_viz) | |
# large_flow_marker_mask[:,:,:,:8,:8] = 1. | |
# large_flow = torch.cat([self.large_flow, self.large_flow[:,:1] *0.], 1).detach().cpu()[:b0] | |
# large_flow_marker_mask = large_flow_marker_mask * large_flow[:,:,None,None,None] | |
# red = torch.FloatTensor([1,0,0])[None,None,:,None,None] | |
# flow_gt_viz = large_flow_marker_mask * red + (1-large_flow_marker_mask) * flow_gt_viz | |
viz_logger.add_image(logger_prefix+'image/flow_gt', misc.image_grid(flow_gt_viz.reshape(-1,*flow_gt_viz.shape[2:])), total_iter) | |
if self.render_flow and flow_pred is not None: | |
# if False | |
flow_pred = flow_pred.detach().cpu() | |
flow_pred_viz = torch.cat([flow_pred[:b0], torch.zeros_like(flow_pred[:b0,:,:1])], 2) + 0.5 # -0.5~1.5 | |
flow_pred_viz = torch.nn.functional.pad(flow_pred_viz, pad=[0, 0, 0, 0, 0, 0, 0, 1]) | |
viz_logger.add_image(logger_prefix+'image/flow_pred', misc.image_grid(flow_pred_viz.reshape(-1,*flow_pred_viz.shape[2:])), total_iter) | |
if sds_random_images is not None: | |
viz_logger.add_image( | |
logger_prefix + 'image/sds_image', | |
self.vis_sds_image(sds_random_images, sds_aux), | |
total_iter) | |
viz_logger.add_image( | |
logger_prefix + 'image/sds_grad', | |
self.vis_sds_grads(sds_aux), total_iter) | |
if mask_distribution_aux is not None: | |
degree_text = mask_distribution_aux['rand_degree'] | |
mask_random_pred = mask_distribution_aux['mask_random_pred'].detach().cpu().clamp(0, 1) | |
mask_distribution_data = mask_distribution_aux['mask_distribution'].detach().cpu().clamp(0, 1) | |
mask_random_pred_image = [misc.add_text_to_image(img, str(text.item())) for img, text in zip(mask_random_pred, degree_text)] | |
mask_random_pred_image = misc.image_grid(mask_random_pred_image) | |
mask_distribution_image = misc.image_grid(mask_distribution_data) | |
viz_logger.add_image( | |
logger_prefix + 'image/mask_random_pred', | |
mask_random_pred_image, | |
total_iter) | |
viz_logger.add_image( | |
logger_prefix + 'image/mask_distribution', | |
mask_distribution_image, | |
total_iter) | |
if gan_tex_aux is not None: | |
gan_tex_render_image = gan_tex_aux['gan_tex_render_image'].detach().cpu().clamp(0, 1) | |
gan_tex_render_image = misc.image_grid(gan_tex_render_image) | |
viz_logger.add_image( | |
logger_prefix + 'image/gan_tex_render_image', | |
gan_tex_render_image, | |
total_iter) | |
gan_tex_render_image_iv = gan_tex_aux['gan_tex_inpview_image'].detach().cpu().clamp(0, 1) | |
gan_tex_render_image_iv = misc.image_grid(gan_tex_render_image_iv) | |
viz_logger.add_image( | |
logger_prefix + 'image/gan_tex_inpview_image', | |
gan_tex_render_image_iv, | |
total_iter) | |
gan_tex_render_image_gt = gan_tex_aux['gan_tex_gt_image'].detach().cpu().clamp(0, 1) | |
gan_tex_render_image_gt = misc.image_grid(gan_tex_render_image_gt) | |
viz_logger.add_image( | |
logger_prefix + 'image/gan_tex_gt_image', | |
gan_tex_render_image_gt, | |
total_iter) | |
# if self.render_flow and flow_gt is not None and flow_pred is not None: | |
# flow_gt = flow_gt.detach().cpu() | |
# # flow_gt_viz = torch.cat([flow_gt[:b0], torch.zeros_like(flow_gt[:b0,:,:1])], 2) + 0.5 # -0.5~1.5 | |
# # flow_gt_viz = torch.nn.functional.pad(flow_gt_viz, pad=[0, 0, 0, 0, 0, 0, 0, 1]) | |
# # ## draw marker on large flow frames | |
# # large_flow_marker_mask = torch.zeros_like(flow_gt_viz) | |
# # large_flow_marker_mask[:,:,:,:8,:8] = 1. | |
# # large_flow = torch.cat([self.large_flow, self.large_flow[:,:1] *0.], 1).detach().cpu()[:b0] | |
# # large_flow_marker_mask = large_flow_marker_mask * large_flow[:,:,None,None,None] | |
# # red = torch.FloatTensor([1,0,0])[None,None,:,None,None] | |
# # flow_gt_viz = large_flow_marker_mask * red + (1-large_flow_marker_mask) * flow_gt_viz | |
# # viz_logger.add_image(logger_prefix+'image/flow_gt', misc.image_grid(flow_gt_viz.reshape(-1,*flow_gt_viz.shape[2:])), total_iter) | |
# flow_pred = flow_pred.detach().cpu() | |
# # flow_pred_viz = torch.cat([flow_pred[:b0], torch.zeros_like(flow_pred[:b0,:,:1])], 2) + 0.5 # -0.5~1.5 | |
# # flow_pred_viz = torch.nn.functional.pad(flow_pred_viz, pad=[0, 0, 0, 0, 0, 0, 0, 1]) | |
# flow_gt_pred = torch.cat([flow_gt, flow_pred], dim=-1) | |
# flow_gt_pred = flow_gt_pred.permute(0,1,3,4,2).detach().cpu().reshape(flow_gt_pred.shape[0]*flow_gt_pred.shape[1],*flow_gt_pred.shape[2:]) | |
# flow_gt_pred = flow_viz.flow_batch_to_images(flow_gt_pred) | |
# # flow_gt_pred = torch.tensor(flow_gt_pred).permute(0,3,1,2) | |
# # viz_logger.add_image(logger_prefix+'image/flow_gt_pred', misc.image_grid(flow_gt_pred.reshape(-1,*flow_gt_pred.shape[2:])), total_iter) | |
# viz_logger.add_image(logger_prefix+'image/flow_gt_pred', misc.image_grid(flow_gt_pred), total_iter) | |
if light is not None: | |
param_names = ['dir_x', 'dir_y', 'dir_z', 'int_ambient', 'int_diffuse'] | |
for name, param in zip(param_names, light.light_params.unbind(-1)): | |
viz_logger.add_histogram(logger_prefix+'light/'+name, param, total_iter) | |
viz_logger.add_image( | |
logger_prefix + f'image/albedo', | |
misc.image_grid(expandF(albedo)[:b0, ...].view(-1, *albedo.shape[1:])), | |
total_iter) | |
viz_logger.add_image( | |
logger_prefix + f'image/shading', | |
misc.image_grid(expandF(shading)[:b0, ...].view(-1, *shading.shape[1:]).repeat(1, 3, 1, 1) /2.), | |
total_iter) | |
viz_logger.add_histogram(logger_prefix+'sdf', self.netPrior.netShape.get_sdf(perturb_sdf=False, class_vector=class_vector), total_iter) | |
viz_logger.add_histogram(logger_prefix+'coordinates', shape.v_pos, total_iter) | |
if arti_params is not None: | |
viz_logger.add_histogram(logger_prefix+'arti_params', arti_params, total_iter) | |
viz_logger.add_histogram(logger_prefix+'edge_lengths', aux_viz['edge_lengths'], total_iter) | |
if deformation is not None: | |
viz_logger.add_histogram(logger_prefix+'deformation', deformation, total_iter) | |
rot_rep = self.netInstance.rot_rep | |
if rot_rep == 'euler_angle' or rot_rep == 'soft_calss': | |
for i, name in enumerate(['rot_x', 'rot_y', 'rot_z', 'trans_x', 'trans_y', 'trans_z']): | |
viz_logger.add_histogram(logger_prefix+'pose/'+name, pose[...,i], total_iter) | |
elif rot_rep == 'quaternion': | |
for i, name in enumerate(['qt_0', 'qt_1', 'qt_2', 'qt_3', 'trans_x', 'trans_y', 'trans_z']): | |
viz_logger.add_histogram(logger_prefix+'pose/'+name, pose[...,i], total_iter) | |
rot_euler = pytorch3d.transforms.matrix_to_euler_angles(pytorch3d.transforms.quaternion_to_matrix(pose.detach().cpu()[...,:4]), convention='XYZ') | |
for i, name in enumerate(['rot_x', 'rot_y', 'rot_z']): | |
viz_logger.add_histogram(logger_prefix+'pose/'+name, rot_euler[...,i], total_iter) | |
elif rot_rep in ['lookat', 'quadlookat', 'octlookat']: | |
for i, name in enumerate(['fwd_x', 'fwd_y', 'fwd_z']): | |
viz_logger.add_histogram(logger_prefix+'pose/'+name, pose_raw[...,i], total_iter) | |
for i, name in enumerate(['trans_x', 'trans_y', 'trans_z']): | |
viz_logger.add_histogram(logger_prefix+'pose/'+name, pose_raw[...,-3+i], total_iter) | |
if rot_rep in ['quadlookat', 'octlookat']: | |
for i, rp in enumerate(forward_aux['rots_probs'].unbind(-1)): | |
viz_logger.add_histogram(logger_prefix+'pose/rot_prob_%d'%i, rp, total_iter) | |
if bank_embedding is not None: | |
weights_for_emb = bank_embedding[2]['weights'] # [B, k] | |
for i, weight_for_emb in enumerate(weights_for_emb.unbind(-1)): | |
viz_logger.add_histogram(logger_prefix+'bank_embedding/emb_weight_%d'%i, weight_for_emb, total_iter) | |
indices_for_emb = bank_embedding[2]['pick_idx'] # [B, k] | |
for i, idx_for_emb in enumerate(indices_for_emb.unbind(-1)): | |
viz_logger.add_histogram(logger_prefix+'bank_embedding/emb_idx_%d'%i, idx_for_emb, total_iter) | |
if 'pose_xflip_raw' in aux_viz: | |
pose_xflip_raw = aux_viz['pose_xflip_raw'] | |
if rot_rep == 'euler_angle' or rot_rep == 'soft_calss': | |
for i, name in enumerate(['rot_x', 'rot_y', 'rot_z', 'trans_x', 'trans_y', 'trans_z']): | |
viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, pose_xflip[...,i], total_iter) | |
elif rot_rep == 'quaternion': | |
for i, name in enumerate(['qt_0', 'qt_1', 'qt_2', 'qt_3', 'trans_x', 'trans_y', 'trans_z']): | |
viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, pose_xflip[...,i], total_iter) | |
rot_euler = pytorch3d.transforms.matrix_to_euler_angles(pytorch3d.transforms.quaternion_to_matrix(pose_xflip.detach().cpu()[...,:4]), convention='XYZ') | |
for i, name in enumerate(['rot_x', 'rot_y', 'rot_z']): | |
viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, rot_euler[...,i], total_iter) | |
elif rot_rep in ['lookat', 'quadlookat', 'octlookat']: | |
for i, name in enumerate(['fwd_x', 'fwd_y', 'fwd_z']): | |
viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, pose_xflip_raw[...,i], total_iter) | |
for i, name in enumerate(['trans_x', 'trans_y', 'trans_z']): | |
viz_logger.add_histogram(logger_prefix+'pose_xflip/'+name, pose_xflip_raw[...,-3+i], total_iter) | |
if dino_feat_im_gt is not None: | |
dino_feat_im_gt_first3 = dino_feat_im_gt[:,:,:3] | |
viz_logger.add_image(logger_prefix+'image/dino_feat_im_gt', misc.image_grid(dino_feat_im_gt_first3.detach().cpu()[:b0,:].reshape(-1,*dino_feat_im_gt_first3.shape[2:]).clamp(0,1)), total_iter) | |
if dino_cluster_im_gt is not None: | |
viz_logger.add_image(logger_prefix+'image/dino_cluster_im_gt', misc.image_grid(dino_cluster_im_gt.detach().cpu()[:b0,:].reshape(-1,*dino_cluster_im_gt.shape[2:]).clamp(0,1)), total_iter) | |
if dino_feat_im_pred is not None: | |
dino_feat_im_pred_first3 = dino_feat_im_pred[:,:,:3] | |
viz_logger.add_image(logger_prefix+'image/dino_feat_im_pred', misc.image_grid(dino_feat_im_pred_first3.detach().cpu()[:b0,:].reshape(-1,*dino_feat_im_pred_first3.shape[2:]).clamp(0,1)), total_iter) | |
for which_shape, modes in self.extra_renders.items(): | |
# This is wrong | |
# if which_shape == "prior": | |
# shape_to_render = prior_shape.extend(im_features.shape[0]) | |
# needed_im_features = None | |
if which_shape == "instance": | |
shape_to_render = shape | |
needed_im_features = im_features | |
else: | |
raise NotImplementedError | |
for mode in modes: | |
if mode in ['gray']: | |
gray_light = FixedDirectionLight(direction=torch.FloatTensor([0, 0, 1]).to(self.device), amb=0.2, diff=0.7) | |
_, render_mask, _, _, _, rendered = self.render(shape_to_render, texture, mvp, w2c, campos, (h, w), background=self.background_mode, im_features=needed_im_features, prior_shape=prior_shape, render_mode='diffuse', light=gray_light, render_flow=False, dino_pred=None, im_features_map=im_features_map) #renderer for visualization only!!! | |
if self.background_mode == 'white': | |
# we want to render shading here, which is always black background, so modify here | |
render_mask = render_mask.unsqueeze(1) | |
rendered[render_mask == 0] = 1 | |
rendered = rendered.repeat(1, 3, 1, 1) | |
else: | |
rendered, _, _, _, _, _ = self.render(shape_to_render, texture, mvp, w2c, campos, (h, w), background=self.background_mode, im_features=needed_im_features, prior_shape=prior_shape, render_mode=mode, render_flow=False, dino_pred=None, im_features_map=im_features_map) #renderer for visualization only!!! | |
if 'kd' in mode: | |
rendered = util.rgb_to_srgb(rendered) | |
rendered = rendered.detach().cpu() | |
rendered_wo_bones = rendered | |
if 'posed_bones' in aux_viz: | |
rendered_bone_image = self.render_bones(mvp, aux_viz['posed_bones'], (h, w)) | |
rendered_bone_image_mask = (rendered_bone_image < 1).any(1, keepdim=True).float() | |
# viz_logger.add_image(logger_prefix+'image/articulation_bones', misc.image_grid(self.render_bones(mvp, aux_viz['posed_bones'])), total_iter) | |
rendered = rendered_bone_image_mask*0.8 * rendered_bone_image + (1-rendered_bone_image_mask*0.8) * rendered | |
if rot_rep in ['quadlookat', 'octlookat']: | |
rand_pose_flag = forward_aux['rand_pose_flag'].detach().cpu() | |
rand_pose_marker_mask = torch.zeros_like(rendered) | |
rand_pose_marker_mask[:,:,:16,:16] = 1. | |
rand_pose_marker_mask = rand_pose_marker_mask * rand_pose_flag[:,None,None,None] | |
red = torch.FloatTensor([1,0,0])[None,:,None,None] | |
rendered = rand_pose_marker_mask * red + (1-rand_pose_marker_mask) * rendered | |
viz_logger.add_image( | |
logger_prefix + f'image/{which_shape}_{mode}', | |
misc.image_grid(expandF(rendered)[:b0, ...].view(-1, *rendered.shape[1:])), | |
total_iter) | |
if rendered_wo_bones is not None: | |
viz_logger.add_image( | |
logger_prefix + f'image/{which_shape}_{mode}_raw', | |
misc.image_grid(expandF(rendered_wo_bones)[:b0, ...].view(-1, *rendered_wo_bones.shape[1:])), | |
total_iter) | |
if mode in ['gray']: | |
viz_logger.add_video( | |
logger_prefix + f'animation/{which_shape}_{mode}', | |
self.render_rotation_frames(shape_to_render, texture, gray_light, (h, w), background=self.background_mode, im_features=needed_im_features, prior_shape=prior_shape, num_frames=15, render_mode='diffuse', b=1, im_features_map=im_features_map, original_mvp=mvp, original_w2c=w2c, original_campos=campos, render_gray=True).detach().cpu().unsqueeze(0), | |
total_iter, | |
fps=2) | |
else: | |
viz_logger.add_video( | |
logger_prefix + f'animation/{which_shape}_{mode}', | |
self.render_rotation_frames(shape_to_render, texture, light, (h, w), background=self.background_mode, im_features=needed_im_features, prior_shape=prior_shape, num_frames=15, render_mode=mode, b=1, im_features_map=im_features_map, original_mvp=mvp, original_w2c=w2c, original_campos=campos).detach().cpu().unsqueeze(0), | |
total_iter, | |
fps=2) | |
viz_logger.add_video( | |
logger_prefix+'animation/prior_image_rotation', | |
self.render_rotation_frames(prior_shape, texture, light, (h, w), background=self.background_mode, im_features=im_features, num_frames=15, b=1, text=category_name[0], im_features_map=im_features_map, original_mvp=mvp).detach().cpu().unsqueeze(0).clamp(0,1), | |
total_iter, | |
fps=2) | |
viz_logger.add_video( | |
logger_prefix+'animation/prior_normal_rotation', | |
self.render_rotation_frames(prior_shape, texture, light, (h, w), background=self.background_mode, im_features=im_features, num_frames=15, render_mode='geo_normal', b=1, text=category_name[0], im_features_map=im_features_map, original_mvp=mvp).detach().cpu().unsqueeze(0), | |
total_iter, | |
fps=2) | |
if save_results and self.rank == 0: | |
b0 = self.cfgs.get('num_saved_from_each_batch', batch_size*num_frames) | |
# from IPython import embed; embed() | |
fnames = [f'{total_iter:07d}_{fid:010d}' for fid in collapseF(frame_id.int())][:b0] | |
# pkl_str = osp.join(save_dir, f'{total_iter:07d}_animal_data.pkl') | |
os.makedirs(save_dir, exist_ok=True) | |
# with open(pkl_str, 'wb') as fpkl: | |
# pickle.dump(save_for_pkl, fpkl) | |
# fpkl.close() | |
misc.save_images(save_dir, collapseF(image_gt)[:b0].clamp(0,1).detach().cpu().numpy(), suffix='image_gt', fnames=fnames) | |
misc.save_images(save_dir, collapseF(image_pred)[:b0].clamp(0,1).detach().cpu().numpy(), suffix='image_pred', fnames=fnames) | |
misc.save_images(save_dir, collapseF(mask_gt)[:b0].unsqueeze(1).repeat(1,3,1,1).clamp(0,1).detach().cpu().numpy(), suffix='mask_gt', fnames=fnames) | |
misc.save_images(save_dir, collapseF(mask_pred)[:b0].unsqueeze(1).repeat(1,3,1,1).clamp(0,1).detach().cpu().numpy(), suffix='mask_pred', fnames=fnames) | |
# tmp_shape = shape.first_n(b0).clone() | |
# tmp_shape.material = texture | |
# feat = im_features[:b0] if im_features is not None else None | |
# misc.save_obj(save_dir, tmp_shape, save_material=False, feat=feat, suffix="mesh", fnames=fnames) # Save the first mesh. | |
if self.render_flow and flow_gt is not None: | |
flow_gt_viz = torch.cat([flow_gt, torch.zeros_like(flow_gt[:,:,:1])], 2) + 0.5 # -0.5~1.5 | |
flow_gt_viz = flow_gt_viz.view(-1, *flow_gt_viz.shape[2:]) | |
misc.save_images(save_dir, flow_gt_viz[:b0].clamp(0,1).detach().cpu().numpy(), suffix='flow_gt', fnames=fnames) | |
if flow_pred is not None: | |
flow_pred_viz = torch.cat([flow_pred, torch.zeros_like(flow_pred[:,:,:1])], 2) + 0.5 # -0.5~1.5 | |
flow_pred_viz = flow_pred_viz.view(-1, *flow_pred_viz.shape[2:]) | |
misc.save_images(save_dir, flow_pred_viz[:b0].clamp(0,1).detach().cpu().numpy(), suffix='flow_pred', fnames=fnames) | |
misc.save_txt(save_dir, pose[:b0].detach().cpu().numpy(), suffix='pose', fnames=fnames) | |
return metrics | |
def save_scores(self, path): | |
header = 'mask_mse, \ | |
mask_iou, \ | |
image_mse, \ | |
flow_mse' | |
mean = self.all_scores.mean(0) | |
std = self.all_scores.std(0) | |
header = header + '\nMean: ' + ',\t'.join(['%.8f'%x for x in mean]) | |
header = header + '\nStd: ' + ',\t'.join(['%.8f'%x for x in std]) | |
misc.save_scores(path, self.all_scores, header=header) | |
print(header) | |
def render_rotation_frames(self, mesh, texture, light, resolution, background='none', im_features=None, prior_shape=None, num_frames=36, render_mode='diffuse', b=None, text=None, im_features_map=None, original_mvp=None, original_w2c=None, original_campos=None, render_gray=False): | |
frames = [] | |
if b is None: | |
b = len(mesh) | |
else: | |
mesh = mesh.first_n(b) | |
feat = im_features[:b] if im_features is not None else None | |
im_features_map = im_features_map[:b] if im_features_map is not None else None | |
original_mvp = original_mvp[:b] if original_mvp is not None else None # [b, 4, 4] | |
if im_features_map is not None: | |
im_features_map = {'im_features_map': im_features_map, 'original_mvp':original_mvp} | |
delta_angle = np.pi / num_frames * 2 | |
delta_rot_matrix = torch.FloatTensor([ | |
[np.cos(delta_angle), 0, np.sin(delta_angle), 0], | |
[0, 1, 0, 0], | |
[-np.sin(delta_angle), 0, np.cos(delta_angle), 0], | |
[0, 0, 0, 1], | |
]).to(self.device).repeat(b, 1, 1) | |
w2c = torch.FloatTensor(np.diag([1., 1., 1., 1])) | |
w2c[:3, 3] = torch.FloatTensor([0, 0, -self.cam_pos_z_offset *1.1]) | |
w2c = w2c.repeat(b, 1, 1).to(self.device) | |
proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, n=0.1, f=1000.0).repeat(b, 1, 1).to(self.device) | |
mvp = torch.bmm(proj, w2c) | |
campos = -w2c[:, :3, 3] | |
if original_w2c is not None and original_campos is not None and original_mvp is not None: | |
w2c = original_w2c[:b] | |
campos = original_campos[:b] | |
mvp = original_mvp[:b] | |
def rotate_pose(mvp, campos): | |
mvp = torch.matmul(mvp, delta_rot_matrix) | |
campos = torch.matmul(delta_rot_matrix[:,:3,:3].transpose(2,1), campos[:,:,None])[:,:,0] | |
return mvp, campos | |
for _ in range(num_frames): | |
if render_gray: | |
_, render_mask, _, _, _, image_pred = self.render(mesh, texture, mvp, w2c, campos, resolution, background=background, im_features=feat, light=light, prior_shape=prior_shape, render_flow=False, dino_pred=None, render_mode=render_mode, two_sided_shading=False, im_features_map=im_features_map) | |
if self.background_mode == 'white': | |
# we want to render shading here, which is always black background, so modify here | |
render_mask = render_mask.unsqueeze(1) | |
image_pred[render_mask == 0] = 1 | |
image_pred = image_pred.repeat(1, 3, 1, 1) | |
else: | |
image_pred, _, _, _, _, _ = self.render(mesh, texture, mvp, w2c, campos, resolution, background=background, im_features=feat, light=light, prior_shape=prior_shape, render_flow=False, dino_pred=None, render_mode=render_mode, two_sided_shading=False, im_features_map=im_features_map) #for rotation frames only! | |
image_pred = image_pred.clamp(0, 1) | |
frames += [misc.image_grid(image_pred)] | |
mvp, campos = rotate_pose(mvp, campos) | |
if text is not None: | |
frames = [torch.Tensor(misc.add_text_to_image(f, text)).permute(2, 0, 1) for f in frames] | |
return torch.stack(frames, dim=0) # Shape: (T, C, H, W) | |
def render_bones(self, mvp, bones_pred, size=(256, 256)): | |
bone_world4 = torch.concat([bones_pred, torch.ones_like(bones_pred[..., :1]).to(bones_pred.device)], dim=-1) | |
b, f, num_bones = bone_world4.shape[:3] | |
bones_clip4 = (bone_world4.view(b, f, num_bones*2, 1, 4) @ mvp.transpose(-1, -2).reshape(b, f, 1, 4, 4)).view(b, f, num_bones, 2, 4) | |
bones_uv = bones_clip4[..., :2] / bones_clip4[..., 3:4] # b, f, num_bones, 2, 2 | |
dpi = 32 | |
fx, fy = size[1] // dpi, size[0] // dpi | |
rendered = [] | |
for b_idx in range(b): | |
for f_idx in range(f): | |
frame_bones_uv = bones_uv[b_idx, f_idx].cpu().numpy() | |
fig = plt.figure(figsize=(fx, fy), dpi=dpi, frameon=False) | |
ax = plt.Axes(fig, [0., 0., 1., 1.]) | |
ax.set_axis_off() | |
for bone in frame_bones_uv: | |
ax.plot(bone[:, 0], bone[:, 1], marker='o', linewidth=8, markersize=20) | |
ax.set_xlim(-1, 1) | |
ax.set_ylim(-1, 1) | |
ax.invert_yaxis() | |
# Convert to image | |
fig.add_axes(ax) | |
fig.canvas.draw_idle() | |
image = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) | |
w, h = fig.canvas.get_width_height() | |
image.resize(h, w, 3) | |
rendered += [image / 255.] | |
return torch.from_numpy(np.stack(rendered, 0).transpose(0, 3, 1, 2)) | |
def render_deformation_frames(self, mesh, texture, batch_size, num_frames, resolution, background='none', im_features=None, render_mode='diffuse', b=None): | |
# frames = [] | |
# if b is None: | |
# b = batch_size | |
# im_features = im_features[] | |
# mesh = mesh.first_n(num_frames * b) | |
# for i in range(b): | |
# tmp_mesh = mesh.get_m_to_n(i*num_frames:(i+1)*num_frames) | |
pass | |
def vis_sds_image(self, sds_image, sds_aux): | |
sds_image = sds_image.detach().cpu().clamp(0, 1) | |
sds_image = [misc.add_text_to_image(img, text) for img, text in zip(sds_image, sds_aux['dirs'])] | |
return misc.image_grid(sds_image) | |
def vis_sds_grads(self, sds_aux): | |
grads = sds_aux['sd_aux']['grad'] | |
grads = grads.detach().cpu() | |
# compute norm | |
grads_norm = grads.norm(dim=1, keepdim=True) | |
# interpolate to 4x size | |
grads_norm = F.interpolate(grads_norm, scale_factor=4, mode='nearest') | |
# add time step and weight | |
t = sds_aux['sd_aux']['t'] | |
w = sds_aux['sd_aux']['w'] | |
# max norm for each sample over dim (1, 2, 3) | |
n = grads_norm.view(grads_norm.shape[0], -1).max(dim=1)[0] | |
texts = [f"t: {t_} w: {w_:.2f} n: {n_:.2e}" for t_, w_ , n_ in zip(t, w, n)] | |
return misc.image_grid_multi_channel(grads_norm, texts=texts, font_scale=0.5) |