Spaces:

rmvenkat
/

counterfactual-world-models

Running

counterfactual-world-models / cwm /eval /Flow /generator.py

rahulvenkk

app.py updated

6dfcb0f 9 months ago

20.9 kB

	import kornia
	import numpy as np
	import torch
	import torch.nn.functional as F
	from einops import rearrange
	from torch import nn

	import cwm.eval.Flow.masking_flow as masking


	def boltzmann(x, beta=1, eps=1e-9):
	if beta is None:
	return x
	x = torch.exp(x * beta)
	return x / x.amax((-1,-2), keepdim=True).clamp(min=eps)

	IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
	IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)

	def imagenet_normalize(x, temporal_dim=1):
	mean = torch.as_tensor(IMAGENET_DEFAULT_MEAN).to(x.device)[None,None,:,None,None].to(x)
	std = torch.as_tensor(IMAGENET_DEFAULT_STD).to(x.device)[None,None,:,None,None].to(x)
	if temporal_dim == 2:
	mean = mean.transpose(1,2)
	std = std.transpose(1,2)
	return (x - mean) / std

	def imagenet_unnormalize(x, temporal_dim=2):
	device = x.device
	mean = torch.as_tensor(IMAGENET_DEFAULT_MEAN).to(device)[None, None, :, None, None].to(x)
	std = torch.as_tensor(IMAGENET_DEFAULT_STD).to(device)[None, None, :, None, None].to(x)
	if temporal_dim == 2:
	mean = mean.transpose(1,2)
	std = std.transpose(1,2)
	x = x*std + mean
	return x



	def coordinate_ims(batch_size, seq_length, imsize, normalize=True, dtype_out=torch.float32):
	static = False
	if seq_length == 0:
	static = True
	seq_length = 1
	B = batch_size
	T = seq_length
	H,W = imsize
	ones = torch.ones([B,H,W,1], dtype=dtype_out)
	if normalize:
	h = torch.divide(torch.arange(H).to(ones), torch.tensor(H-1, dtype=dtype_out))
	h = 2.0 * ((h.view(1, H, 1, 1) * ones) - 0.5)
	w = torch.divide(torch.arange(W).to(ones), torch.tensor(W-1, dtype=dtype_out))
	w = 2.0 * ((w.view(1, 1, W, 1) * ones) - 0.5)
	else:
	h = torch.arange(H).to(ones).view(1,H,1,1) * ones
	w = torch.arange(W).to(ones).view(1,1,W,1) * ones
	h = torch.stack([h]*T, 1)
	w = torch.stack([w]*T, 1)
	hw_ims = torch.cat([h,w], -1)
	if static:
	hw_ims = hw_ims[:,0]
	return hw_ims


	def get_distribution_centroid(dist, eps=1e-9, normalize=False):

	B,T,C,H,W = dist.shape
	assert C == 1
	dist_sum = dist.sum((-2, -1), keepdim=True).clamp(min=eps)
	dist = dist / dist_sum

	grid = coordinate_ims(B, T, [H,W], normalize=normalize).to(dist.device)
	grid = grid.permute(0,1,4,2,3)
	centroid = (grid * dist).sum((-2,-1))
	return centroid



	class FlowToRgb(object):

	def __init__(self, max_speed=1.0, from_image_coordinates=True, from_sampling_grid=False):
	self.max_speed = max_speed
	self.from_image_coordinates = from_image_coordinates
	self.from_sampling_grid = from_sampling_grid

	def __call__(self, flow):
	assert flow.size(-3) == 2, flow.shape
	if self.from_sampling_grid:
	flow_x, flow_y = torch.split(flow, [1, 1], dim=-3)
	flow_y = -flow_y
	elif not self.from_image_coordinates:
	flow_x, flow_y = torch.split(flow, [1, 1], dim=-3)
	else:
	flow_h, flow_w = torch.split(flow, [1,1], dim=-3)
	flow_x, flow_y = [flow_w, -flow_h]

	angle = torch.atan2(flow_y, flow_x) # in radians from -pi to pi
	speed = torch.sqrt(flow_x2 + flow_y2) / self.max_speed

	hue = torch.fmod(angle, torch.tensor(2 * np.pi))
	sat = torch.ones_like(hue)
	val = speed

	hsv = torch.cat([hue, sat, val], -3)
	rgb = kornia.color.hsv_to_rgb(hsv)
	return rgb

	class Patchify(nn.Module):
	"""Convert a set of images or a movie into patch vectors"""

	def __init__(self,
	patch_size=(16, 16),
	temporal_dim=1,
	squeeze_channel_dim=True
	):
	super().__init__()
	self.set_patch_size(patch_size)
	self.temporal_dim = temporal_dim
	assert self.temporal_dim in [1, 2], self.temporal_dim
	self._squeeze_channel_dim = squeeze_channel_dim

	@property
	def num_patches(self):
	if (self.T is None) or (self.H is None) or (self.W is None):
	return None
	else:
	return (self.T // self.pt) * (self.H // self.ph) * (self.W // self.pw)

	def set_patch_size(self, patch_size):
	self.patch_size = patch_size
	if len(self.patch_size) == 2:
	self.ph, self.pw = self.patch_size
	self.pt = 1
	self._patches_are_3d = False
	elif len(self.patch_size) == 3:
	self.pt, self.ph, self.pw = self.patch_size
	self._patches_are_3d = True
	else:
	raise ValueError("patch_size must be a 2- or 3-tuple, but is %s" % self.patch_size)

	self.shape_inp = self.rank_inp = self.H = self.W = self.T = None
	self.D = self.C = self.E = self.embed_dim = None

	def _check_shape(self, x):
	self.shape_inp = x.shape
	self.rank_inp = len(self.shape_inp)
	self.H, self.W = self.shape_inp[-2:]
	assert (self.H % self.ph) == 0 and (self.W % self.pw) == 0, (self.shape_inp, self.patch_size)
	if (self.rank_inp == 5) and self._patches_are_3d:
	self.T = self.shape_inp[self.temporal_dim]
	assert (self.T % self.pt) == 0, (self.T, self.pt)
	elif self.rank_inp == 5:
	self.T = self.shape_inp[self.temporal_dim]
	else:
	self.T = 1

	def split_by_time(self, x):
	shape = x.shape
	assert shape[1] % self.T == 0, (shape, self.T)
	return x.view(shape[0], self.T, shape[1] // self.T, *shape[2:])

	def merge_by_time(self, x):
	shape = x.shape
	return x.view(shape[0], shape[1] * shape[2], *shape[3:])

	def video_to_patches(self, x):
	if self.rank_inp == 4:
	assert self.pt == 1, (self.pt, x.shape)
	x = rearrange(x, 'b c (h ph) (w pw) -> b (h w) (ph pw) c', ph=self.ph, pw=self.pw)
	else:
	assert self.rank_inp == 5, (x.shape, self.rank_inp, self.shape_inp)
	dim_order = 'b (t pt) c (h ph) (w pw)' if self.temporal_dim == 1 else 'b c (t pt) (h ph) (w pw)'
	x = rearrange(x, dim_order + ' -> b (t h w) (pt ph pw) c', pt=self.pt, ph=self.ph, pw=self.pw)

	self.N, self.D, self.C = x.shape[-3:]
	self.embed_dim = self.E = self.D * self.C
	return x

	def patches_to_video(self, x):
	shape = x.shape
	rank = len(shape)
	if rank == 4:
	B, _N, _D, _C = shape
	else:
	assert rank == 3, rank
	B, _N, _E = shape
	assert (_E % self.D == 0), (_E, self.D)
	x = x.view(B, _N, self.D, -1)

	if _N < self.num_patches:
	masked_patches = self.get_masked_patches(
	x,
	num_patches=(self.num_patches - _N),
	mask_mode=self.mask_mode)
	x = torch.cat([x, masked_patches], 1)

	x = rearrange(
	x,
	'b (t h w) (pt ph pw) c -> b c (t pt) (h ph) (w pw)',
	pt=self.pt, ph=self.ph, pw=self.pw,
	t=(self.T // self.pt), h=(self.H // self.ph), w=(self.W // self.pw))

	if self.rank_inp == 5 and (self.temporal_dim == 1):
	x = x.transpose(1, 2)
	elif self.rank_inp == 4:
	assert x.shape[2] == 1, x.shape
	x = x[:, :, 0]
	return x

	@staticmethod
	def get_masked_patches(x, num_patches, mask_mode='zeros'):
	shape = x.shape
	patches_shape = (shape[0], num_patches, *shape[2:])
	if mask_mode == 'zeros':
	return torch.zeros(patches_shape).to(x.device).to(x.dtype).detach()
	elif mask_mode == 'gray':
	return 0.5 * torch.ones(patches_shape).to(x.device).to(x.dtype).detach()
	else:
	raise NotImplementedError("Haven't implemented mask_mode == %s" % mask_mode)

	def average_within_patches(self, z):
	if len(z.shape) == 3:
	z = rearrange(z, 'b n (d c) -> b n d c', c=self.C)
	return z.mean(-2, True).repeat(1, 1, z.shape[-2], 1)

	def forward(self, x, to_video=False, mask_mode='zeros'):
	if not to_video:
	self._check_shape(x)
	x = self.video_to_patches(x)
	return x if not self._squeeze_channel_dim else x.view(x.size(0), self.N, -1)

	else: # x are patches
	assert (self.shape_inp is not None) and (self.num_patches is not None)
	self.mask_mode = mask_mode
	x = self.patches_to_video(x)
	return x


	class DerivativeFlowGenerator(nn.Module):
	"""Estimate flow of a two-frame predictor using torch autograd"""

	def __init__(self,
	predictor,
	perturbation_patch_size=None,
	aggregation_patch_size=None,
	agg_power=None,
	agg_channel_func=None,
	num_samples=1,
	leave_one_out_sampling=False,
	average_jacobian=True,
	confidence_thresh=None,
	temporal_dim=2,
	imagenet_normalize_inputs=True):

	super(DerivativeFlowGenerator, self).__init__()

	self.predictor = predictor

	self.patchify = Patchify(self.patch_size, temporal_dim=1, squeeze_channel_dim=True)

	self.set_temporal_dim(temporal_dim)

	self.imagenet_normalize_inputs = imagenet_normalize_inputs

	self.perturbation_patch_size = self._get_patch_size(perturbation_patch_size) or self.patch_size
	self.aggregation_patch_size = self._get_patch_size(aggregation_patch_size) or self.patch_size
	self.agg_patchify = Patchify(self.aggregation_patch_size,
	temporal_dim=1,
	squeeze_channel_dim=False)
	self.agg_channel_func = agg_channel_func or (lambda x: F.relu(x).sum(-3, True))
	self.average_jacobian = average_jacobian
	self.confidence_thresh = confidence_thresh

	self.num_samples = num_samples
	self.leave_one_out_sampling = leave_one_out_sampling
	self.agg_power = agg_power
	self.t_dim = temporal_dim

	def _get_patch_size(self, p):
	if p is None:
	return None
	elif isinstance(p, int):
	return (1, p, p)
	elif len(p) == 2:
	return (1, p[0], p[1])
	else:
	assert len(p) == 3, p
	return (p[0], p[1], p[2])

	def set_temporal_dim(self, t_dim):
	if t_dim == 1:
	self.predictor.t_dim = 1
	self.predictor.c_dim = 2
	elif t_dim == 2:
	self.predictor.c_dim = 1
	self.predictor.t_dim = 2
	else:
	raise ValueError("temporal_dim must be 1 or 2")

	@property
	def c_dim(self):
	if self.predictor is None:
	return None
	return self.predictor.c_dim

	@property
	def patch_size(self):
	if self.predictor is None:
	return None
	elif hasattr(self.predictor, 'patch_size'):
	return self.predictor.patch_size
	elif hasattr(self.predictor.encoder.patch_embed, 'proj'):
	return self.predictor.encoder.patch_embed.proj.kernel_size
	else:
	return None
	@property
	def S(self):
	return self.num_samples

	@property
	def sequence_length(self):
	if self.predictor is None:
	return None
	elif hasattr(self.predictor, 'sequence_length'):
	return self.predictor.sequence_length
	elif hasattr(self.predictor, 'num_frames'):
	return self.predictor.num_frames
	else:
	return 2
	@property
	def mask_shape(self):
	if self.predictor is None:
	return None
	elif hasattr(self.predictor, 'mask_shape'):
	return self.predictor.mask_shape

	assert self.patch_size is not None
	pt, ph, pw = self.patch_size
	return (self.sequence_length // pt,
	self.inp_shape[-2] // ph,
	self.inp_shape[-1] // pw)

	@property
	def perturbation_mask_shape(self):
	return (
	self.mask_shape[0],
	self.inp_shape[-2] // self.perturbation_patch_size[-2],
	self.inp_shape[-1] // self.perturbation_patch_size[-1]
	)



	@property
	def p_mask_shape(self):
	return self.perturbation_mask_shape

	@property
	def aggregation_mask_shape(self):
	return (
	1,
	self.inp_shape[-2] // self.aggregation_patch_size[-2],
	self.inp_shape[-1] // self.aggregation_patch_size[-1]
	)

	@property
	def a_mask_shape(self):
	return self.aggregation_mask_shape

	def get_perturbation_input(self, x):
	self.set_input(x)
	y = torch.zeros((self.B, *self.p_mask_shape), dtype=x.dtype, device=x.device, requires_grad=True)
	y = y.unsqueeze(2).repeat(1, 1, x.shape[2], 1, 1)
	return y

	def pred_patches_to_video(self, y, x, mask):
	"""input at visible positions, preds at masked positions"""
	B, C = y.shape[0], y.shape[-1]
	self.patchify._check_shape(x)
	self.patchify.D = np.prod(self.patch_size)
	x = self.patchify(x)
	y_out = torch.zeros_like(x)
	x_vis = x[~mask]

	y_out[~mask] = x_vis.view(-1, C)
	try:
	y_out[mask] = y.view(-1, C)
	except:
	y_out[mask] = y.reshape(-1, C)

	return self.patchify(y_out, to_video=True)

	def set_image_size(self, args, *kwargs):
	assert self.predictor is not None, "Can't set the image size without a predictor"
	if hasattr(self.predictor, 'set_image_size'):
	self.predictor.set_image_size(args, *kwargs)
	else:
	self.predictor.image_size = args[0]

	def predict(self, x=None, mask=None, forward_full=False):
	if x is None:
	x = self.x
	if mask is None:
	mask = self.generate_mask(x)

	self.set_image_size(x.shape[-2:])
	y = self.predictor(
	self._preprocess(x),
	mask if (x.size(0) == 1) else self.mask_rectangularizer(mask), forward_full=forward_full)

	y = self.pred_patches_to_video(y, x, mask=mask)

	frame = -1 % y.size(1)
	y = y[:, frame:frame + 1]

	return y

	def _get_perturbation_func(self, x=None, mask=None):

	if (x is not None):
	self.set_input(x, mask)

	def forward_mini_image(y):
	y = y.repeat_interleave(self.perturbation_patch_size[-2], -2)
	y = y.repeat_interleave(self.perturbation_patch_size[-1], -1)
	x_pred = self.predict(self.x + y, self.mask)
	x_pred = self.agg_patchify(x_pred).mean(-2).sum(-1).view(self.B, *self.a_mask_shape)
	return x_pred[self.targets]

	return forward_mini_image

	def _postprocess_jacobian(self, jac):
	_jac = torch.zeros((self.B, self.a_mask_shape, jac.shape[1:])).to(jac.device).to(jac.dtype)
	_jac[self.targets] = jac
	jac = self.agg_channel_func(_jac)
	assert jac.size(-3) == 1, jac.shape
	jac = jac.squeeze(-3)[..., 0, :, :] # derivative w.r.t. first frame and agg channels
	jac = jac.view(self.B, self.a_mask_shape[-2], self.a_mask_shape[-1],
	self.B, self.p_mask_shape[-2], self.p_mask_shape[-1])
	bs = torch.arange(0, self.B).long().to(jac.device)
	jac = jac[bs, :, :, bs, :, :] # take diagonal
	return jac

	def _confident_jacobian(self, jac):
	if self.confidence_thresh is None:
	return torch.ones_like(jac[:, None, ..., 0, 0])
	conf = (jac.amax((-2, -1)) > self.confidence_thresh).float()[:, None]
	return conf

	def set_input(self, x, mask=None, timestamps=None):
	shape = x.shape
	if len(shape) == 4:
	x = x.unsqueeze(1)
	else:
	assert len(shape) == 5, \
	"Input must be a movie of shape [B,T,C,H,W]" + \
	"or a single frame of shape [B,C,H,W]"

	self.inp_shape = x.shape
	self.x = x
	self.B = self.inp_shape[0]
	self.T = self.inp_shape[1]
	self.C = self.inp_shape[2]
	if mask is not None:
	self.mask = mask

	if timestamps is not None:
	self.timestamps = timestamps

	def _preprocess(self, x):
	if self.imagenet_normalize_inputs:
	x = imagenet_normalize(x)
	if self.t_dim != 1:
	x = x.transpose(self.t_dim, self.c_dim)
	return x

	def _jacobian_to_flows(self, jac):
	if self.agg_power is None:
	jac = (jac == jac.amax((-2, -1), True)).float()
	else:
	jac = torch.pow(jac, self.agg_power)

	jac = jac.view(self.B * np.prod(self.a_mask_shape[-2:]), 1, 1, *self.p_mask_shape[-2:])
	centroids = get_distribution_centroid(jac, normalize=False).view(
	self.B, self.a_mask_shape[-2], self.a_mask_shape[-1], 2)
	rescale = [self.a_mask_shape[-2] / self.p_mask_shape[-2],
	self.a_mask_shape[-1] / self.p_mask_shape[-1]]
	centroids = centroids * torch.tensor(rescale, device=centroids.device).view(1, 1, 1, 2)

	flows = centroids - \
	coordinate_ims(1, 0, self.a_mask_shape[-2:], normalize=False).to(jac.device)
	flows = flows.permute(0, 3, 1, 2)
	px_scale = torch.tensor(self.aggregation_patch_size[-2:]).float().to(flows.device).view(1, 2, 1, 1)
	flows *= px_scale

	return flows

	def set_targets(self, targets=None, frame=-1):
	frame = frame % self.mask_shape[0]
	if targets is None:
	targets = self.get_mask_image(self.mask)[:, frame:frame + 1]
	else:
	assert len(targets.shape) == 4, targets.shape
	targets = targets[:, frame:frame + 1]
	self.targets = ~masking.upsample_masks(~targets, self.a_mask_shape[-2:])

	def _get_mask_partition(self, mask):
	mask = self.get_mask_image(mask)
	mask_list = masking.partition_masks(
	mask[:, 1:], num_samples=self.S, leave_one_out=self.leave_one_out_sampling)
	return [torch.cat([mask[:, 0:1].view(m.size(0), -1), m], -1)
	for m in mask_list]

	def _compute_jacobian(self, y):
	perturbation_func = self._get_perturbation_func()
	jac = torch.autograd.functional.jacobian(
	perturbation_func,
	y,
	vectorize=False)
	jac = self._postprocess_jacobian(jac)
	return jac

	def _upsample_mask(self, mask):
	return masking.upsample_masks(
	mask.view(mask.size(0), -1, *self.mask_shape[-2:]).float(), self.inp_shape[-2:])

	def get_mask_image(self, mask, upsample=False, invert=False, shape=None):
	if shape is None:
	shape = self.mask_shape
	mask = mask.view(-1, *shape)
	if upsample:
	mask = self._upsample_mask(mask)
	if invert:
	mask = 1 - mask
	return mask

	def forward(self, x, mask, targets=None):
	self.set_input(x, mask)
	y = self.get_perturbation_input(x)
	mask_list = self._get_mask_partition(mask)

	jacobian, flows, confident = [], [], []
	for s, mask_sample in enumerate(mask_list):
	self.set_input(x, mask_sample)
	self.set_targets(targets)

	import time
	t1 = time.time()
	jac = self._compute_jacobian(y)
	conf_jac = masking.upsample_masks(self._confident_jacobian(jac), self.a_mask_shape[-2:])
	jacobian.append(jac)
	confident.append(conf_jac)
	if not self.average_jacobian:
	flow = self._jacobian_to_flows(jac) * self.targets * conf_jac * \
	masking.upsample_masks(self.get_mask_image(self.mask)[:, 1:], self.a_mask_shape[-2:])
	flows.append(flow)
	t2 = time.time()
	print(t2 - t1)

	jacobian = torch.stack(jacobian, -1)
	confident = torch.stack(confident, -1)
	valid = torch.stack([masking.upsample_masks(
	self.get_mask_image(m)[:, 1:], self.a_mask_shape[-2:]) for m in mask_list], -1)
	valid = valid * confident

	if self.average_jacobian:
	_valid = valid[:, 0].unsqueeze(-2).unsqueeze(-2)
	jac = (jacobian * _valid.float()).sum(-1) / _valid.float().sum(-1).clamp(min=1)
	flows = self._jacobian_to_flows(jac) * \
	masking.upsample_masks(_valid[:, None, ..., 0, 0, :].amax(-1).bool(), self.a_mask_shape[-2:])
	if targets is not None:
	self.set_targets(targets)
	flows *= self.targets
	else:
	flows = torch.stack(flows, -1)
	flows = flows.sum(-1) / valid.float().sum(-1).clamp(min=1)

	valid = valid * (targets[:, -1:].unsqueeze(-1) if targets is not None else 1)

	return (jacobian, flows, valid)