Spaces:

liruiw
/

hma

Running on Zero

App Files Files Community

hma / visualize.py

LeroyWaa

draft

246c106 29 days ago

raw

history blame contribute delete

13.9 kB

	#!/usr/bin/env python3

	"""
	Script to decode tokenized video into images/video.
	Example usage: See https://github.com/1x-technologies/1xgpt?tab=readme-ov-file#1x-genie-baseline
	"""

	import argparse
	import math
	import os
	from PIL import Image, ImageDraw

	import numpy as np
	import torch
	import torch.distributed.optim
	import torch.utils.checkpoint
	import torch.utils.data
	import torchvision.transforms.v2.functional as transforms_f
	from diffusers import AutoencoderKLTemporalDecoder
	from einops import rearrange
	from matplotlib import pyplot as plt
	from cont_data import RawFeatureDataset

	from data import RawTokenDataset
	from datasets.utils import get_image_encoder
	from magvit2.config import VQConfig
	from magvit2.models.lfqgan import VQModel
	from common.eval_utils import decode_tokens, decode_features
	import wandb
	wandb.login(key='4c1540ebf8cb9964703ac212a937c00848a79b67')

	SVD_SCALE = 0.18215

	def parse_args():
	parser = argparse.ArgumentParser(description="Visualize tokenized video as GIF or comic.")
	parser.add_argument(
	"--stride",
	type=int,
	default=1,
	help="Frame skip",
	)
	parser.add_argument(
	"--token_dir",
	type=str,
	default="data/genie_generated",
	help="Directory of tokens, in the format of `video.bin` and `metadata.json`. "
	"Visualized gif and comic will be written here.",
	)
	parser.add_argument(
	"--offset", type=int, default=0, help="Offset to start generating images from"
	)
	parser.add_argument(
	"--fps", type=int, default=2, help="Frames per second"
	)
	parser.add_argument(
	"--max_images", type=int, default=None, help="Maximum number of images to generate. None for all."
	)
	parser.add_argument(
	"--example_ind", type=int, default=0,
	help="The index in the dataset of the example to generate on."
	)
	parser.add_argument(
	"--project_prefix", type=str, default="", help="Project suffix."
	)
	parser.add_argument(
	"--disable_comic", action="store_true",
	help="Comic generation assumes `token_dir` follows the same format as generate: e.g., "
	"`prompt \| predictions \| gtruth` in `video.bin`, `window_size` in `metadata.json`."
	"Therefore, comic should be disabled when visualizing videos without this format, such as the dataset."
	)
	parser.add_argument(
	"--batch_size", type=int, default=4,
	help="Batch size, current script only supports a single GPU."
	)
	parser.add_argument(
	"--max_example", type=int, default=4,
	help="Maximum number of examples."
	)
	parser.add_argument(
	"--use_feature", action="store_true",
	help="visualize the features rather than tokens"
	)
	args = parser.parse_args()

	return args


	def export_to_gif(frames: list, output_gif_path: str, fps: int):
	"""
	Export a list of frames to a GIF.

	Args:
	- frames (list): List of frames (as numpy arrays or PIL Image objects).
	- output_gif_path (str): Path to save the output GIF.
	- fps (int): Desired frames per second.
	"""
	# Convert numpy arrays to PIL Images if needed
	pil_frames = [Image.fromarray(frame) if isinstance(
	frame, np.ndarray) else frame for frame in frames]

	duration_ms = 1000 / fps
	pil_frames[0].save(output_gif_path.replace(".mp4", ".gif"),
	format="GIF",
	append_images=pil_frames[1:],
	save_all=True,
	duration=duration_ms,
	loop=0)
	# return the gif
	return output_gif_path.replace(".mp4", ".gif")

	def unnormalize_imgs(normalized_imgs):
	"""
	[-1, 1] -> [0, 255]

	Important: clip to [0, 255]
	"""
	normalized_imgs = torch.clamp(normalized_imgs, -1, 1)
	rescaled_output = ((normalized_imgs.detach().cpu() + 1) * 127.5)
	clipped_output = torch.clamp(rescaled_output, 0, 255).to(dtype=torch.uint8)
	return clipped_output
	# rescaled_output = ((normalized_imgs.detach().cpu() + 1) * 127.5)
	# clipped_output = torch.clamp(rescaled_output, 0, 255).to(dtype=torch.uint8)
	# return clipped_output

	def decode_latents_wrapper(
	batch_size: int = 16,
	encoder_type: str = "magvit",
	encoder_name_or_path: str = "data/magvit2.ckpt",
	max_images: int = None,
	device: str = "cuda",
	):
	dtype = torch.bfloat16 # torch.bfloat16
	model = get_image_encoder(encoder_type, encoder_name_or_path)
	model = model.to(device=device, dtype=dtype)

	@torch.no_grad()
	def decode_latents(video_data: np.array):
	"""
	video_data: (b, h, w) for quantized data, or (b, c, h, w) for continuous data,
	where b is `batch_size` and different from training/eval batch size.
	"""
	decoded_imgs = []

	for shard_ind in range(math.ceil(len(video_data) / batch_size)):
	shard_data = video_data[shard_ind * batch_size: (shard_ind + 1) * batch_size]
	if isinstance(model, VQModel): # TODO: class agnostic wrapper
	# expecting quantized
	assert shard_data.ndim == 3, f"{shard_data.shape=} {shard_data.dtype=}"
	torch_shard = torch.from_numpy(shard_data.astype(np.int64))
	# if model.use_ema: # EMA does nothing in bugged VQModel
	# with model.ema_scope():
	quant = model.quantize.get_codebook_entry(rearrange(torch_shard, "b h w -> b (h w)"),
	bhwc=torch_shard.shape + (model.quantize.codebook_dim,)).flip(1)
	normalized_imgs = model.decode(quant.to(device=device, dtype=dtype))
	elif isinstance(model, AutoencoderKLTemporalDecoder):
	# expecting continuous
	assert shard_data.ndim == 4, f"{shard_data.shape=} {shard_data.dtype=}"
	torch_shard = torch.from_numpy(shard_data)
	# manual clip
	# if torch_shard.shape[0] == 16:
	# print("prompt torch_shard", torch_shard[:4, 0].min(), torch_shard[:4, 0].max(), torch_shard[:4, 0].mean(), torch_shard[:4, 0].std())
	# print("pred torch_shard", torch_shard[4:12, 0].min(), torch_shard[4:12, 0].max(), torch_shard[4:12, 0].mean(), torch_shard[4:12, 0].std())
	# print("groundtruth torch_shard", torch_shard[12:, 0].min(), torch_shard[12:, 0].max(), torch_shard[12:, 0].mean(), torch_shard[12:, 0].std())

	torch_shard = torch.clamp(torch_shard, -25, 25)
	normalized_imgs = model.decode(torch_shard.to(device=device, dtype=dtype), num_frames=1).sample # sample to mean
	# if torch_shard.shape[0] == 16:
	# print("prompt normalized_imgs", normalized_imgs[:4, 0].min(), normalized_imgs[:4, 0].max(), normalized_imgs[:4, 0].mean(), normalized_imgs[:4, 0].std())
	# print("pred normalized_imgs", normalized_imgs[4:12, 0].min(), normalized_imgs[4:12, 0].max(), normalized_imgs[4:12, 0].mean(), normalized_imgs[4:12, 0].std())
	# print("groundtruth normalized_imgs", normalized_imgs[12:, 0].min(), normalized_imgs[12:, 0].max(), normalized_imgs[12:, 0].mean(), normalized_imgs[12:, 0].std())

	else:
	raise NotImplementedError(f"{model=}")

	decoded_imgs.append(unnormalize_imgs(normalized_imgs))
	if max_images and len(decoded_imgs) * batch_size >= max_images:
	break

	return [transforms_f.to_pil_image(img) for img in torch.cat(decoded_imgs)]

	return decode_latents


	def caption_image(pil_image: Image, caption: str):
	"""
	Add a bit of empty space at the top, and add the caption there
	"""
	border_size = 36
	font_size = 24
	# convert pil_image to PIL.Image.Image if it's not already
	if not isinstance(pil_image, Image.Image):
	pil_image = transforms_f.to_pil_image(pil_image)

	width, height = pil_image.size
	new_width = width
	new_height = height + border_size

	new_image = Image.new("RGB", (new_width, new_height), "white")
	new_image.paste(pil_image, (0, border_size))

	# Draw the caption
	draw = ImageDraw.Draw(new_image)

	# Center text (`align` keyword doesn't work)
	_, _, text_w, text_h = draw.textbbox((0, 0), caption, font_size=font_size)
	draw.text(((width - text_w) / 2, (border_size - text_h) / 2), caption, fill="black", font_size=font_size)

	return new_image


	@torch.no_grad()
	def main():
	args = parse_args()
	name = args.token_dir.split('/')[-2]
	name_split = name.find('nodes')
	model = name[:name_split-7]
	dataset = name[name_split+8:]

	# Load tokens
	if args.use_feature:
	token_dataset = RawFeatureDataset(args.token_dir, 1, compute_stride_from_freq_table=False,
	filter_interrupts=False, filter_overlaps=False)
	video_tokens = token_dataset.data

	print(f"Loaded {video_tokens.shape=}")
	else:
	token_dataset = RawTokenDataset(args.token_dir, 1, compute_stride_from_freq_table=False,
	filter_interrupts=False, filter_overlaps=False)
	video_tokens = token_dataset.data
	print(f"Loaded {video_tokens.shape=}")

	metadata = token_dataset.metadata
	video_tokens = video_tokens.reshape(-1, metadata["window_size"] * 2 - metadata["num_prompt_frames"], *video_tokens.shape[1:])
	decode_func = decode_latents_wrapper
	print(metadata)
	print(f"Reshape {video_tokens.shape=}")

	wandb.init(project='video_eval_vis', settings=wandb.Settings(start_method="thread"), name=f"{args.project_prefix}vis_{model}", id=f"{args.project_prefix}vis_{model}", resume="allow")
	for example_id in range(min(args.max_example, len(video_tokens))):
	if args.use_feature:
	if "encoder_type" not in metadata:
	metadata["encoder_type"] = "temporalvae"
	metadata["encoder_name_or_path"] = "stabilityai/stable-video-diffusion-img2vid"
	decode_latents = decode_func(max_images=args.max_images, encoder_name_or_path=metadata["encoder_name_or_path"],
	encoder_type=metadata["encoder_type"]) # args.offset::args.stride
	this_video_token = torch.FloatTensor(video_tokens[example_id].copy())[None] / SVD_SCALE
	this_video_token = rearrange(this_video_token, "b t c h w -> b t h w c")
	video_frames = decode_features(this_video_token, decode_latents)
	video_frames = rearrange(video_frames, "b t c h w -> b t h w c")
	video_frames = video_frames.detach().cpu().numpy()[0].astype(np.uint8)
	else:
	decode_latents = decode_func(max_images=args.max_images)
	this_video_token = torch.LongTensor(video_tokens[example_id])[None]
	video_frames = decode_tokens(this_video_token, decode_latents)
	video_frames = rearrange(video_frames, "b t c h w -> b t h w c")
	video_frames = video_frames.detach().cpu().numpy()[0].astype(np.uint8)

	output_gif_path = os.path.join(args.token_dir, f"example{args.offset}.gif")

	# `generate` should populate `metadata.json` with these keys, while ground truth metadata does not have them
	is_generated_data = all(key in metadata for key in ("num_prompt_frames", "window_size"))
	if is_generated_data:
	if video_tokens[example_id].shape[0] != metadata["window_size"] * 2 - metadata["num_prompt_frames"]:
	raise ValueError(f"Unexpected {video_tokens.shape=} given {metadata['window_size']=}, {metadata['num_prompt_frames']=}")

	captioned_frames = []
	for i, frame in enumerate(video_frames):
	if i < metadata["num_prompt_frames"]:
	caption = "Prompt"
	elif i < metadata["window_size"]:
	caption = "Generated"
	else:
	caption = "Ground truth"

	captioned_frames.append(caption_image(frame, caption))
	else:
	# Leave ground truth frames uncaptioned
	captioned_frames = video_frames

	gif_path = export_to_gif(captioned_frames, output_gif_path, args.fps)
	print(f"Saved to {output_gif_path}")

	if not args.disable_comic:
	fig, axs = plt.subplots(nrows=2, ncols=metadata["window_size"], figsize=(3 * metadata["window_size"], 3 * 2))
	for i, image in enumerate(video_frames):
	if i < metadata["num_prompt_frames"]:
	curr_axs = [axs[0, i], axs[1, i]]
	title = "Prompt"

	elif i < metadata["window_size"]:
	curr_axs = [axs[0, i]]
	title = "Prediction"
	else:
	curr_axs = [axs[1, i - metadata["window_size"] + metadata["num_prompt_frames"]]]
	title = "Ground truth"

	for ax in curr_axs:
	ax.set_title(title)
	ax.imshow(image)
	ax.axis("off")

	output_comic_path = os.path.join(args.token_dir, f"example{args.offset}.png")
	plt.savefig(output_comic_path, bbox_inches="tight")
	plt.close()
	print(f"Saved to {output_comic_path}")
	wandb.log({f"{dataset}/gif_{example_id}": wandb.Video(gif_path)})

	# add wandb logging
	# wandb.log({f"{dataset}/comic_{args.example_ind}": wandb.Image(output_comic_path)})
	wandb.run.summary["model_checkpoint"] = metadata["model_checkpoint"]
	wandb.run.summary["dataset"] = metadata["dataset"]
	wandb.run.summary["trained_steps"] = metadata["trained_steps"]

	wandb.finish()


	if __name__ == "__main__":
	main()