Spaces:

tennant
/

Semanticist_AR

Runtime error

App Files Files Community

Semanticist_AR / gen_demo.py

tennant

Update gen_demo.py

b72e8a3 verified 3 months ago

raw

history blame contribute delete

11.1 kB

	import gradio as gr
	import spaces
	import numpy as np
	from PIL import Image
	import os.path as osp
	import torch
	import matplotlib.pyplot as plt
	from omegaconf import OmegaConf
	from tqdm import tqdm
	from huggingface_hub import hf_hub_download
	from semanticist.engine.trainer_utils import instantiate_from_config
	from semanticist.stage1.diffuse_slot import DiffuseSlot
	from semanticist.stage2.gpt import GPT_models
	from semanticist.stage2.generate import generate
	from safetensors import safe_open
	from semanticist.utils.datasets import vae_transforms
	from PIL import Image
	from imagenet_classes import imagenet_classes

	transform = vae_transforms('test')


	def norm_ip(img, low, high):
	img.clamp_(min=low, max=high)
	img.sub_(low).div_(max(high - low, 1e-5))

	def norm_range(t, value_range):
	if value_range is not None:
	norm_ip(t, value_range[0], value_range[1])
	else:
	norm_ip(t, float(t.min()), float(t.max()))

	from PIL import Image
	def convert_np(img):
	ndarr = img.mul(255).add_(0.5).clamp_(0, 255)\
	.permute(1, 2, 0).to("cpu", torch.uint8).numpy()
	return ndarr
	def convert_PIL(img):
	ndarr = img.mul(255).add_(0.5).clamp_(0, 255)\
	.permute(1, 2, 0).to("cpu", torch.uint8).numpy()
	img = Image.fromarray(ndarr)
	return img

	def norm_slots(slots):
	mean = torch.mean(slots, dim=-1, keepdim=True)
	std = torch.std(slots, dim=-1, keepdim=True)
	return (slots - mean) / std

	def load_state_dict(state_dict, model):
	"""Helper to load a state dict with proper prefix handling."""
	if 'state_dict' in state_dict:
	state_dict = state_dict['state_dict']
	# Remove '_orig_mod' prefix if present
	state_dict = {k.replace('_orig_mod.', ''): v for k, v in state_dict.items()}
	missing, unexpected = model.load_state_dict(
	state_dict, strict=False
	)
	# print(f"Loaded model. Missing: {missing}, Unexpected: {unexpected}")

	def load_safetensors(path, model):
	"""Helper to load a safetensors checkpoint."""
	from safetensors.torch import safe_open
	with safe_open(path, framework="pt", device="cpu") as f:
	state_dict = {k: f.get_tensor(k) for k in f.keys()}
	load_state_dict(state_dict, model)

	def load_checkpoint(ckpt_path, model):
	if ckpt_path is None or not osp.exists(ckpt_path):
	return

	if osp.isdir(ckpt_path):
	# ckpt_path is something like 'path/to/models/step10/'
	model_path = osp.join(ckpt_path, "model.safetensors")
	if osp.exists(model_path):
	load_safetensors(model_path, model)
	else:
	# ckpt_path is something like 'path/to/models/step10.pt'
	if ckpt_path.endswith(".safetensors"):
	load_safetensors(ckpt_path, model)
	else:
	state_dict = torch.load(ckpt_path, map_location="cpu")
	load_state_dict(state_dict, model)

	print(f"Loaded checkpoint from {ckpt_path}")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Is CUDA available: {torch.cuda.is_available()}")
	if device == 'cuda':
	print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

	ckpt_path = hf_hub_download(repo_id='tennant/semanticist', filename="semanticist_ar_gen_L.pkl", cache_dir='/mnt/ceph_rbd/mnt_pvc_vid_data/zbc/cache/')
	config_path = 'configs/autoregressive_xl.yaml'

	cfg = OmegaConf.load(config_path)
	params = cfg.trainer.params

	ae_model = instantiate_from_config(params.ae_model).to(device)
	ae_model_path = hf_hub_download(repo_id='tennant/semanticist', filename="semanticist_tok_XL.pkl", cache_dir='/mnt/ceph_rbd/mnt_pvc_vid_data/zbc/cache/')
	load_checkpoint(ae_model_path, ae_model)
	ae_model.eval()

	gpt_model = GPT_models[params.gpt_model.target](**params.gpt_model.params).to(device)
	load_checkpoint(ckpt_path, gpt_model)
	gpt_model.eval();

	# @spaces.GPU(duration=120)
	def viz_diff_slots(model, slots, nums, cfg=1.0, return_figs=False):
	n_slots_inf = []
	for num_slots_to_inference in nums:
	drop_mask = model.nested_sampler(slots.shape[0], device, num_slots_to_inference)
	recon_n = model.sample(slots, drop_mask=drop_mask, cfg=cfg)
	n_slots_inf.append(recon_n)
	return [convert_np(n_slots_inf[i][0]) for i in range(len(n_slots_inf))]

	num_slots = params.ae_model.params.num_slots
	slot_dim = params.ae_model.params.slot_dim
	dtype = torch.bfloat16
	# the model is trained with only 32 tokens.
	num_slots_to_gen = 32

	# Function to generate image from class
	@spaces.GPU(duration=600)
	def generate_from_class_and_viz_diff_slots(class_id, cfg_scale, nums, cfg=1.0, return_figs=False):
	with torch.no_grad():
	dtype = torch.float
	num_slots_to_gen = 32
	with torch.autocast(device, dtype=dtype):
	slots_gen = generate(
	gpt_model,
	torch.tensor([class_id]).to(device),
	num_slots_to_gen,
	cfg_scale=cfg_scale,
	cfg_schedule="linear"
	)
	if num_slots_to_gen < num_slots:
	null_slots = ae_model.dit.null_cond.expand(slots_gen.shape[0], -1, -1)
	null_slots = null_slots[:, num_slots_to_gen:, :]
	slots_gen = torch.cat([slots_gen, null_slots], dim=1)

	slots = slots_gen
	n_slots_inf = []
	for i, num_slots_to_inference in enumerate(nums):
	drop_mask = ae_model.nested_sampler(slots.shape[0], device, num_slots_to_inference)
	recon_n = ae_model.sample(slots, drop_mask=drop_mask, cfg=cfg)
	n_slots_inf.append(recon_n)
	print(f'processing {i} / {len(nums)}')
	drop_mask = ae_model.nested_sampler(slots.shape[0], device, 32)
	recon_n = ae_model.sample(slots, drop_mask=drop_mask, cfg=cfg)
	recon = recon_n

	return [convert_np(n_slots_inf[i][0]) for i in range(len(n_slots_inf))], convert_np(recon[0])

	with gr.Blocks() as demo:
	with gr.Row():
	# First column - Input and configs
	with gr.Column(scale=1):
	gr.Markdown("## Input")

	# Replace image input with ImageNet class selection
	imagenet_classes = {k: v for k, v in enumerate(imagenet_classes)}
	class_choices = [f"{id}: {name}" for id, name in imagenet_classes.items()]

	# Dropdown for class selection
	class_dropdown = gr.Dropdown(
	choices=class_choices, # Limit for demonstration
	label="Select ImageNet Class",
	value=class_choices[0] if class_choices else None
	)

	# Option to enter class ID directly
	class_id_input = gr.Number(
	label="Or enter class ID directly (0-999)",
	value=0,
	minimum=0,
	maximum=999,
	step=1
	)

	with gr.Group():
	gr.Markdown("### Configuration")
	show_gallery = gr.Checkbox(label="Show Gallery", value=True)
	slider = gr.Slider(minimum=0.1, maximum=20.0, value=4.0, label="CFG value")
	labels_input = gr.Textbox(
	label="Number of tokens to reconstruct (comma-separated)",
	value="1, 2, 4, 8, 16",
	placeholder="Enter comma-separated numbers for the number of slots to use"
	)

	# Second column - Output (conditionally rendered)
	with gr.Column(scale=1):
	gr.Markdown("## Output")

	# Container for conditional rendering
	with gr.Group(visible=True) as gallery_container:
	gallery = gr.Gallery(label="Result Gallery", columns=3, height="auto", show_label=True)

	# Always visible output image
	output_image = gr.Image(label="Generated Image", type="numpy")

	# Handle form submission
	submit_btn = gr.Button("Generate")

	# Define the processing logic
	def update_outputs(class_selection, class_id, show_gallery_value, slider_value, labels_text):
	# Determine which class to use - either from dropdown or direct input
	if class_selection:
	# Extract class ID from the dropdown selection
	selected_class_id = int(class_selection.split(":")[0])
	else:
	selected_class_id = int(class_id)

	# Update the visibility of the gallery container
	gallery_container.visible = show_gallery_value

	try:
	# Parse the labels from the text input
	if labels_text and "," in labels_text:
	labels = [int(label.strip()) for label in labels_text.split(",")]
	else:
	# Default labels if none provided or in wrong format
	labels = [1, 4, 16, 64, 256]
	except:
	labels = [1, 4, 16, 64, 256]

	while len(labels) < 3:
	labels.append(256)

	# # Generate the image based on the selected class
	# slots_gen = generate_from_class(selected_class_id, cfg_scale=slider_value)

	# recon = viz_diff_slots(ae_model, slots_gen, [32], cfg=slider_value)[0]

	# # Always generate the model decomposition for potential gallery display
	# model_decompose = viz_diff_slots(ae_model, slots_gen, labels, cfg=slider_value)
	model_decompose, recon = generate_from_class_and_viz_diff_slots(
	selected_class_id,
	cfg_scale=slider_value,
	nums=labels,
	cfg=slider_value
	)

	if not show_gallery_value:
	# If only the image should be shown, return just the processed image
	return gallery_container, [], recon
	else:
	# Create image variations and pair them with labels
	gallery_images = [
	(recon, f'Generated from class {selected_class_id}'),
	] + [(img, 'Gen. with ' + str(label) + ' tokens') for img, label in zip(model_decompose, labels)]
	return gallery_container, gallery_images, recon

	# Connect the inputs and outputs
	submit_btn.click(
	fn=update_outputs,
	inputs=[class_dropdown, class_id_input, show_gallery, slider, labels_input],
	outputs=[gallery_container, gallery, output_image]
	)

	# Also update when checkbox changes
	show_gallery.change(
	fn=lambda value: gr.update(visible=value),
	inputs=[show_gallery],
	outputs=[gallery_container]
	)

	# Add examples
	examples = [
	# ["0: tench, Tinca tinca", 0, True, 4.0, "1,2,4,8,16"],
	["1: goldfish", 1, True, 4.0, "1,2,4,8,16"],
	# ["2: great white shark, white shark", 2, True, 4.0, "1,2,4,8,16"],
	]

	gr.Examples(
	examples=examples,
	inputs=[class_dropdown, class_id_input, show_gallery, slider, labels_input],
	outputs=[gallery_container, gallery, output_image],
	fn=update_outputs,
	cache_examples=False
	)

	# Launch the demo
	if __name__ == "__main__":
	demo.launch()