Spaces:

tennant
/

Semanticist_AR

Runtime error

App Files Files Community

Semanticist_AR / semanticist /engine /trainer_utils.py

tennant

upload

7b0a1ef 4 months ago

raw

history blame contribute delete

8.98 kB

	import os, torch
	import cv2
	import numpy as np
	import torch_fidelity
	from collections import OrderedDict
	from concurrent.futures import ThreadPoolExecutor
	import importlib
	from torch.optim import AdamW
	from semanticist.utils.lr_scheduler import build_scheduler


	def get_obj_from_str(string, reload=False):
	"""Get object from string path."""
	module, cls = string.rsplit(".", 1)
	if reload:
	module_imp = importlib.import_module(module)
	importlib.reload(module_imp)
	return getattr(importlib.import_module(module, package=None), cls)


	def instantiate_from_config(config):
	"""Instantiate an object from a config dictionary."""
	if not "target" in config:
	raise KeyError("Expected key `target` to instantiate.")
	return get_obj_from_str(config["target"])(**config.get("params", dict()))


	def is_dist_avail_and_initialized():
	"""Check if distributed training is available and initialized."""
	if not torch.distributed.is_initialized():
	return False
	return True


	def is_main_process():
	"""Check if the current process is the main process."""
	return not is_dist_avail_and_initialized() or torch.distributed.get_rank() == 0


	def concat_all_gather(tensor):
	"""
	Performs all_gather operation on the provided tensors.
	* Warning *: torch.distributed.all_gather has no gradient.
	"""
	tensors_gather = [torch.ones_like(tensor)
	for _ in range(torch.distributed.get_world_size())]
	torch.distributed.all_gather(tensors_gather, tensor, async_op=False)

	output = torch.cat(tensors_gather, dim=0)
	return output


	def requires_grad(model, flag=True):
	"""Set requires_grad flag for all model parameters."""
	for p in model.parameters():
	p.requires_grad = flag


	def save_img(img, save_path):
	"""Save a single image to disk."""
	img = np.clip(img.float().numpy().transpose([1, 2, 0]) * 255, 0, 255)
	img = img.astype(np.uint8)[:, :, ::-1]
	cv2.imwrite(save_path, img)


	def save_img_batch(imgs, save_paths):
	"""Process and save multiple images at once using a thread pool."""
	# Convert to numpy and prepare all images in one go
	imgs = np.clip(imgs.float().numpy().transpose(0, 2, 3, 1) * 255, 0, 255).astype(np.uint8)
	imgs = imgs[:, :, :, ::-1] # RGB to BGR for all images at once

	with ThreadPoolExecutor(max_workers=32) as pool:
	# Submit all tasks at once
	futures = [pool.submit(cv2.imwrite, path, img)
	for path, img in zip(save_paths, imgs)]
	# Wait for all tasks to complete
	for future in futures:
	future.result() # This will raise any exceptions that occurred


	def get_fid_stats(real_dir, rec_dir, fid_stats):
	"""Calculate FID statistics between real and reconstructed images."""
	stats = torch_fidelity.calculate_metrics(
	input1=rec_dir,
	input2=real_dir,
	fid_statistics_file=fid_stats,
	cuda=True,
	isc=True,
	fid=True,
	kid=False,
	prc=False,
	verbose=False,
	)
	return stats


	def create_scheduler(optimizer, num_epoch, steps_per_epoch, lr_min, warmup_steps,
	warmup_lr_init, decay_steps, cosine_lr):
	"""Create a learning rate scheduler."""
	scheduler = build_scheduler(
	optimizer,
	num_epoch,
	steps_per_epoch,
	lr_min,
	warmup_steps,
	warmup_lr_init,
	decay_steps,
	cosine_lr,
	)
	return scheduler


	def load_state_dict(state_dict, model):
	"""Helper to load a state dict with proper prefix handling."""
	if 'state_dict' in state_dict:
	state_dict = state_dict['state_dict']
	# Remove '_orig_mod' prefix if present
	state_dict = {k.replace('_orig_mod.', ''): v for k, v in state_dict.items()}
	missing, unexpected = model.load_state_dict(
	state_dict, strict=False
	)
	if is_main_process():
	print(f"Loaded model. Missing: {missing}, Unexpected: {unexpected}")


	def load_safetensors(path, model):
	"""Helper to load a safetensors checkpoint."""
	from safetensors.torch import safe_open
	with safe_open(path, framework="pt", device="cpu") as f:
	state_dict = {k: f.get_tensor(k) for k in f.keys()}
	load_state_dict(state_dict, model)


	def setup_result_folders(result_folder):
	"""Setup result folders for saving models and images."""
	model_saved_dir = os.path.join(result_folder, "models")
	os.makedirs(model_saved_dir, exist_ok=True)

	image_saved_dir = os.path.join(result_folder, "images")
	os.makedirs(image_saved_dir, exist_ok=True)

	return model_saved_dir, image_saved_dir


	def create_optimizer(model, weight_decay, learning_rate, betas=(0.9, 0.95)):
	"""Create an AdamW optimizer with weight decay for 2D parameters only."""
	# start with all of the candidate parameters
	param_dict = {pn: p for pn, p in model.named_parameters()}
	# filter out those that do not require grad
	param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
	# create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
	# i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
	decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
	nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
	optim_groups = [
	{'params': decay_params, 'weight_decay': weight_decay},
	{'params': nodecay_params, 'weight_decay': 0.0}
	]
	num_decay_params = sum(p.numel() for p in decay_params)
	num_nodecay_params = sum(p.numel() for p in nodecay_params)
	if is_main_process():
	print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
	print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
	optimizer = AdamW(optim_groups, lr=learning_rate, betas=betas)
	return optimizer


	class EMAModel:
	"""Model Exponential Moving Average."""
	def __init__(self, model, device, decay=0.999):
	self.device = device
	self.decay = decay
	self.ema_params = OrderedDict(
	(name, param.clone().detach().to(device))
	for name, param in model.named_parameters()
	if param.requires_grad
	)

	@torch.no_grad()
	def update(self, model):
	for name, param in model.named_parameters():
	if param.requires_grad:
	if name in self.ema_params:
	self.ema_params[name].lerp_(param.data, 1 - self.decay)
	else:
	self.ema_params[name] = param.data.clone().detach()

	def state_dict(self):
	return self.ema_params

	def load_state_dict(self, params):
	self.ema_params = OrderedDict(
	(name, param.clone().detach().to(self.device))
	for name, param in params.items()
	)


	class PaddedDataset(torch.utils.data.Dataset):
	"""Dataset wrapper that pads a dataset to ensure even distribution across processes."""
	def __init__(self, dataset, padding_size):
	self.dataset = dataset
	self.padding_size = padding_size

	def __len__(self):
	return len(self.dataset) + self.padding_size

	def __getitem__(self, idx):
	if idx < len(self.dataset):
	return self.dataset[idx]
	return self.dataset[0]

	class CacheDataLoader:
	"""DataLoader-like interface for cached data with epoch-based shuffling."""
	def __init__(self, slots, targets=None, batch_size=32, num_augs=1, seed=None):
	self.slots = slots
	self.targets = targets
	self.batch_size = batch_size
	self.num_augs = num_augs
	self.seed = seed
	self.epoch = 0
	# Original dataset size (before augmentations)
	self.num_samples = len(slots) // num_augs

	def set_epoch(self, epoch):
	"""Set epoch for deterministic shuffling."""
	self.epoch = epoch

	def __len__(self):
	"""Return number of batches based on original dataset size."""
	return self.num_samples // self.batch_size

	def __iter__(self):
	"""Return random indices for current epoch."""
	g = torch.Generator()
	g.manual_seed(self.seed + self.epoch if self.seed is not None else self.epoch)

	# Randomly sample indices from the entire augmented dataset
	indices = torch.randint(
	0, len(self.slots),
	(self.num_samples,),
	generator=g
	).numpy()

	# Yield batches of indices
	for start in range(0, self.num_samples, self.batch_size):
	end = min(start + self.batch_size, self.num_samples)
	batch_indices = indices[start:end]
	yield (
	torch.from_numpy(self.slots[batch_indices]),
	torch.from_numpy(self.targets[batch_indices])
	)