dreambooth / 00_created /04_ShivamShrirao /train_dreambooth_mod.py

Upload 00_created with huggingface_hub

9725440 over 2 years ago

26.6 kB

	import argparse
	import math
	import os
	from contextlib import nullcontext
	from pathlib import Path
	from typing import Optional

	import torch
	import torch.nn.functional as F
	import torch.utils.checkpoint
	from torch.utils.data import Dataset

	from accelerate import Accelerator
	from accelerate.logging import get_logger
	from accelerate.utils import set_seed
	from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
	from diffusers.optimization import get_scheduler
	from huggingface_hub import HfFolder, Repository, whoami
	from PIL import Image
	from torchvision import transforms
	from tqdm.auto import tqdm
	from transformers import CLIPTextModel, CLIPTokenizer


	logger = get_logger(__name__)


	def parse_args():
	parser = argparse.ArgumentParser(description="Simple example of a training script.")
	parser.add_argument(
	"--pretrained_model_name_or_path",
	type=str,
	default=None,
	required=True,
	help="Path to pretrained model or model identifier from huggingface.co/models.",
	)
	parser.add_argument(
	"--tokenizer_name",
	type=str,
	default=None,
	help="Pretrained tokenizer name or path if not the same as model_name",
	)
	parser.add_argument(
	"--instance_data_dir",
	type=str,
	default=None,
	required=True,
	help="A folder containing the training data of instance images.",
	)
	parser.add_argument(
	"--class_data_dir",
	type=str,
	default=None,
	required=False,
	help="A folder containing the training data of class images.",
	)
	parser.add_argument(
	"--instance_prompt",
	type=str,
	default=None,
	help="The prompt with identifier specifing the instance",
	)
	parser.add_argument(
	"--class_prompt",
	type=str,
	default=None,
	help="The prompt to specify images in the same class as provided intance images.",
	)
	parser.add_argument(
	"--with_prior_preservation",
	default=False,
	action="store_true",
	help="Flag to add prior perservation loss.",
	)
	parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
	parser.add_argument(
	"--num_class_images",
	type=int,
	default=100,
	help=(
	"Minimal class images for prior perversation loss. If not have enough images, additional images will be"
	" sampled with class_prompt."
	),
	)
	parser.add_argument(
	"--output_dir",
	type=str,
	default="text-inversion-model",
	help="The output directory where the model predictions and checkpoints will be written.",
	)
	parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
	parser.add_argument(
	"--resolution",
	type=int,
	default=512,
	help=(
	"The resolution for input images, all the images in the train/validation dataset will be resized to this"
	" resolution"
	),
	)
	parser.add_argument(
	"--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
	)
	parser.add_argument(
	"--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
	)
	parser.add_argument(
	"--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
	)
	parser.add_argument("--num_train_epochs", type=int, default=1)
	parser.add_argument(
	"--max_train_steps",
	type=int,
	default=None,
	help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
	)
	parser.add_argument(
	"--gradient_accumulation_steps",
	type=int,
	default=1,
	help="Number of updates steps to accumulate before performing a backward/update pass.",
	)
	parser.add_argument(
	"--gradient_checkpointing",
	action="store_true",
	help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
	)
	parser.add_argument(
	"--learning_rate",
	type=float,
	default=5e-6,
	help="Initial learning rate (after the potential warmup period) to use.",
	)
	parser.add_argument(
	"--scale_lr",
	action="store_true",
	default=False,
	help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
	)
	parser.add_argument(
	"--lr_scheduler",
	type=str,
	default="constant",
	help=(
	'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
	' "constant", "constant_with_warmup"]'
	),
	)
	parser.add_argument(
	"--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
	)
	parser.add_argument(
	"--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
	)
	parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
	parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
	parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
	parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
	parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
	parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
	parser.add_argument(
	"--use_auth_token",
	action="store_true",
	help=(
	"Will use the token generated when running `huggingface-cli login` (necessary to use this script with"
	" private models)."
	),
	)
	parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
	parser.add_argument(
	"--hub_model_id",
	type=str,
	default=None,
	help="The name of the repository to keep in sync with the local `output_dir`.",
	)
	parser.add_argument(
	"--logging_dir",
	type=str,
	default="logs",
	help=(
	"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
	" output_dir/runs/CURRENT_DATETIME_HOSTNAME**."
	),
	)
	parser.add_argument("--log_interval", type=int, default=10, help="Log every N steps.")
	parser.add_argument(
	"--mixed_precision",
	type=str,
	default="no",
	choices=["no", "fp16", "bf16"],
	help=(
	"Whether to use mixed precision. Choose"
	"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
	"and an Nvidia Ampere GPU."
	),
	)
	parser.add_argument("--not_cache_latents", action="store_true", help="Do not precompute and cache latents from VAE.")
	parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")

	args = parser.parse_args()
	env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
	if env_local_rank != -1 and env_local_rank != args.local_rank:
	args.local_rank = env_local_rank

	if args.instance_data_dir is None:
	raise ValueError("You must specify a train data directory.")

	if args.with_prior_preservation:
	if args.class_data_dir is None:
	raise ValueError("You must specify a data directory for class images.")
	if args.class_prompt is None:
	raise ValueError("You must specify prompt for class images.")

	return args


	class DreamBoothDataset(Dataset):
	"""
	A dataset to prepare the instance and class images with the promots for fine-tuning the model.
	It pre-processes the images and the tokenizes prompts.
	"""

	def __init__(
	self,
	instance_data_root,
	instance_prompt,
	tokenizer,
	class_data_root=None,
	class_prompt=None,
	size=512,
	center_crop=False,
	):
	self.size = size
	self.center_crop = center_crop
	self.tokenizer = tokenizer

	self.instance_data_root = Path(instance_data_root)
	if not self.instance_data_root.exists():
	raise ValueError("Instance images root doesn't exists.")

	self.instance_images_path = [x for x in Path(instance_data_root).iterdir() if x.is_file()]
	self.num_instance_images = len(self.instance_images_path)
	self.instance_prompt = instance_prompt
	self._length = self.num_instance_images

	if class_data_root is not None:
	self.class_data_root = Path(class_data_root)
	self.class_data_root.mkdir(parents=True, exist_ok=True)
	self.class_images_path = [x for x in Path(class_data_root).iterdir() if x.is_file()]
	self.num_class_images = len(self.class_images_path)
	self._length = max(self.num_class_images, self.num_instance_images)
	self.class_prompt = class_prompt
	else:
	self.class_data_root = None

	self.image_transforms = transforms.Compose(
	[
	transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
	transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
	transforms.ToTensor(),
	transforms.Normalize([0.5], [0.5]),
	]
	)

	def __len__(self):
	return self._length

	def __getitem__(self, index):
	example = {}
	instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
	if not instance_image.mode == "RGB":
	instance_image = instance_image.convert("RGB")
	example["instance_images"] = self.image_transforms(instance_image)
	example["instance_prompt_ids"] = self.tokenizer(
	self.instance_prompt,
	padding="do_not_pad",
	truncation=True,
	max_length=self.tokenizer.model_max_length,
	).input_ids

	if self.class_data_root:
	class_image = Image.open(self.class_images_path[index % self.num_class_images])
	if not class_image.mode == "RGB":
	class_image = class_image.convert("RGB")
	example["class_images"] = self.image_transforms(class_image)
	example["class_prompt_ids"] = self.tokenizer(
	self.class_prompt,
	padding="do_not_pad",
	truncation=True,
	max_length=self.tokenizer.model_max_length,
	).input_ids

	return example


	class PromptDataset(Dataset):
	"A simple dataset to prepare the prompts to generate class images on multiple GPUs."

	def __init__(self, prompt, num_samples):
	self.prompt = prompt
	self.num_samples = num_samples

	def __len__(self):
	return self.num_samples

	def __getitem__(self, index):
	example = {}
	example["prompt"] = self.prompt
	example["index"] = index
	return example


	class LatentsDataset(Dataset):
	def __init__(self, latents_cache, texts):
	self.latents_cache = latents_cache
	self.texts = texts

	def __len__(self):
	return len(self.latents_cache)

	def __getitem__(self, index):
	return self.latents_cache[index], self.texts[index]


	class AverageMeter:
	def __init__(self, name=None):
	self.name = name
	self.reset()

	def reset(self):
	self.sum = self.count = self.avg = 0

	def update(self, val, n=1):
	self.sum += val * n
	self.count += n
	self.avg = self.sum / self.count


	def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
	if token is None:
	token = HfFolder.get_token()
	if organization is None:
	username = whoami(token)["name"]
	return f"{username}/{model_id}"
	else:
	return f"{organization}/{model_id}"


	def main():
	args = parse_args()
	logging_dir = Path(args.output_dir, args.logging_dir)

	accelerator = Accelerator(
	gradient_accumulation_steps=args.gradient_accumulation_steps,
	mixed_precision=args.mixed_precision,
	log_with="tensorboard",
	logging_dir=logging_dir,
	)

	if args.seed is not None:
	set_seed(args.seed)

	if args.with_prior_preservation:
	class_images_dir = Path(args.class_data_dir)
	if not class_images_dir.exists():
	class_images_dir.mkdir(parents=True)
	cur_class_images = len(list(class_images_dir.iterdir()))

	if cur_class_images < args.num_class_images:
	torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
	pipeline = StableDiffusionPipeline.from_pretrained(
	args.pretrained_model_name_or_path, use_auth_token=args.use_auth_token, torch_dtype=torch_dtype
	)
	pipeline.set_progress_bar_config(disable=True)

	num_new_images = args.num_class_images - cur_class_images
	logger.info(f"Number of class images to sample: {num_new_images}.")

	sample_dataset = PromptDataset(args.class_prompt, num_new_images)
	sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)

	sample_dataloader = accelerator.prepare(sample_dataloader)
	pipeline.to(accelerator.device)

	context = torch.autocast("cuda") if accelerator.device.type == "cuda" else nullcontext
	for example in tqdm(
	sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
	):
	with context:
	images = pipeline(example["prompt"]).images

	for i, image in enumerate(images):
	image.save(class_images_dir / f"{example['index'][i] + cur_class_images}.jpg")

	del pipeline
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Handle the repository creation
	if accelerator.is_main_process:
	if args.push_to_hub:
	if args.hub_model_id is None:
	repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
	else:
	repo_name = args.hub_model_id
	repo = Repository(args.output_dir, clone_from=repo_name)

	with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
	if "step_*" not in gitignore:
	gitignore.write("step_*\n")
	if "epoch_*" not in gitignore:
	gitignore.write("epoch_*\n")
	elif args.output_dir is not None:
	os.makedirs(args.output_dir, exist_ok=True)

	# Load the tokenizer
	if args.tokenizer_name:
	tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
	elif args.pretrained_model_name_or_path:
	tokenizer = CLIPTokenizer.from_pretrained(
	args.pretrained_model_name_or_path, subfolder="tokenizer", use_auth_token=args.use_auth_token
	)

	# Load models and create wrapper for stable diffusion
	text_encoder = CLIPTextModel.from_pretrained(
	args.pretrained_model_name_or_path, subfolder="text_encoder", use_auth_token=args.use_auth_token
	)
	vae = AutoencoderKL.from_pretrained(
	args.pretrained_model_name_or_path, subfolder="vae", use_auth_token=args.use_auth_token
	)
	unet = UNet2DConditionModel.from_pretrained(
	args.pretrained_model_name_or_path, subfolder="unet", use_auth_token=args.use_auth_token
	)

	if args.gradient_checkpointing:
	unet.enable_gradient_checkpointing()

	if args.scale_lr:
	args.learning_rate = (
	args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
	)

	# Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
	if args.use_8bit_adam:
	try:
	import bitsandbytes as bnb
	except ImportError:
	raise ImportError(
	"To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
	)
	print("Use AdamW8bit optimizer")
	optimizer_class = bnb.optim.AdamW8bit
	else:
	optimizer_class = torch.optim.AdamW

	# create wrapper model
	import torch.nn as nn
	class WrapperModel(nn.Module):
	def __init__(self, un, te):
	super().__init__()
	self.unet = un
	self.text_encoder = te
	# super(WrapperModel, self).__init__()
	# self.register_module("unet", un)
	# self.register_module("text_encoder", te)
	# model = WrapperModel(unet, text_encoder)
	# unet = model.get_submodule("unet")
	# text_encoder = model.get_submodule("text_encoder")

	model = WrapperModel(unet, text_encoder)

	optimizer = optimizer_class(
	# unet.parameters(), # only optimize unet
	model.parameters(),
	lr=args.learning_rate,
	betas=(args.adam_beta1, args.adam_beta2),
	weight_decay=args.adam_weight_decay,
	eps=args.adam_epsilon,
	)

	noise_scheduler = DDPMScheduler(
	beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
	)

	train_dataset = DreamBoothDataset(
	instance_data_root=args.instance_data_dir,
	instance_prompt=args.instance_prompt,
	class_data_root=args.class_data_dir if args.with_prior_preservation else None,
	class_prompt=args.class_prompt,
	tokenizer=tokenizer,
	size=args.resolution,
	center_crop=args.center_crop,
	)

	def collate_fn(examples):
	input_ids = [example["instance_prompt_ids"] for example in examples]
	pixel_values = [example["instance_images"] for example in examples]

	# Concat class and instance examples for prior preservation.
	# We do this to avoid doing two forward passes.
	if args.with_prior_preservation:
	input_ids += [example["class_prompt_ids"] for example in examples]
	pixel_values += [example["class_images"] for example in examples]

	pixel_values = torch.stack(pixel_values)
	pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()

	input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids

	batch = {
	"input_ids": input_ids,
	"pixel_values": pixel_values,
	}
	return batch

	train_dataloader = torch.utils.data.DataLoader(
	train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True
	)

	# Move vae to gpu
	vae.to(accelerator.device)

	if not args.not_cache_latents:
	latents_cache = []
	texts = []
	for batch in tqdm(train_dataloader, desc="Caching latents"):
	with torch.no_grad():
	batch["pixel_values"] = batch["pixel_values"].to(accelerator.device, non_blocking=True)
	latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
	texts.append(batch["input_ids"])
	train_dataset = LatentsDataset(latents_cache, texts)
	train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, collate_fn=lambda x: x, shuffle=True)

	del vae
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Scheduler and math around the number of training steps.
	overrode_max_train_steps = False
	num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
	if args.max_train_steps is None:
	args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
	overrode_max_train_steps = True

	lr_scheduler = get_scheduler(
	args.lr_scheduler,
	optimizer=optimizer,
	num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
	num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
	)

	model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
	model, optimizer, train_dataloader, lr_scheduler
	)

	# We need to recalculate our total training steps as the size of the training dataloader may have changed.
	num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
	if overrode_max_train_steps:
	args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
	# Afterwards we recalculate our number of training epochs
	args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

	# We need to initialize the trackers we use, and also store our configuration.
	# The trackers initializes automatically on the main process.
	if accelerator.is_main_process:
	accelerator.init_trackers("dreambooth", config=vars(args))

	# Train!
	total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

	logger.info("*** Running training ***")
	logger.info(f" Num examples = {len(train_dataset)}")
	logger.info(f" Num batches each epoch = {len(train_dataloader)}")
	logger.info(f" Num Epochs = {args.num_train_epochs}")
	logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
	logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
	logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
	logger.info(f" Total optimization steps = {args.max_train_steps}")
	# Only show the progress bar once on each machine.
	progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
	progress_bar.set_description("Steps")
	global_step = 0
	loss_avg = AverageMeter()
	for epoch in range(args.num_train_epochs):
	model.train()
	for step, batch in enumerate(train_dataloader):
	with accelerator.accumulate(model):
	# Convert images to latent space
	with torch.no_grad():
	if not args.not_cache_latents:
	latent_dist = batch[0][0]
	else:
	latent_dist = vae.encode(batch["pixel_values"]).latent_dist
	latents = latent_dist.sample() * 0.18215

	# Sample noise that we'll add to the latents
	noise = torch.randn(latents.shape).to(latents.device)
	bsz = latents.shape[0]
	# Sample a random timestep for each image
	timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
	timesteps = timesteps.long()

	# Add noise to the latents according to the noise magnitude at each timestep
	# (this is the forward diffusion process)
	noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

	# Get the text embedding for conditioning
	# with torch.no_grad():
	# if not args.not_cache_latents:
	# encoder_hidden_states = batch[0][1]
	# else:
	# encoder_hidden_states = text_encoder(batch[0][1])[0]
	# with gradient
	encoder_hidden_states = text_encoder(batch[0][1])[0]

	# Predict the noise residual
	noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample

	if args.with_prior_preservation:
	# Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
	noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
	noise, noise_prior = torch.chunk(noise, 2, dim=0)

	# Compute instance loss
	loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()

	# Compute prior loss
	prior_loss = F.mse_loss(noise_pred_prior, noise_prior, reduction="none").mean([1, 2, 3]).mean()

	# Add the prior loss to the instance loss.
	loss = loss + args.prior_loss_weight * prior_loss
	else:
	loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()

	accelerator.backward(loss)
	if accelerator.sync_gradients:
	accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm)
	optimizer.step()
	lr_scheduler.step()
	optimizer.zero_grad(set_to_none=True)
	loss_avg.update(loss.detach_(), bsz)

	if not global_step % args.log_interval:
	logs = {"loss": loss_avg.avg.item(), "lr": lr_scheduler.get_last_lr()[0]}
	progress_bar.set_postfix(**logs)
	accelerator.log(logs, step=global_step)

	progress_bar.update(1)
	global_step += 1

	if global_step >= args.max_train_steps:
	break

	accelerator.wait_for_everyone()

	# Create the pipeline using using the trained modules and save it.
	if accelerator.is_main_process:
	unwrapped = accelerator.unwrap_model(model)
	pipeline = StableDiffusionPipeline.from_pretrained(
	args.pretrained_model_name_or_path,
	text_encoder=unwrapped.text_encoder,
	unet=unwrapped.unet,
	use_auth_token=args.use_auth_token,
	)
	pipeline.save_pretrained(args.output_dir)

	if args.push_to_hub:
	repo.push_to_hub(
	args, pipeline, repo, commit_message="End of training", blocking=False, auto_lfs_prune=True
	)

	accelerator.end_training()


	if __name__ == "__main__":
	main()