import argparse def _get_model_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--pretrained_model_name_or_path", type=str, default=None, required=True, help="Path to pretrained model or model identifier from huggingface.co/models.", ) parser.add_argument( "--revision", type=str, default=None, required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) parser.add_argument( "--variant", type=str, default=None, help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", ) parser.add_argument( "--cache_dir", type=str, default=None, help="The directory where the downloaded models and datasets will be stored.", ) def _get_dataset_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--data_root", type=str, default=None, help=("A folder containing the training data."), ) parser.add_argument( "--dataset_file", type=str, default=None, help=("Path to a CSV file if loading prompts/video paths using this format."), ) parser.add_argument( "--video_column", type=str, default="video", help="The column of the dataset containing videos. Or, the name of the file in `--data_root` folder containing the line-separated path to video data.", ) parser.add_argument( "--caption_column", type=str, default="text", help="The column of the dataset containing the instance prompt for each video. Or, the name of the file in `--data_root` folder containing the line-separated instance prompts.", ) parser.add_argument( "--id_token", type=str, default=None, help="Identifier token appended to the start of each prompt if provided.", ) parser.add_argument( "--height_buckets", nargs="+", type=int, default=[256, 320, 384, 480, 512, 576, 720, 768, 960, 1024, 1280, 1536], ) parser.add_argument( "--width_buckets", nargs="+", type=int, default=[256, 320, 384, 480, 512, 576, 720, 768, 960, 1024, 1280, 1536], ) parser.add_argument( "--frame_buckets", nargs="+", type=int, default=[49], help="CogVideoX1.5 need to guarantee that ((num_frames - 1) // self.vae_scale_factor_temporal + 1) % patch_size_t == 0, such as 53" ) parser.add_argument( "--load_tensors", action="store_true", help="Whether to use a pre-encoded tensor dataset of latents and prompt embeddings instead of videos and text prompts. The expected format is that saved by running the `prepare_dataset.py` script.", ) parser.add_argument( "--random_flip", type=float, default=None, help="If random horizontal flip augmentation is to be used, this should be the flip probability.", ) parser.add_argument( "--dataloader_num_workers", type=int, default=0, help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.", ) parser.add_argument( "--pin_memory", action="store_true", help="Whether or not to use the pinned memory setting in pytorch dataloader.", ) def _get_validation_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--validation_prompt", type=str, default=None, help="One or more prompt(s) that is used during validation to verify that the model is learning. Multiple validation prompts should be separated by the '--validation_prompt_seperator' string.", ) parser.add_argument( "--validation_images", type=str, default=None, help="One or more image path(s)/URLs that is used during validation to verify that the model is learning. Multiple validation paths should be separated by the '--validation_prompt_seperator' string. These should correspond to the order of the validation prompts.", ) parser.add_argument( "--validation_prompt_separator", type=str, default=":::", help="String that separates multiple validation prompts", ) parser.add_argument( "--num_validation_videos", type=int, default=1, help="Number of videos that should be generated during validation per `validation_prompt`.", ) parser.add_argument( "--validation_epochs", type=int, default=None, help="Run validation every X training epochs. Validation consists of running the validation prompt `args.num_validation_videos` times.", ) parser.add_argument( "--validation_steps", type=int, default=None, help="Run validation every X training steps. Validation consists of running the validation prompt `args.num_validation_videos` times.", ) parser.add_argument( "--guidance_scale", type=float, default=6, help="The guidance scale to use while sampling validation videos.", ) parser.add_argument( "--use_dynamic_cfg", action="store_true", default=False, help="Whether or not to use the default cosine dynamic guidance schedule when sampling validation videos.", ) parser.add_argument( "--enable_model_cpu_offload", action="store_true", default=False, help="Whether or not to enable model-wise CPU offloading when performing validation/testing to save memory.", ) def _get_training_args(parser: argparse.ArgumentParser) -> None: parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument("--rank", type=int, default=64, help="The rank for LoRA matrices.") parser.add_argument( "--lora_alpha", type=int, default=64, help="The lora_alpha to compute scaling factor (lora_alpha / rank) for LoRA matrices.", ) parser.add_argument( "--mixed_precision", type=str, default=None, choices=["no", "fp16", "bf16"], help=( "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10.and an Nvidia Ampere GPU. " "Default to the value of accelerate config of the current system or the flag passed with the `accelerate.launch` command. Use this " "argument to override the accelerate config." ), ) parser.add_argument( "--output_dir", type=str, default="cogvideox-sft", help="The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--height", type=int, default=480, help="All input videos are resized to this height.", ) parser.add_argument( "--width", type=int, default=720, help="All input videos are resized to this width.", ) parser.add_argument( "--video_reshape_mode", type=str, default=None, help="All input videos are reshaped to this mode. Choose between ['center', 'random', 'none']", ) parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.") parser.add_argument( "--max_num_frames", type=int, default=49, help="All input videos will be truncated to these many frames.", ) parser.add_argument( "--skip_frames_start", type=int, default=0, help="Number of frames to skip from the beginning of each input video. Useful if training data contains intro sequences.", ) parser.add_argument( "--skip_frames_end", type=int, default=0, help="Number of frames to skip from the end of each input video. Useful if training data contains outro sequences.", ) parser.add_argument( "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader.", ) parser.add_argument("--num_train_epochs", type=int, default=1) parser.add_argument( "--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides `--num_train_epochs`.", ) parser.add_argument( "--checkpointing_steps", type=int, default=500, help=( "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final" " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming" " training using `--resume_from_checkpoint`." ), ) parser.add_argument( "--checkpoints_total_limit", type=int, default=None, help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", type=str, default=None, help=( "Whether training should be resumed from a previous checkpoint. Use a path saved by" ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' ), ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--gradient_checkpointing", action="store_true", help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", ) parser.add_argument( "--learning_rate", type=float, default=1e-4, help="Initial learning rate (after the potential warmup period) to use.", ) parser.add_argument( "--scale_lr", action="store_true", default=False, help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", ) parser.add_argument( "--lr_scheduler", type=str, default="constant", help=( 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' ' "constant", "constant_with_warmup"]' ), ) parser.add_argument( "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler.", ) parser.add_argument( "--lr_num_cycles", type=int, default=1, help="Number of hard resets of the lr in cosine_with_restarts scheduler.", ) parser.add_argument( "--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.", ) parser.add_argument( "--enable_slicing", action="store_true", default=False, help="Whether or not to use VAE slicing for saving memory.", ) parser.add_argument( "--enable_tiling", action="store_true", default=False, help="Whether or not to use VAE tiling for saving memory.", ) parser.add_argument( "--noised_image_dropout", type=float, default=0.05, help="Image condition dropout probability when finetuning image-to-video.", ) parser.add_argument( "--ignore_learned_positional_embeddings", action="store_true", default=False, help=( "Whether to ignore the learned positional embeddings when training CogVideoX Image-to-Video. This setting " "should be used when performing multi-resolution training, because CogVideoX-I2V does not support it " "otherwise. Please read the comments in https://github.com/a-r-r-o-w/cogvideox-factory/issues/26 to understand why." ), ) def _get_optimizer_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--optimizer", type=lambda s: s.lower(), default="adam", choices=["adam", "adamw", "prodigy", "came"], help=("The optimizer type to use."), ) parser.add_argument( "--use_8bit", action="store_true", help="Whether or not to use 8-bit optimizers from `bitsandbytes` or `bitsandbytes`.", ) parser.add_argument( "--use_4bit", action="store_true", help="Whether or not to use 4-bit optimizers from `torchao`.", ) parser.add_argument( "--use_torchao", action="store_true", help="Whether or not to use the `torchao` backend for optimizers." ) parser.add_argument( "--beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers.", ) parser.add_argument( "--beta2", type=float, default=0.95, help="The beta2 parameter for the Adam and Prodigy optimizers.", ) parser.add_argument( "--beta3", type=float, default=None, help="Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2.", ) parser.add_argument( "--prodigy_decouple", action="store_true", help="Use AdamW style decoupled weight decay.", ) parser.add_argument( "--weight_decay", type=float, default=1e-04, help="Weight decay to use for optimizer.", ) parser.add_argument( "--epsilon", type=float, default=1e-8, help="Epsilon value for the Adam optimizer and Prodigy optimizers.", ) parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--prodigy_use_bias_correction", action="store_true", help="Turn on Adam's bias correction.", ) parser.add_argument( "--prodigy_safeguard_warmup", action="store_true", help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage.", ) parser.add_argument( "--use_cpu_offload_optimizer", action="store_true", help="Whether or not to use the CPUOffloadOptimizer from TorchAO to perform optimization step and maintain parameters on the CPU.", ) parser.add_argument( "--offload_gradients", action="store_true", help="Whether or not to offload the gradients to CPU when using the CPUOffloadOptimizer from TorchAO.", ) def _get_configuration_args(parser: argparse.ArgumentParser) -> None: parser.add_argument("--tracker_name", type=str, default=None, help="Project tracker name") parser.add_argument( "--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.", ) parser.add_argument( "--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.", ) parser.add_argument( "--hub_model_id", type=str, default=None, help="The name of the repository to keep in sync with the local `output_dir`.", ) parser.add_argument( "--logging_dir", type=str, default="logs", help="Directory where logs are stored.", ) parser.add_argument( "--allow_tf32", action="store_true", help=( "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" ), ) parser.add_argument( "--nccl_timeout", type=int, default=600, help="Maximum timeout duration before which allgather, or related, operations fail in multi-GPU/multi-node training settings.", ) parser.add_argument( "--report_to", type=str, default=None, help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' ), ) def get_args(): parser = argparse.ArgumentParser(description="Simple example of a training script for CogVideoX.") _get_model_args(parser) _get_dataset_args(parser) _get_training_args(parser) _get_validation_args(parser) _get_optimizer_args(parser) _get_configuration_args(parser) return parser.parse_args()