# Paths | |
model = '/workspace/model' | |
output_dir = '/workspace/out' | |
# Lora configuration | |
# can use full_fine_tune=true and no quantization to train the whole model instead of a LoRA | |
#full_fine_tune = true | |
lora_rank = 16 | |
lora_alpha = 32 | |
lora_dropout = 0.05 | |
# Train only specific modules. This is passed to the parameter of the same name in the LoraConfig. | |
# If not set, adapt all linear modules. | |
# Note, this ALSO affects full fine tuning. In that case, if this is set, only weights containing one | |
# of these keys as substring will have requires_grad. If not set everything is trained. | |
#target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'] | |
# can specify layers to adapt with LoRA if you want | |
#layers_to_transform = '16:31' | |
# for Mixtral, set the load balancing coefficient | |
# load_balancing_loss_coef = 0.02 | |
# Optimization configuration | |
epochs = 2 | |
lr_scheduler = 'cosine' # can also be 'constant' | |
warmup_steps = 50 | |
# might be useful if resuming from a checkpoint and you want to change the LR and force it to something | |
#force_constant_lr = 5e-5 | |
# hard clamp the magnitude of the LoRA weights | |
#scale_weight_norms = 1.0 | |
# dynamic batch size, targeting this many tokens per batch, per device | |
# if set, completely ignores the batch size in the deepspeed JSON config file | |
# can be thought of as a replacement for sample packing | |
batch_size_tokens = 10000 | |
# Performance settings | |
pipeline_stages = 8 # number of pipeline parallel stages, must evenly divide the number of GPUs you launch the script with | |
logging_steps = 10 # how often to log in Tensorboard | |
eval_steps = 500 | |
save_steps = 500 | |
checkpoint_every_n_minutes = 60 | |
eval_before_first_step = false # do an eval before any training happens | |
# dtype to load the underlying model weights in | |
model_weight_dtype = 'bfloat16' | |
# dtype for the LoRA weights | |
lora_weight_dtype = 'bfloat16' | |
# Can have the saved weights be different dtype. Don't need to set this. Could be useful for | |
# training in float32 but saving with float16. | |
#save_dtype = 'bfloat16' | |
# Keep this number of stepXXXX (model saves) and global_stepXXX (checkpoint saves) and delete the rest | |
# (this only applies to the current training session, and resumed training sessions will not touch | |
# old saves) | |
keep_states = 5 | |
# sort examples by length before dividing them into batches | |
# this makes all examples in a batch approximately the same length, to minimize padding | |
# the batches are still shuffled after that | |
# you should probably always have this set to true | |
group_by_length = true | |
# This can also be 'unsloth' to offload hidden states to CPU, saving potentially a lot of VRAM | |
# for a minor performance hit. | |
# Example: 4x4090, PCIE 3.0 16x, pipeline_stages=4, training QLoRA on Llama 3 70B with 4096 sequence length. | |
# true: 75s step time, 19.7G peak per-GPU VRAM usage. | |
# 'unsloth': 78s step time, 16.2G peak per-GPU VRAM usage. | |
activation_checkpointing = 'unsloth' | |
# Keep MLP weights on system RAM until they are needed. Can save a ton of VRAM with a | |
# moderate hit to performance. If using an MoE model, this can also be an integer, in | |
# which case only that many experts are offloaded (tradeoff between VRAM and speed). | |
offload_mlp_to_cpu = 2 | |
# Resume a prior run | |
# if true, we attempt to resume training from the most recent directory inside output_dir (the directory names are timestamps) | |
# so, to resume, just run the exact same command but set this to true first | |
resume_from_checkpoint = false | |
# Loading the optimizer states seems to cause some kind of unavoidable VRAM memory leak. | |
# It's very small, only about 0.2 GB in cases I've seen. But if you are very close to the | |
# limit, it can cause resuming from checkpoint to OOM. As a last resort, you can uncomment | |
# this to not load the optimizer states and hopefully the resumption won't OOM. | |
#load_optimizer_states = false | |
# Dataset configuration | |
# How to combine multiple datasets if you have more than one. | |
# Can be 'concatenate' or 'interleave'. Will be 'concatenate' if not set. | |
dataset_combination_mode = 'interleave' | |
# When to stop interleaving datasets when using mode 'interleave'. Either 'first_exhausted' or 'all_exhausted'. | |
# Default if not set: 'first_exhausted' | |
dataset_interleave_stopping_strategy = 'all_exhausted' | |
# Can set this lower than training, so we don't drop as many examples when trying to make equal-sized batches. | |
# Default if not set: same as training GAS. | |
eval_gradient_accumulation_steps = 1 | |
# bitsandbytes 4 bit quantization. The parameters here become arguments to Transformers BitsAndBytesConfig. | |
#[quantization.bnb] | |
#load_in_4bit = true | |
#bnb_4bit_use_double_quant = false | |
#bnb_4bit_compute_dtype = 'bfloat16' | |
# HQQ quantization. The parameters here become arguments to CustomHQQConfig. | |
# [quantization.hqq] | |
# nbits = 4 | |
# group_size = 64 | |
# compute_dtype = 'bfloat16' | |
# (Optional) You can override the quant params for certain modules. This does substring matching, e.g. if 'gate_proj' | |
# is a substring of the full module name, anything specified overwrites the defaults in [quantization.hqq]. | |
# [quantization.hqq.dynamic_config] | |
# gate_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true} | |
# up_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true} | |
# down_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true} | |
[optimizer] | |
# options: adamw_kahan, AdamW, AdamW8bit | |
type = 'adamw_kahan' | |
lr = 5e-5 | |
beta1 = 0.9 | |
beta2 = 0.99 | |
weight_decay = 0.1 | |
[[datasets]] | |
# Arbitrary name, used only for separately logging eval metrics. Will be dataset0, dataset1, etc if not set. | |
name = 'c2' | |
dataset_type = 'axolotl' | |
dataset_path = '../axolotl/sorc.yml' | |
sequence_len = 8192 | |
eval_size = 0.01 | |
# Relative sampling weight, when using combination mode 'interleave'. Will be 1 if not set. | |
sample_weight = 1 | |
#[[datasets]] | |
#name = 'capybara' | |
#dataset_type = 'axolotl' | |
#dataset_path = 'examples/capybara.yml' | |
#sequence_len = 2048 | |
#eval_size = 0.02 | |
#sample_weight = 1.5 | |
# In addition to using eval_size which splits off some of the dataset, we can have completely separate datasets for eval. | |
# This can be useful if you're training on raw text data, so that the eval set remains completely fixed, even if | |
# you change training sequence_len, etc. | |
# This is just an example, typically you wouldn't have this overlap a training dataset. | |
# [[eval_datasets]] | |
# name = 'capybara' | |
# dataset_type = 'axolotl' | |
# dataset_path = 'examples/capybara.yml' | |
# sequence_len = 2048 | |