HaileyStorm's picture
Upload 2 files
7b54808 verified
# Tokenizer
tokenizer:
_component_: torchtune.models.llama3.llama3_tokenizer
path: ../tokenizer.model
# Dataset and Sampler
dataset:
_component_: custom_datasets.orpo_dpo_mix_40k_dataset
max_seq_len: 8196
#dataset:
# _component_: torchtune.datasets.stack_exchanged_paired_dataset
seed: 42
shuffle: True
batch_size: 2
# Model Arguments
model:
_component_: torchtune.models.llama3.llama3
vocab_size: 128256
num_layers: 20
num_heads: 32
num_kv_heads: 8
embed_dim: 4096
max_seq_len: 8196
intermediate_dim: 14336
attn_dropout: 0.0
norm_eps: 1e-5
rope_base: 500000.0
checkpointer:
_component_: torchtune.utils.FullModelHFCheckpointer
checkpoint_dir: ../
checkpoint_files: [
model-00001-of-00003.safetensors,
model-00002-of-00003.safetensors,
model-00003-of-00003.safetensors
]
recipe_checkpoint: null
output_dir: ./llama3-5b/
model_type: LLAMA3
resume_from_checkpoint: False
# Fine-tuning arguments
epochs: 5
optimizer:
_component_: torch.optim.AdamW #bitsandbytes.optim.PagedAdamW8bit
lr: 3e-6
lr_scheduler:
_component_: torchtune.modules.get_cosine_schedule_with_warmup
num_warmup_steps: 1500
#loss:
# _component_: torchtune.modules.loss.DPOLoss
# beta: 0.1
# label_smoothing: 0
# loss_type: sigmoid
loss:
_component_: torch.nn.CrossEntropyLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 2
optimizer_in_bwd: False # False if grad accum > 1
compile: False
# Training environment
device: cuda
# Memory management
enable_activation_checkpointing: True
# Reduced precision
dtype: bf16 #fp32
# Logging
# enable logging to the built-in WandBLogger
metric_logger:
_component_: torchtune.utils.metric_logging.WandBLogger
# the W&B project to log to
project: llama3-5b
output_dir: ./logs/
log_every_n_steps: 1
log_peak_memory_stats: False