File size: 5,122 Bytes
6648530
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b579b9b
6648530
 
 
 
 
a1af5c6
6648530
 
a1af5c6
6648530
 
15b659d
6648530
 
 
 
 
 
 
 
 
 
 
 
 
 
08ac2cf
6648530
 
 
 
 
 
 
 
 
 
 
 
 
3981027
 
6648530
 
 
 
 
 
 
 
 
 
 
 
 
 
cfb3029
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6648530
cfb3029
6648530
 
cfb3029
6648530
cfb3029
 
 
 
6648530
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json

# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
# ``model_config``. (type: Optional[str], default: null)
model_name: "Llama-3.2-1B"

# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
# ``model_config``. (type: Optional[Config], default: null)
model_config:
  padded_vocab_size: 32768
  vocab_size: 32768
  block_size: 131072
  n_layer: 16
  n_head: 32
  head_size: 64
  n_embd: 768
  n_query_groups: 8
  rotary_percentage: 1.0
  parallel_residual: false
  shared_attention_norm: false
  bias: false
  norm_class_name: "RMSNorm"
  mlp_class_name: "LLaMAMLP"
  intermediate_size: 2048
  rope_base: 500000
  rope_adjustments:
    factor: 32.0
    low_freq_factor: 1.0
    high_freq_factor: 4.0
    original_max_seq_len: 8192

# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
out_dir: "../out/pretrain/"

# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
# precision: bf16-mixed
precision: bf16-true

# Optional path to a checkpoint directory to initialize the model from.
# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
initial_checkpoint_dir:

# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
# (type: Union[bool, Literal["auto"], Path], default: False)
# resume: false
resume: "auto"

# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
data:
  class_path: LitData

  init_args:
    data_path: "../pretrain-data/"
    num_workers: 32

# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:
  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
  save_interval: 100

  # Number of iterations between logging calls (type: int, default: 1)
  log_interval: 1

  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
  global_batch_size: 512

  # Number of samples per data-parallel rank (type: int, default: 4)
  micro_batch_size: 2

  # Number of iterations with learning rate warmup active (type: int, default: 2000)
  lr_warmup_steps: 0

  # Number of epochs to train on (type: Optional[int], default: null)
  epochs:

  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
  # max_tokens: 26141568960 # 319072 * 8193 * 10
  max_tokens: 7842470688 # 319072 * 8193 * 3
  # max_tokens: 5228313792 # 319072 * 8193 * 2
  # max_tokens: 2614156896 # 319072 * 8193 * 1

  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
  max_steps:

  # Limits the length of samples. Off by default (type: Optional[int], default: null)
  max_seq_length: 8193

  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
  tie_embeddings: true

  #   (type: Optional[float], default: 1.0)
  max_norm: 1.0

  #   (type: float, default: 4e-05)
  min_lr: 4e-05

# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:
  # Number of optimizer steps between evaluation calls (type: int, default: 1000)
  # interval: 100
  interval: 20

  # Number of tokens to generate (type: Optional[int], default: null)
  max_new_tokens:

  # Number of iterations (type: int, default: 100)
  max_iters: 100

  # Whether to evaluate on the validation set at the beginning of the training
  initial_validation: false

  # Whether to evaluate on the validation set at the end the training
  final_validation: true

# Optimizer-related arguments
# optimizer:
#   # class_path: torch.optim.AdamW
#   class_path: grokadamw.GrokAdamW
#
#   init_args:
#     #   (type: float, default: 0.001)
#     lr: 4e-04
#
#     #   (type: float, default: 0.01)
#     weight_decay: 0.1
#
#     #   (type: tuple, default: (0.9,0.999))
#     betas:
#       - 0.9
#       - 0.95

optimizer:
  class_path: sophia_opt.SophiaG

  init_args:
    lr: 2e-4
    betas:
      - 0.965
      - 0.99
    rho: 0.01
    weight_decay: 1e-1

# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
devices: auto

# How many nodes to use. (type: int, default: 1)
num_nodes: 1

# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
# module require this. (type: Optional[Path], default: null)
tokenizer_dir: "../"

# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
logger_name: "wandb"

# The random seed to use for reproducibility. (type: int, default: 42)
seed: 23