pretrain final

Browse files

Files changed (6) hide show

out/pretrain/final/config.json +39 -0
out/pretrain/final/hyperparameters.yaml +74 -0
out/pretrain/final/lit_model.pth +3 -0
out/pretrain/final/model_config.yaml +38 -0
out/pretrain/final/tokenizer.json +3 -0
out/pretrain/final/tokenizer_config.json +0 -0

out/pretrain/final/config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2,
+    5,
+    6
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 256,
+  "initializer_range": 0.02,
+  "intermediate_size": 1024,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0.dev0",
+  "use_cache": true,
+  "vocab_size": 262144
+}

out/pretrain/final/hyperparameters.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+model_name: Llama-3.1-8B
+model_config:
+  name: ''
+  hf_config: {}
+  scale_embeddings: false
+  block_size: 8192
+  vocab_size: 262144
+  padding_multiple: 512
+  padded_vocab_size: 262144
+  n_layer: 32
+  n_head: 32
+  head_size: 64
+  n_embd: 256
+  rotary_percentage: 1.0
+  parallel_residual: false
+  bias: false
+  lm_head_bias: false
+  n_query_groups: 8
+  shared_attention_norm: false
+  norm_class_name: RMSNorm
+  post_attention_norm: false
+  post_mlp_norm: false
+  norm_eps: 1.0e-05
+  mlp_class_name: LLaMAMLP
+  gelu_approximate: none
+  intermediate_size: 1024
+  rope_condense_ratio: 1
+  rope_base: 500000
+  rope_adjustments:
+    factor: 32.0
+    low_freq_factor: 1.0
+    high_freq_factor: 4.0
+    original_max_seq_len: 8192
+  n_expert: 0
+  n_expert_per_token: 0
+out_dir: ../out/pretrain
+precision: bf16-true
+resume: auto
+data:
+  class_path: litgpt.data.LitData
+  init_args:
+    data_path: ../pretrain-data/
+    seed: 42
+    num_workers: 32
+train:
+  save_interval: 500
+  log_interval: 1
+  global_batch_size: 512
+  micro_batch_size: 11
+  lr_warmup_steps: 2000
+  max_tokens: 9889496064
+  max_seq_length: 512
+  tie_embeddings: true
+  max_norm: 1.0
+  min_lr: 4.0e-05
+eval:
+  interval: 100
+  max_iters: 100
+  initial_validation: false
+  final_validation: true
+  evaluate_example: first
+optimizer:
+  class_path: grokadamw.GrokAdamW
+  init_args:
+    lr: 0.001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.999
+devices: auto
+num_nodes: 1
+tokenizer_dir: ..
+logger_name: wandb
+seed: 23

out/pretrain/final/lit_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58836dae5410566815233a6a7dda727e660176f8272bc838e52e37b39dd6d513
+size 1074196642

out/pretrain/final/model_config.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+attention_logit_softcapping: null
+attention_scores_scalar: null
+bias: false
+block_size: 8192
+final_logit_softcapping: null
+gelu_approximate: none
+head_size: 64
+hf_config: {}
+intermediate_size: 1024
+lm_head_bias: false
+mlp_class_name: LLaMAMLP
+n_embd: 256
+n_expert: 0
+n_expert_per_token: 0
+n_head: 32
+n_layer: 32
+n_query_groups: 8
+name: ''
+norm_class_name: RMSNorm
+norm_eps: 1.0e-05
+padded_vocab_size: 262144
+padding_multiple: 512
+parallel_residual: false
+post_attention_norm: false
+post_mlp_norm: false
+rope_adjustments:
+  factor: 32.0
+  high_freq_factor: 4.0
+  low_freq_factor: 1.0
+  original_max_seq_len: 8192
+rope_base: 500000
+rope_condense_ratio: 1
+rotary_percentage: 1.0
+scale_embeddings: false
+shared_attention_norm: false
+sliding_window_layer_placing: null
+sliding_window_size: null
+vocab_size: 262144

out/pretrain/final/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87f950ea35683d2d80e36a35aa88b4c80fe5bd6d53aaab0a3ff3380ed0bc823e
+size 19928939

out/pretrain/final/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff