codebyzeb commited on
Commit
c2792b4
·
verified ·
1 Parent(s): c3a53ba

Copying trained fw57M-tied model

Browse files
llm/fw57M-tied/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
llm/fw57M-tied/README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ {}
3
+ ---
4
+ ## Experiment Configuration
5
+ ```yaml
6
+ callbacks:
7
+ grad_accum:
8
+ _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler
9
+ scheduling:
10
+ 0: 4
11
+ grad_norm:
12
+ _target_: src.callbacks.grad_norm.GradNorm
13
+ check_clipping: false
14
+ group_separator: /
15
+ histogram_freq: null
16
+ log_weight_distribution: false
17
+ norm_type: 2
18
+ only_total: true
19
+ lr_monitor:
20
+ _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor
21
+ model_checkpoint:
22
+ _target_: src.callbacks.model_checkpoint.ModelCheckpoint
23
+ dirpath: .checkpoints
24
+ enable_version_counter: false
25
+ every_n_train_steps: 2000
26
+ filename: '{step}'
27
+ save_initial_checkpoint: true
28
+ save_last: link
29
+ save_top_k: -1
30
+ verbose: true
31
+ speed_monitor:
32
+ _target_: src.callbacks.speed_monitor.SpeedMonitor
33
+ data:
34
+ batch_size: 32
35
+ drop_last: false
36
+ eval_batch_size: 128
37
+ multiprocessing_context: null
38
+ num_workers: 12
39
+ persistent_workers: false
40
+ pin_memory: true
41
+ prefetch_factor: 2
42
+ shuffle: true
43
+ dataset: finewebedu-20B
44
+ loggers:
45
+ tensorboard:
46
+ _target_: src.trainer.TensorBoardLogger
47
+ name: ''
48
+ save_dir: ./
49
+ version: null
50
+ model: fw57M-tied
51
+ optim:
52
+ lr: 0.0006
53
+ num_warmup_steps: 2000
54
+ optim_kwargs:
55
+ betas:
56
+ - 0.9
57
+ - 0.95
58
+ eps: 1.0e-08
59
+ fused: true
60
+ optim_name: adamw
61
+ scheduler_kwargs:
62
+ min_lr_ratio: 0.01
63
+ num_decay_steps: 4000
64
+ num_stable_steps: 44000
65
+ scheduler_name: warmup_stable_decay
66
+ weight_decay: 0.01
67
+ out_parent_folder: model_train
68
+ pwd: /home/zg258/projects/infotokenization
69
+ resume_from_checkpoint: .checkpoints/last.ckpt
70
+ run_folder: .
71
+ save_initial_checkpoint: true
72
+ seed: 42
73
+ tok_name: bytelevel
74
+ torch_compile: true
75
+ train_data_path: /home/zg258/projects/infotokenization/data/finewebedu-20B/bytelevel-subset/train
76
+ trainer:
77
+ accelerator: gpu
78
+ deterministic: false
79
+ devices: 1
80
+ enable_progress_bar: true
81
+ fast_dev_run: false
82
+ gradient_clip_algorithm: norm
83
+ gradient_clip_val: 1.0
84
+ limit_val_batches: 500
85
+ log_every_n_steps: 1
86
+ max_steps: 50000
87
+ precision: bf16-true
88
+ val_check_interval: 2000
89
+ val_data_path: /home/zg258/projects/infotokenization/data/finewebedu-20B/bytelevel-subset/validation
90
+ ```
llm/fw57M-tied/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "eos_token_id": 1,
9
+ "head_dim": 32,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "max_position_embeddings": 2048,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 24,
18
+ "num_hidden_layers": 6,
19
+ "num_key_value_heads": 24,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": true,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.51.1",
28
+ "use_cache": true,
29
+ "vocab_size": 258
30
+ }
llm/fw57M-tied/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 1,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.51.1"
6
+ }
llm/fw57M-tied/hparams.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ loggers:
2
+ tensorboard:
3
+ _target_: src.trainer.TensorBoardLogger
4
+ save_dir: ./
5
+ name: ''
6
+ version: null
7
+ callbacks:
8
+ lr_monitor:
9
+ _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor
10
+ grad_norm:
11
+ _target_: src.callbacks.grad_norm.GradNorm
12
+ norm_type: 2
13
+ group_separator: /
14
+ histogram_freq: null
15
+ check_clipping: false
16
+ log_weight_distribution: false
17
+ only_total: true
18
+ speed_monitor:
19
+ _target_: src.callbacks.speed_monitor.SpeedMonitor
20
+ grad_accum:
21
+ _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler
22
+ scheduling:
23
+ 0: 4
24
+ model_checkpoint:
25
+ _target_: src.callbacks.model_checkpoint.ModelCheckpoint
26
+ dirpath: .checkpoints
27
+ filename: '{step}'
28
+ enable_version_counter: false
29
+ every_n_train_steps: 2000
30
+ save_top_k: -1
31
+ save_last: link
32
+ verbose: true
33
+ save_initial_checkpoint: true
34
+ out_parent_folder: model_train
35
+ tok_name: bytelevel
36
+ run_folder: .
37
+ dataset: finewebedu-20B
38
+ pwd: /home/zg258/projects/infotokenization
39
+ train_data_path: /home/zg258/projects/infotokenization/data/finewebedu-20B/bytelevel-subset/train
40
+ val_data_path: /home/zg258/projects/infotokenization/data/finewebedu-20B/bytelevel-subset/validation
41
+ model: fw57M-tied
42
+ resume_from_checkpoint: .checkpoints/last.ckpt
43
+ save_initial_checkpoint: true
44
+ seed: 42
45
+ torch_compile: true
46
+ data:
47
+ batch_size: 32
48
+ eval_batch_size: 128
49
+ shuffle: true
50
+ drop_last: false
51
+ num_workers: 12
52
+ pin_memory: true
53
+ persistent_workers: false
54
+ prefetch_factor: 2
55
+ multiprocessing_context: null
56
+ optim:
57
+ optim_name: adamw
58
+ lr: 0.0006
59
+ weight_decay: 0.01
60
+ optim_kwargs:
61
+ fused: true
62
+ eps: 1.0e-08
63
+ betas:
64
+ - 0.9
65
+ - 0.95
66
+ scheduler_name: warmup_stable_decay
67
+ num_warmup_steps: 2000
68
+ scheduler_kwargs:
69
+ num_stable_steps: 44000
70
+ num_decay_steps: 4000
71
+ min_lr_ratio: 0.01
72
+ trainer:
73
+ accelerator: gpu
74
+ devices: 1
75
+ precision: bf16-true
76
+ deterministic: false
77
+ log_every_n_steps: 1
78
+ enable_progress_bar: true
79
+ fast_dev_run: false
80
+ gradient_clip_val: 1.0
81
+ gradient_clip_algorithm: norm
82
+ val_check_interval: 2000
83
+ max_steps: 50000
84
+ limit_val_batches: 500
llm/fw57M-tied/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1c5989b2eb341dde8e68fbf142f4f53a66c40876344c9e2571331e98edb1bc3
3
+ size 113668624
llm/fw57M-tied/tb_logs.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80adcbac1f2bb5ab25365d18ff532c89602efb8bdd02e03f6f5b1747f6ff6eee
3
+ size 2853290