rerun softmax0/1 15M logging kurtosis over trainstep
Browse files- out/softmax0-15m-2023_08_26_00_08_49/ckpt.pt +3 -0
- out/softmax0-15m-2023_08_26_00_08_49/config.json +1 -0
- out/softmax0-15m-2023_08_26_00_08_49/model.bin +3 -0
- out/softmax1-15m-2023_08_25_11_40_38/config.json +1 -0
- out/softmax1-15m-2023_08_25_11_42_26/config.json +1 -0
- out/softmax1-15m-2023_08_25_11_45_19/config.json +1 -0
- out/softmax1-15m-2023_08_25_11_47_04/ckpt.pt +3 -0
- out/softmax1-15m-2023_08_25_11_47_04/config.json +1 -0
- out/softmax1-15m-2023_08_25_11_47_04/model.bin +3 -0
out/softmax0-15m-2023_08_26_00_08_49/ckpt.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21cb2dc7f32f99593c5fc5ec727f99278c86c06615d7b2718a2d91ae7c538bc6
|
3 |
+
size 182361878
|
out/softmax0-15m-2023_08_26_00_08_49/config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"out_dir": "out/softmax0-15m-2023_08_26_00_08_49", "eval_interval": 1000, "log_interval": 1, "eval_iters": 50, "eval_only": false, "always_save_checkpoint": true, "init_from": "scratch", "wandb_log": true, "wandb_project": "softmax1-tinystories", "wandb_run_name": "softmax0-15m-2023_08_26_00_08_49", "batch_size": 72, "max_seq_len": 256, "vocab_source": "llama2", "vocab_size": 32000, "dim": 288, "n_layers": 6, "n_heads": 6, "n_kv_heads": 6, "multiple_of": 32, "dropout": 0.0, "gradient_accumulation_steps": 4, "learning_rate": 0.0005, "max_iters": 100000, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "decay_lr": true, "warmup_iters": 1000, "device": "cuda", "dtype": "float16", "compile": false, "softmax1": false}
|
out/softmax0-15m-2023_08_26_00_08_49/model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b7718351334d597c753d1df1af246df7bba9724c8b696d3581e05b3a1193dfc
|
3 |
+
size 60816028
|
out/softmax1-15m-2023_08_25_11_40_38/config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"out_dir": "out/softmax1-15m-2023_08_25_11_40_38", "eval_interval": 1000, "log_interval": 1, "eval_iters": 50, "eval_only": false, "always_save_checkpoint": true, "init_from": "scratch", "wandb_log": true, "wandb_project": "softmax1-tinystories", "wandb_run_name": "softmax1-15m-2023_08_25_11_40_38", "batch_size": 96, "max_seq_len": 256, "vocab_source": "llama2", "vocab_size": 32000, "dim": 288, "n_layers": 6, "n_heads": 6, "n_kv_heads": 6, "multiple_of": 32, "dropout": 0.0, "gradient_accumulation_steps": 4, "learning_rate": 0.0005, "max_iters": 100000, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "decay_lr": true, "warmup_iters": 1000, "device": "cuda", "dtype": "float16", "compile": true, "softmax1": true}
|
out/softmax1-15m-2023_08_25_11_42_26/config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"out_dir": "out/softmax1-15m-2023_08_25_11_42_26", "eval_interval": 1000, "log_interval": 1, "eval_iters": 50, "eval_only": false, "always_save_checkpoint": true, "init_from": "scratch", "wandb_log": true, "wandb_project": "softmax1-tinystories", "wandb_run_name": "softmax1-15m-2023_08_25_11_42_26", "batch_size": 96, "max_seq_len": 256, "vocab_source": "llama2", "vocab_size": 32000, "dim": 288, "n_layers": 6, "n_heads": 6, "n_kv_heads": 6, "multiple_of": 32, "dropout": 0.0, "gradient_accumulation_steps": 4, "learning_rate": 0.0005, "max_iters": 100000, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "decay_lr": true, "warmup_iters": 1000, "device": "cuda", "dtype": "float16", "compile": true, "softmax1": true}
|
out/softmax1-15m-2023_08_25_11_45_19/config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"out_dir": "out/softmax1-15m-2023_08_25_11_45_19", "eval_interval": 1000, "log_interval": 1, "eval_iters": 50, "eval_only": false, "always_save_checkpoint": true, "init_from": "scratch", "wandb_log": true, "wandb_project": "softmax1-tinystories", "wandb_run_name": "softmax1-15m-2023_08_25_11_45_19", "batch_size": 96, "max_seq_len": 256, "vocab_source": "llama2", "vocab_size": 32000, "dim": 288, "n_layers": 6, "n_heads": 6, "n_kv_heads": 6, "multiple_of": 32, "dropout": 0.0, "gradient_accumulation_steps": 4, "learning_rate": 0.0005, "max_iters": 100000, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "decay_lr": true, "warmup_iters": 1000, "device": "cuda", "dtype": "float16", "compile": false, "softmax1": true}
|
out/softmax1-15m-2023_08_25_11_47_04/ckpt.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:069a05750421d76aada282966c586f58054fdc04d13e951090c9a92677b7b309
|
3 |
+
size 182361878
|
out/softmax1-15m-2023_08_25_11_47_04/config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"out_dir": "out/softmax1-15m-2023_08_25_11_47_04", "eval_interval": 1000, "log_interval": 1, "eval_iters": 50, "eval_only": false, "always_save_checkpoint": true, "init_from": "scratch", "wandb_log": true, "wandb_project": "softmax1-tinystories", "wandb_run_name": "softmax1-15m-2023_08_25_11_47_04", "batch_size": 72, "max_seq_len": 256, "vocab_source": "llama2", "vocab_size": 32000, "dim": 288, "n_layers": 6, "n_heads": 6, "n_kv_heads": 6, "multiple_of": 32, "dropout": 0.0, "gradient_accumulation_steps": 4, "learning_rate": 0.0005, "max_iters": 100000, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "decay_lr": true, "warmup_iters": 1000, "device": "cuda", "dtype": "float16", "compile": false, "softmax1": true}
|
out/softmax1-15m-2023_08_25_11_47_04/model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9162e29b7644f017fffa0fcffbd81d112677102e14b807d7f57aec2e11c52df0
|
3 |
+
size 60816028
|