Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +29 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:731f9e5cdb09067bf8e15f162aff6438e7e79db5a9987c735e4b28b97fe7d2b7
|
3 |
+
size 100733974
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 384,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 768,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 20,
|
17 |
+
"device": "cuda:1",
|
18 |
+
"layer": 8,
|
19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
|
21 |
+
"submodule_name": "resid_post_layer_8"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 768,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 32,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:1"
|
31 |
+
}
|
32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 7.165702819824219, "l1_loss": 52.398841048731946, "l0": 19.910211158521246, "frac_variance_explained": 0.9256864298473705, "cossim": 0.9520920298316262, "l2_ratio": 0.9514842954548922, "relative_reconstruction_bias": 0.9993569128441088, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.9917597409450645, "loss_zero": 12.187079458525687, "frac_recovered": 0.9568525624997688, "frac_alive": 0.933837890625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:da5a8f71335eba31afa1eca0d8b6f01790dec259246cbd6bc7225951a5486d9a
|
3 |
+
size 100733974
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 384,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 768,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 40,
|
17 |
+
"device": "cuda:1",
|
18 |
+
"layer": 8,
|
19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
|
21 |
+
"submodule_name": "resid_post_layer_8"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 768,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 32,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:1"
|
31 |
+
}
|
32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 6.358675190896699, "l1_loss": 67.62095688328598, "l0": 39.82553227742513, "frac_variance_explained": 0.9409584890712391, "cossim": 0.9623859289920691, "l2_ratio": 0.9620392033548066, "relative_reconstruction_bias": 0.9997165492086699, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.852238994656187, "loss_zero": 12.187079458525687, "frac_recovered": 0.9724247419472897, "frac_alive": 0.92108154296875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe65f3e4ad7fe582508b307625716ff197a04a3d371834c7222c023169597fbd
|
3 |
+
size 100733974
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 384,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 768,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 80,
|
17 |
+
"device": "cuda:1",
|
18 |
+
"layer": 8,
|
19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
|
21 |
+
"submodule_name": "resid_post_layer_8"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 768,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 32,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:1"
|
31 |
+
}
|
32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 5.535941615249172, "l1_loss": 90.0740072076971, "l0": 79.61385576652758, "frac_variance_explained": 0.9549660375623992, "cossim": 0.9716276183272853, "l2_ratio": 0.9715044552629645, "relative_reconstruction_bias": 1.0009427467981975, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.7588755361961597, "loss_zero": 12.187079458525687, "frac_recovered": 0.9828645659215522, "frac_alive": 0.873291015625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62d31b3f692684c53b5c5e8ff60b739baa9a0daf57abe707da33701733063e7d
|
3 |
+
size 100733974
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 384,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 768,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 160,
|
17 |
+
"device": "cuda:1",
|
18 |
+
"layer": 8,
|
19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
|
21 |
+
"submodule_name": "resid_post_layer_8"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 768,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 32,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:1"
|
31 |
+
}
|
32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 4.596404407963608, "l1_loss": 153.62816642992425, "l0": 159.1172462232185, "frac_variance_explained": 0.9690298311638109, "cossim": 0.98062437953371, "l2_ratio": 0.9803878791404493, "relative_reconstruction_bias": 1.0007607864611077, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6916814464511294, "loss_zero": 12.187079458525687, "frac_recovered": 0.9904219201116851, "frac_alive": 0.7470703125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff290fc3b27d35eee2431b923928da0a5530eaf02380cc605a3f04aa3bc61287
|
3 |
+
size 100733974
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 384,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 768,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 320,
|
17 |
+
"device": "cuda:1",
|
18 |
+
"layer": 8,
|
19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
|
21 |
+
"submodule_name": "resid_post_layer_8"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 768,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 32,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:1"
|
31 |
+
}
|
32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 3.1190587029312598, "l1_loss": 295.7740866921165, "l0": 318.5566424745502, "frac_variance_explained": 0.9860216288855581, "cossim": 0.9912758794697848, "l2_ratio": 0.9912153554685188, "relative_reconstruction_bias": 1.0004844918395535, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.63657668503848, "loss_zero": 12.187079458525687, "frac_recovered": 0.9965930906209078, "frac_alive": 0.32232666015625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:405bcdf303076eb4e67b6dcd34a2c041278a2574400fd19e2deb02eb9df12560
|
3 |
+
size 100733974
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 384,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 768,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 640,
|
17 |
+
"device": "cuda:1",
|
18 |
+
"layer": 8,
|
19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
|
21 |
+
"submodule_name": "resid_post_layer_8"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 768,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 32,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:1"
|
31 |
+
}
|
32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 1.5527055624759558, "l1_loss": 626.5316938920455, "l0": 639.1975338097775, "frac_variance_explained": 0.9963156291932771, "cossim": 0.9977991869955352, "l2_ratio": 0.9976590293826479, "relative_reconstruction_bias": 1.0002403476021506, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6137762756058662, "loss_zero": 12.187079458525687, "frac_recovered": 0.9991628256711093, "frac_alive": 0.06146240234375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50b23e31ad3ee499518d7bd643be829a3ee7b86b236059a0146799d475bb1c6c
|
3 |
+
size 100865046
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "GatedAutoEncoder",
|
4 |
+
"trainer_class": "GatedSAETrainer",
|
5 |
+
"activation_dim": 768,
|
6 |
+
"dict_size": 16384,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.012,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"sparsity_warmup_steps": 5000,
|
11 |
+
"decay_start": 195312,
|
12 |
+
"seed": 0,
|
13 |
+
"device": "cuda:0",
|
14 |
+
"layer": 8,
|
15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_12",
|
17 |
+
"submodule_name": "resid_post_layer_8"
|
18 |
+
},
|
19 |
+
"buffer": {
|
20 |
+
"d_submodule": 768,
|
21 |
+
"io": "out",
|
22 |
+
"n_ctxs": 244,
|
23 |
+
"ctx_len": 1024,
|
24 |
+
"refresh_batch_size": 32,
|
25 |
+
"out_batch_size": 2048,
|
26 |
+
"device": "cuda:0"
|
27 |
+
}
|
28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 2.449277211384601, "l1_loss": 233.9408302766731, "l0": 483.06496126105986, "frac_variance_explained": 0.9913476149001753, "cossim": 0.9946498181446489, "l2_ratio": 0.9939263398388782, "relative_reconstruction_bias": 0.9993272940796541, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6073691378156822, "loss_zero": 12.979128625019488, "frac_recovered": 0.9982530004288777, "frac_alive": 0.59857177734375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c989b27ea8ba899f086eb1e4bbf13a21a3e8ebc86900fb5d588fd7f804b814f4
|
3 |
+
size 100865046
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "GatedAutoEncoder",
|
4 |
+
"trainer_class": "GatedSAETrainer",
|
5 |
+
"activation_dim": 768,
|
6 |
+
"dict_size": 16384,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.018,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"sparsity_warmup_steps": 5000,
|
11 |
+
"decay_start": 195312,
|
12 |
+
"seed": 0,
|
13 |
+
"device": "cuda:0",
|
14 |
+
"layer": 8,
|
15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_13",
|
17 |
+
"submodule_name": "resid_post_layer_8"
|
18 |
+
},
|
19 |
+
"buffer": {
|
20 |
+
"d_submodule": 768,
|
21 |
+
"io": "out",
|
22 |
+
"n_ctxs": 244,
|
23 |
+
"ctx_len": 1024,
|
24 |
+
"refresh_batch_size": 32,
|
25 |
+
"out_batch_size": 2048,
|
26 |
+
"device": "cuda:0"
|
27 |
+
}
|
28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 3.718252821140979, "l1_loss": 174.70882599612315, "l0": 357.33407280244023, "frac_variance_explained": 0.9797811321465366, "cossim": 0.9874855103981064, "l2_ratio": 0.9873908984373851, "relative_reconstruction_bias": 1.0000614502343788, "loss_original": 2.591329812285412, "loss_reconstructed": 2.638556733906987, "loss_zero": 12.979128625019488, "frac_recovered": 0.9948906988264566, "frac_alive": 0.55438232421875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0818ec859a15998633c73ad5dce110641f60f0b7b999aafd459d8bcaafa407d
|
3 |
+
size 100865046
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "GatedAutoEncoder",
|
4 |
+
"trainer_class": "GatedSAETrainer",
|
5 |
+
"activation_dim": 768,
|
6 |
+
"dict_size": 16384,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.024,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"sparsity_warmup_steps": 5000,
|
11 |
+
"decay_start": 195312,
|
12 |
+
"seed": 0,
|
13 |
+
"device": "cuda:0",
|
14 |
+
"layer": 8,
|
15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_14",
|
17 |
+
"submodule_name": "resid_post_layer_8"
|
18 |
+
},
|
19 |
+
"buffer": {
|
20 |
+
"d_submodule": 768,
|
21 |
+
"io": "out",
|
22 |
+
"n_ctxs": 244,
|
23 |
+
"ctx_len": 1024,
|
24 |
+
"refresh_batch_size": 32,
|
25 |
+
"out_batch_size": 2048,
|
26 |
+
"device": "cuda:0"
|
27 |
+
}
|
28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 4.538920862128935, "l1_loss": 141.84569384103798, "l0": 227.8396312299981, "frac_variance_explained": 0.9699712948626783, "cossim": 0.9812705624534424, "l2_ratio": 0.98167203241084, "relative_reconstruction_bias": 1.000891358737486, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6714729882148376, "loss_zero": 12.979128625019488, "frac_recovered": 0.9913170700331768, "frac_alive": 0.71710205078125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:01de966bee6924eeaadb356982edc47ee4ae5d8c402495317c21dfafaf2361f7
|
3 |
+
size 100865046
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "GatedAutoEncoder",
|
4 |
+
"trainer_class": "GatedSAETrainer",
|
5 |
+
"activation_dim": 768,
|
6 |
+
"dict_size": 16384,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.04,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"sparsity_warmup_steps": 5000,
|
11 |
+
"decay_start": 195312,
|
12 |
+
"seed": 0,
|
13 |
+
"device": "cuda:0",
|
14 |
+
"layer": 8,
|
15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_15",
|
17 |
+
"submodule_name": "resid_post_layer_8"
|
18 |
+
},
|
19 |
+
"buffer": {
|
20 |
+
"d_submodule": 768,
|
21 |
+
"io": "out",
|
22 |
+
"n_ctxs": 244,
|
23 |
+
"ctx_len": 1024,
|
24 |
+
"refresh_batch_size": 32,
|
25 |
+
"out_batch_size": 2048,
|
26 |
+
"device": "cuda:0"
|
27 |
+
}
|
28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 5.540974214852574, "l1_loss": 94.82435709022614, "l0": 95.3962563204478, "frac_variance_explained": 0.9552597058824746, "cossim": 0.9718739264700786, "l2_ratio": 0.9708090194736618, "relative_reconstruction_bias": 0.9997583469712591, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7390204301799637, "loss_zero": 12.979128625019488, "frac_recovered": 0.9840897005724619, "frac_alive": 0.9267578125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2eb06802eee053782d3d338803d4f408ce84b0a1cc281748a266cb4f772a956e
|
3 |
+
size 100865046
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "GatedAutoEncoder",
|
4 |
+
"trainer_class": "GatedSAETrainer",
|
5 |
+
"activation_dim": 768,
|
6 |
+
"dict_size": 16384,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.06,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"sparsity_warmup_steps": 5000,
|
11 |
+
"decay_start": 195312,
|
12 |
+
"seed": 0,
|
13 |
+
"device": "cuda:0",
|
14 |
+
"layer": 8,
|
15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_16",
|
17 |
+
"submodule_name": "resid_post_layer_8"
|
18 |
+
},
|
19 |
+
"buffer": {
|
20 |
+
"d_submodule": 768,
|
21 |
+
"io": "out",
|
22 |
+
"n_ctxs": 244,
|
23 |
+
"ctx_len": 1024,
|
24 |
+
"refresh_batch_size": 32,
|
25 |
+
"out_batch_size": 2048,
|
26 |
+
"device": "cuda:0"
|
27 |
+
}
|
28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 6.3163925567305235, "l1_loss": 64.96194687808853, "l0": 48.62305409075266, "frac_variance_explained": 0.9414439212126904, "cossim": 0.9631719384566847, "l2_ratio": 0.9623617971517954, "relative_reconstruction_bias": 0.9998143462531538, "loss_original": 2.591329812285412, "loss_reconstructed": 2.8256030183240592, "loss_zero": 12.979128625019488, "frac_recovered": 0.9748218806393175, "frac_alive": 0.92987060546875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:492aae9f99769dba8c556b726451e369b1eeb33b1baf558a90444dbfad618fb2
|
3 |
+
size 100865046
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "GatedAutoEncoder",
|
4 |
+
"trainer_class": "GatedSAETrainer",
|
5 |
+
"activation_dim": 768,
|
6 |
+
"dict_size": 16384,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.08,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"sparsity_warmup_steps": 5000,
|
11 |
+
"decay_start": 195312,
|
12 |
+
"seed": 0,
|
13 |
+
"device": "cuda:0",
|
14 |
+
"layer": 8,
|
15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_17",
|
17 |
+
"submodule_name": "resid_post_layer_8"
|
18 |
+
},
|
19 |
+
"buffer": {
|
20 |
+
"d_submodule": 768,
|
21 |
+
"io": "out",
|
22 |
+
"n_ctxs": 244,
|
23 |
+
"ctx_len": 1024,
|
24 |
+
"refresh_batch_size": 32,
|
25 |
+
"out_batch_size": 2048,
|
26 |
+
"device": "cuda:0"
|
27 |
+
}
|
28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 6.866190913211868, "l1_loss": 55.27316311755812, "l0": 30.90061481889472, "frac_variance_explained": 0.9303119121545769, "cossim": 0.9563094456511808, "l2_ratio": 0.9556090986154165, "relative_reconstruction_bias": 0.9998748388635107, "loss_original": 2.591329812285412, "loss_reconstructed": 2.912389753812767, "loss_zero": 12.979128625019488, "frac_recovered": 0.9655166954161173, "frac_alive": 0.86328125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35b66366e463e23602a12aa239decfaf82325071984f6e48fbfc53f1e97e1aac
|
3 |
+
size 100799263
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "JumpReluTrainer",
|
4 |
+
"dict_class": "JumpReluAutoEncoder",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 768,
|
9 |
+
"dict_size": 16384,
|
10 |
+
"device": "cuda:0",
|
11 |
+
"layer": 8,
|
12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_30",
|
14 |
+
"submodule_name": "resid_post_layer_8",
|
15 |
+
"bandwidth": 0.001,
|
16 |
+
"sparsity_penalty": 1.0,
|
17 |
+
"sparsity_warmup_steps": 5000,
|
18 |
+
"target_l0": 20
|
19 |
+
},
|
20 |
+
"buffer": {
|
21 |
+
"d_submodule": 768,
|
22 |
+
"io": "out",
|
23 |
+
"n_ctxs": 244,
|
24 |
+
"ctx_len": 1024,
|
25 |
+
"refresh_batch_size": 32,
|
26 |
+
"out_batch_size": 2048,
|
27 |
+
"device": "cuda:0"
|
28 |
+
}
|
29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 7.542313641812428, "l1_loss": 59.5405657664839, "l0": 20.316411776715015, "frac_variance_explained": 0.9169782827417534, "cossim": 0.9468502043241478, "l2_ratio": 0.9470895029694201, "relative_reconstruction_bias": 1.0005664681813804, "loss_original": 2.591329812285412, "loss_reconstructed": 3.063094075903835, "loss_zero": 12.979128625019488, "frac_recovered": 0.9492282486823668, "frac_alive": 0.423095703125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52a73e6d5540b5666e46b3ce47c868f81846d1ac46cc3f2fd0ed174158fe95e3
|
3 |
+
size 100799263
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "JumpReluTrainer",
|
4 |
+
"dict_class": "JumpReluAutoEncoder",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 768,
|
9 |
+
"dict_size": 16384,
|
10 |
+
"device": "cuda:0",
|
11 |
+
"layer": 8,
|
12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_31",
|
14 |
+
"submodule_name": "resid_post_layer_8",
|
15 |
+
"bandwidth": 0.001,
|
16 |
+
"sparsity_penalty": 1.0,
|
17 |
+
"sparsity_warmup_steps": 5000,
|
18 |
+
"target_l0": 40
|
19 |
+
},
|
20 |
+
"buffer": {
|
21 |
+
"d_submodule": 768,
|
22 |
+
"io": "out",
|
23 |
+
"n_ctxs": 244,
|
24 |
+
"ctx_len": 1024,
|
25 |
+
"refresh_batch_size": 32,
|
26 |
+
"out_batch_size": 2048,
|
27 |
+
"device": "cuda:0"
|
28 |
+
}
|
29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 6.596800962126399, "l1_loss": 68.14624850721245, "l0": 40.465813809130566, "frac_variance_explained": 0.9358430052378092, "cossim": 0.9595368070056639, "l2_ratio": 0.9597559456365654, "relative_reconstruction_bias": 1.0001378511808006, "loss_original": 2.591329812285412, "loss_reconstructed": 2.866633120071457, "loss_zero": 12.979128625019488, "frac_recovered": 0.9703360113752894, "frac_alive": 0.53546142578125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b355bdff67e88e1ccf0c30ff781cd2408f566c203a2ffea6d730c47faf7e6958
|
3 |
+
size 100799263
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "JumpReluTrainer",
|
4 |
+
"dict_class": "JumpReluAutoEncoder",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 768,
|
9 |
+
"dict_size": 16384,
|
10 |
+
"device": "cuda:0",
|
11 |
+
"layer": 8,
|
12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_32",
|
14 |
+
"submodule_name": "resid_post_layer_8",
|
15 |
+
"bandwidth": 0.001,
|
16 |
+
"sparsity_penalty": 1.0,
|
17 |
+
"sparsity_warmup_steps": 5000,
|
18 |
+
"target_l0": 80
|
19 |
+
},
|
20 |
+
"buffer": {
|
21 |
+
"d_submodule": 768,
|
22 |
+
"io": "out",
|
23 |
+
"n_ctxs": 244,
|
24 |
+
"ctx_len": 1024,
|
25 |
+
"refresh_batch_size": 32,
|
26 |
+
"out_batch_size": 2048,
|
27 |
+
"device": "cuda:0"
|
28 |
+
}
|
29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 5.868561931403287, "l1_loss": 87.49280005765249, "l0": 73.93405321419957, "frac_variance_explained": 0.9491253247462124, "cossim": 0.9681477079908531, "l2_ratio": 0.9674936364214104, "relative_reconstruction_bias": 0.9984568217432642, "loss_original": 2.591329812285412, "loss_reconstructed": 2.771394150802888, "loss_zero": 12.979128625019488, "frac_recovered": 0.9805464475269777, "frac_alive": 0.52166748046875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57b55991b9ed3a61bfbd12123f73db672397e9e3917e4eb1a14a0dc8e05792dc
|
3 |
+
size 100799263
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "JumpReluTrainer",
|
4 |
+
"dict_class": "JumpReluAutoEncoder",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 768,
|
9 |
+
"dict_size": 16384,
|
10 |
+
"device": "cuda:0",
|
11 |
+
"layer": 8,
|
12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_33",
|
14 |
+
"submodule_name": "resid_post_layer_8",
|
15 |
+
"bandwidth": 0.001,
|
16 |
+
"sparsity_penalty": 1.0,
|
17 |
+
"sparsity_warmup_steps": 5000,
|
18 |
+
"target_l0": 160
|
19 |
+
},
|
20 |
+
"buffer": {
|
21 |
+
"d_submodule": 768,
|
22 |
+
"io": "out",
|
23 |
+
"n_ctxs": 244,
|
24 |
+
"ctx_len": 1024,
|
25 |
+
"refresh_batch_size": 32,
|
26 |
+
"out_batch_size": 2048,
|
27 |
+
"device": "cuda:0"
|
28 |
+
}
|
29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 4.841383078012122, "l1_loss": 130.77086565867964, "l0": 155.94150111186934, "frac_variance_explained": 0.9658320416887123, "cossim": 0.9784838686506432, "l2_ratio": 0.978472747716559, "relative_reconstruction_bias": 1.0000444162322815, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6872186675129166, "loss_zero": 12.979128625019488, "frac_recovered": 0.9896216367382601, "frac_alive": 0.4205322265625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:123ae303303f92498d1269b0e4c32df87c055d51b5bb87f6e7d5cbfbb23efca7
|
3 |
+
size 100799263
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "JumpReluTrainer",
|
4 |
+
"dict_class": "JumpReluAutoEncoder",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 768,
|
9 |
+
"dict_size": 16384,
|
10 |
+
"device": "cuda:0",
|
11 |
+
"layer": 8,
|
12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_34",
|
14 |
+
"submodule_name": "resid_post_layer_8",
|
15 |
+
"bandwidth": 0.001,
|
16 |
+
"sparsity_penalty": 1.0,
|
17 |
+
"sparsity_warmup_steps": 5000,
|
18 |
+
"target_l0": 320
|
19 |
+
},
|
20 |
+
"buffer": {
|
21 |
+
"d_submodule": 768,
|
22 |
+
"io": "out",
|
23 |
+
"n_ctxs": 244,
|
24 |
+
"ctx_len": 1024,
|
25 |
+
"refresh_batch_size": 32,
|
26 |
+
"out_batch_size": 2048,
|
27 |
+
"device": "cuda:0"
|
28 |
+
}
|
29 |
+
}
|