diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b1b582e0d3ef63d7150f5952a6298bf3359f379 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:731f9e5cdb09067bf8e15f162aff6438e7e79db5a9987c735e4b28b97fe7d2b7 +size 100733974 diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..458059e795deab45bd7cfcd514c55dcfa55dd5e3 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 20, + "device": "cuda:1", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:1" + } +} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7336eb9ae2ab9f349f07814ea62bdc61a5139f60 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 7.165702819824219, "l1_loss": 52.398841048731946, "l0": 19.910211158521246, "frac_variance_explained": 0.9256864298473705, "cossim": 0.9520920298316262, "l2_ratio": 0.9514842954548922, "relative_reconstruction_bias": 0.9993569128441088, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.9917597409450645, "loss_zero": 12.187079458525687, "frac_recovered": 0.9568525624997688, "frac_alive": 0.933837890625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..4164de90d990579350e4bbc4849438441a9d8e78 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da5a8f71335eba31afa1eca0d8b6f01790dec259246cbd6bc7225951a5486d9a +size 100733974 diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0f29dec665105aacf5042ad142c273d56be5252 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 40, + "device": "cuda:1", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:1" + } +} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ed4ede0ea39346ab3f72e328f0590fb391f35bb1 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.358675190896699, "l1_loss": 67.62095688328598, "l0": 39.82553227742513, "frac_variance_explained": 0.9409584890712391, "cossim": 0.9623859289920691, "l2_ratio": 0.9620392033548066, "relative_reconstruction_bias": 0.9997165492086699, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.852238994656187, "loss_zero": 12.187079458525687, "frac_recovered": 0.9724247419472897, "frac_alive": 0.92108154296875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..c499093e7c3a1514a99a6daa5b7905b8295a5ec7 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe65f3e4ad7fe582508b307625716ff197a04a3d371834c7222c023169597fbd +size 100733974 diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8868e9ce225ee28147affb75065695ae8dd68c89 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 80, + "device": "cuda:1", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:1" + } +} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..45871e1c23861f69029a835c6e75ee2084f4ef3e --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 5.535941615249172, "l1_loss": 90.0740072076971, "l0": 79.61385576652758, "frac_variance_explained": 0.9549660375623992, "cossim": 0.9716276183272853, "l2_ratio": 0.9715044552629645, "relative_reconstruction_bias": 1.0009427467981975, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.7588755361961597, "loss_zero": 12.187079458525687, "frac_recovered": 0.9828645659215522, "frac_alive": 0.873291015625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..1702746a9e4c7a3aba8fdc5ad707b70599f69848 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62d31b3f692684c53b5c5e8ff60b739baa9a0daf57abe707da33701733063e7d +size 100733974 diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..65ef6069ff892a6817c8715be256c3188038dce4 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 160, + "device": "cuda:1", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:1" + } +} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..046c6f8c4fc3a003e841b796a81df7b430f9d111 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 4.596404407963608, "l1_loss": 153.62816642992425, "l0": 159.1172462232185, "frac_variance_explained": 0.9690298311638109, "cossim": 0.98062437953371, "l2_ratio": 0.9803878791404493, "relative_reconstruction_bias": 1.0007607864611077, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6916814464511294, "loss_zero": 12.187079458525687, "frac_recovered": 0.9904219201116851, "frac_alive": 0.7470703125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee57c2cd2abc3017d37bb942f8e419a2be8052f6 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff290fc3b27d35eee2431b923928da0a5530eaf02380cc605a3f04aa3bc61287 +size 100733974 diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json new file mode 100644 index 0000000000000000000000000000000000000000..67bcb01401d37d5940bd75fdcf4a6ebccef68001 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 320, + "device": "cuda:1", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:1" + } +} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6300f72d7d835cf405df3350906851605fddbab6 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 3.1190587029312598, "l1_loss": 295.7740866921165, "l0": 318.5566424745502, "frac_variance_explained": 0.9860216288855581, "cossim": 0.9912758794697848, "l2_ratio": 0.9912153554685188, "relative_reconstruction_bias": 1.0004844918395535, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.63657668503848, "loss_zero": 12.187079458525687, "frac_recovered": 0.9965930906209078, "frac_alive": 0.32232666015625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..58b8fd913153c163606300a87a138c25e290a716 --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:405bcdf303076eb4e67b6dcd34a2c041278a2574400fd19e2deb02eb9df12560 +size 100733974 diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e95d277f1f104399df23bc91d19184e592bb2fd --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 640, + "device": "cuda:1", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:1" + } +} \ No newline at end of file diff --git a/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1329723c0a7ba346f313324a21d606bc21559fbf --- /dev/null +++ b/BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 1.5527055624759558, "l1_loss": 626.5316938920455, "l0": 639.1975338097775, "frac_variance_explained": 0.9963156291932771, "cossim": 0.9977991869955352, "l2_ratio": 0.9976590293826479, "relative_reconstruction_bias": 1.0002403476021506, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6137762756058662, "loss_zero": 12.187079458525687, "frac_recovered": 0.9991628256711093, "frac_alive": 0.06146240234375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..123c05984e9dfc2cdf0cad206cfd598cf308178f --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50b23e31ad3ee499518d7bd643be829a3ee7b86b236059a0146799d475bb1c6c +size 100865046 diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f9ee88b7030431880eb9cadfa2ba6a75b0ebd41 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json @@ -0,0 +1,28 @@ +{ + "trainer": { + "dict_class": "GatedAutoEncoder", + "trainer_class": "GatedSAETrainer", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.012, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_12", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b8f4a2f1673082ea8b7e0867cfd3e29c77b56d88 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 2.449277211384601, "l1_loss": 233.9408302766731, "l0": 483.06496126105986, "frac_variance_explained": 0.9913476149001753, "cossim": 0.9946498181446489, "l2_ratio": 0.9939263398388782, "relative_reconstruction_bias": 0.9993272940796541, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6073691378156822, "loss_zero": 12.979128625019488, "frac_recovered": 0.9982530004288777, "frac_alive": 0.59857177734375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..300b95739c9b0588db013f225064e858621f80b5 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c989b27ea8ba899f086eb1e4bbf13a21a3e8ebc86900fb5d588fd7f804b814f4 +size 100865046 diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2992e11e19dc426262b633e53173e8d1a444bee0 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json @@ -0,0 +1,28 @@ +{ + "trainer": { + "dict_class": "GatedAutoEncoder", + "trainer_class": "GatedSAETrainer", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.018, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_13", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..14f5bb189c55ba39eed9516bf12da133da342655 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 3.718252821140979, "l1_loss": 174.70882599612315, "l0": 357.33407280244023, "frac_variance_explained": 0.9797811321465366, "cossim": 0.9874855103981064, "l2_ratio": 0.9873908984373851, "relative_reconstruction_bias": 1.0000614502343788, "loss_original": 2.591329812285412, "loss_reconstructed": 2.638556733906987, "loss_zero": 12.979128625019488, "frac_recovered": 0.9948906988264566, "frac_alive": 0.55438232421875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..18d6ce617e1c78fb0506102e65a59fa59ec8fe81 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0818ec859a15998633c73ad5dce110641f60f0b7b999aafd459d8bcaafa407d +size 100865046 diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..16f02cbc3a6951bdfe75737befa9f914a64a83bb --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json @@ -0,0 +1,28 @@ +{ + "trainer": { + "dict_class": "GatedAutoEncoder", + "trainer_class": "GatedSAETrainer", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.024, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_14", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e9e7a4665b3ee8fd24ab14b595bf41606105c8d4 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 4.538920862128935, "l1_loss": 141.84569384103798, "l0": 227.8396312299981, "frac_variance_explained": 0.9699712948626783, "cossim": 0.9812705624534424, "l2_ratio": 0.98167203241084, "relative_reconstruction_bias": 1.000891358737486, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6714729882148376, "loss_zero": 12.979128625019488, "frac_recovered": 0.9913170700331768, "frac_alive": 0.71710205078125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..16dfc685588ff19c11192e2d32fb9970f23c68b0 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01de966bee6924eeaadb356982edc47ee4ae5d8c402495317c21dfafaf2361f7 +size 100865046 diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..929dede216c8793f48760402ed703029b628eabc --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json @@ -0,0 +1,28 @@ +{ + "trainer": { + "dict_class": "GatedAutoEncoder", + "trainer_class": "GatedSAETrainer", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.04, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_15", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..aa7a4fd81668d0a15e4b211b2ce5fdc961182a7c --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 5.540974214852574, "l1_loss": 94.82435709022614, "l0": 95.3962563204478, "frac_variance_explained": 0.9552597058824746, "cossim": 0.9718739264700786, "l2_ratio": 0.9708090194736618, "relative_reconstruction_bias": 0.9997583469712591, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7390204301799637, "loss_zero": 12.979128625019488, "frac_recovered": 0.9840897005724619, "frac_alive": 0.9267578125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c5f0ad1c398cf2d5ed98d5819fe0d1aabda9305 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eb06802eee053782d3d338803d4f408ce84b0a1cc281748a266cb4f772a956e +size 100865046 diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6988289fa5ee59d4bff095a23f4500d4585b33a1 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json @@ -0,0 +1,28 @@ +{ + "trainer": { + "dict_class": "GatedAutoEncoder", + "trainer_class": "GatedSAETrainer", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.06, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_16", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ad1111d0fb62b3c34b286aec188434ed9c20a8b3 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.3163925567305235, "l1_loss": 64.96194687808853, "l0": 48.62305409075266, "frac_variance_explained": 0.9414439212126904, "cossim": 0.9631719384566847, "l2_ratio": 0.9623617971517954, "relative_reconstruction_bias": 0.9998143462531538, "loss_original": 2.591329812285412, "loss_reconstructed": 2.8256030183240592, "loss_zero": 12.979128625019488, "frac_recovered": 0.9748218806393175, "frac_alive": 0.92987060546875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a0282bf0ad0583beea5c3b910b0299cae264efc --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:492aae9f99769dba8c556b726451e369b1eeb33b1baf558a90444dbfad618fb2 +size 100865046 diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d3322fe9d83c3577d957e59743d04f7db5a8c362 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json @@ -0,0 +1,28 @@ +{ + "trainer": { + "dict_class": "GatedAutoEncoder", + "trainer_class": "GatedSAETrainer", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.08, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_17", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..de769c10acdb9767a44d93c85a58246e51bad585 --- /dev/null +++ b/GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.866190913211868, "l1_loss": 55.27316311755812, "l0": 30.90061481889472, "frac_variance_explained": 0.9303119121545769, "cossim": 0.9563094456511808, "l2_ratio": 0.9556090986154165, "relative_reconstruction_bias": 0.9998748388635107, "loss_original": 2.591329812285412, "loss_reconstructed": 2.912389753812767, "loss_zero": 12.979128625019488, "frac_recovered": 0.9655166954161173, "frac_alive": 0.86328125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..695543f9f986719615d3952af0f6339c39279c82 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35b66366e463e23602a12aa239decfaf82325071984f6e48fbfc53f1e97e1aac +size 100799263 diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c986b4e7b29e94907864c929f387db328c43c884 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "trainer_class": "JumpReluTrainer", + "dict_class": "JumpReluAutoEncoder", + "lr": 0.0003, + "steps": 244140, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_30", + "submodule_name": "resid_post_layer_8", + "bandwidth": 0.001, + "sparsity_penalty": 1.0, + "sparsity_warmup_steps": 5000, + "target_l0": 20 + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3cb2d745c21e24eb40a19e006f5eb97d3a2aef20 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 7.542313641812428, "l1_loss": 59.5405657664839, "l0": 20.316411776715015, "frac_variance_explained": 0.9169782827417534, "cossim": 0.9468502043241478, "l2_ratio": 0.9470895029694201, "relative_reconstruction_bias": 1.0005664681813804, "loss_original": 2.591329812285412, "loss_reconstructed": 3.063094075903835, "loss_zero": 12.979128625019488, "frac_recovered": 0.9492282486823668, "frac_alive": 0.423095703125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e476d9d07e9142c7bfc00086d79fc350d471654 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52a73e6d5540b5666e46b3ce47c868f81846d1ac46cc3f2fd0ed174158fe95e3 +size 100799263 diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4220d1a6a91306fc837ce5059290545df307e280 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "trainer_class": "JumpReluTrainer", + "dict_class": "JumpReluAutoEncoder", + "lr": 0.0003, + "steps": 244140, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_31", + "submodule_name": "resid_post_layer_8", + "bandwidth": 0.001, + "sparsity_penalty": 1.0, + "sparsity_warmup_steps": 5000, + "target_l0": 40 + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..88ad7b289ff4c6fb87edbbc17af61d1d0eb0199f --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.596800962126399, "l1_loss": 68.14624850721245, "l0": 40.465813809130566, "frac_variance_explained": 0.9358430052378092, "cossim": 0.9595368070056639, "l2_ratio": 0.9597559456365654, "relative_reconstruction_bias": 1.0001378511808006, "loss_original": 2.591329812285412, "loss_reconstructed": 2.866633120071457, "loss_zero": 12.979128625019488, "frac_recovered": 0.9703360113752894, "frac_alive": 0.53546142578125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e4a128be831d42f4c79d74bb786956e597a554d --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b355bdff67e88e1ccf0c30ff781cd2408f566c203a2ffea6d730c47faf7e6958 +size 100799263 diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..eb68293669c19aee70f4b394008a2e5c0ca01601 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "trainer_class": "JumpReluTrainer", + "dict_class": "JumpReluAutoEncoder", + "lr": 0.0003, + "steps": 244140, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_32", + "submodule_name": "resid_post_layer_8", + "bandwidth": 0.001, + "sparsity_penalty": 1.0, + "sparsity_warmup_steps": 5000, + "target_l0": 80 + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..571a73c93b407b65a4ed10d85ba64b56019778dc --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 5.868561931403287, "l1_loss": 87.49280005765249, "l0": 73.93405321419957, "frac_variance_explained": 0.9491253247462124, "cossim": 0.9681477079908531, "l2_ratio": 0.9674936364214104, "relative_reconstruction_bias": 0.9984568217432642, "loss_original": 2.591329812285412, "loss_reconstructed": 2.771394150802888, "loss_zero": 12.979128625019488, "frac_recovered": 0.9805464475269777, "frac_alive": 0.52166748046875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..09d0bae64f94ada4ab3bbf35031614c5ba3f33f0 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57b55991b9ed3a61bfbd12123f73db672397e9e3917e4eb1a14a0dc8e05792dc +size 100799263 diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..65793daf008f41644144ecd066d4653398d984f4 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "trainer_class": "JumpReluTrainer", + "dict_class": "JumpReluAutoEncoder", + "lr": 0.0003, + "steps": 244140, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_33", + "submodule_name": "resid_post_layer_8", + "bandwidth": 0.001, + "sparsity_penalty": 1.0, + "sparsity_warmup_steps": 5000, + "target_l0": 160 + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a1177d398b120fad267d1d1d4229a736bba521bd --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 4.841383078012122, "l1_loss": 130.77086565867964, "l0": 155.94150111186934, "frac_variance_explained": 0.9658320416887123, "cossim": 0.9784838686506432, "l2_ratio": 0.978472747716559, "relative_reconstruction_bias": 1.0000444162322815, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6872186675129166, "loss_zero": 12.979128625019488, "frac_recovered": 0.9896216367382601, "frac_alive": 0.4205322265625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e50a3755441b5f2e0f84cd43e54740fff7005de --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:123ae303303f92498d1269b0e4c32df87c055d51b5bb87f6e7d5cbfbb23efca7 +size 100799263 diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6314406a5d5a74a551d10f2f7e4bdcb2337721a3 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "trainer_class": "JumpReluTrainer", + "dict_class": "JumpReluAutoEncoder", + "lr": 0.0003, + "steps": 244140, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_34", + "submodule_name": "resid_post_layer_8", + "bandwidth": 0.001, + "sparsity_penalty": 1.0, + "sparsity_warmup_steps": 5000, + "target_l0": 320 + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..406e4cb106faefcecff540f1c943aabfe7cc5524 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 3.403391051005168, "l1_loss": 212.64206833437265, "l0": 308.2046768004636, "frac_variance_explained": 0.9831308564507818, "cossim": 0.9895132137350289, "l2_ratio": 0.9899121196155088, "relative_reconstruction_bias": 1.0014854985547352, "loss_original": 2.591329812285412, "loss_reconstructed": 2.626830586467881, "loss_zero": 12.979128625019488, "frac_recovered": 0.9961226119334439, "frac_alive": 0.2725830078125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..3482c5f046b0c3288e6eef93f5e2411084f49bc8 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c59bff0e7f2d1ed3fbd1422f75d9aa82695904fd3c546e70456c2dee09dc6332 +size 100799263 diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json new file mode 100644 index 0000000000000000000000000000000000000000..58b539832e6e460e609af1e27f5fb9d5c5047827 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "trainer_class": "JumpReluTrainer", + "dict_class": "JumpReluAutoEncoder", + "lr": 0.0003, + "steps": 244140, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_35", + "submodule_name": "resid_post_layer_8", + "bandwidth": 0.001, + "sparsity_penalty": 1.0, + "sparsity_warmup_steps": 5000, + "target_l0": 640 + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8508adaba69816e6c50e08bc08063f63e8b678a4 --- /dev/null +++ b/JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 1.7013380031987846, "l1_loss": 438.71177857180675, "l0": 594.7365741040334, "frac_variance_explained": 0.9956594737897436, "cossim": 0.9974015067858868, "l2_ratio": 0.9970332251255771, "relative_reconstruction_bias": 0.9992425413016813, "loss_original": 2.591329812285412, "loss_reconstructed": 2.5989834702158547, "loss_zero": 12.979128625019488, "frac_recovered": 0.9991641662206995, "frac_alive": 0.0777587890625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..a47835b1dc8de6a926da4d4abe6b6a41b649e5f8 --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5090748ecf6e9ffe4a64932b5780db4a88daf2ec7ee242e8cb0990a6a390b20d +size 100734221 diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d66fce43059dfbda55eccbe31f6eeaeafe2d4cd --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json @@ -0,0 +1,53 @@ +{ + "trainer": { + "trainer_class": "MatryoshkaBatchTopKTrainer", + "dict_class": "MatryoshkaBatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "group_fractions": [ + 0.03125, + 0.0625, + 0.125, + 0.25, + 0.53125 + ], + "group_weights": [ + 0.2, + 0.2, + 0.2, + 0.2, + 0.2 + ], + "group_sizes": [ + 512, + 1024, + 2048, + 4096, + 8704 + ], + "k": 20, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3ba404242e8b38b53cca6cde040571dc44d9bec4 --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 7.503067392291444, "l1_loss": 57.648184805205375, "l0": 19.92774974938595, "frac_variance_explained": 0.9185879086003159, "cossim": 0.947629041744001, "l2_ratio": 0.9552449746565386, "relative_reconstruction_bias": 1.0052419864770137, "loss_original": 2.6064688870401094, "loss_reconstructed": 3.059862620902784, "loss_zero": 12.187079458525687, "frac_recovered": 0.9492561925541271, "frac_alive": 0.87738037109375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3d8f78bc8ade7877eace6f7986234f46460107a --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dc8f124e90c04d8dbf1c2cd006759a68eb8f6a386d135d7e12069aea3931eb4 +size 100734221 diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ff5739666b24d95ee415dbb7147cacf6fe0cfd1b --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json @@ -0,0 +1,53 @@ +{ + "trainer": { + "trainer_class": "MatryoshkaBatchTopKTrainer", + "dict_class": "MatryoshkaBatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "group_fractions": [ + 0.03125, + 0.0625, + 0.125, + 0.25, + 0.53125 + ], + "group_weights": [ + 0.2, + 0.2, + 0.2, + 0.2, + 0.2 + ], + "group_sizes": [ + 512, + 1024, + 2048, + 4096, + 8704 + ], + "k": 40, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5525dca0fc11a57f0631f7dfecfacd7b04a7747b --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.681113040808476, "l1_loss": 75.43565345532966, "l0": 39.8682194334088, "frac_variance_explained": 0.9349244598186377, "cossim": 0.95862359530998, "l2_ratio": 0.9662664424289357, "relative_reconstruction_bias": 1.0057821707292036, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.883022969419306, "loss_zero": 12.187079458525687, "frac_recovered": 0.9689650336901346, "frac_alive": 0.9034423828125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f7715268f27165bb850b3ed910e2081fa360b57 --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c8e6e60e103f69c509650e114bd54666baaa734ca06a694c90a33c5c83dd1b3 +size 100734221 diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..994c32c07deff1b89be53c01679f4638693063b6 --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json @@ -0,0 +1,53 @@ +{ + "trainer": { + "trainer_class": "MatryoshkaBatchTopKTrainer", + "dict_class": "MatryoshkaBatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "group_fractions": [ + 0.03125, + 0.0625, + 0.125, + 0.25, + 0.53125 + ], + "group_weights": [ + 0.2, + 0.2, + 0.2, + 0.2, + 0.2 + ], + "group_sizes": [ + 512, + 1024, + 2048, + 4096, + 8704 + ], + "k": 80, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d1b166782799ef0f1cd010874c2253ce905758e9 --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 5.880217046448679, "l1_loss": 107.21432587594697, "l0": 79.61381715716738, "frac_variance_explained": 0.9493099487189091, "cossim": 0.968103901906447, "l2_ratio": 0.9761826793352762, "relative_reconstruction_bias": 1.0057118047367444, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.775832931200663, "loss_zero": 12.187079458525687, "frac_recovered": 0.981011531569741, "frac_alive": 0.87103271484375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..e685a622b4ec81695b85c4e62cd9d5af5710617d --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:440e92fe914f875a73a4665859ef61a50183264f5eb036068812caaf8f162120 +size 100734221 diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..85d3f3ecb55b356d703544a287ea41566b9bdf6f --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json @@ -0,0 +1,53 @@ +{ + "trainer": { + "trainer_class": "MatryoshkaBatchTopKTrainer", + "dict_class": "MatryoshkaBatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "group_fractions": [ + 0.03125, + 0.0625, + 0.125, + 0.25, + 0.53125 + ], + "group_weights": [ + 0.2, + 0.2, + 0.2, + 0.2, + 0.2 + ], + "group_sizes": [ + 512, + 1024, + 2048, + 4096, + 8704 + ], + "k": 160, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6da9ae07871eb1b6c8db4623d28ba7feefba53b9 --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 4.954983465599291, "l1_loss": 193.02208964029947, "l0": 159.0796592018821, "frac_variance_explained": 0.9642201698187626, "cossim": 0.977724425720446, "l2_ratio": 0.9847773006468108, "relative_reconstruction_bias": 1.0048337452339404, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.7032747485420923, "loss_zero": 12.187079458525687, "frac_recovered": 0.9891494313875834, "frac_alive": 0.66717529296875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..11ce46f9f54008a6e45a0433ee4c6c559baaa10f --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2660886927bb35a44af3ac7c4308c5aa438f8d5df6ee102ce7d98995dc94c9cb +size 100734221 diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json new file mode 100644 index 0000000000000000000000000000000000000000..46eb41690b73c7acb988904efb9ab94fd472e90f --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json @@ -0,0 +1,53 @@ +{ + "trainer": { + "trainer_class": "MatryoshkaBatchTopKTrainer", + "dict_class": "MatryoshkaBatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "group_fractions": [ + 0.03125, + 0.0625, + 0.125, + 0.25, + 0.53125 + ], + "group_weights": [ + 0.2, + 0.2, + 0.2, + 0.2, + 0.2 + ], + "group_sizes": [ + 512, + 1024, + 2048, + 4096, + 8704 + ], + "k": 320, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a43a43ab2460bfbf2d92e2666b822c3cd808b23c --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 3.53629664218787, "l1_loss": 326.76424246123344, "l0": 318.4963119969224, "frac_variance_explained": 0.9821194139393893, "cossim": 0.9888840588656339, "l2_ratio": 0.9927890228502678, "relative_reconstruction_bias": 1.002148378979076, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.643577243342544, "loss_zero": 12.187079458525687, "frac_recovered": 0.9958121306968458, "frac_alive": 0.2459716796875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..231ef297f8be55c8c19964dec9bfb5a757f97d60 --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e6040d56b1128db7d560270eecaf952f5b97cd2c012fcac2321ce73976f0578 +size 100734221 diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a25f43a3c84afa5e88cf914d157ce6f8ee129f2 --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json @@ -0,0 +1,53 @@ +{ + "trainer": { + "trainer_class": "MatryoshkaBatchTopKTrainer", + "dict_class": "MatryoshkaBatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 384, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "group_fractions": [ + 0.03125, + 0.0625, + 0.125, + 0.25, + 0.53125 + ], + "group_weights": [ + 0.2, + 0.2, + 0.2, + 0.2, + 0.2 + ], + "group_sizes": [ + 512, + 1024, + 2048, + 4096, + 8704 + ], + "k": 640, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..037e6762491b842d43d5eacb234c66ab8693390f --- /dev/null +++ b/MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 1.4892961689920137, "l1_loss": 721.7255711410985, "l0": 638.8220381303267, "frac_variance_explained": 0.9966207399512782, "cossim": 0.9979786349065376, "l2_ratio": 0.9975055961897878, "relative_reconstruction_bias": 1.0003309213753901, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6123990217844644, "loss_zero": 12.187079458525687, "frac_recovered": 0.9993439009695342, "frac_alive": 0.05511474609375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b30606e43ac598e305a8b23cfe2312c21c8e8be --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88efda2c782a996a9511c0499be5a98e666e0bce366533478099e38a0f6e9abe +size 100733608 diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..cc8afec79d95c6f15093fd5478d6d3bf0fb9ad29 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json @@ -0,0 +1,35 @@ +{ + "trainer": { + "trainer_class": "PAnnealTrainer", + "dict_class": "AutoEncoder", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "sparsity_function": "Lp^p", + "sparsity_penalty": 0.006, + "p_start": 1.0, + "p_end": 0.2, + "anneal_start": 10000, + "sparsity_queue_length": 10, + "n_sparsity_updates": 10, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "resample_steps": null, + "steps": 244140, + "seed": 0, + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c70b350c85c4608cb0b4587c33dcc1e6eb1a98ca --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 3.3622507919748146, "l1_loss": 161.849647154291, "l0": 401.211550609175, "frac_variance_explained": 0.98363655913307, "cossim": 0.9897847663925354, "l2_ratio": 0.9845664928476494, "relative_reconstruction_bias": 0.9970926865037665, "loss_original": 2.591329812285412, "loss_reconstructed": 2.632457171578005, "loss_zero": 12.979128625019488, "frac_recovered": 0.9955474386014134, "frac_alive": 0.65032958984375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..30473e5f446631ff0df1a1d28609dccdf490bd9e --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6b4106db2d65bef813107285bf834ed71eaf5c8fc82879022548cba06677d79 +size 100733608 diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..36a58aef71fc791c531e5525cdc923e8ddcd3862 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json @@ -0,0 +1,35 @@ +{ + "trainer": { + "trainer_class": "PAnnealTrainer", + "dict_class": "AutoEncoder", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "sparsity_function": "Lp^p", + "sparsity_penalty": 0.008, + "p_start": 1.0, + "p_end": 0.2, + "anneal_start": 10000, + "sparsity_queue_length": 10, + "n_sparsity_updates": 10, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "resample_steps": null, + "steps": 244140, + "seed": 0, + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..bff0cf240b9ceba2577b30124040b9f5fe2eed47 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 4.5028303674904695, "l1_loss": 117.82930498238069, "l0": 255.23347252535532, "frac_variance_explained": 0.9703729996480137, "cossim": 0.9814757866313658, "l2_ratio": 0.9766738077244127, "relative_reconstruction_bias": 0.9983646517776581, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6821453894477294, "loss_zero": 12.979128625019488, "frac_recovered": 0.9901571744177715, "frac_alive": 0.65704345703125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f7e48b065e80ee5fa909a0d616aea6b2f11f30a --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dad50bccde025c7a6046b455a88b0e55df0339f49020426d1daa00968c7c3795 +size 100733608 diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2bc78206e48690bf0c7ad88ac0462c5164a3ab32 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json @@ -0,0 +1,35 @@ +{ + "trainer": { + "trainer_class": "PAnnealTrainer", + "dict_class": "AutoEncoder", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "sparsity_function": "Lp^p", + "sparsity_penalty": 0.01, + "p_start": 1.0, + "p_end": 0.2, + "anneal_start": 10000, + "sparsity_queue_length": 10, + "n_sparsity_updates": 10, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "resample_steps": null, + "steps": 244140, + "seed": 0, + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..540f977eb66eacdbe4d1a0af8829f2146041f8ac --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 5.202926089964717, "l1_loss": 93.26969257033015, "l0": 167.94699023143355, "frac_variance_explained": 0.9601707979139075, "cossim": 0.9751063757632152, "l2_ratio": 0.9704936541706682, "relative_reconstruction_bias": 0.9973056086574692, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7267462905631006, "loss_zero": 12.979128625019488, "frac_recovered": 0.9853492372007255, "frac_alive": 0.6607666015625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb888414ff55dbbfeef97c9a84a6f1659df1edec --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daaf6b8be0f0cef67781a9bf4d91cd5d3562fe23cfd66bda390c0cac573b5fd2 +size 100733608 diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3429fc0d9acb39b57f991fa3455e8b89d1662fd4 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json @@ -0,0 +1,35 @@ +{ + "trainer": { + "trainer_class": "PAnnealTrainer", + "dict_class": "AutoEncoder", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "sparsity_function": "Lp^p", + "sparsity_penalty": 0.015, + "p_start": 1.0, + "p_end": 0.2, + "anneal_start": 10000, + "sparsity_queue_length": 10, + "n_sparsity_updates": 10, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "resample_steps": null, + "steps": 244140, + "seed": 0, + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ff9fdd150243a23020546a79f50f302e148f1f75 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.263203859329224, "l1_loss": 64.18408386965832, "l0": 73.46823607295393, "frac_variance_explained": 0.9419336117893816, "cossim": 0.9636158436895853, "l2_ratio": 0.9587060428527464, "relative_reconstruction_bias": 0.9977090491587857, "loss_original": 2.591329812285412, "loss_reconstructed": 2.8357038907257905, "loss_zero": 12.979128625019488, "frac_recovered": 0.973673836653491, "frac_alive": 0.6658935546875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..3659d5978de63ec56024a3335d5709277df7734d --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5903f377dbd7c451f953ade79bdd870a482ef70c2d1619d643f4068d771143cb +size 100733608 diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json new file mode 100644 index 0000000000000000000000000000000000000000..cfaf59f27051d6a233dcc3a6dd5ecba541efbc49 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json @@ -0,0 +1,35 @@ +{ + "trainer": { + "trainer_class": "PAnnealTrainer", + "dict_class": "AutoEncoder", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "sparsity_function": "Lp^p", + "sparsity_penalty": 0.02, + "p_start": 1.0, + "p_end": 0.2, + "anneal_start": 10000, + "sparsity_queue_length": 10, + "n_sparsity_updates": 10, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "resample_steps": null, + "steps": 244140, + "seed": 0, + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a132ebc4bc280e415e77b091260a1f177d57bafe --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.8961237625903395, "l1_loss": 51.952152091336536, "l0": 41.2746576056423, "frac_variance_explained": 0.929740074528269, "cossim": 0.9557553210172308, "l2_ratio": 0.95038376586983, "relative_reconstruction_bias": 0.9986502840576401, "loss_original": 2.591329812285412, "loss_reconstructed": 2.9467881781509124, "loss_zero": 12.979128625019488, "frac_recovered": 0.9618090857942421, "frac_alive": 0.66766357421875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0c2ccdc31f3a0dc2a547da1ab6bceda74471436 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1848070778207251469acda3d933993543de5d68a38194a37ecb2792e5b4f813 +size 100733608 diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0c7871a7475550f23545e239b1b40aad3d032ce4 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json @@ -0,0 +1,35 @@ +{ + "trainer": { + "trainer_class": "PAnnealTrainer", + "dict_class": "AutoEncoder", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "sparsity_function": "Lp^p", + "sparsity_penalty": 0.025, + "p_start": 1.0, + "p_end": 0.2, + "anneal_start": 10000, + "sparsity_queue_length": 10, + "n_sparsity_updates": 10, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "decay_start": 195312, + "resample_steps": null, + "steps": 244140, + "seed": 0, + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f81e7b35f5a9e48dbf77d88d86b97d2d7922a5e2 --- /dev/null +++ b/PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 7.323505956006337, "l1_loss": 45.781148060258616, "l0": 27.82367721235896, "frac_variance_explained": 0.9210811422531864, "cossim": 0.9500051370586258, "l2_ratio": 0.9441123252891632, "relative_reconstruction_bias": 0.9976531385657299, "loss_original": 2.591329812285412, "loss_reconstructed": 3.0392324558223587, "loss_zero": 12.979128625019488, "frac_recovered": 0.951921901070928, "frac_alive": 0.6649169921875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..dda4b6c616aede0d43acccd29febc2ebb6ccc2f5 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:637d7a74d75dfeb5611865e6eefd62af3ff14bd794e71bb23b4b870eb3d66823 +size 100733608 diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..16f852c39d973f982f62e36ac304cc2b9d788fac --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.012, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "StandardTrainerNew-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_6", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4ba1bc994a2f285411f1d0bf09ac6e9ed595b1b8 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 4.413027272166976, "l1_loss": 86.36697617496353, "l0": 693.245662459408, "frac_variance_explained": 0.9715100422681097, "cossim": 0.9828673177454845, "l2_ratio": 0.9533848065927805, "relative_reconstruction_bias": 0.9820587484233351, "loss_original": 2.591329812285412, "loss_reconstructed": 2.683495038963226, "loss_zero": 12.979128625019488, "frac_recovered": 0.9900326175862048, "frac_alive": 0.74920654296875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bbe4302b832bef91c160b1f5a3122b6c797d73d --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4a64e454c445dde7f1db008c35629050bbc1b324f63915a78f6b6bdd0a8d4eb +size 100733608 diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..cf1b8cde6fe988ed27e1ca4b15145b3b45014489 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.015, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "StandardTrainerNew-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_7", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d6488e7c16cd03af3ab5e0ca2cd4f261f6261e --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 4.928134016243808, "l1_loss": 74.24602200611528, "l0": 495.6307437391166, "frac_variance_explained": 0.9644140550889164, "cossim": 0.9784741110830422, "l2_ratio": 0.946569280811103, "relative_reconstruction_bias": 0.9802226797643915, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7142408533268663, "loss_zero": 12.979128625019488, "frac_recovered": 0.9867081437484327, "frac_alive": 0.748779296875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..13b497e7f5d0a5982a60af454dea47e0af2b6de0 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2cc97dfc1553cda52477ef47bbd12bfd3d9b67cd84d00f299f16e760de4b4aa +size 100733608 diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..71825763c18ebd3144e3a958bd8518d834819b87 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.02, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "StandardTrainerNew-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_8", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..03d120debf579edfb258d5ed87b7d38295153443 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 5.58367193463337, "l1_loss": 60.69475557717932, "l0": 309.40750894201807, "frac_variance_explained": 0.9543272741587765, "cossim": 0.9720740648637335, "l2_ratio": 0.937349563980677, "relative_reconstruction_bias": 0.9785354941724295, "loss_original": 2.591329812285412, "loss_reconstructed": 2.765607705317348, "loss_zero": 12.979128625019488, "frac_recovered": 0.9812187724802868, "frac_alive": 0.7474365234375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..9670735a4ecdebc28b78fc517a92281626bf1e0b --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ff951fbe8914ed549cade9cde11e67532a9b930ab2fa67cc99aa2127fabef36 +size 100733608 diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..057b563135847c087bfcbf02006f3297d862fc55 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.03, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "StandardTrainerNew-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_9", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..062d028c32dd73261745b63931b7b2f2d415334a --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.448484552911966, "l1_loss": 46.181850984872106, "l0": 154.64578734248516, "frac_variance_explained": 0.9382677713790571, "cossim": 0.9623248900275633, "l2_ratio": 0.9227385366537485, "relative_reconstruction_bias": 0.9744020757904972, "loss_original": 2.591329812285412, "loss_reconstructed": 2.869060355496694, "loss_zero": 12.979128625019488, "frac_recovered": 0.9701569353241518, "frac_alive": 0.74444580078125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..945bce018d21cccc6325d7a61642d2dfb92ff49f --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abc8a0c9654e2036ee258402caf31b812b0bb67276a18f9aaf728e600751475 +size 100733608 diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d26d902a8ff3b2f84e71e6b1ed95b25d7cf0746 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.04, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "StandardTrainerNew-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_10", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..677885f825544382eed6e99912634f1f7443ded1 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 7.007212920361255, "l1_loss": 38.88443358547716, "l0": 96.43636142776673, "frac_variance_explained": 0.9274044618549118, "cossim": 0.9552818676793432, "l2_ratio": 0.911165978176048, "relative_reconstruction_bias": 0.9716554108154343, "loss_original": 2.591329812285412, "loss_reconstructed": 2.9624105699091072, "loss_zero": 12.979128625019488, "frac_recovered": 0.9601952480264457, "frac_alive": 0.74041748046875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..76d1cbb996c5147a7ec7c24e44ab330448a696eb --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d017a06d11ba8a400b8df31cf9f233ecc02488257dd59e369485bf785732bb2 +size 100733608 diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json new file mode 100644 index 0000000000000000000000000000000000000000..816096af22ba35bf04d9b18cfc8c716645ea9849 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 768, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.06, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 0, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "StandardTrainerNew-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_11", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e1f6f77521dba8b4ddfe4e354c4d67786aa91c32 --- /dev/null +++ b/Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 7.761060510773256, "l1_loss": 31.28638026225998, "l0": 51.94378997618894, "frac_variance_explained": 0.9111872135874737, "cossim": 0.9451250573956823, "l2_ratio": 0.891550215971039, "relative_reconstruction_bias": 0.9651189223829523, "loss_original": 2.591329812285412, "loss_reconstructed": 3.1271837712770485, "loss_zero": 12.979128625019488, "frac_recovered": 0.9424630604594587, "frac_alive": 0.7325439453125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..2823645518bd39dcfdabe846d84461e3d3c5af82 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab84d876da1b82169bb0f62636a8978a0b650df97e68406d5665d72630c56808 +size 100733974 diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b51618f24a577c78b72e258d6a14478a8fe5fb66 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 20, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "TopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b55121944c00ea81d3a9b50571e769bfb521dbea --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 7.10956572041367, "l1_loss": 51.419726747455016, "l0": 19.99874230587121, "frac_variance_explained": 0.9261308095671914, "cossim": 0.9529453786936674, "l2_ratio": 0.9528289130239775, "relative_reconstruction_bias": 1.0015099120862556, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.9800217296137954, "loss_zero": 12.187079458525687, "frac_recovered": 0.9583556218580767, "frac_alive": 0.97100830078125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..67b43e57abffb5f73ae86c66e5c41f2246405cbd --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc713e9ac91a6e19eaf21cea22a5b7163b6a8270d25e589aba4e168dba9917b +size 100733974 diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..aa13445c97a43918a608d8ed0e80d37b90196ddf --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 40, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "TopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..540471563fdf92a4fb779e354a36cfc151df0a76 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.298781915144487, "l1_loss": 65.19652025627367, "l0": 39.99936976577296, "frac_variance_explained": 0.9413592273538763, "cossim": 0.9631030758221945, "l2_ratio": 0.9631146662163012, "relative_reconstruction_bias": 1.0009909868240356, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.843784288926558, "loss_zero": 12.187079458525687, "frac_recovered": 0.9734502001242205, "frac_alive": 0.99285888671875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff756e7971b8c3593470d48b4d5cf03959e5a6d8 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd85a9da075fefc69162072f4024a0a3e17a0040b371a783a117507d21e8bbb0 +size 100733974 diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f0397f73c00248c66c8dd0fc295a0b5849ee2c6 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 80, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "TopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c54e69d95d57b54e2c6bfa307f958be75cfad5ca --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 5.48234257553563, "l1_loss": 88.40633161140211, "l0": 79.98960506554806, "frac_variance_explained": 0.9552180875431407, "cossim": 0.9720208825487079, "l2_ratio": 0.972093170339411, "relative_reconstruction_bias": 1.0002325664867053, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.7563261480042427, "loss_zero": 12.187079458525687, "frac_recovered": 0.9832018993117593, "frac_alive": 0.9927978515625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..08f3f014353c0386033c3c0cb691b3a105dc350d --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08529759e40db392f2e3b7a187bc51c1c033395f9c097975d1368215dbc096af +size 100733974 diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed15e40305f3c40e8f7c52fb5113effe03e84090 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 160, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "TopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3fec1840033465a734debe0f09cae295b138ba43 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 4.540305412176884, "l1_loss": 156.59205766157672, "l0": 159.97946166992188, "frac_variance_explained": 0.9691928065184391, "cossim": 0.9808063272273901, "l2_ratio": 0.9807897011439005, "relative_reconstruction_bias": 1.0003096334861987, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6975065975478203, "loss_zero": 12.187079458525687, "frac_recovered": 0.9897829980561228, "frac_alive": 0.94097900390625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fe1c1b1350daf809563357dedaa3452a0e97b5d --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45446e7dd3803d4145d6aa83dbe4ed406798232863ecbb4ea467cc796988ac41 +size 100733974 diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json new file mode 100644 index 0000000000000000000000000000000000000000..871240ce93caeca19614b54f6d20cf9ffe52d796 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 320, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "TopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e153f12f40f2f0fa9aa9c4a8f053958dd7b6a747 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 3.127272938237046, "l1_loss": 286.77250070282906, "l0": 320.0, "frac_variance_explained": 0.98568051330971, "cossim": 0.9909948363448634, "l2_ratio": 0.9907523158824805, "relative_reconstruction_bias": 0.9999155582803668, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6417208006887725, "loss_zero": 12.187079458525687, "frac_recovered": 0.9960357546806335, "frac_alive": 0.4578857421875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..8acced3f3aee0ad8bcb1d6bc58b09210c89feeaa --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30dc6ca83384efc071536423f981653499aeb1b6d8c34cb812ec4b2f92bd9055 +size 100733974 diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ae6c2d487f9a5cfb9ab442acd13243a5420a5e7f --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 0, + "activation_dim": 768, + "dict_size": 16384, + "k": 640, + "device": "cuda:0", + "layer": 8, + "lm_name": "EleutherAI/pythia-160m-deduped", + "wandb_name": "TopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5", + "submodule_name": "resid_post_layer_8" + }, + "buffer": { + "d_submodule": 768, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 32, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a2bd1080e742cedc8bff7ccc6de470c1ecef6b82 --- /dev/null +++ b/TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 1.6481144536625256, "l1_loss": 607.7699584960938, "l0": 639.8206953568892, "frac_variance_explained": 0.995967478463144, "cossim": 0.9975080237244115, "l2_ratio": 0.9976696462342234, "relative_reconstruction_bias": 0.9993832635157036, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6151728268825645, "loss_zero": 12.187079458525687, "frac_recovered": 0.9990135232607523, "frac_alive": 0.06243896484375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file