Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/eval_results.json +1 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/config.json +26 -0
- gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/eval_results.json +1 -0
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "0",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 20,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 137.7, "l1_loss": 110.15, "l0": 20.0, "frac_variance_explained": 0.06328125, "cossim": 0.2876953125, "l2_ratio": 0.1814453125, "relative_reconstruction_bias": 0.630078125, "loss_original": 2.440642213821411, "loss_reconstructed": 11.407014656066895, "loss_zero": 12.452932643890382, "frac_recovered": 0.10465058535337449, "frac_alive": 0.138454869389534, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "154",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 20,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 93.55, "l1_loss": 238.6, "l0": 20.0, "frac_variance_explained": 0.334375, "cossim": 0.755078125, "l2_ratio": 0.79140625, "relative_reconstruction_bias": 1.01484375, "loss_original": 2.440642213821411, "loss_reconstructed": 4.918577527999878, "loss_zero": 12.452932643890382, "frac_recovered": 0.7525505006313324, "frac_alive": 0.1292317658662796, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "1544",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 20,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 70.4, "l1_loss": 267.2, "l0": 20.0, "frac_variance_explained": 0.633984375, "cossim": 0.871484375, "l2_ratio": 0.87109375, "relative_reconstruction_bias": 1.0, "loss_original": 2.440642213821411, "loss_reconstructed": 3.1963499784469604, "loss_zero": 12.452932643890382, "frac_recovered": 0.9245945453643799, "frac_alive": 0.1436631977558136, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "15440",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 20,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 65.425, "l1_loss": 311.0, "l0": 20.0, "frac_variance_explained": 0.758984375, "cossim": 0.8875, "l2_ratio": 0.890234375, "relative_reconstruction_bias": 1.00078125, "loss_original": 2.440642213821411, "loss_reconstructed": 2.7933643102645873, "loss_zero": 12.452932643890382, "frac_recovered": 0.9648277342319489, "frac_alive": 0.157009556889534, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "48",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 20,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 105.15, "l1_loss": 179.8, "l0": 20.0, "frac_variance_explained": 0.18828125, "cossim": 0.692578125, "l2_ratio": 0.716015625, "relative_reconstruction_bias": 0.9984375, "loss_original": 2.440642213821411, "loss_reconstructed": 6.1452779293060305, "loss_zero": 12.452932643890382, "frac_recovered": 0.6302249014377594, "frac_alive": 0.1695421040058136, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "488",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 20,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 73.85, "l1_loss": 262.3, "l0": 20.0, "frac_variance_explained": 0.6015625, "cossim": 0.857421875, "l2_ratio": 0.85625, "relative_reconstruction_bias": 0.996875, "loss_original": 2.440642213821411, "loss_reconstructed": 3.600706672668457, "loss_zero": 12.452932643890382, "frac_recovered": 0.884125429391861, "frac_alive": 0.131618931889534, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "4882",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 20,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 66.1, "l1_loss": 277.0, "l0": 20.0, "frac_variance_explained": 0.680859375, "cossim": 0.8859375, "l2_ratio": 0.888671875, "relative_reconstruction_bias": 1.00390625, "loss_original": 2.440642213821411, "loss_reconstructed": 2.888194966316223, "loss_zero": 12.452932643890382, "frac_recovered": 0.9553665339946746, "frac_alive": 0.1568467915058136, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "0",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 40,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 142.2, "l1_loss": 218.0, "l0": 40.0, "frac_variance_explained": 0.10546875, "cossim": 0.36953125, "l2_ratio": 0.2482421875, "relative_reconstruction_bias": 0.669921875, "loss_original": 2.440642213821411, "loss_reconstructed": 10.688310146331787, "loss_zero": 12.452932643890382, "frac_recovered": 0.1763722062110901, "frac_alive": 0.212185338139534, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "154",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 40,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 87.2, "l1_loss": 379.6, "l0": 40.0, "frac_variance_explained": 0.449609375, "cossim": 0.7953125, "l2_ratio": 0.8234375, "relative_reconstruction_bias": 1.015625, "loss_original": 2.440642213821411, "loss_reconstructed": 4.139649343490601, "loss_zero": 12.452932643890382, "frac_recovered": 0.830296915769577, "frac_alive": 0.2669813334941864, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "1544",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 40,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 65.55, "l1_loss": 343.8, "l0": 40.0, "frac_variance_explained": 0.665625, "cossim": 0.88984375, "l2_ratio": 0.888671875, "relative_reconstruction_bias": 0.998046875, "loss_original": 2.440642213821411, "loss_reconstructed": 2.863617014884949, "loss_zero": 12.452932643890382, "frac_recovered": 0.9578355431556702, "frac_alive": 0.2782118022441864, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "15440",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 40,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 58.45, "l1_loss": 394.2, "l0": 40.0, "frac_variance_explained": 0.780078125, "cossim": 0.909375, "l2_ratio": 0.911328125, "relative_reconstruction_bias": 1.00234375, "loss_original": 2.440642213821411, "loss_reconstructed": 2.612251806259155, "loss_zero": 12.452932643890382, "frac_recovered": 0.9829114198684692, "frac_alive": 0.29541015625, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "48",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 40,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 108.0, "l1_loss": 338.6, "l0": 40.0, "frac_variance_explained": 0.305078125, "cossim": 0.733203125, "l2_ratio": 0.73984375, "relative_reconstruction_bias": 0.8953125, "loss_original": 2.440642213821411, "loss_reconstructed": 5.098966073989868, "loss_zero": 12.452932643890382, "frac_recovered": 0.7346842169761658, "frac_alive": 0.33349609375, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "488",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 40,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 69.6, "l1_loss": 384.6, "l0": 40.0, "frac_variance_explained": 0.665625, "cossim": 0.872265625, "l2_ratio": 0.8796875, "relative_reconstruction_bias": 1.0078125, "loss_original": 2.440642213821411, "loss_reconstructed": 3.157698321342468, "loss_zero": 12.452932643890382, "frac_recovered": 0.9283924698829651, "frac_alive": 0.2635633647441864, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "4882",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 40,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 61.075, "l1_loss": 350.0, "l0": 40.0, "frac_variance_explained": 0.703125, "cossim": 0.901171875, "l2_ratio": 0.901953125, "relative_reconstruction_bias": 1.000390625, "loss_original": 2.440642213821411, "loss_reconstructed": 2.661836934089661, "loss_zero": 12.452932643890382, "frac_recovered": 0.9779623448848724, "frac_alive": 0.2948133647441864, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "0",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 80,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 133.8, "l1_loss": 406.6, "l0": 80.0, "frac_variance_explained": 0.173828125, "cossim": 0.4671875, "l2_ratio": 0.3470703125, "relative_reconstruction_bias": 0.741796875, "loss_original": 2.440642213821411, "loss_reconstructed": 9.577101516723634, "loss_zero": 12.452932643890382, "frac_recovered": 0.2874674767255783, "frac_alive": 0.314019113779068, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "154",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 80,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 78.3, "l1_loss": 554.4, "l0": 80.0, "frac_variance_explained": 0.56953125, "cossim": 0.839453125, "l2_ratio": 0.851953125, "relative_reconstruction_bias": 1.0046875, "loss_original": 2.440642213821411, "loss_reconstructed": 3.426113796234131, "loss_zero": 12.452932643890382, "frac_recovered": 0.9015444159507752, "frac_alive": 0.5124782919883728, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "1544",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 80,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 61.05, "l1_loss": 524.6, "l0": 80.0, "frac_variance_explained": 0.731640625, "cossim": 0.9046875, "l2_ratio": 0.9078125, "relative_reconstruction_bias": 1.0046875, "loss_original": 2.440642213821411, "loss_reconstructed": 2.71366970539093, "loss_zero": 12.452932643890382, "frac_recovered": 0.9728043735027313, "frac_alive": 0.49365234375, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "15440",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 80,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 54.2, "l1_loss": 536.8, "l0": 80.0, "frac_variance_explained": 0.803125, "cossim": 0.925390625, "l2_ratio": 0.926171875, "relative_reconstruction_bias": 1.003125, "loss_original": 2.440642213821411, "loss_reconstructed": 2.552223062515259, "loss_zero": 12.452932643890382, "frac_recovered": 0.9888993203639984, "frac_alive": 0.4521484375, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "48",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 80,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 91.75, "l1_loss": 515.4, "l0": 80.0, "frac_variance_explained": 0.414453125, "cossim": 0.773828125, "l2_ratio": 0.78984375, "relative_reconstruction_bias": 0.96953125, "loss_original": 2.440642213821411, "loss_reconstructed": 4.03226523399353, "loss_zero": 12.452932643890382, "frac_recovered": 0.841141802072525, "frac_alive": 0.5571831464767456, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "488",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 80,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 69.125, "l1_loss": 497.8, "l0": 80.0, "frac_variance_explained": 0.642578125, "cossim": 0.873046875, "l2_ratio": 0.87734375, "relative_reconstruction_bias": 1.004296875, "loss_original": 2.440642213821411, "loss_reconstructed": 3.0880523681640626, "loss_zero": 12.452932643890382, "frac_recovered": 0.9353146016597748, "frac_alive": 0.5100911259651184, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "4882",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 80,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 54.45, "l1_loss": 593.0, "l0": 80.0, "frac_variance_explained": 0.847265625, "cossim": 0.921875, "l2_ratio": 0.923828125, "relative_reconstruction_bias": 1.0015625, "loss_original": 2.440642213821411, "loss_reconstructed": 2.5707311153411867, "loss_zero": 12.452932643890382, "frac_recovered": 0.9870538175106048, "frac_alive": 0.4774305522441864, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "0",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 160,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 121.8, "l1_loss": 744.8, "l0": 160.0, "frac_variance_explained": 0.2640625, "cossim": 0.56953125, "l2_ratio": 0.4927734375, "relative_reconstruction_bias": 0.8609375, "loss_original": 2.440642213821411, "loss_reconstructed": 6.267248868942261, "loss_zero": 12.452932643890382, "frac_recovered": 0.6180142462253571, "frac_alive": 0.4386935830116272, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "154",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 160,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 69.05, "l1_loss": 742.0, "l0": 160.0, "frac_variance_explained": 0.65234375, "cossim": 0.875390625, "l2_ratio": 0.885546875, "relative_reconstruction_bias": 1.0046875, "loss_original": 2.440642213821411, "loss_reconstructed": 2.9694986820220945, "loss_zero": 12.452932643890382, "frac_recovered": 0.9471549808979034, "frac_alive": 0.7988823652267456, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "1544",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 160,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 59.575, "l1_loss": 769.6, "l0": 160.0, "frac_variance_explained": 0.788671875, "cossim": 0.91484375, "l2_ratio": 0.916796875, "relative_reconstruction_bias": 1.003125, "loss_original": 2.440642213821411, "loss_reconstructed": 2.621287798881531, "loss_zero": 12.452932643890382, "frac_recovered": 0.9820147037506104, "frac_alive": 0.76953125, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TrainerTopK",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0001885618083164127,
|
6 |
+
"steps": "15440",
|
7 |
+
"seed": 0,
|
8 |
+
"activation_dim": 2304,
|
9 |
+
"dict_size": 18432,
|
10 |
+
"k": 160,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11"
|
16 |
+
},
|
17 |
+
"buffer": {
|
18 |
+
"d_submodule": 2304,
|
19 |
+
"io": "out",
|
20 |
+
"n_ctxs": 2000,
|
21 |
+
"ctx_len": 128,
|
22 |
+
"refresh_batch_size": 24,
|
23 |
+
"out_batch_size": 4096,
|
24 |
+
"device": "cuda:0"
|
25 |
+
}
|
26 |
+
}
|
gemma-2-2b_sweep_topk_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 48.3, "l1_loss": 731.6, "l0": 160.0, "frac_variance_explained": 0.833203125, "cossim": 0.93984375, "l2_ratio": 0.9390625, "relative_reconstruction_bias": 0.9984375, "loss_original": 2.440642213821411, "loss_reconstructed": 2.512040066719055, "loss_zero": 12.452932643890382, "frac_recovered": 0.9929002702236176, "frac_alive": 0.6171332597732544, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|