diff --git a/baseline-128x-k128/config.json b/baseline-128x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..22a009689c6b2e655fdc99c34db83c947a08346b --- /dev/null +++ b/baseline-128x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 2946, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 16, "grad_acc_steps": 2, "micro_acc_steps": 1, "stop_steps": 10000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 8000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "baseline-128x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/baseline-128x-k128/layers.10.mlp/cfg.json b/baseline-128x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..ae6faa29f4600acaf050e63cae9a0b010bf51265 --- /dev/null +++ b/baseline-128x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 2946, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-128x-k128/layers.10.mlp/sae.safetensors b/baseline-128x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9dbcf9ad11b618b783c892d8db3b36b3e6e5463d --- /dev/null +++ b/baseline-128x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:793aa1ee37e886b3210bd08102e19f72dfcdef6bb720805553eefc5387f6634d +size 341363360 diff --git a/baseline-128x-k128/layers.15.mlp/cfg.json b/baseline-128x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..ae6faa29f4600acaf050e63cae9a0b010bf51265 --- /dev/null +++ b/baseline-128x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 2946, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-128x-k128/layers.15.mlp/sae.safetensors b/baseline-128x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c0207545ed8f3c0ccfff527ba9a53fa4348ea412 --- /dev/null +++ b/baseline-128x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de9cebcc0c5a85e8a14aaaa1ae869a0a433cce9fdb90b0776896a8ddb10f0a34 +size 341363360 diff --git a/baseline-128x-k128/layers.20.mlp/cfg.json b/baseline-128x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..ae6faa29f4600acaf050e63cae9a0b010bf51265 --- /dev/null +++ b/baseline-128x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 2946, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-128x-k128/layers.20.mlp/sae.safetensors b/baseline-128x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8d7c06874acab4e851b71fed6df8d73250c1998a --- /dev/null +++ b/baseline-128x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ba2e9ab845db5bd1d11871b3fbb4a40fd09b2fd1c3e77737a2e27dc504f768 +size 341363360 diff --git a/baseline-128x-k128/lr_scheduler_0.pt b/baseline-128x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..d377bedfe1e4173fc2f10a55133abdb5b9155b71 --- /dev/null +++ b/baseline-128x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7c91a088ea6ea032e6b80395a912fc02957b1f89ffdab30523b0aa45c5dde5b +size 1020 diff --git a/baseline-128x-k128/optimizer_0.pt b/baseline-128x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..73af8ad590674e0e8be0fa04a7a9345fdaf25c84 --- /dev/null +++ b/baseline-128x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9097295520720ab8af201776a2159e142529d4016acfad8b5cdc639ff127bda +size 1023200816 diff --git a/baseline-128x-k128/optimizer_1.pt b/baseline-128x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..be0888f7370c9b953e82bbe8dae541e431355316 --- /dev/null +++ b/baseline-128x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8891d988d9f5f59394457c76541beb13abda955df6a2fa8ca021dfb80b7b1f39 +size 1789112 diff --git a/baseline-128x-k128/rank_0_state.pt b/baseline-128x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef6d416c4c99d2e14f58ac69c41b24a601547bdc --- /dev/null +++ b/baseline-128x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c649d647071cf460df941d9ff32dc81790402b26bf9b0b3a5cb5e35a656210 +size 1771319 diff --git a/baseline-128x-k128/state.pt b/baseline-128x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..3eaaa3d92d7e27c98efd0fa0a85d0a9126f947da --- /dev/null +++ b/baseline-128x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dfd1ef6b555df5ad7efb5e9597fba35316c7be30e2e221ea46de8e76592a08b +size 856 diff --git a/baseline-16x-k128/config.json b/baseline-16x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ca556ba7654c8c97fa7459a49cd88370d03c0a9 --- /dev/null +++ b/baseline-16x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 16, "normalize_decoder": true, "num_latents": 0, "k": 1539, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "baseline-16x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/baseline-16x-k128/layers.10.mlp/cfg.json b/baseline-16x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..a31a452a6bad617f85de9fa05c642728d4303ffa --- /dev/null +++ b/baseline-16x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 16, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-16x-k128/layers.10.mlp/sae.safetensors b/baseline-16x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2b9e93d2918821ef8ae045f27ea8894384e47852 --- /dev/null +++ b/baseline-16x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3892de718ae73bb56b8f8e0d0b3617ba1af9a511fc6472a9838333931afa9e1 +size 43834000 diff --git a/baseline-16x-k128/layers.15.mlp/cfg.json b/baseline-16x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..a31a452a6bad617f85de9fa05c642728d4303ffa --- /dev/null +++ b/baseline-16x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 16, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-16x-k128/layers.15.mlp/sae.safetensors b/baseline-16x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..830af344bc2d025db969ea725300282f9d241a5e --- /dev/null +++ b/baseline-16x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fa69afcb6ab83e7b7ae8307c3d87a1e7cf18886cc95782942190e3c16ea3ace +size 43834000 diff --git a/baseline-16x-k128/layers.20.mlp/cfg.json b/baseline-16x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..a31a452a6bad617f85de9fa05c642728d4303ffa --- /dev/null +++ b/baseline-16x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 16, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-16x-k128/layers.20.mlp/sae.safetensors b/baseline-16x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a625990d8c108510287249d29c5b53e0369dd8a9 --- /dev/null +++ b/baseline-16x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30485800adb27bd9975f5bf64627ed19a295128972ee22e9afc4653cbdb97144 +size 43834000 diff --git a/baseline-16x-k128/lr_scheduler_0.pt b/baseline-16x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..15b1a6f9897069305ce459e2bb0a13a278ea2d71 --- /dev/null +++ b/baseline-16x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd0d07701c3ba499d6e8cb644e6586e74a8ad17d38cab9cd0bf72cf61f04cb5 +size 1020 diff --git a/baseline-16x-k128/optimizer_0.pt b/baseline-16x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..aaa14950ce667a64e97058404df1836634d20499 --- /dev/null +++ b/baseline-16x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8815d008f3e428f71d070f111248c65b4bb197c5ff7e6fb17e79b585bc133fe1 +size 131386928 diff --git a/baseline-16x-k128/optimizer_1.pt b/baseline-16x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e2a7e9e59b414d90050b9a26846c4a814b11564 --- /dev/null +++ b/baseline-16x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c149707b13ee860f9e7d176dd7ac1633fd1344e086961e4087a5b0f8c30c12a1 +size 240824 diff --git a/baseline-16x-k128/rank_0_state.pt b/baseline-16x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..28cf51395dbf7a70154124708262360a8ec8ab51 --- /dev/null +++ b/baseline-16x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a549c9fbf3c27871751cf668af3d375a1b7767c3eb2b35581b3cd7376971a245 +size 222967 diff --git a/baseline-16x-k128/state.pt b/baseline-16x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ea4e3fb974a75b287242aa8260fc925119dfffe --- /dev/null +++ b/baseline-16x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241a3af5b7905444d1b0c938b38c55fee6d3d2c2be09cd70c374dc7c6bbf7954 +size 856 diff --git a/baseline-16x-k64/config.json b/baseline-16x-k64/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9280eae1f5701ab480f7142a50d04b117a9d8750 --- /dev/null +++ b/baseline-16x-k64/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 16, "normalize_decoder": true, "num_latents": 0, "k": 1491, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "baseline-16x-k64", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 24} \ No newline at end of file diff --git a/baseline-16x-k64/layers.10.mlp/cfg.json b/baseline-16x-k64/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..eba529fee4bcdd08603a51274a09efcde95aa2ee --- /dev/null +++ b/baseline-16x-k64/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 16, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-16x-k64/layers.10.mlp/sae.safetensors b/baseline-16x-k64/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2a008e8610c664e889ab8cf045e6cb7d0a9c4690 --- /dev/null +++ b/baseline-16x-k64/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d48eae0e58a8f8b47fbbb49fa60271eaffce37b04f8b8009e454c7fabe015f6e +size 43834000 diff --git a/baseline-16x-k64/layers.15.mlp/cfg.json b/baseline-16x-k64/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..eba529fee4bcdd08603a51274a09efcde95aa2ee --- /dev/null +++ b/baseline-16x-k64/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 16, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-16x-k64/layers.15.mlp/sae.safetensors b/baseline-16x-k64/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..34c485742b15b4344d7a5b375c20962361c6e89e --- /dev/null +++ b/baseline-16x-k64/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ab216c35d7dc8aaf1c82e024fe7b1c21eff2b12a7f4af59f927b7128a8ea3ac +size 43834000 diff --git a/baseline-16x-k64/layers.20.mlp/cfg.json b/baseline-16x-k64/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..eba529fee4bcdd08603a51274a09efcde95aa2ee --- /dev/null +++ b/baseline-16x-k64/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 16, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-16x-k64/layers.20.mlp/sae.safetensors b/baseline-16x-k64/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ceb0db06cb2fffb29e823cc6840a1df0468ddc0 --- /dev/null +++ b/baseline-16x-k64/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5413673ade586e276a7c044cb017e4a5d06f83a8bac22fe3d518d09f7b2676ce +size 43834000 diff --git a/baseline-16x-k64/lr_scheduler_0.pt b/baseline-16x-k64/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..15b1a6f9897069305ce459e2bb0a13a278ea2d71 --- /dev/null +++ b/baseline-16x-k64/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd0d07701c3ba499d6e8cb644e6586e74a8ad17d38cab9cd0bf72cf61f04cb5 +size 1020 diff --git a/baseline-16x-k64/optimizer_0.pt b/baseline-16x-k64/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..705127e5c3135e3ff0ccf85db917d3f9ae7b4d85 --- /dev/null +++ b/baseline-16x-k64/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb057026e4b2721b14835d78031699e57541c6d20e01981078e40614908c4009 +size 131386928 diff --git a/baseline-16x-k64/optimizer_1.pt b/baseline-16x-k64/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d05f018b854790dfa1657e5f3685b5462990018 --- /dev/null +++ b/baseline-16x-k64/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c95b5224282279c73a1b83679700cff2318c88cdc05aa3dedd2a65e654fe0f7d +size 240824 diff --git a/baseline-16x-k64/rank_0_state.pt b/baseline-16x-k64/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..28cf51395dbf7a70154124708262360a8ec8ab51 --- /dev/null +++ b/baseline-16x-k64/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a549c9fbf3c27871751cf668af3d375a1b7767c3eb2b35581b3cd7376971a245 +size 222967 diff --git a/baseline-16x-k64/state.pt b/baseline-16x-k64/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ea4e3fb974a75b287242aa8260fc925119dfffe --- /dev/null +++ b/baseline-16x-k64/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241a3af5b7905444d1b0c938b38c55fee6d3d2c2be09cd70c374dc7c6bbf7954 +size 856 diff --git a/baseline-32x-k128/config.json b/baseline-32x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..99761176c21a694a1ab9bdc6a688e04a463e8d5f --- /dev/null +++ b/baseline-32x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 1539, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "baseline-32x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/baseline-32x-k128/layers.10.mlp/cfg.json b/baseline-32x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..b56339ff4e414b703c57783bb58258d50ffd2bb5 --- /dev/null +++ b/baseline-32x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-32x-k128/layers.10.mlp/sae.safetensors b/baseline-32x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2933cccb4b3d72fb8af3119aea3af97411b5d2c9 --- /dev/null +++ b/baseline-32x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e5d2ae1d462e1bf6a9f8210d6566b996420c5ce585887917acea1f31753a2a4 +size 86338200 diff --git a/baseline-32x-k128/layers.15.mlp/cfg.json b/baseline-32x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..b56339ff4e414b703c57783bb58258d50ffd2bb5 --- /dev/null +++ b/baseline-32x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-32x-k128/layers.15.mlp/sae.safetensors b/baseline-32x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6901d619c4da63d2ab841831994637364d1cf7f2 --- /dev/null +++ b/baseline-32x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e06c237ce049e55e1c68f8c5b6589ca5e4cc415c1c1bcb2ed577a42a61ccc25c +size 86338200 diff --git a/baseline-32x-k128/layers.20.mlp/cfg.json b/baseline-32x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..b56339ff4e414b703c57783bb58258d50ffd2bb5 --- /dev/null +++ b/baseline-32x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-32x-k128/layers.20.mlp/sae.safetensors b/baseline-32x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dc20bb0d66eb2fd48470d8091ca40f417811ea05 --- /dev/null +++ b/baseline-32x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:249fd0796c562caf9ca9014a1866c3aabcf90eb88ff2e282398808b2f1c605a2 +size 86338200 diff --git a/baseline-32x-k128/lr_scheduler_0.pt b/baseline-32x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..15b1a6f9897069305ce459e2bb0a13a278ea2d71 --- /dev/null +++ b/baseline-32x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd0d07701c3ba499d6e8cb644e6586e74a8ad17d38cab9cd0bf72cf61f04cb5 +size 1020 diff --git a/baseline-32x-k128/optimizer_0.pt b/baseline-32x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc0c6e484b7319aa611d2a6528fa68333b1ade39 --- /dev/null +++ b/baseline-32x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30e926abd84c6ac57a220387b39065832038780cf2dd51877e99cb8cbca4dff0 +size 258788912 diff --git a/baseline-32x-k128/optimizer_1.pt b/baseline-32x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..de03bf02299ce1f74bbadd28e2708ca46e2818bd --- /dev/null +++ b/baseline-32x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdd9bd84a6b36c879668d085406271490da21f631d85e0e3c42fa45755fc8e3b +size 462008 diff --git a/baseline-32x-k128/rank_0_state.pt b/baseline-32x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..e51599e5f8cb69d85abadcb2ba4b571cb5ab4144 --- /dev/null +++ b/baseline-32x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e90ee6bac0c6e8e7f7d49a9ecee6ac28119c6154b76ae24b77492e193a36b88e +size 444151 diff --git a/baseline-32x-k128/state.pt b/baseline-32x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ea4e3fb974a75b287242aa8260fc925119dfffe --- /dev/null +++ b/baseline-32x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241a3af5b7905444d1b0c938b38c55fee6d3d2c2be09cd70c374dc7c6bbf7954 +size 856 diff --git a/baseline-32x-k64/config.json b/baseline-32x-k64/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f626c208d225fd6e6d1566d312ecc93310b5bfce --- /dev/null +++ b/baseline-32x-k64/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 1491, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "baseline-32x-k64", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 24} \ No newline at end of file diff --git a/baseline-32x-k64/layers.10.mlp/cfg.json b/baseline-32x-k64/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..4f31a0b964a34fe8df157fe345e49625dd2cd4d1 --- /dev/null +++ b/baseline-32x-k64/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-32x-k64/layers.10.mlp/sae.safetensors b/baseline-32x-k64/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b53beda51b6fc8fed7f2055259c7e8e6d83b692 --- /dev/null +++ b/baseline-32x-k64/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d5682f6a85dc3ab35972ad7625f9ed11988cead989ea1abc1dd28f014def4f6 +size 86338200 diff --git a/baseline-32x-k64/layers.15.mlp/cfg.json b/baseline-32x-k64/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..4f31a0b964a34fe8df157fe345e49625dd2cd4d1 --- /dev/null +++ b/baseline-32x-k64/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-32x-k64/layers.15.mlp/sae.safetensors b/baseline-32x-k64/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6cb934c25667317586339a765dc405e5ce572a29 --- /dev/null +++ b/baseline-32x-k64/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c19ecea8e2efd91962687d1e9b9bd611532ea94c37b5908640bd910161b813ce +size 86338200 diff --git a/baseline-32x-k64/layers.20.mlp/cfg.json b/baseline-32x-k64/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..4f31a0b964a34fe8df157fe345e49625dd2cd4d1 --- /dev/null +++ b/baseline-32x-k64/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-32x-k64/layers.20.mlp/sae.safetensors b/baseline-32x-k64/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..668c00c6af10538609c1fd010c8d5ff7f07505ba --- /dev/null +++ b/baseline-32x-k64/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0366ac3c10934b34819f642ea4e3fa0c01e1ef8cafbdf97e0de8664cd778c239 +size 86338200 diff --git a/baseline-32x-k64/lr_scheduler_0.pt b/baseline-32x-k64/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..86b53d8e22e1c4ad27c6f4c7c2466ec3f57470fd --- /dev/null +++ b/baseline-32x-k64/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d834baf40442747fae5c6cbde6c424301e95e721e39dde07e4ef8dc7d4521f33 +size 1431 diff --git a/baseline-32x-k64/optimizer_0.pt b/baseline-32x-k64/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..f383de72ceee0d02d8b088c40648c9144dc2bdb0 --- /dev/null +++ b/baseline-32x-k64/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1922cfd154008b4c28fd0df9bd2b78502b72e39f9261f2c806b19e150a6665dd +size 258789317 diff --git a/baseline-32x-k64/optimizer_1.pt b/baseline-32x-k64/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..0218d39d261b93b5454024eff07699cd9b2fd4ea --- /dev/null +++ b/baseline-32x-k64/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fbbe91cfad8bedd0283379e22602a57d15152a360335e22980699646a73f3ab +size 462413 diff --git a/baseline-32x-k64/rank_0_state.pt b/baseline-32x-k64/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..a74ed450570521148b49bf8dc19b5e8934d998ec --- /dev/null +++ b/baseline-32x-k64/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:addcbdc2eabca9d0af17ce951a1507f1044489cf835fa576d6d47e217a2535fd +size 444622 diff --git a/baseline-32x-k64/state.pt b/baseline-32x-k64/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7e2dcc9430dc6130114bfcbe8ce9d64ef30b85a --- /dev/null +++ b/baseline-32x-k64/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa523a84d85418107fd8b6545cb6654f796f295dfc25927e975091f1d1bb0d2c +size 1249 diff --git a/baseline-64x-k128/config.json b/baseline-64x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c7ce246d3786ec149d7935b2d90617959021de67 --- /dev/null +++ b/baseline-64x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 2947, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "baseline-64x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/baseline-64x-k128/layers.10.mlp/cfg.json b/baseline-64x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..88e77a6307823d94f12439b2dea26c1da1c6a46e --- /dev/null +++ b/baseline-64x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-64x-k128/layers.10.mlp/sae.safetensors b/baseline-64x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b6858fa8c7063c4958489215f069fa1740dc2b61 --- /dev/null +++ b/baseline-64x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac58ec457626a08c3072e6a463df772ea90d3a8c807ad1cbe8851dacf56b4795 +size 171346584 diff --git a/baseline-64x-k128/layers.15.mlp/cfg.json b/baseline-64x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..88e77a6307823d94f12439b2dea26c1da1c6a46e --- /dev/null +++ b/baseline-64x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-64x-k128/layers.15.mlp/sae.safetensors b/baseline-64x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..121d7f6cdbbbd1bab726e29629eff7741641641a --- /dev/null +++ b/baseline-64x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:467393096c1a77ebded7c051470a29a9c62d1c89f9beacce6b0cbc630b2b5ad1 +size 171346584 diff --git a/baseline-64x-k128/layers.20.mlp/cfg.json b/baseline-64x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..88e77a6307823d94f12439b2dea26c1da1c6a46e --- /dev/null +++ b/baseline-64x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-64x-k128/layers.20.mlp/sae.safetensors b/baseline-64x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce16266e64d4ee4ecac2e7e49ff72c85ee411b6a --- /dev/null +++ b/baseline-64x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39cf9c5eb56222718381a61c97d71f278c69f2603dc79f46dd57b23af7e78128 +size 171346584 diff --git a/baseline-64x-k128/lr_scheduler_0.pt b/baseline-64x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..532c1bb42e509bfeddc5bdf1f9154b5c4bf09846 --- /dev/null +++ b/baseline-64x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe3f6a6e0df78974469de9d9ff865c041c2257a9af228d24b06c2b553f74571 +size 1020 diff --git a/baseline-64x-k128/optimizer_0.pt b/baseline-64x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..36dc55643c743272d38995c84b99a154d800b808 --- /dev/null +++ b/baseline-64x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1941e34348b122722a09174210e7c182b8bb5011b95a94c614a45e4e6068fa15 +size 513592880 diff --git a/baseline-64x-k128/optimizer_1.pt b/baseline-64x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bdeb780d097abda1f5eb74fce7e70aedc79cf2b --- /dev/null +++ b/baseline-64x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:978cb6431e9f2874d46c5d30b6e1ab3bd4a1b00994ea47b21b11ebab64557430 +size 904376 diff --git a/baseline-64x-k128/rank_0_state.pt b/baseline-64x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..94456e5567c57064e2dee71c409b8ef8735a8408 --- /dev/null +++ b/baseline-64x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aed06e3a9e9abd343f337694615af3c6e15709fcbfcfff1e62ee83ed778ae863 +size 886519 diff --git a/baseline-64x-k128/state.pt b/baseline-64x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b5e32b8b9c3e498e0f53dd833feb60d4221015d --- /dev/null +++ b/baseline-64x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:242a8680bc461f0fe44d225f1a5cd054f84580bd2fc72d83e72f6f31e70e3ef0 +size 856 diff --git a/baseline-64x-k64/config.json b/baseline-64x-k64/config.json new file mode 100644 index 0000000000000000000000000000000000000000..285e2beb8fe31000e6ba5304742e455ea97c9307 --- /dev/null +++ b/baseline-64x-k64/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 2915, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "baseline-64x-k64", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 24} \ No newline at end of file diff --git a/baseline-64x-k64/layers.10.mlp/cfg.json b/baseline-64x-k64/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..6dab98eb48f7dee899f91664abf02774a1d8384d --- /dev/null +++ b/baseline-64x-k64/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-64x-k64/layers.10.mlp/sae.safetensors b/baseline-64x-k64/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1b20408539a566250d573a647e5a9ebb13efaa1e --- /dev/null +++ b/baseline-64x-k64/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92cf9c2838ce407cb95d815c12b5656979b684b15ea3df277baede815473725a +size 171346584 diff --git a/baseline-64x-k64/layers.15.mlp/cfg.json b/baseline-64x-k64/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..6dab98eb48f7dee899f91664abf02774a1d8384d --- /dev/null +++ b/baseline-64x-k64/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-64x-k64/layers.15.mlp/sae.safetensors b/baseline-64x-k64/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1405125a182149724fed3fe3733127c90de4a92b --- /dev/null +++ b/baseline-64x-k64/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25ddd96ce6b9c247fb4be1cfa5a1dcc6bc31fc6b5ed763aa6b67ed6c996c2981 +size 171346584 diff --git a/baseline-64x-k64/layers.20.mlp/cfg.json b/baseline-64x-k64/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..6dab98eb48f7dee899f91664abf02774a1d8384d --- /dev/null +++ b/baseline-64x-k64/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "None_", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/baseline-64x-k64/layers.20.mlp/sae.safetensors b/baseline-64x-k64/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0f57f10320dae06328fc889fdb1632406106a7e7 --- /dev/null +++ b/baseline-64x-k64/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af618bf7b99b75e42eaf3901ed2016601fde644e731d110294f59e05f99a1a71 +size 171346584 diff --git a/baseline-64x-k64/lr_scheduler_0.pt b/baseline-64x-k64/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..532c1bb42e509bfeddc5bdf1f9154b5c4bf09846 --- /dev/null +++ b/baseline-64x-k64/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe3f6a6e0df78974469de9d9ff865c041c2257a9af228d24b06c2b553f74571 +size 1020 diff --git a/baseline-64x-k64/optimizer_0.pt b/baseline-64x-k64/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..136cef3ab348a4268b7be2415d80946e0f66bad1 --- /dev/null +++ b/baseline-64x-k64/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:112ebd09e6725d683aa6ed173b01a629b20c0825e215cca4fa3f25f8b0b8e734 +size 513592880 diff --git a/baseline-64x-k64/optimizer_1.pt b/baseline-64x-k64/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..39a5ac59dcfcfcb456cbd98a6e9175ab797e3600 --- /dev/null +++ b/baseline-64x-k64/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec51304150b2bc8d4a4f6b52f3f0f99fcb1466cbf974196629b33edf66b9c20c +size 904376 diff --git a/baseline-64x-k64/rank_0_state.pt b/baseline-64x-k64/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..94456e5567c57064e2dee71c409b8ef8735a8408 --- /dev/null +++ b/baseline-64x-k64/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aed06e3a9e9abd343f337694615af3c6e15709fcbfcfff1e62ee83ed778ae863 +size 886519 diff --git a/baseline-64x-k64/state.pt b/baseline-64x-k64/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b5e32b8b9c3e498e0f53dd833feb60d4221015d --- /dev/null +++ b/baseline-64x-k64/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:242a8680bc461f0fe44d225f1a5cd054f84580bd2fc72d83e72f6f31e70e3ef0 +size 856 diff --git a/pkm-128x-k128/config.json b/pkm-128x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a30dcfda89baefc2ba42c0820b4973fb05bed022 --- /dev/null +++ b/pkm-128x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 16, "grad_acc_steps": 2, "micro_acc_steps": 1, "stop_steps": 10000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 8000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-128x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/pkm-128x-k128/layers.10.mlp/cfg.json b/pkm-128x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..6ded44206d50ed127c897311b151661c24f34acb --- /dev/null +++ b/pkm-128x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-128x-k128/layers.10.mlp/sae.safetensors b/pkm-128x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4650b2be61060e9ce8668943e278e3d189bcf2c4 --- /dev/null +++ b/pkm-128x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157603b5ac12eee58294e416ed7975fa0039a4da975bb3feacb0b41cd3da6f35 +size 172454788 diff --git a/pkm-128x-k128/layers.15.mlp/cfg.json b/pkm-128x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..6ded44206d50ed127c897311b151661c24f34acb --- /dev/null +++ b/pkm-128x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-128x-k128/layers.15.mlp/sae.safetensors b/pkm-128x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..04ba81e030690ac0b3d3f276cb5d3315ffcfe9ea --- /dev/null +++ b/pkm-128x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71a4c0f590509afc43923dff16627bb557637a9415e43386e6571194b1700280 +size 172454788 diff --git a/pkm-128x-k128/layers.20.mlp/cfg.json b/pkm-128x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..6ded44206d50ed127c897311b151661c24f34acb --- /dev/null +++ b/pkm-128x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-128x-k128/layers.20.mlp/sae.safetensors b/pkm-128x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c51f5fb63af9440747b1e14005f0e2efbea09834 --- /dev/null +++ b/pkm-128x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0986694d0dd114ccbfe1d33a00a9852469e9cbca70df27085b623c89237f62c +size 172454788 diff --git a/pkm-128x-k128/lr_scheduler_0.pt b/pkm-128x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..70c4b5ae3c1b31ca53bc85787689b1c021f02b79 --- /dev/null +++ b/pkm-128x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3e9fd145eeeab56cc691b22568ad8915c031607ebf20b147b5682b270a85fa4 +size 1020 diff --git a/pkm-128x-k128/optimizer_0.pt b/pkm-128x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bd60eea6afbe40e99fc790027a04a8f5676be94 --- /dev/null +++ b/pkm-128x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18054c492e6860b501667d983de0b25439802f70e5e1c7cd04a11eb049c514f2 +size 517353008 diff --git a/pkm-128x-k128/optimizer_1.pt b/pkm-128x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6500f1d95a3ccffd800514fdcd857c6dd492d1d --- /dev/null +++ b/pkm-128x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9370c5d21546b992979347f5ba3f00f7203c74a9b710c6e2c91d477d2cbba41f +size 32696 diff --git a/pkm-128x-k128/rank_0_state.pt b/pkm-128x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..e69b9f3cdf32adf7949752361af5afe9f82231cf --- /dev/null +++ b/pkm-128x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1858a81490e176a72c12a3b316e42fe0c7f453d3082debd2ccefd4235e4561c4 +size 1771319 diff --git a/pkm-128x-k128/state.pt b/pkm-128x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..02b6c22a436e23e79f93854b7f45abe91a45a7f4 --- /dev/null +++ b/pkm-128x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f536eb8cc334058db0ba3306f599aa137bff9ffa61f29b5822dd6f31300bd3fb +size 856 diff --git a/pkm-128x-k64/config.json b/pkm-128x-k64/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d80cb932c7f2728c03d162e28e66a81ce1082a24 --- /dev/null +++ b/pkm-128x-k64/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 1490, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 16, "grad_acc_steps": 2, "micro_acc_steps": 1, "stop_steps": 10000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 8000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-128x-k64", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 24} \ No newline at end of file diff --git a/pkm-128x-k64/layers.10.mlp/cfg.json b/pkm-128x-k64/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..25bb138ade5e516eea2a97f7db25c0b5ba0093a0 --- /dev/null +++ b/pkm-128x-k64/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-128x-k64/layers.10.mlp/sae.safetensors b/pkm-128x-k64/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a3b9f2d5a33901c7f01794d84432fa4c1508d062 --- /dev/null +++ b/pkm-128x-k64/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29b8bb8024d1c43ad273e7cbcd1c42a4abb2571c5d5fb5601f74b9e260f56b5c +size 172454788 diff --git a/pkm-128x-k64/layers.15.mlp/cfg.json b/pkm-128x-k64/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..25bb138ade5e516eea2a97f7db25c0b5ba0093a0 --- /dev/null +++ b/pkm-128x-k64/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-128x-k64/layers.15.mlp/sae.safetensors b/pkm-128x-k64/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..06b1ca6f45ca24eae23f2ba7d1dfa00de9d89679 --- /dev/null +++ b/pkm-128x-k64/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59114ca56015ebfb52edad6bed9ebea9d79e63577dcd53584593c773ddef8cb7 +size 172454788 diff --git a/pkm-128x-k64/layers.20.mlp/cfg.json b/pkm-128x-k64/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..25bb138ade5e516eea2a97f7db25c0b5ba0093a0 --- /dev/null +++ b/pkm-128x-k64/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 128, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-128x-k64/layers.20.mlp/sae.safetensors b/pkm-128x-k64/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a9f32276b65e21a7ef4f24965c354c3de0b2022f --- /dev/null +++ b/pkm-128x-k64/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba2d75da901db70384976f9ea4b9f93a85f0bc9ead1d6f8bdabbd59291a4f03c +size 172454788 diff --git a/pkm-128x-k64/lr_scheduler_0.pt b/pkm-128x-k64/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..36a78c98f6722749cd05118db66131e5187b8ddf --- /dev/null +++ b/pkm-128x-k64/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef1fc5b66f139920ff6b61c9d651e9e93b997b320cadfeb9ebabc94f4cbad68 +size 1020 diff --git a/pkm-128x-k64/optimizer_0.pt b/pkm-128x-k64/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..58b7acb14138d3b826cdd92e2868e47a04d0eca2 --- /dev/null +++ b/pkm-128x-k64/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a577879e9021288b6fb73184e43db57c153562682ccbb4100fb4453462561bb +size 517353008 diff --git a/pkm-128x-k64/optimizer_1.pt b/pkm-128x-k64/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4a4e899f685d70be5756fd98262019701fb7b9e --- /dev/null +++ b/pkm-128x-k64/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d9767909e3d600999afbf47ee782149e3c6a933b008b8443ac223c26c9d1b6 +size 32696 diff --git a/pkm-128x-k64/rank_0_state.pt b/pkm-128x-k64/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..f44e56437cff15ec0eacfc0ef57e5d5fb5ce6ed5 --- /dev/null +++ b/pkm-128x-k64/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1410df0f38e31df5f09e763f3f7cdab38f68475f1d83fbc5b3c1fbc00ed0716 +size 1771319 diff --git a/pkm-128x-k64/state.pt b/pkm-128x-k64/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..72b8b59525f4fb3dc4c50e77147dd552c1361c3b --- /dev/null +++ b/pkm-128x-k64/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67fd27c20f072718725e322035cde80dfa82004644cf91abb3977df2c4f9dd06 +size 856 diff --git a/pkm-256x-k128/config.json b/pkm-256x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d0a195169f9efb9b260c489923f43efac3afd0e0 --- /dev/null +++ b/pkm-256x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 256, "normalize_decoder": true, "num_latents": 0, "k": 1538, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 4, "grad_acc_steps": 8, "micro_acc_steps": 1, "stop_steps": 40000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 32000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-256x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/pkm-256x-k128/layers.10.mlp/cfg.json b/pkm-256x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..a71e16fcc005aaeb5afe6384399f6082efe3e49f --- /dev/null +++ b/pkm-256x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 256, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-256x-k128/layers.10.mlp/sae.safetensors b/pkm-256x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dfe18e8936e560cc989991671a95e78f803e9a11 --- /dev/null +++ b/pkm-256x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:758f39890dc3f91807491ee5f5aaa156ce561ec80d98ea572093eb7cfbf4969e +size 342841092 diff --git a/pkm-256x-k128/layers.15.mlp/cfg.json b/pkm-256x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..a71e16fcc005aaeb5afe6384399f6082efe3e49f --- /dev/null +++ b/pkm-256x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 256, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-256x-k128/layers.15.mlp/sae.safetensors b/pkm-256x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..87c66cd4e3b9a2d2b8f90e14baa99a2df9f50664 --- /dev/null +++ b/pkm-256x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c1f353009ebea96f322ec3ffdc4135546f1ed5ddea52dc95db9ab6f92b0e4ab +size 342841092 diff --git a/pkm-256x-k128/layers.20.mlp/cfg.json b/pkm-256x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..a71e16fcc005aaeb5afe6384399f6082efe3e49f --- /dev/null +++ b/pkm-256x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 256, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-256x-k128/layers.20.mlp/sae.safetensors b/pkm-256x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b82bd683954bb6580d7cd1c643e9bf7be1f89c12 --- /dev/null +++ b/pkm-256x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:972d3b21f8de1951b91bd35a30289c593e933c64b8ce572f251c4c4ca2a0d4e8 +size 342841092 diff --git a/pkm-256x-k128/lr_scheduler_0.pt b/pkm-256x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..03603e3268ed5bfdf0ff7c5abc0f7aaa01b4e1fd --- /dev/null +++ b/pkm-256x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a751e40040a573a5bee1650a8ca8953e8870748aff4955fe7e45d3c0307346d +size 1020 diff --git a/pkm-256x-k128/optimizer_0.pt b/pkm-256x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..44c773fcad47a5e78790231fce2eccf91d3d911a --- /dev/null +++ b/pkm-256x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4421fb258b91ef0b9bb4f0984729abcc0c24386efa8e7f0c61b367cd6fd82923 +size 1028509232 diff --git a/pkm-256x-k128/optimizer_1.pt b/pkm-256x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..52baaec71b858538da2a68d945a550a0e02b7aa7 --- /dev/null +++ b/pkm-256x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daa6850958e604d73852f8798b561f67935a6a8c39fc85e288f928ce87eb89ef +size 38072 diff --git a/pkm-256x-k128/rank_0_state.pt b/pkm-256x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc3c207acf7ef59d6bc26d31b55c228f272162c5 --- /dev/null +++ b/pkm-256x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60af2a7ec3d6064c9be4b5dfe0a3dd6dfbd3cd15d8bd185947fe78a7f683cf77 +size 3540791 diff --git a/pkm-256x-k128/state.pt b/pkm-256x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..1088569859992cae76d3bc8d03bbcf7d6aeae2ac --- /dev/null +++ b/pkm-256x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7beec48e9886b4f07208cd1d0a75d8dfe6d6de10e55cc43cc3a42af74894129d +size 856 diff --git a/pkm-256x-k64/config.json b/pkm-256x-k64/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4579da6c4bdd1183c87c0450b0fd415091d7bbba --- /dev/null +++ b/pkm-256x-k64/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 256, "normalize_decoder": true, "num_latents": 0, "k": 1490, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 4, "grad_acc_steps": 8, "micro_acc_steps": 1, "stop_steps": 40000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 32000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-256x-k64", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 24} \ No newline at end of file diff --git a/pkm-256x-k64/layers.10.mlp/cfg.json b/pkm-256x-k64/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..128d235464318095b43227ed4379cd5acdeb580f --- /dev/null +++ b/pkm-256x-k64/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 256, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-256x-k64/layers.10.mlp/sae.safetensors b/pkm-256x-k64/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..616e24da459220848e60588c2614469782058685 --- /dev/null +++ b/pkm-256x-k64/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73b91736048002d2fa00acf85543c333737c3f6bb9bf0b93514cd8a4a8ba4db8 +size 342841092 diff --git a/pkm-256x-k64/layers.15.mlp/cfg.json b/pkm-256x-k64/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..128d235464318095b43227ed4379cd5acdeb580f --- /dev/null +++ b/pkm-256x-k64/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 256, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-256x-k64/layers.15.mlp/sae.safetensors b/pkm-256x-k64/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2f600039aa4ab70e92d6c0ffa705838ae24e175b --- /dev/null +++ b/pkm-256x-k64/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c108c90555768b2a16ead9684b57daf3013e87b6e9176d6e24d2c8b8ae71edb +size 342841092 diff --git a/pkm-256x-k64/layers.20.mlp/cfg.json b/pkm-256x-k64/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..128d235464318095b43227ed4379cd5acdeb580f --- /dev/null +++ b/pkm-256x-k64/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 256, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-256x-k64/layers.20.mlp/sae.safetensors b/pkm-256x-k64/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e975b545b9412b8366a262ed03d621d6f84c720f --- /dev/null +++ b/pkm-256x-k64/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef171faa0fa34b634bb18b4e08cc8c5024141d2d04bb1e70014c0b57e7c6ac7 +size 342841092 diff --git a/pkm-256x-k64/lr_scheduler_0.pt b/pkm-256x-k64/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..823e400a1e2a18bdea6ede945d0787c3935f4476 --- /dev/null +++ b/pkm-256x-k64/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaca5b57304f797e039b2c10dcc3a645d8644160413a00ad3927886c9798a78c +size 1020 diff --git a/pkm-256x-k64/optimizer_0.pt b/pkm-256x-k64/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..570152464bc9d915cafd0903cd83bebf405c6117 --- /dev/null +++ b/pkm-256x-k64/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7f4548a3b6f18380eef218c0c9f09311bb46917d53a27a3172d8c528408864f +size 1028509232 diff --git a/pkm-256x-k64/optimizer_1.pt b/pkm-256x-k64/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cd0099edd6323155a4f027e618801581ebb7b92 --- /dev/null +++ b/pkm-256x-k64/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48f6d430d02fe8f55bb4b3bb017c6a84a406b8575d826561639a3eca8c0d9163 +size 38072 diff --git a/pkm-256x-k64/rank_0_state.pt b/pkm-256x-k64/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..e809dc572e92d0f5b49d129d63aee6fab2d3f624 --- /dev/null +++ b/pkm-256x-k64/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f27cbfa0ccefe12959e19494579893ff179a2395314c989d17457fb0d2f096 +size 3540791 diff --git a/pkm-256x-k64/state.pt b/pkm-256x-k64/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..1088569859992cae76d3bc8d03bbcf7d6aeae2ac --- /dev/null +++ b/pkm-256x-k64/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7beec48e9886b4f07208cd1d0a75d8dfe6d6de10e55cc43cc3a42af74894129d +size 856 diff --git a/pkm-32x-k128/config.json b/pkm-32x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f39680082f3e44cde1a772db068b9a095017ac22 --- /dev/null +++ b/pkm-32x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 131, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-32x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/pkm-32x-k128/layers.10.mlp/cfg.json b/pkm-32x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..22e6d37171144ab2981353d70d14d5d46b7ed09c --- /dev/null +++ b/pkm-32x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-32x-k128/layers.10.mlp/sae.safetensors b/pkm-32x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..87a179afefc51ae553e50c53fd2ebbc945470955 --- /dev/null +++ b/pkm-32x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e752b24c66461aeb41021af7bd637789103576d8246a42fd1236a78233626dd +size 44425012 diff --git a/pkm-32x-k128/layers.15.mlp/cfg.json b/pkm-32x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..22e6d37171144ab2981353d70d14d5d46b7ed09c --- /dev/null +++ b/pkm-32x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-32x-k128/layers.15.mlp/sae.safetensors b/pkm-32x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dc840f8e6f99beeea682bf8e8b4ca6d32393d098 --- /dev/null +++ b/pkm-32x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc1d90bf32cd24eebf7de4e0f17b5eae90002b4332bcd5568b58d40048c2f905 +size 44425012 diff --git a/pkm-32x-k128/layers.20.mlp/cfg.json b/pkm-32x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..22e6d37171144ab2981353d70d14d5d46b7ed09c --- /dev/null +++ b/pkm-32x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-32x-k128/layers.20.mlp/sae.safetensors b/pkm-32x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0961338da2fa8dfaf9e01ff9a9a2b291bdaa7176 --- /dev/null +++ b/pkm-32x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb347b00d366a9ab9694e5955cfb7839d77e0eb9c5bdb224b9e9713365871c8 +size 44425012 diff --git a/pkm-32x-k128/lr_scheduler_0.pt b/pkm-32x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..58e539aca7ba5e60ea2f4edaf79a1691559bdc11 --- /dev/null +++ b/pkm-32x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:974d02a514cd88545cc0ccf016241fd8975718a01cc21418ebb308e029309749 +size 1020 diff --git a/pkm-32x-k128/optimizer_0.pt b/pkm-32x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0706520461372ea26be270d3f2e8c64e491b469 --- /dev/null +++ b/pkm-32x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba2a9226f6b3ed1a585c59a28221ad846725a745cccfe369ff3ace39360336ca +size 133266992 diff --git a/pkm-32x-k128/optimizer_1.pt b/pkm-32x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbec4f15fb4e375287e1cbb0534f5101429bdab8 --- /dev/null +++ b/pkm-32x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ec114bd9c8abfce5a153b45367295712f3aaed9b656bd015d21ef9073a0517a +size 26168 diff --git a/pkm-32x-k128/rank_0_state.pt b/pkm-32x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..e51599e5f8cb69d85abadcb2ba4b571cb5ab4144 --- /dev/null +++ b/pkm-32x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e90ee6bac0c6e8e7f7d49a9ecee6ac28119c6154b76ae24b77492e193a36b88e +size 444151 diff --git a/pkm-32x-k128/state.pt b/pkm-32x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f6d46f534b71a3058f9f95baacd3d1522325a95 --- /dev/null +++ b/pkm-32x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64c69df589c14c83e6c58325788b89ad8f95adbc7f968444e8430fc84c418e89 +size 856 diff --git a/pkm-32x-k64/config.json b/pkm-32x-k64/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f7e27e3a2a9b81035228b1cd980f8102e27413d5 --- /dev/null +++ b/pkm-32x-k64/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-32x-k64", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 24} \ No newline at end of file diff --git a/pkm-32x-k64/layers.10.mlp/cfg.json b/pkm-32x-k64/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..286ac1849ed3b791dfe17bdbde1d8d3a7cde4460 --- /dev/null +++ b/pkm-32x-k64/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-32x-k64/layers.10.mlp/sae.safetensors b/pkm-32x-k64/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5a67e0b01b589bb8a5afec6ca7b7b5ff3f3598d4 --- /dev/null +++ b/pkm-32x-k64/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd2476636a1686d6e981dac090cc6930fda85228153fbec9c4b69712c42f30f8 +size 44425012 diff --git a/pkm-32x-k64/layers.15.mlp/cfg.json b/pkm-32x-k64/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..286ac1849ed3b791dfe17bdbde1d8d3a7cde4460 --- /dev/null +++ b/pkm-32x-k64/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-32x-k64/layers.15.mlp/sae.safetensors b/pkm-32x-k64/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5734ff15a4e78cb3e11faf164898ed056bbdee16 --- /dev/null +++ b/pkm-32x-k64/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de3c7d3260c726776e86d222c72e1311d1ed8a7d4743035a1a512c3e903d9bf7 +size 44425012 diff --git a/pkm-32x-k64/layers.20.mlp/cfg.json b/pkm-32x-k64/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..286ac1849ed3b791dfe17bdbde1d8d3a7cde4460 --- /dev/null +++ b/pkm-32x-k64/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 32, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-32x-k64/layers.20.mlp/sae.safetensors b/pkm-32x-k64/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1423a860f4faf616788f71df5bdfe2faa6210c55 --- /dev/null +++ b/pkm-32x-k64/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb0d67315c056d4d1f1f4b1ed1cabef7a455a434b54fdd8609df46198fd4704b +size 44425012 diff --git a/pkm-32x-k64/lr_scheduler_0.pt b/pkm-32x-k64/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6ad5f37aed94735a798743dbd990fa94b6be084 --- /dev/null +++ b/pkm-32x-k64/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7de4c12587d60631b5cff6c042fdfe6d606f111ca4d6e288ba467f5fa8d4ef21 +size 1431 diff --git a/pkm-32x-k64/optimizer_0.pt b/pkm-32x-k64/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a071574714dbeea1ba91deac42e69723e9ba376 --- /dev/null +++ b/pkm-32x-k64/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3960ac7e5e74b6822139af275ced89738841aa231a8804082cb4c308c2ec9053 +size 133267397 diff --git a/pkm-32x-k64/optimizer_1.pt b/pkm-32x-k64/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c46056534eec763ba90c2d61f84b4f41ce5a7f5 --- /dev/null +++ b/pkm-32x-k64/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7439dc051377e8c282a94cc0aa7f1737ef6640f332cfcbbe0f548999f8db0bf3 +size 26573 diff --git a/pkm-32x-k64/rank_0_state.pt b/pkm-32x-k64/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..606104158fb8b473194e360be963265553807023 --- /dev/null +++ b/pkm-32x-k64/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f432de8c82603081bde444cb89fd77c7ce676c0aae239b504d56f36b04a3d21 +size 444622 diff --git a/pkm-32x-k64/state.pt b/pkm-32x-k64/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..0552809d01a2730f9ddd77afade9f34f5d478d68 --- /dev/null +++ b/pkm-32x-k64/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63998f6ecb076120ab5b4a54a24fd2a7a001c7be8634fc6ccd9d6f6199117d6e +size 1249 diff --git a/pkm-48x-k128/config.json b/pkm-48x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b07acc58009d94e72a7e575fedc517a212704b7c --- /dev/null +++ b/pkm-48x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 48, "normalize_decoder": true, "num_latents": 0, "k": 131, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-48x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/pkm-48x-k128/layers.10.mlp/cfg.json b/pkm-48x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..8283fcce2b72f84622f71dcdd79ef2f7b45ab505 --- /dev/null +++ b/pkm-48x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 48, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-48x-k128/layers.10.mlp/sae.safetensors b/pkm-48x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..71611bf9fa5866028a48f63aaaa05c742ed4d1d5 --- /dev/null +++ b/pkm-48x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae2a9e69c973abdd4fbd24ad24fb2621f285e3712c78fef1b8b72667ac7c3dcd +size 65801772 diff --git a/pkm-48x-k128/layers.15.mlp/cfg.json b/pkm-48x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..8283fcce2b72f84622f71dcdd79ef2f7b45ab505 --- /dev/null +++ b/pkm-48x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 48, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-48x-k128/layers.15.mlp/sae.safetensors b/pkm-48x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..699ac89cb108fe12106a2c69beebf1333553ed93 --- /dev/null +++ b/pkm-48x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ab63643a292993cf8388687cac6336df1c87474ed860fa923a4a69613496d3c +size 65801772 diff --git a/pkm-48x-k128/layers.20.mlp/cfg.json b/pkm-48x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..8283fcce2b72f84622f71dcdd79ef2f7b45ab505 --- /dev/null +++ b/pkm-48x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 48, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-48x-k128/layers.20.mlp/sae.safetensors b/pkm-48x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b9af4b8d2ea6a30f33b8e93916a151d0ffd5e3a --- /dev/null +++ b/pkm-48x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e9c42aa5baea56de18167e02dea1b6581fc51362f85b2b1fa95aa5f3b575012 +size 65801772 diff --git a/pkm-48x-k128/lr_scheduler_0.pt b/pkm-48x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..58e539aca7ba5e60ea2f4edaf79a1691559bdc11 --- /dev/null +++ b/pkm-48x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:974d02a514cd88545cc0ccf016241fd8975718a01cc21418ebb308e029309749 +size 1020 diff --git a/pkm-48x-k128/optimizer_0.pt b/pkm-48x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4607b15c7c0b84fd6ff0592c74e9e6e5cf8a978 --- /dev/null +++ b/pkm-48x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97382a99fcb5e037f9d01f47720eb0ec9f47b471c3cefb22ed9cbf90b0c61918 +size 197396528 diff --git a/pkm-48x-k128/optimizer_1.pt b/pkm-48x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca472fe3ee52f86867efec7a1cfd240b5ef4f128 --- /dev/null +++ b/pkm-48x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abf0f37fa49be7a910dbb89b4d27a35bf9e38e61e33be633f634f68c7cb3ce6e +size 27320 diff --git a/pkm-48x-k128/rank_0_state.pt b/pkm-48x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..12306d8d3c5357c816ed7ec74f9ad4d3a2ed07b3 --- /dev/null +++ b/pkm-48x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720f2dcbb4b4261d01f2e805e9582d3a47345cd51670f820fa0e38365ff66f26 +size 665335 diff --git a/pkm-48x-k128/state.pt b/pkm-48x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f6d46f534b71a3058f9f95baacd3d1522325a95 --- /dev/null +++ b/pkm-48x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64c69df589c14c83e6c58325788b89ad8f95adbc7f968444e8430fc84c418e89 +size 856 diff --git a/pkm-48x-k64/config.json b/pkm-48x-k64/config.json new file mode 100644 index 0000000000000000000000000000000000000000..97463168549546ceef62cea0299a9ab7b00439d2 --- /dev/null +++ b/pkm-48x-k64/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 48, "normalize_decoder": true, "num_latents": 0, "k": 1491, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-48x-k64", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 24} \ No newline at end of file diff --git a/pkm-48x-k64/layers.10.mlp/cfg.json b/pkm-48x-k64/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..7fe52e216d25cc3ff732c2f95319740ccc8d9d1d --- /dev/null +++ b/pkm-48x-k64/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 48, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-48x-k64/layers.10.mlp/sae.safetensors b/pkm-48x-k64/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a393fdb8ef1fa5f1d48f129468ec8490a209407a --- /dev/null +++ b/pkm-48x-k64/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74c69d906123a3173c4768272b1ee323e4469827ccc04aa430110c4b9718aeab +size 65801772 diff --git a/pkm-48x-k64/layers.15.mlp/cfg.json b/pkm-48x-k64/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..7fe52e216d25cc3ff732c2f95319740ccc8d9d1d --- /dev/null +++ b/pkm-48x-k64/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 48, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-48x-k64/layers.15.mlp/sae.safetensors b/pkm-48x-k64/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8dc99a429bba94b9fa61ee2b43b30a5ec9bc9508 --- /dev/null +++ b/pkm-48x-k64/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682f879fe52caa629b6cd1b535a6e995fb64961c01c2ef619dcbbff9f14c0cbc +size 65801772 diff --git a/pkm-48x-k64/layers.20.mlp/cfg.json b/pkm-48x-k64/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..7fe52e216d25cc3ff732c2f95319740ccc8d9d1d --- /dev/null +++ b/pkm-48x-k64/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 48, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-48x-k64/layers.20.mlp/sae.safetensors b/pkm-48x-k64/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0c3c27b98a05598e62d5df458eaab0338379bb7b --- /dev/null +++ b/pkm-48x-k64/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:048b940e277a272792922058d65a997271612c6bce87e7f406c93c61d945423f +size 65801772 diff --git a/pkm-48x-k64/lr_scheduler_0.pt b/pkm-48x-k64/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..86b53d8e22e1c4ad27c6f4c7c2466ec3f57470fd --- /dev/null +++ b/pkm-48x-k64/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d834baf40442747fae5c6cbde6c424301e95e721e39dde07e4ef8dc7d4521f33 +size 1431 diff --git a/pkm-48x-k64/optimizer_0.pt b/pkm-48x-k64/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..60f8356aac9d913012fbed34c30f8679a628c31b --- /dev/null +++ b/pkm-48x-k64/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfe2f2987e11cb5ad447592d8a332ae99811bd21675b7b21136bb22c83466014 +size 197396933 diff --git a/pkm-48x-k64/optimizer_1.pt b/pkm-48x-k64/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..6feba159b820a487da46e8d01135943b3bc9243c --- /dev/null +++ b/pkm-48x-k64/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ffb389dcbd2c22a6b574833ff2d60c9bbcb3f803731e5f6287abc0be6ce930e +size 27725 diff --git a/pkm-48x-k64/rank_0_state.pt b/pkm-48x-k64/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..4295da63a41b4443180e53f96c6473ecf473d00a --- /dev/null +++ b/pkm-48x-k64/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:212d2f10cbec058a0f27bb194f55e9dbec2991d52efe508c662f6cce7acd42b9 +size 665806 diff --git a/pkm-48x-k64/state.pt b/pkm-48x-k64/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7e2dcc9430dc6130114bfcbe8ce9d64ef30b85a --- /dev/null +++ b/pkm-48x-k64/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa523a84d85418107fd8b6545cb6654f796f295dfc25927e975091f1d1bb0d2c +size 1249 diff --git a/pkm-512x-k128/config.json b/pkm-512x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..27f52dedeb3d6ebb6780f5ea1fc6903eaf74663e --- /dev/null +++ b/pkm-512x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 512, "normalize_decoder": true, "num_latents": 0, "k": 1537, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 2, "grad_acc_steps": 16, "micro_acc_steps": 1, "stop_steps": 80000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 64000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-512x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/pkm-512x-k128/layers.10.mlp/cfg.json b/pkm-512x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..a27ca35560ba425d117b2bf446755b0e07b21dda --- /dev/null +++ b/pkm-512x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 512, "normalize_decoder": true, "num_latents": 0, "k": 1537, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-512x-k128/layers.10.mlp/sae.safetensors b/pkm-512x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..abfda14a85bc2d2986427f45086942dfce1d0cf0 --- /dev/null +++ b/pkm-512x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a192a40eef09f85617ed536d1e440c29e9245d9d806a4bd05b0a0ede1ef9ad99 +size 683318276 diff --git a/pkm-512x-k128/layers.15.mlp/cfg.json b/pkm-512x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..a27ca35560ba425d117b2bf446755b0e07b21dda --- /dev/null +++ b/pkm-512x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 512, "normalize_decoder": true, "num_latents": 0, "k": 1537, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-512x-k128/layers.15.mlp/sae.safetensors b/pkm-512x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b95a8421d5b28739163982f8718b0782e2a2a55 --- /dev/null +++ b/pkm-512x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:553abeb034415fba59480969a67de0c084481cce53b75c68d7a9e0520a213f81 +size 683318276 diff --git a/pkm-512x-k128/layers.20.mlp/cfg.json b/pkm-512x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..a27ca35560ba425d117b2bf446755b0e07b21dda --- /dev/null +++ b/pkm-512x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 512, "normalize_decoder": true, "num_latents": 0, "k": 1537, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-512x-k128/layers.20.mlp/sae.safetensors b/pkm-512x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d180844e14d5ef12740a65164011c097079b4428 --- /dev/null +++ b/pkm-512x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3df843fc41c6f7b2fb20b49f8ec97388cb2a4ce8a67435f578e6e80b5be6a82 +size 683318276 diff --git a/pkm-512x-k128/lr_scheduler_0.pt b/pkm-512x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..42f49440ccd355f3d9db76560eaac148aea3947c --- /dev/null +++ b/pkm-512x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdd5a041512b51efd22767ffadf1e68c8649edafb3d64ea600d41b329339827b +size 1020 diff --git a/pkm-512x-k128/optimizer_0.pt b/pkm-512x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..b45518fcd94fbac3fc36373ffc93836e7e9baee7 --- /dev/null +++ b/pkm-512x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8adda488f55cae2c0c9f85d1a2f33072b2d876fb272a858f1207f406601926c0 +size 2049936944 diff --git a/pkm-512x-k128/optimizer_1.pt b/pkm-512x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cf6e9c032f19bd82a902fc0a5c7d4b4e1eab4c0 --- /dev/null +++ b/pkm-512x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f77b2f0bc37d27cb4c18f13e0aa1854fa711e0066544fd033c9c3c556d00653 +size 45752 diff --git a/pkm-512x-k128/rank_0_state.pt b/pkm-512x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..23ef49b06e8a81734a4048bd53233c971f89e850 --- /dev/null +++ b/pkm-512x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062f1f6873f367478329d82b6fdc5354ce4b7067dffbbfc953ef94eda2819f98 +size 7079735 diff --git a/pkm-512x-k128/state.pt b/pkm-512x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..2895c060379bbdbae659f93affbd0c3f31d74cab --- /dev/null +++ b/pkm-512x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14cf0464b66120dc648bcb42656579fb663272f33d97406e93373b7da0037d00 +size 856 diff --git a/pkm-64x-k128/config.json b/pkm-64x-k128/config.json new file mode 100644 index 0000000000000000000000000000000000000000..518e3580b1284b91e492193d1b67f484a811d216 --- /dev/null +++ b/pkm-64x-k128/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-64x-k128", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 48} \ No newline at end of file diff --git a/pkm-64x-k128/layers.10.mlp/cfg.json b/pkm-64x-k128/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..2ac97d7c794fd958d1a71690cfa0334e1fbf5960 --- /dev/null +++ b/pkm-64x-k128/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-64x-k128/layers.10.mlp/sae.safetensors b/pkm-64x-k128/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13849b0d337127826868603b941bac4f55a9e1f6 --- /dev/null +++ b/pkm-64x-k128/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfea97938e24b8c211fded447b9f11c836727286482441a7a283d3532e8880c0 +size 87150836 diff --git a/pkm-64x-k128/layers.15.mlp/cfg.json b/pkm-64x-k128/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..2ac97d7c794fd958d1a71690cfa0334e1fbf5960 --- /dev/null +++ b/pkm-64x-k128/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-64x-k128/layers.15.mlp/sae.safetensors b/pkm-64x-k128/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bcdfad748499a0173d9a81ef2b27778dc39bc9bd --- /dev/null +++ b/pkm-64x-k128/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:087b23e56de897a897bb563ab37602e8a8ccec99545daed37e68298a9df54ad8 +size 87150836 diff --git a/pkm-64x-k128/layers.20.mlp/cfg.json b/pkm-64x-k128/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..2ac97d7c794fd958d1a71690cfa0334e1fbf5960 --- /dev/null +++ b/pkm-64x-k128/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-64x-k128/layers.20.mlp/sae.safetensors b/pkm-64x-k128/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aadf0781269340d6b7438b2923837c28327d0c8b --- /dev/null +++ b/pkm-64x-k128/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882b509357494fea387da8a382b9fc5bf64040a61fc4d4450a26373d6fa4e40c +size 87150836 diff --git a/pkm-64x-k128/lr_scheduler_0.pt b/pkm-64x-k128/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..59e3c1e166119f19c76fd4471b64d0390bccbd0f --- /dev/null +++ b/pkm-64x-k128/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a4df8929043fb69ba02c3a1816a7951a5f5e986bc1cbedef5a14c63f80e8aa +size 1020 diff --git a/pkm-64x-k128/optimizer_0.pt b/pkm-64x-k128/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c9f57e7de172904ef0256fd7e520e1399977d53 --- /dev/null +++ b/pkm-64x-k128/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a224b3e87ead235becfca0330559f62c8d86aa0fc5f97a7222a6da4981be882 +size 261443120 diff --git a/pkm-64x-k128/optimizer_1.pt b/pkm-64x-k128/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..504506d699af30a98cfd3bf7e16ff28a51249a30 --- /dev/null +++ b/pkm-64x-k128/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:744989902d884c435aec160d8a1d4fc964d11fb9bfe6ebf4a1dd28fbbc6ad150 +size 28856 diff --git a/pkm-64x-k128/rank_0_state.pt b/pkm-64x-k128/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..711e83b123716b46f7087056f1a2bde1488deea8 --- /dev/null +++ b/pkm-64x-k128/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70c38cfaa8b9251613a635353e7682c09e3f08bd3729a39fe946496d26310f1c +size 886519 diff --git a/pkm-64x-k128/state.pt b/pkm-64x-k128/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..928677e02c637686e7c84eebde68c52449a340df --- /dev/null +++ b/pkm-64x-k128/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59869c974db95128370f750f59e530d1a4db506a0e7e8c4374f7343dd0aebe95 +size 856 diff --git a/pkm-64x-k64/config.json b/pkm-64x-k64/config.json new file mode 100644 index 0000000000000000000000000000000000000000..94a77a1ad4d7b9eed161fe3172f3ed4e503f6112 --- /dev/null +++ b/pkm-64x-k64/config.json @@ -0,0 +1 @@ +{"sae": {"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 1491, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}}, "batch_size": 32, "grad_acc_steps": 1, "micro_acc_steps": 1, "stop_steps": 5000, "loss_fn": "fvu", "optimizer": "muon", "lr": 0.0008, "lr_warmup_steps": 1000, "k_decay_steps": 4000, "auxk_alpha": 0.0, "dead_feature_threshold": 10000000, "hookpoints": ["layers.10.mlp", "layers.15.mlp", "layers.20.mlp"], "init_seeds": [0], "layers": [], "layer_stride": 1, "distribute_modules": false, "save_every": 1000, "log_to_wandb": true, "run_name": "pkm-64x-k64", "wandb_log_frequency": 1, "wandb_project": "sparsify", "model": "HuggingFaceTB/SmolLM2-135M", "dataset": "EleutherAI/fineweb-edu-dedup-10b", "split": "train", "ctx_len": 2048, "hf_token": null, "revision": null, "load_in_8bit": false, "max_examples": null, "resume": false, "text_column": "text", "finetune": null, "shuffle_seed": 42, "data_preprocessing_num_proc": 24} \ No newline at end of file diff --git a/pkm-64x-k64/layers.10.mlp/cfg.json b/pkm-64x-k64/layers.10.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..484ed9025746fce27826bb50483378bab35100cf --- /dev/null +++ b/pkm-64x-k64/layers.10.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-64x-k64/layers.10.mlp/sae.safetensors b/pkm-64x-k64/layers.10.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9632182e65fd90fbb48b806a4193780e76f136 --- /dev/null +++ b/pkm-64x-k64/layers.10.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c0449a9cb064cfa1376b20207c9d682225ff9772d974a88d8854e5b471aa049 +size 87150836 diff --git a/pkm-64x-k64/layers.15.mlp/cfg.json b/pkm-64x-k64/layers.15.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..484ed9025746fce27826bb50483378bab35100cf --- /dev/null +++ b/pkm-64x-k64/layers.15.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-64x-k64/layers.15.mlp/sae.safetensors b/pkm-64x-k64/layers.15.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7c2e60c33c990191bb3bfa86b2778c6dba180b8f --- /dev/null +++ b/pkm-64x-k64/layers.15.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58777aa5b4ff887bb65c50134b13a3604e5d1b21901eeaa85b8432e48ac66fb2 +size 87150836 diff --git a/pkm-64x-k64/layers.20.mlp/cfg.json b/pkm-64x-k64/layers.20.mlp/cfg.json new file mode 100644 index 0000000000000000000000000000000000000000..484ed9025746fce27826bb50483378bab35100cf --- /dev/null +++ b/pkm-64x-k64/layers.20.mlp/cfg.json @@ -0,0 +1 @@ +{"activation": "topk", "expansion_factor": 64, "normalize_decoder": true, "num_latents": 0, "k": 64, "multi_topk": false, "skip_connection": true, "transcode": true, "optimized_encoder_config": "PKM", "pkm_config": {"pad": false, "softmax": false, "heads": 1, "bias": false, "init_scale": 1.0}, "kronecker_config": {"in_group": 2, "out_group": 4, "u": 4, "lora_dim": 1.0}, "d_in": 576} \ No newline at end of file diff --git a/pkm-64x-k64/layers.20.mlp/sae.safetensors b/pkm-64x-k64/layers.20.mlp/sae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f5c99be3d42ce4e91a559ee1fb0aa9fbbd3bdb77 --- /dev/null +++ b/pkm-64x-k64/layers.20.mlp/sae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec0ac1c11d467f36cdc63b7e06f65ccb16e2cb740599586025730e92c36379c +size 87150836 diff --git a/pkm-64x-k64/lr_scheduler_0.pt b/pkm-64x-k64/lr_scheduler_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..86b53d8e22e1c4ad27c6f4c7c2466ec3f57470fd --- /dev/null +++ b/pkm-64x-k64/lr_scheduler_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d834baf40442747fae5c6cbde6c424301e95e721e39dde07e4ef8dc7d4521f33 +size 1431 diff --git a/pkm-64x-k64/optimizer_0.pt b/pkm-64x-k64/optimizer_0.pt new file mode 100644 index 0000000000000000000000000000000000000000..137151aac8d9f650354f7ddb220c4a580014bf21 --- /dev/null +++ b/pkm-64x-k64/optimizer_0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bf6133065edb70e24b67adc7365862c6f17fde1acd6c17db3c2d9cbf44b05b9 +size 261443525 diff --git a/pkm-64x-k64/optimizer_1.pt b/pkm-64x-k64/optimizer_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c84bf6b58055e7c3f734361d798cab27b0dcb06 --- /dev/null +++ b/pkm-64x-k64/optimizer_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93437cf6166585daab202445912c54de192fc3b6c5979ab85feb10db9594ad5b +size 29261 diff --git a/pkm-64x-k64/rank_0_state.pt b/pkm-64x-k64/rank_0_state.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b921bdb9cb613690b0332f32d4119b963894a89 --- /dev/null +++ b/pkm-64x-k64/rank_0_state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33bb6f36c072b9dc70ea9504c82ca50116e777629e41c9214d181990b2b7cb37 +size 886990 diff --git a/pkm-64x-k64/state.pt b/pkm-64x-k64/state.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7e2dcc9430dc6130114bfcbe8ce9d64ef30b85a --- /dev/null +++ b/pkm-64x-k64/state.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa523a84d85418107fd8b6545cb6654f796f295dfc25927e975091f1d1bb0d2c +size 1249