empty-michael commited on
Commit
123a4ce
·
verified ·
1 Parent(s): 9082573

Upload 2 files

Browse files
Files changed (2) hide show
  1. ae.pt +3 -0
  2. config.json +24 -0
ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ac8ee0cd77a073c0fcd13a1810873acc184cc83d27f94ea91b2f59d34df4d4e
3
+ size 3624052520
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "TrainerTopK",
4
+ "dict_class": "AutoEncoderTopK",
5
+ "lr": 0.00016329931618554522,
6
+ "steps": 30000,
7
+ "seed": 42,
8
+ "activation_dim": 18432,
9
+ "dict_size": 24576,
10
+ "k": 64,
11
+ "device": "cuda:0",
12
+ "wandb_name": "all_modules_topk64_dict24576_seed42_batch8196",
13
+ "submodule_list": "[GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n), GPT2SdpaAttention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n)]"
14
+ },
15
+ "buffer": {
16
+ "d_submodule": 768,
17
+ "io": "out",
18
+ "n_ctxs": 128,
19
+ "ctx_len": 128,
20
+ "refresh_batch_size": 256,
21
+ "out_batch_size": 8192,
22
+ "device": "cuda:0"
23
+ }
24
+ }