Commit
·
6ed6502
verified
·
0
Parent(s):
Duplicate from georgeyw/gpt-2-small
Browse filesCo-authored-by: George Wang <[email protected]>
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +35 -0
- checkpoint-100/config.json +31 -0
- checkpoint-100/model.safetensors +3 -0
- checkpoint-100/training_args.bin +3 -0
- checkpoint-1000/config.json +31 -0
- checkpoint-1000/model.safetensors +3 -0
- checkpoint-1000/training_args.bin +3 -0
- checkpoint-10000/config.json +31 -0
- checkpoint-10000/model.safetensors +3 -0
- checkpoint-10000/training_args.bin +3 -0
- checkpoint-10100/config.json +31 -0
- checkpoint-10100/model.safetensors +3 -0
- checkpoint-10100/training_args.bin +3 -0
- checkpoint-10200/config.json +31 -0
- checkpoint-10200/model.safetensors +3 -0
- checkpoint-10200/training_args.bin +3 -0
- checkpoint-10300/config.json +31 -0
- checkpoint-10300/model.safetensors +3 -0
- checkpoint-10300/training_args.bin +3 -0
- checkpoint-10400/config.json +31 -0
- checkpoint-10400/model.safetensors +3 -0
- checkpoint-10400/training_args.bin +3 -0
- checkpoint-10500/config.json +31 -0
- checkpoint-10500/model.safetensors +3 -0
- checkpoint-10500/training_args.bin +3 -0
- checkpoint-10600/config.json +31 -0
- checkpoint-10600/model.safetensors +3 -0
- checkpoint-10600/training_args.bin +3 -0
- checkpoint-10700/config.json +31 -0
- checkpoint-10700/model.safetensors +3 -0
- checkpoint-10700/training_args.bin +3 -0
- checkpoint-10800/config.json +31 -0
- checkpoint-10800/model.safetensors +3 -0
- checkpoint-10800/training_args.bin +3 -0
- checkpoint-10900/config.json +31 -0
- checkpoint-10900/model.safetensors +3 -0
- checkpoint-10900/training_args.bin +3 -0
- checkpoint-1100/config.json +31 -0
- checkpoint-1100/model.safetensors +3 -0
- checkpoint-1100/training_args.bin +3 -0
- checkpoint-11000/config.json +31 -0
- checkpoint-11000/model.safetensors +3 -0
- checkpoint-11000/training_args.bin +3 -0
- checkpoint-11100/config.json +31 -0
- checkpoint-11100/model.safetensors +3 -0
- checkpoint-11100/training_args.bin +3 -0
- checkpoint-11200/config.json +31 -0
- checkpoint-11200/model.safetensors +3 -0
- checkpoint-11200/training_args.bin +3 -0
- checkpoint-11300/config.json +31 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
checkpoint-100/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-100/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03e56f5b3dd4afc21190625e190827309ea556064666d746669af2409569127a
|
3 |
+
size 324662984
|
checkpoint-100/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-1000/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-1000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a56aaeb73fd5a850495c644f8711e0fd2770f3e773c4f808af9f7d9b344fd53f
|
3 |
+
size 324662984
|
checkpoint-1000/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10000/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:601eb51db6fa707eba37f9f216c6ffc9ef4e32ec527c68a765795e43bc8fb8d3
|
3 |
+
size 324662984
|
checkpoint-10000/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10100/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10100/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27a558976d50677b7e1442ead7e2855bff7b5b876d504ec9b153a3109d85e196
|
3 |
+
size 324662984
|
checkpoint-10100/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10200/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10200/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2d9e0280973fece6e5e52c31bebb9e5f505e3445f4e067cffe0770508672937
|
3 |
+
size 324662984
|
checkpoint-10200/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10300/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10300/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34b2881cddcf530c4f56798b9d9f54ebd1c448c3c3a19ecbe5fabe76f0be128c
|
3 |
+
size 324662984
|
checkpoint-10300/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10400/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10400/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be0afbf78250eeb4ad3b4f622ea40aeac0e6cd63d06d234cbd22308fb29b9f8c
|
3 |
+
size 324662984
|
checkpoint-10400/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10500/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10500/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed06b43dd3d7554cc68e351367ce5eead72a165a405ed21daa98a55d0de06463
|
3 |
+
size 324662984
|
checkpoint-10500/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10600/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10600/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:baf1409774fbd0059a81253e6a1783a2666f1794181fd01ed2a923cdf762b951
|
3 |
+
size 324662984
|
checkpoint-10600/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10700/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10700/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c0d6f0f34b0e72b1f398d5350a1418d32daa962eada990b846d3d3aba240342
|
3 |
+
size 324662984
|
checkpoint-10700/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10800/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10800/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:241a4ebaaafe542b303716ff632a8e2009aaa45efa2db89340bef14f7e92670f
|
3 |
+
size 324662984
|
checkpoint-10800/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-10900/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-10900/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:842701f4fbad38879acacd251e3aee93cc1f8967942effddfa9a5ebf7576a45e
|
3 |
+
size 324662984
|
checkpoint-10900/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-1100/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-1100/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd0f0e5d1fdece8c968bb138920e72a81d0166e2538ade4d8d74175b147ae605
|
3 |
+
size 324662984
|
checkpoint-1100/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-11000/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-11000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b62d3a64f76e93d93111b2c51de044e3aee7043289749e9c1cdcd31ac0ff37ee
|
3 |
+
size 324662984
|
checkpoint-11000/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-11100/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-11100/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29764862ba130e00fbec38ec91af863d3b1b57f42f29df49e93536cfd3a7ce37
|
3 |
+
size 324662984
|
checkpoint-11100/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-11200/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-11200/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e08e950a505b355d9602f3c47d9eea94807466658b5a615ea3491faeb966a1f
|
3 |
+
size 324662984
|
checkpoint-11200/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-11300/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|