diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/checkpoint-100/config.json b/checkpoint-100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-100/model.safetensors b/checkpoint-100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..104970e5d5470ac82b4ca8b2488c8ab6af2d7f17
--- /dev/null
+++ b/checkpoint-100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03e56f5b3dd4afc21190625e190827309ea556064666d746669af2409569127a
+size 324662984
diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1000/config.json b/checkpoint-1000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1000/model.safetensors b/checkpoint-1000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..364ae86e9c839d84a7033c3be65921ab6343928d
--- /dev/null
+++ b/checkpoint-1000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a56aaeb73fd5a850495c644f8711e0fd2770f3e773c4f808af9f7d9b344fd53f
+size 324662984
diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10000/config.json b/checkpoint-10000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10000/model.safetensors b/checkpoint-10000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e5323931e60e38b29624aab50590c614fbe024d9
--- /dev/null
+++ b/checkpoint-10000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:601eb51db6fa707eba37f9f216c6ffc9ef4e32ec527c68a765795e43bc8fb8d3
+size 324662984
diff --git a/checkpoint-10000/training_args.bin b/checkpoint-10000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10100/config.json b/checkpoint-10100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10100/model.safetensors b/checkpoint-10100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9794c3bb99125d9c4ba9dc77d713905029539e97
--- /dev/null
+++ b/checkpoint-10100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27a558976d50677b7e1442ead7e2855bff7b5b876d504ec9b153a3109d85e196
+size 324662984
diff --git a/checkpoint-10100/training_args.bin b/checkpoint-10100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10200/config.json b/checkpoint-10200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10200/model.safetensors b/checkpoint-10200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7072c92ba460ec2cad12b852ab49148376f02300
--- /dev/null
+++ b/checkpoint-10200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2d9e0280973fece6e5e52c31bebb9e5f505e3445f4e067cffe0770508672937
+size 324662984
diff --git a/checkpoint-10200/training_args.bin b/checkpoint-10200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10300/config.json b/checkpoint-10300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10300/model.safetensors b/checkpoint-10300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b95fc34d6e0f163ba1052b81d0184b62e9eb8e67
--- /dev/null
+++ b/checkpoint-10300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34b2881cddcf530c4f56798b9d9f54ebd1c448c3c3a19ecbe5fabe76f0be128c
+size 324662984
diff --git a/checkpoint-10300/training_args.bin b/checkpoint-10300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10400/config.json b/checkpoint-10400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10400/model.safetensors b/checkpoint-10400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dbb4c7d081c0166a6e973f82b0caf61a55fb90f5
--- /dev/null
+++ b/checkpoint-10400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be0afbf78250eeb4ad3b4f622ea40aeac0e6cd63d06d234cbd22308fb29b9f8c
+size 324662984
diff --git a/checkpoint-10400/training_args.bin b/checkpoint-10400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10500/config.json b/checkpoint-10500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10500/model.safetensors b/checkpoint-10500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..21ab297d279d1ec368897e4bcfc365516ee5d1d1
--- /dev/null
+++ b/checkpoint-10500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed06b43dd3d7554cc68e351367ce5eead72a165a405ed21daa98a55d0de06463
+size 324662984
diff --git a/checkpoint-10500/training_args.bin b/checkpoint-10500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10600/config.json b/checkpoint-10600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10600/model.safetensors b/checkpoint-10600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..35b9107be6761f109d64d22ae56026e8905b591f
--- /dev/null
+++ b/checkpoint-10600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baf1409774fbd0059a81253e6a1783a2666f1794181fd01ed2a923cdf762b951
+size 324662984
diff --git a/checkpoint-10600/training_args.bin b/checkpoint-10600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10700/config.json b/checkpoint-10700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10700/model.safetensors b/checkpoint-10700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..01aafb4129ab3e28f37b2f5e1ea458b708ecd51f
--- /dev/null
+++ b/checkpoint-10700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c0d6f0f34b0e72b1f398d5350a1418d32daa962eada990b846d3d3aba240342
+size 324662984
diff --git a/checkpoint-10700/training_args.bin b/checkpoint-10700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10800/config.json b/checkpoint-10800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10800/model.safetensors b/checkpoint-10800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ed47b21841209e5af4ce6bb6d918e6d6fd018120
--- /dev/null
+++ b/checkpoint-10800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:241a4ebaaafe542b303716ff632a8e2009aaa45efa2db89340bef14f7e92670f
+size 324662984
diff --git a/checkpoint-10800/training_args.bin b/checkpoint-10800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-10900/config.json b/checkpoint-10900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-10900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-10900/model.safetensors b/checkpoint-10900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..39f23401fee5c03c60289c1f5bec6ca9eee8f9b6
--- /dev/null
+++ b/checkpoint-10900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:842701f4fbad38879acacd251e3aee93cc1f8967942effddfa9a5ebf7576a45e
+size 324662984
diff --git a/checkpoint-10900/training_args.bin b/checkpoint-10900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-10900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1100/config.json b/checkpoint-1100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1100/model.safetensors b/checkpoint-1100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b3cfaba185b99092400cffc59cf743a362cfe75f
--- /dev/null
+++ b/checkpoint-1100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd0f0e5d1fdece8c968bb138920e72a81d0166e2538ade4d8d74175b147ae605
+size 324662984
diff --git a/checkpoint-1100/training_args.bin b/checkpoint-1100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11000/config.json b/checkpoint-11000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11000/model.safetensors b/checkpoint-11000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6e2d0c9e7216f91f1414440c0f0c258289ca961a
--- /dev/null
+++ b/checkpoint-11000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b62d3a64f76e93d93111b2c51de044e3aee7043289749e9c1cdcd31ac0ff37ee
+size 324662984
diff --git a/checkpoint-11000/training_args.bin b/checkpoint-11000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11100/config.json b/checkpoint-11100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11100/model.safetensors b/checkpoint-11100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e60838b4eb3b24f87137c8550de0d8a40b937fff
--- /dev/null
+++ b/checkpoint-11100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29764862ba130e00fbec38ec91af863d3b1b57f42f29df49e93536cfd3a7ce37
+size 324662984
diff --git a/checkpoint-11100/training_args.bin b/checkpoint-11100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11200/config.json b/checkpoint-11200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11200/model.safetensors b/checkpoint-11200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7253d524de5cc2d492ef9af303df2ac3ad6867b1
--- /dev/null
+++ b/checkpoint-11200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e08e950a505b355d9602f3c47d9eea94807466658b5a615ea3491faeb966a1f
+size 324662984
diff --git a/checkpoint-11200/training_args.bin b/checkpoint-11200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11300/config.json b/checkpoint-11300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11300/model.safetensors b/checkpoint-11300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8d14d399954cb31a2cceab535e2fb8cafa4b4f05
--- /dev/null
+++ b/checkpoint-11300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:765ffd603fad69f4a013035a4d627bd7c23ce73d3eb1b047c2e79b9e3d60fdc8
+size 324662984
diff --git a/checkpoint-11300/training_args.bin b/checkpoint-11300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11400/config.json b/checkpoint-11400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11400/model.safetensors b/checkpoint-11400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..de9713e8affa5232678c85f8bb9119c450dc6ede
--- /dev/null
+++ b/checkpoint-11400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3873a98e96f9464619f4718dc2d06321b37c5f22d4ac044ffea17bfff0d34c72
+size 324662984
diff --git a/checkpoint-11400/training_args.bin b/checkpoint-11400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11500/config.json b/checkpoint-11500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11500/model.safetensors b/checkpoint-11500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0b64500891e48aea76c93175f64b815d905131a8
--- /dev/null
+++ b/checkpoint-11500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0be68b37656c70f8115747fa16bf93c4cb97009dcb36879f43aeecde14d45fe3
+size 324662984
diff --git a/checkpoint-11500/training_args.bin b/checkpoint-11500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11600/config.json b/checkpoint-11600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11600/model.safetensors b/checkpoint-11600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c7390f85bcc0acfb5cf197cfd154b901ecf59584
--- /dev/null
+++ b/checkpoint-11600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2047a536e17c62bf50ed24513598612271accc41523769fb65e004dbc3b43ab
+size 324662984
diff --git a/checkpoint-11600/training_args.bin b/checkpoint-11600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11700/config.json b/checkpoint-11700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11700/model.safetensors b/checkpoint-11700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6c1bec811233f79f9b4b7d76407cac7bd629c84e
--- /dev/null
+++ b/checkpoint-11700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fac2b8834fe78288702b051eaa9a56abec74516ca7969231a30b100003b90ec1
+size 324662984
diff --git a/checkpoint-11700/training_args.bin b/checkpoint-11700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11800/config.json b/checkpoint-11800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11800/model.safetensors b/checkpoint-11800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b5e77f281e0382a80915311fca969aefe7ec021d
--- /dev/null
+++ b/checkpoint-11800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:934553d7cb878192f865f45c4fb7c42674d9b883a7a063c9541ad73dcaa2a6e0
+size 324662984
diff --git a/checkpoint-11800/training_args.bin b/checkpoint-11800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-11900/config.json b/checkpoint-11900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-11900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-11900/model.safetensors b/checkpoint-11900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..73a9fa06ec8b744b465e62ac75015afd01656978
--- /dev/null
+++ b/checkpoint-11900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25ffb8f729299f0ada3cdc8868047f4ea44b7d51f5e5867cc3a06af245195ad2
+size 324662984
diff --git a/checkpoint-11900/training_args.bin b/checkpoint-11900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-11900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1200/config.json b/checkpoint-1200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1200/model.safetensors b/checkpoint-1200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..18042109b5adf9b31e89a5a0aa57ff13c263fc40
--- /dev/null
+++ b/checkpoint-1200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02c4e630c1f98d6eb6a219768abf83de401a7f158bb5309930dcff46c14820f7
+size 324662984
diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12000/config.json b/checkpoint-12000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12000/model.safetensors b/checkpoint-12000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..265a2a8bb4bf92d0e78cc5fd90cc0f86ad5e798d
--- /dev/null
+++ b/checkpoint-12000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1e30e85474b576298c7bb9228c5a9a94092a8f9b2b3887679654ddffcdb09b0
+size 324662984
diff --git a/checkpoint-12000/training_args.bin b/checkpoint-12000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12100/config.json b/checkpoint-12100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12100/model.safetensors b/checkpoint-12100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ba036f77254ca888a9c08180ca31a58326987162
--- /dev/null
+++ b/checkpoint-12100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f555b874cb9ebe975f93427a26c60d3c60d01e1728c0fb96cc6c0e9f7f5893
+size 324662984
diff --git a/checkpoint-12100/training_args.bin b/checkpoint-12100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12200/config.json b/checkpoint-12200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12200/model.safetensors b/checkpoint-12200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..44ce2988e835dff3c9a095eeb37393f19a69e233
--- /dev/null
+++ b/checkpoint-12200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9591cffbdd95ec607ecbc952f4137a66f4dcf1400c10faca3bb23269cd709e20
+size 324662984
diff --git a/checkpoint-12200/training_args.bin b/checkpoint-12200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12300/config.json b/checkpoint-12300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12300/model.safetensors b/checkpoint-12300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..87931522758a1ffe93a5a3c97c656fc37d4ea7cb
--- /dev/null
+++ b/checkpoint-12300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bf9296e12037b21cd7362c8c28ea7efed896a089a93af298f56300e15163d88
+size 324662984
diff --git a/checkpoint-12300/training_args.bin b/checkpoint-12300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12400/config.json b/checkpoint-12400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12400/model.safetensors b/checkpoint-12400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fb5762d84dc8ee482e7d2a435c0e238fd9d860ae
--- /dev/null
+++ b/checkpoint-12400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c141e5492e8190c5f867d1857ae792a154ac66ceaec872cd8a0d73b1ff40578c
+size 324662984
diff --git a/checkpoint-12400/training_args.bin b/checkpoint-12400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12500/config.json b/checkpoint-12500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12500/model.safetensors b/checkpoint-12500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..12d3e94cf2eb14ca6c365c23909f95d437498115
--- /dev/null
+++ b/checkpoint-12500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d71096620aca34647e9f6bc9576976765b133a2c5bd11e24364613b84bc2bc1c
+size 324662984
diff --git a/checkpoint-12500/training_args.bin b/checkpoint-12500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12600/config.json b/checkpoint-12600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12600/model.safetensors b/checkpoint-12600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..60c50c4038515a363634dc95ccc57178fedbe8b5
--- /dev/null
+++ b/checkpoint-12600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a339a8eba99999dcee345e15b46ccb98a9ef023bcdf5b84d22c026a19e9ade92
+size 324662984
diff --git a/checkpoint-12600/training_args.bin b/checkpoint-12600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12700/config.json b/checkpoint-12700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12700/model.safetensors b/checkpoint-12700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..47949f56c18998ead761ae671d8f63d433fac686
--- /dev/null
+++ b/checkpoint-12700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05fc4ea5115ae015332fe9f2ca20febd78a80356a325b40581d745255fc8d658
+size 324662984
diff --git a/checkpoint-12700/training_args.bin b/checkpoint-12700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12800/config.json b/checkpoint-12800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12800/model.safetensors b/checkpoint-12800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ae04f03fe6fc4be3dd5047607015c3b9d8f9d83d
--- /dev/null
+++ b/checkpoint-12800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4592f43e8713a9ac8c4baff25e76cd2afb06cc812dda27e89d1a23d88c510f7f
+size 324662984
diff --git a/checkpoint-12800/training_args.bin b/checkpoint-12800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-12900/config.json b/checkpoint-12900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-12900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-12900/model.safetensors b/checkpoint-12900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1c844eaf7836ac5b3645e25bdc3910e69f703283
--- /dev/null
+++ b/checkpoint-12900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ca667d2f2ac362968d2737801a46f90fdd220f03b4ef385fcd4dd6edfb79ee5
+size 324662984
diff --git a/checkpoint-12900/training_args.bin b/checkpoint-12900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-12900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1300/config.json b/checkpoint-1300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1300/model.safetensors b/checkpoint-1300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7bff8c6e57637f8f4a5170c12aa212d8bdd53999
--- /dev/null
+++ b/checkpoint-1300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f4b990d356af6a6a3b3aa900eb8d1c800a2c6c295f6e1ec10c49503fc324d26
+size 324662984
diff --git a/checkpoint-1300/training_args.bin b/checkpoint-1300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13000/config.json b/checkpoint-13000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13000/model.safetensors b/checkpoint-13000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..37e66793aa04d30ac284f233d129cb73f5f64807
--- /dev/null
+++ b/checkpoint-13000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44778d96aeb424090bf5e7f13ea42fc282757ad8ad02581fb812527e0c0efa1a
+size 324662984
diff --git a/checkpoint-13000/training_args.bin b/checkpoint-13000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13100/config.json b/checkpoint-13100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13100/model.safetensors b/checkpoint-13100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d0154db6eed7d807c43c26fcf1433c43c8b83fcf
--- /dev/null
+++ b/checkpoint-13100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab134cc7d59feeddd0b7305dcf51e6eb6a0ca2b06812f5bb4e71d8450d26af24
+size 324662984
diff --git a/checkpoint-13100/training_args.bin b/checkpoint-13100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13200/config.json b/checkpoint-13200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13200/model.safetensors b/checkpoint-13200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..559d83dbe28cf66f74aa951fbe3e2446d47d2ae4
--- /dev/null
+++ b/checkpoint-13200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a63ef113fb8b2456632589376cf0bb652e35c6506e143c3af1df28769e11e51
+size 324662984
diff --git a/checkpoint-13200/training_args.bin b/checkpoint-13200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13300/config.json b/checkpoint-13300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13300/model.safetensors b/checkpoint-13300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f29882f7bfb77eefc931db44b1e318261f8156e5
--- /dev/null
+++ b/checkpoint-13300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fe2952ea6b548ccd45635efbba5d2c97e7830013b09c91b6a351d23e793b3f7
+size 324662984
diff --git a/checkpoint-13300/training_args.bin b/checkpoint-13300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13400/config.json b/checkpoint-13400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13400/model.safetensors b/checkpoint-13400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..128d9b57f0c2ef953b10b048c1109e196cead8ac
--- /dev/null
+++ b/checkpoint-13400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f608ec53530441d49d3169a68bd734192c72742ddb944d6b605b509139f10e01
+size 324662984
diff --git a/checkpoint-13400/training_args.bin b/checkpoint-13400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13500/config.json b/checkpoint-13500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13500/model.safetensors b/checkpoint-13500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4fbc2827dfa2703de26612d502afbdb3cb4ef082
--- /dev/null
+++ b/checkpoint-13500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:748c13a00e6f44758f9756023b746ba3a3d94c21a9a919e3e580fe00c872c5e3
+size 324662984
diff --git a/checkpoint-13500/training_args.bin b/checkpoint-13500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13600/config.json b/checkpoint-13600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13600/model.safetensors b/checkpoint-13600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..439a3ba317d2c0dfefc9f6224c530ea3abdd12b8
--- /dev/null
+++ b/checkpoint-13600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64e84c7158de425286a4f5d86486a87a28b65d678561ff38b84b12f766ae6a6d
+size 324662984
diff --git a/checkpoint-13600/training_args.bin b/checkpoint-13600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13700/config.json b/checkpoint-13700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13700/model.safetensors b/checkpoint-13700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fb4296e26e263ac752fd4d5a6e0de3cb3326248b
--- /dev/null
+++ b/checkpoint-13700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee973e21e1df0fa6b321b83a21975a00bd53fbd8766fd4da6b33aa44c074db81
+size 324662984
diff --git a/checkpoint-13700/training_args.bin b/checkpoint-13700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13800/config.json b/checkpoint-13800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13800/model.safetensors b/checkpoint-13800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dedabb206badc5835cf83c5aeaa81e0ae3d743f9
--- /dev/null
+++ b/checkpoint-13800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a2a0449a2a9c089c5d377a7b8df808be37ff2296d51435d519d602600178af0
+size 324662984
diff --git a/checkpoint-13800/training_args.bin b/checkpoint-13800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-13900/config.json b/checkpoint-13900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-13900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-13900/model.safetensors b/checkpoint-13900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d1381d7c6b56f13a80417c8da0422dbc6009f64a
--- /dev/null
+++ b/checkpoint-13900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6af2b6fbb9bf5f09c347fedaaf26b1cac9acacf0a047dc2f61c0f6cf8f32728d
+size 324662984
diff --git a/checkpoint-13900/training_args.bin b/checkpoint-13900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-13900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1400/config.json b/checkpoint-1400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1400/model.safetensors b/checkpoint-1400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a7aceab5181a77435b8b166ecba0dce7dfab553e
--- /dev/null
+++ b/checkpoint-1400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3854bc45d83dd58a407d96b8c6cc850351a6a8a6e274dce5375deba9b0ba054e
+size 324662984
diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14000/config.json b/checkpoint-14000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14000/model.safetensors b/checkpoint-14000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e316d9e213995f3d734c502c3a33539f79357ff3
--- /dev/null
+++ b/checkpoint-14000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37090f18a5b1f2af061ef7c30907642afed2bdc411fef40c2a620a7549f78ddb
+size 324662984
diff --git a/checkpoint-14000/training_args.bin b/checkpoint-14000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14100/config.json b/checkpoint-14100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14100/model.safetensors b/checkpoint-14100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aaa681aec7f28effebcbe3b0b43c7197f32d810f
--- /dev/null
+++ b/checkpoint-14100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5746eab2312a09e4f950ba53e434029d7c0601f5845e4ce41fce5fd99b49f419
+size 324662984
diff --git a/checkpoint-14100/training_args.bin b/checkpoint-14100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14200/config.json b/checkpoint-14200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14200/model.safetensors b/checkpoint-14200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3cfd89109ed2d10ad722867d2060cefe8861c207
--- /dev/null
+++ b/checkpoint-14200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fe40e5b91cdbcc9b2de68ebf9a6b2299847ecb845ff77ccf42b73b74627a936
+size 324662984
diff --git a/checkpoint-14200/training_args.bin b/checkpoint-14200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14300/config.json b/checkpoint-14300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14300/model.safetensors b/checkpoint-14300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6d79263f47bdd008c813e4c703354a4eab339700
--- /dev/null
+++ b/checkpoint-14300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bde6028a5850f35a55edead7ca4360a7ca774eee092bc79bc2043bdba6e80ee
+size 324662984
diff --git a/checkpoint-14300/training_args.bin b/checkpoint-14300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14400/config.json b/checkpoint-14400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14400/model.safetensors b/checkpoint-14400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..80f7c1b164691f3adff13829f4c661d35c15f59e
--- /dev/null
+++ b/checkpoint-14400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ea2f191e62530d4fb60e382b8714cec4f71b4e851329f69262afc6e9a5d707d
+size 324662984
diff --git a/checkpoint-14400/training_args.bin b/checkpoint-14400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14500/config.json b/checkpoint-14500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14500/model.safetensors b/checkpoint-14500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..292e4aeb33d56f986304046666bd1597e4d08e80
--- /dev/null
+++ b/checkpoint-14500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27cf9555710565b1c05c3f2ace81be1c2fd0393a545ece1a949aa6279ea28012
+size 324662984
diff --git a/checkpoint-14500/training_args.bin b/checkpoint-14500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14600/config.json b/checkpoint-14600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14600/model.safetensors b/checkpoint-14600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e205da35fd95568ab3350bab8099725b95cd3953
--- /dev/null
+++ b/checkpoint-14600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a3bbae9d0905b61bd83b75cdd1e7b0b8d19969ac59445b2f3920f0b687508e8
+size 324662984
diff --git a/checkpoint-14600/training_args.bin b/checkpoint-14600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14700/config.json b/checkpoint-14700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14700/model.safetensors b/checkpoint-14700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b4a39b1dca210127ebbe29c1fa5025a8f260c391
--- /dev/null
+++ b/checkpoint-14700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3816c7692f9ac6e63f79e4997fbb2d28cb5506d769575a4b06c958516206ee49
+size 324662984
diff --git a/checkpoint-14700/training_args.bin b/checkpoint-14700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14800/config.json b/checkpoint-14800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14800/model.safetensors b/checkpoint-14800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..baefe6cd4c20a417f5f3e278179d302abc06ffa1
--- /dev/null
+++ b/checkpoint-14800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6509a66a29d47396cb45dbb4acc93d5e0b8ba1e9688b0c1adf8e7223746aa5c
+size 324662984
diff --git a/checkpoint-14800/training_args.bin b/checkpoint-14800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-14900/config.json b/checkpoint-14900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-14900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-14900/model.safetensors b/checkpoint-14900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c4ce8766c32ff5de1696405ea3fe82d1bcb491b7
--- /dev/null
+++ b/checkpoint-14900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eebce53f0bf285b4ef17ca9d76021e2ea04434584e514d801d6d9717c071efc7
+size 324662984
diff --git a/checkpoint-14900/training_args.bin b/checkpoint-14900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-14900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1500/config.json b/checkpoint-1500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1500/model.safetensors b/checkpoint-1500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8970ac593270844b1b32d49a9ec5801c4f4747cd
--- /dev/null
+++ b/checkpoint-1500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22d206416278b06e158de70512bb690a7f37ee47b7408a8e197f6e77b30b570c
+size 324662984
diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15000/config.json b/checkpoint-15000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15000/model.safetensors b/checkpoint-15000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..913450bfcbda9e5d5436bb4b8756324774027d26
--- /dev/null
+++ b/checkpoint-15000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcf99c8f574f2ea9de76097a2e2ad8e8fa94d560765f95556c5e849c3d7c577c
+size 324662984
diff --git a/checkpoint-15000/training_args.bin b/checkpoint-15000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15100/config.json b/checkpoint-15100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15100/model.safetensors b/checkpoint-15100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..47ba418e005eae8efc834b275f3624e73d27980d
--- /dev/null
+++ b/checkpoint-15100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096d93b75e3e6477f8b39e8bc18148c86ef468f21f8dd2961ab2fa35512f07b3
+size 324662984
diff --git a/checkpoint-15100/training_args.bin b/checkpoint-15100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15200/config.json b/checkpoint-15200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15200/model.safetensors b/checkpoint-15200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7ff9fe3fdd71047aff875832003d36999a3b9ed8
--- /dev/null
+++ b/checkpoint-15200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a7336f1f112e26a7e72ec9b02bfcdb11e1e26471e8af314f016343713f8feb3
+size 324662984
diff --git a/checkpoint-15200/training_args.bin b/checkpoint-15200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15300/config.json b/checkpoint-15300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15300/model.safetensors b/checkpoint-15300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6a151371ab31269abb690717ecc4efb3f1489226
--- /dev/null
+++ b/checkpoint-15300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24338a5cb0e5b7688d19a86be4676272cb223b71ee5d1892d4f69be46f3a0938
+size 324662984
diff --git a/checkpoint-15300/training_args.bin b/checkpoint-15300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15400/config.json b/checkpoint-15400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15400/model.safetensors b/checkpoint-15400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9d85cd256f237b1725d438dc1123426243074306
--- /dev/null
+++ b/checkpoint-15400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:995389b13079007f2f80684451c968e7329f47589b382ee6c9023b4ab39989b7
+size 324662984
diff --git a/checkpoint-15400/training_args.bin b/checkpoint-15400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15500/config.json b/checkpoint-15500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15500/model.safetensors b/checkpoint-15500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b76b246c5b20c97ff50280e8af4542004ace6b7c
--- /dev/null
+++ b/checkpoint-15500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ed2464dad65cbb58cd695a567258cc5f5bef72137b1b2488671dcd729d048b
+size 324662984
diff --git a/checkpoint-15500/training_args.bin b/checkpoint-15500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15600/config.json b/checkpoint-15600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15600/model.safetensors b/checkpoint-15600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..54eb59facb6be9e4ce7ac0f3c06cad0a67571609
--- /dev/null
+++ b/checkpoint-15600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce1489b4216dd48a091a171c4e9343b73afe74974b110ca3a55630f52b5584d3
+size 324662984
diff --git a/checkpoint-15600/training_args.bin b/checkpoint-15600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15700/config.json b/checkpoint-15700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15700/model.safetensors b/checkpoint-15700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8077478edc08d4e9896b6e170e22ea595a4b1d7e
--- /dev/null
+++ b/checkpoint-15700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cdf8111ded75dc61f1d49acde501e06876c52e71d5e9b97777eb99474f00e5f
+size 324662984
diff --git a/checkpoint-15700/training_args.bin b/checkpoint-15700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15800/config.json b/checkpoint-15800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15800/model.safetensors b/checkpoint-15800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..994f0ffc072af33b794b722974f54af59587722d
--- /dev/null
+++ b/checkpoint-15800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffb222092dfeeadc3730149b97d91649cbe037de05d7983d094f6481b5cde18f
+size 324662984
diff --git a/checkpoint-15800/training_args.bin b/checkpoint-15800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-15900/config.json b/checkpoint-15900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-15900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-15900/model.safetensors b/checkpoint-15900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..85070487866284a99b280c1c439d3d1a33d34950
--- /dev/null
+++ b/checkpoint-15900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c147dc2622fb84037de96dd33b4f94e6521862c7f76aef6b4588af61772714d
+size 324662984
diff --git a/checkpoint-15900/training_args.bin b/checkpoint-15900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-15900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1600/config.json b/checkpoint-1600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1600/model.safetensors b/checkpoint-1600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..58628015f760b9c3b5e52bb2a67d6c98ec6f1f57
--- /dev/null
+++ b/checkpoint-1600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acd3f3372cb519e32bd50476e8887fd3aa73405ac3eb86a42fb52b81ead6fad3
+size 324662984
diff --git a/checkpoint-1600/training_args.bin b/checkpoint-1600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16000/config.json b/checkpoint-16000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16000/model.safetensors b/checkpoint-16000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7a36547be3c40593d5bd0421511fe72db819adae
--- /dev/null
+++ b/checkpoint-16000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40613ff187dafb4147b5ace7d3a79738f1434354a923b7bf21b65aefcf144e07
+size 324662984
diff --git a/checkpoint-16000/training_args.bin b/checkpoint-16000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16100/config.json b/checkpoint-16100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16100/model.safetensors b/checkpoint-16100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d07a0fa6c95c0d6356dfd2e0efa4a5139da33194
--- /dev/null
+++ b/checkpoint-16100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d6e55cc4101ce7d60aa30f18de04d21179fb41f27960a70536d4350525b6e63
+size 324662984
diff --git a/checkpoint-16100/training_args.bin b/checkpoint-16100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16200/config.json b/checkpoint-16200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16200/model.safetensors b/checkpoint-16200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..99b0a71f5bbdb5384f791bd798c32183bb4c0293
--- /dev/null
+++ b/checkpoint-16200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e749bc8b3c8b697cff944b09f272a4cff0ddc605e33dfe53100f39dcc3e37bc9
+size 324662984
diff --git a/checkpoint-16200/training_args.bin b/checkpoint-16200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16300/config.json b/checkpoint-16300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16300/model.safetensors b/checkpoint-16300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d9f30225452f0355b179f94282960d94593607cf
--- /dev/null
+++ b/checkpoint-16300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c6b51588451ed4705c0d503da0ed3594fd56d6ae280eea569e1863e0ffa888f
+size 324662984
diff --git a/checkpoint-16300/training_args.bin b/checkpoint-16300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16400/config.json b/checkpoint-16400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16400/model.safetensors b/checkpoint-16400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ef3c501bf7faef7f3c8e569e3d039eac8284743a
--- /dev/null
+++ b/checkpoint-16400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b22d7697da5b72fa5e1737d5d617d767cefa981ed4d6c5cd201e08a695b960b6
+size 324662984
diff --git a/checkpoint-16400/training_args.bin b/checkpoint-16400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16500/config.json b/checkpoint-16500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16500/model.safetensors b/checkpoint-16500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e4896381d26db2582c599ffc950d3d48cda1215a
--- /dev/null
+++ b/checkpoint-16500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d840e479a0a16a99b7264902623db89c31adf59ec74d129d3ce9d21af7c72f1e
+size 324662984
diff --git a/checkpoint-16500/training_args.bin b/checkpoint-16500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16600/config.json b/checkpoint-16600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16600/model.safetensors b/checkpoint-16600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..29396b19ec5d9533d929324c17c9912887849355
--- /dev/null
+++ b/checkpoint-16600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b238b87983d07f836888c6664fe7c19499e7ce0770ab50126377e02084ecb109
+size 324662984
diff --git a/checkpoint-16600/training_args.bin b/checkpoint-16600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16700/config.json b/checkpoint-16700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16700/model.safetensors b/checkpoint-16700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dfd344f4017dfc7705efaa975f27544507b45280
--- /dev/null
+++ b/checkpoint-16700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21787f6e4fc817c625cb77e1be7a74aeb2f04694d913ad2133c3cc4e19aad4dc
+size 324662984
diff --git a/checkpoint-16700/training_args.bin b/checkpoint-16700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16800/config.json b/checkpoint-16800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16800/model.safetensors b/checkpoint-16800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b26ec1cd40aa042bd348b0f9d7d230a53238f600
--- /dev/null
+++ b/checkpoint-16800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c0b338fbc81e76cd4cac214848798b9ab92baadf1e0d25b370999df0d125e68
+size 324662984
diff --git a/checkpoint-16800/training_args.bin b/checkpoint-16800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-16900/config.json b/checkpoint-16900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-16900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-16900/model.safetensors b/checkpoint-16900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..624fada9fdcb5237a486d436ab182da3b1ccfd65
--- /dev/null
+++ b/checkpoint-16900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7ee8633d3b5ba21307ba6938bda987a2477e9832896f665470c52ad2cb0bdc4
+size 324662984
diff --git a/checkpoint-16900/training_args.bin b/checkpoint-16900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-16900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1700/config.json b/checkpoint-1700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1700/model.safetensors b/checkpoint-1700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a2eed74e9472eaa1fe7818bceb70f6015046af1e
--- /dev/null
+++ b/checkpoint-1700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ca036606f560cd39b04683892e7d45c8c10a77b9c323b2bc9329703e125b23f
+size 324662984
diff --git a/checkpoint-1700/training_args.bin b/checkpoint-1700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17000/config.json b/checkpoint-17000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17000/model.safetensors b/checkpoint-17000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e301b8fa12f35ced0800ffeba59ea88719f64a0f
--- /dev/null
+++ b/checkpoint-17000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a79b890f61c985a4edf77736a37b8a680ca6a2720094e88cce624df38252c138
+size 324662984
diff --git a/checkpoint-17000/training_args.bin b/checkpoint-17000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17100/config.json b/checkpoint-17100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17100/model.safetensors b/checkpoint-17100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..423e31261ba33a261a687bc89c93caa9deaa820c
--- /dev/null
+++ b/checkpoint-17100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b06b667458c69e652cbd58b49070dbf8a6af7f9c87bd2bc89e21866fa896c3c0
+size 324662984
diff --git a/checkpoint-17100/training_args.bin b/checkpoint-17100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17200/config.json b/checkpoint-17200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17200/model.safetensors b/checkpoint-17200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4414483129b1e0b4dc6e38abb77acde99bbdb9c0
--- /dev/null
+++ b/checkpoint-17200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dae7e7921cd174d39ade1bb67cc8bd74de4334428aaf5cc678e25a1936d0a6aa
+size 324662984
diff --git a/checkpoint-17200/training_args.bin b/checkpoint-17200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17300/config.json b/checkpoint-17300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17300/model.safetensors b/checkpoint-17300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..db9c57dec4104d5bdb52b9b019021149b6cf6387
--- /dev/null
+++ b/checkpoint-17300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cd5f67ffaa611c3b20c04ed2e6d0701bf786bf3e840e4c33a4fb1f9b679123e
+size 324662984
diff --git a/checkpoint-17300/training_args.bin b/checkpoint-17300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17400/config.json b/checkpoint-17400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17400/model.safetensors b/checkpoint-17400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bf9c2104c842daa1c5dadbf6fa89f69d5df3ec1f
--- /dev/null
+++ b/checkpoint-17400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99758d0d8d1a3652a58ffb6e4968df3d287022fea1bf60606391326cba48aef4
+size 324662984
diff --git a/checkpoint-17400/training_args.bin b/checkpoint-17400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17500/config.json b/checkpoint-17500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17500/model.safetensors b/checkpoint-17500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f431f9bba9e78fabceff0002c77bc487cd3c6afc
--- /dev/null
+++ b/checkpoint-17500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d26f0f4ef6e9a4d9015a111fa2dd1231ec2a51b9a9b8878a83d42463144d6c7
+size 324662984
diff --git a/checkpoint-17500/training_args.bin b/checkpoint-17500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17600/config.json b/checkpoint-17600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17600/model.safetensors b/checkpoint-17600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b4e7c80018ca31750889b83108c5cebbf42f8d70
--- /dev/null
+++ b/checkpoint-17600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15673ce18d15c27942abf1acfea2ef363f4d4f31e3107762ed1b6d50e1e8862e
+size 324662984
diff --git a/checkpoint-17600/training_args.bin b/checkpoint-17600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17700/config.json b/checkpoint-17700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17700/model.safetensors b/checkpoint-17700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e129732bd4b518427b1bda682e4401cd3943f448
--- /dev/null
+++ b/checkpoint-17700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36a5475c0199bc78730f186e94070eafbca66dfdb6abb0b3c0b1f9be13715fae
+size 324662984
diff --git a/checkpoint-17700/training_args.bin b/checkpoint-17700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17800/config.json b/checkpoint-17800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17800/model.safetensors b/checkpoint-17800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f0efcf846a9da2de2b1ba9ee73cc780afa0b8fed
--- /dev/null
+++ b/checkpoint-17800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a72e5dc5b519191a6d856325c350cace7f246e7ff94da832e560654556099622
+size 324662984
diff --git a/checkpoint-17800/training_args.bin b/checkpoint-17800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-17900/config.json b/checkpoint-17900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-17900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-17900/model.safetensors b/checkpoint-17900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..364f2746bb10cbebf8f927631f0b1e82023068a0
--- /dev/null
+++ b/checkpoint-17900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1767bf5dbaf3a9f6394dfc927d81ab5daad57bbb9a4170922c5ad90f35552773
+size 324662984
diff --git a/checkpoint-17900/training_args.bin b/checkpoint-17900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-17900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1800/config.json b/checkpoint-1800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1800/model.safetensors b/checkpoint-1800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..57a02aa1771ba9a9d7e1ad6a608ff4fa0af2f035
--- /dev/null
+++ b/checkpoint-1800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3e7c5e2f7d1e12de31ad34233245ab052ae0642e79fb54dde1330692d948362
+size 324662984
diff --git a/checkpoint-1800/training_args.bin b/checkpoint-1800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18000/config.json b/checkpoint-18000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18000/model.safetensors b/checkpoint-18000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..143625f00da4f5ea1cc524b404012b436cf8e2f2
--- /dev/null
+++ b/checkpoint-18000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:271b78e9b466e8af42ba9fcc8607e3f23532d6dcd6742283ef1a0b4140851e85
+size 324662984
diff --git a/checkpoint-18000/training_args.bin b/checkpoint-18000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18100/config.json b/checkpoint-18100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18100/model.safetensors b/checkpoint-18100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3295c1ba7432e376da57b1bb7cfe9a7d440035a1
--- /dev/null
+++ b/checkpoint-18100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:129c5754adbf4dc2d43180027ccf48d06fe5a0779ed38f4f3fae49e0f5144f4c
+size 324662984
diff --git a/checkpoint-18100/training_args.bin b/checkpoint-18100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18200/config.json b/checkpoint-18200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18200/model.safetensors b/checkpoint-18200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..438556d1b650e26265fcb320e93f07191fe05799
--- /dev/null
+++ b/checkpoint-18200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c2725ea2c7decb1aca69722816c9f6826da9f6a47c93a03648322e595866f31
+size 324662984
diff --git a/checkpoint-18200/training_args.bin b/checkpoint-18200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18300/config.json b/checkpoint-18300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18300/model.safetensors b/checkpoint-18300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..36e771ca6a4b108f0ce0b9077550232f0b7bca4f
--- /dev/null
+++ b/checkpoint-18300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bd75d4ae929b66b5e1045048ff41d6ae675963b2f57d306d2f3ca63316294c6
+size 324662984
diff --git a/checkpoint-18300/training_args.bin b/checkpoint-18300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18400/config.json b/checkpoint-18400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18400/model.safetensors b/checkpoint-18400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..442a3891dbae26df0d4d0e19360bdf45c2ac2617
--- /dev/null
+++ b/checkpoint-18400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4072680ab39956c2237725db32eee7fc76729ad5c4fae66dbb8fbacc653ce60
+size 324662984
diff --git a/checkpoint-18400/training_args.bin b/checkpoint-18400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18500/config.json b/checkpoint-18500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18500/model.safetensors b/checkpoint-18500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d49ad73fa5badb3874bc2737f6c55f4caecc89d8
--- /dev/null
+++ b/checkpoint-18500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a204f9e338921e84c00a651f69327542ca90ddf7ebfe95a9684680171b70a4e
+size 324662984
diff --git a/checkpoint-18500/training_args.bin b/checkpoint-18500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18600/config.json b/checkpoint-18600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18600/model.safetensors b/checkpoint-18600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..db2581e094cb5ede42c394a80d2aff7554e65948
--- /dev/null
+++ b/checkpoint-18600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70986ecdb79485bad6d1f8456aabcf614efc48262bb2507790965b82df63d2b2
+size 324662984
diff --git a/checkpoint-18600/training_args.bin b/checkpoint-18600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18700/config.json b/checkpoint-18700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18700/model.safetensors b/checkpoint-18700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..025267ec2878b6cf3f5175c5ff45bed10e921414
--- /dev/null
+++ b/checkpoint-18700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc5cb049d3dad9e2826d30d56e7eadfaa59bdaf3092911c17fc38f90f9867d4e
+size 324662984
diff --git a/checkpoint-18700/training_args.bin b/checkpoint-18700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18800/config.json b/checkpoint-18800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18800/model.safetensors b/checkpoint-18800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ebd76e710ff5cf0ce9d01ac68ec433ec13c672df
--- /dev/null
+++ b/checkpoint-18800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:456b73c40792bbb80740c7fc635b790bac038fa218cc7ff01a3a0efc8918c93c
+size 324662984
diff --git a/checkpoint-18800/training_args.bin b/checkpoint-18800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-18900/config.json b/checkpoint-18900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-18900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-18900/model.safetensors b/checkpoint-18900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7b8eb2435b15f67b9d01c6d1b8f790229801b757
--- /dev/null
+++ b/checkpoint-18900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d8cc6829f40c0293a0929ab25bdce78c5b397747babbb4670d1ce529cc9d941
+size 324662984
diff --git a/checkpoint-18900/training_args.bin b/checkpoint-18900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-18900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-1900/config.json b/checkpoint-1900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-1900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-1900/model.safetensors b/checkpoint-1900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..725206b2525c7804c9301a8cf0e9215fe1d10ea9
--- /dev/null
+++ b/checkpoint-1900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb99584225576a311066f91c21b95bcfd25cf5e5abaa50f30cb48dd9e2694d4
+size 324662984
diff --git a/checkpoint-1900/training_args.bin b/checkpoint-1900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-1900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19000/config.json b/checkpoint-19000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19000/model.safetensors b/checkpoint-19000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e68cd4f6f2cc598dfd7e7cec92e9dd4ca3be82e3
--- /dev/null
+++ b/checkpoint-19000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8706678e5f6ce9372e57416292ef53ac5297c0c8db25aa46b994a331f0fea7b8
+size 324662984
diff --git a/checkpoint-19000/training_args.bin b/checkpoint-19000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19100/config.json b/checkpoint-19100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19100/model.safetensors b/checkpoint-19100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..90da5c300ee381d5621b6f83a121a54d89d90ece
--- /dev/null
+++ b/checkpoint-19100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7eba5bd6f673dde452494867e0104dd80b6b06f604b621e41cb473d0ebe0e56
+size 324662984
diff --git a/checkpoint-19100/training_args.bin b/checkpoint-19100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19200/config.json b/checkpoint-19200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19200/model.safetensors b/checkpoint-19200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b89a4ace0050342ce34e9dd26477431dfd2f15ff
--- /dev/null
+++ b/checkpoint-19200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:371c56e0883481691776c80dd6be5d20c41248f31b9af11f256b5b7055761fd1
+size 324662984
diff --git a/checkpoint-19200/training_args.bin b/checkpoint-19200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19300/config.json b/checkpoint-19300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19300/model.safetensors b/checkpoint-19300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2318ef051a62a1631a8bb2baf1aa965bf7c59607
--- /dev/null
+++ b/checkpoint-19300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2721df7b39b585c75df26c206e92d2cb9d1265a55d3879afd83551f4600dfa8
+size 324662984
diff --git a/checkpoint-19300/training_args.bin b/checkpoint-19300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19400/config.json b/checkpoint-19400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19400/model.safetensors b/checkpoint-19400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..17698a135d889822240ecce2d910874a9e3a3744
--- /dev/null
+++ b/checkpoint-19400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea12b65b767dadde67d0d3376b518ca73d74383c9d82227ea30798510916f476
+size 324662984
diff --git a/checkpoint-19400/training_args.bin b/checkpoint-19400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19500/config.json b/checkpoint-19500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19500/model.safetensors b/checkpoint-19500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a9c6632bb00e8e10007d6012d7770d13d0e923d5
--- /dev/null
+++ b/checkpoint-19500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59e1933630c72c03653b1e51b0f7b4ce3367b3d456b92be4b14dbcb6710ad4ad
+size 324662984
diff --git a/checkpoint-19500/training_args.bin b/checkpoint-19500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19600/config.json b/checkpoint-19600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19600/model.safetensors b/checkpoint-19600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..785b319b5db33462576d44a3beac1d514a62db41
--- /dev/null
+++ b/checkpoint-19600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1e6c26bfddb64bffb1450158376efb44dbbb86347900f821510b36fd0318513
+size 324662984
diff --git a/checkpoint-19600/training_args.bin b/checkpoint-19600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19700/config.json b/checkpoint-19700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19700/model.safetensors b/checkpoint-19700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7ada7e4af19f0acd024bef9f8a64f2a1cac2c704
--- /dev/null
+++ b/checkpoint-19700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ead04dbf4bf0ee9765e8ed49df20cafdf365437a4827d80364aa94e4eca15d7
+size 324662984
diff --git a/checkpoint-19700/training_args.bin b/checkpoint-19700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19800/config.json b/checkpoint-19800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19800/model.safetensors b/checkpoint-19800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8a7707c32c2fa9390ff2e7555b969f7409d9a367
--- /dev/null
+++ b/checkpoint-19800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22985af65b6017234bee4023dbdad0d2ab325bd8c421ed8486b407a96543a9e0
+size 324662984
diff --git a/checkpoint-19800/training_args.bin b/checkpoint-19800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-19900/config.json b/checkpoint-19900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-19900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-19900/model.safetensors b/checkpoint-19900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ce53a143d6c70ed1a4092105a5d7db18a3f85a78
--- /dev/null
+++ b/checkpoint-19900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2cd33005188350bbae43353698e0522f60fe219604537f9d87d0092504ff92a
+size 324662984
diff --git a/checkpoint-19900/training_args.bin b/checkpoint-19900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-19900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-200/config.json b/checkpoint-200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-200/model.safetensors b/checkpoint-200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5fb876a2321753da76fcb53075c9c87571be06c9
--- /dev/null
+++ b/checkpoint-200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd23533d50f604282bb0a5f22d02f3f22b956993282389c73cafa912b443f37b
+size 324662984
diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2000/config.json b/checkpoint-2000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2000/model.safetensors b/checkpoint-2000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..82f86cc7620b2624c55239b433729957cd77ea2c
--- /dev/null
+++ b/checkpoint-2000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:771f9532ed221f79ba71582f9a4b296276f4b5f28399dfe80a5b0fba907356be
+size 324662984
diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20000/config.json b/checkpoint-20000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20000/model.safetensors b/checkpoint-20000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..219cdbe4fab09af31e2615a2755c967c0fcd0232
--- /dev/null
+++ b/checkpoint-20000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7719414201234cd93de119c7bcac71d70df132dc372519713945d02f97adf4fb
+size 324662984
diff --git a/checkpoint-20000/training_args.bin b/checkpoint-20000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20100/config.json b/checkpoint-20100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20100/model.safetensors b/checkpoint-20100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6746231bef695395157081ead8c06c63184791e3
--- /dev/null
+++ b/checkpoint-20100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d3ce53c8f9eeabf512bb312e00ac00388397a84d87b7d313c5a5c70b283bdfb
+size 324662984
diff --git a/checkpoint-20100/training_args.bin b/checkpoint-20100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20200/config.json b/checkpoint-20200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20200/model.safetensors b/checkpoint-20200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9bafbe4558e7f2025b5df91c8d98bb02b6b0872f
--- /dev/null
+++ b/checkpoint-20200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb7c17aa74a0aa56b78d6524249e3842686ae18e1eedc8f936fac22eb3c83de
+size 324662984
diff --git a/checkpoint-20200/training_args.bin b/checkpoint-20200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20300/config.json b/checkpoint-20300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20300/model.safetensors b/checkpoint-20300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ecb5b9a478f0f9982953bf79c7833bcd50e0d767
--- /dev/null
+++ b/checkpoint-20300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:121852506f4604c102be365570979fc6db5135154a16212ff153e8302f44b621
+size 324662984
diff --git a/checkpoint-20300/training_args.bin b/checkpoint-20300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20400/config.json b/checkpoint-20400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20400/model.safetensors b/checkpoint-20400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b10884675ba08c1ee53a4fa1148a350025280337
--- /dev/null
+++ b/checkpoint-20400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ce6c9ba4403912b8816e8f9f8197d9a04ac0e3dbca88a73025d3253284c363c
+size 324662984
diff --git a/checkpoint-20400/training_args.bin b/checkpoint-20400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20500/config.json b/checkpoint-20500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20500/model.safetensors b/checkpoint-20500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..98357e683a5915ae74cdcf8248428ec7b7c7db37
--- /dev/null
+++ b/checkpoint-20500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c99d9724a2474ce62de8eed9dbf490f1627ac5a5df8e77a8863848b826d44d91
+size 324662984
diff --git a/checkpoint-20500/training_args.bin b/checkpoint-20500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20600/config.json b/checkpoint-20600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20600/model.safetensors b/checkpoint-20600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fc21727d8c8e54eecdc0639b0c763b24f520537c
--- /dev/null
+++ b/checkpoint-20600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07a83441fddff13dab094491b5e6a0db82308c0878ee0076b02db466289d5948
+size 324662984
diff --git a/checkpoint-20600/training_args.bin b/checkpoint-20600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20700/config.json b/checkpoint-20700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20700/model.safetensors b/checkpoint-20700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f0a1b3dbf704489d8da683eee2f04cf1f1634f75
--- /dev/null
+++ b/checkpoint-20700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6652d95b477f79b80ba17fed8eb9620d65a43785fbab729a82c0c8385e0086e
+size 324662984
diff --git a/checkpoint-20700/training_args.bin b/checkpoint-20700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20800/config.json b/checkpoint-20800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20800/model.safetensors b/checkpoint-20800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f63c92efda63b2e06636a57f750e644523ff09d0
--- /dev/null
+++ b/checkpoint-20800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0e25e9c82684fd820b2579c6a4a94dae6b671225b103c7815c19295376f5996
+size 324662984
diff --git a/checkpoint-20800/training_args.bin b/checkpoint-20800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-20900/config.json b/checkpoint-20900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-20900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-20900/model.safetensors b/checkpoint-20900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..321968131f9788d9062b84a597671874ca3a54ed
--- /dev/null
+++ b/checkpoint-20900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97e230b2493988842ce672699883233eec55f931ab7845066c076ab4acd48f24
+size 324662984
diff --git a/checkpoint-20900/training_args.bin b/checkpoint-20900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-20900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2100/config.json b/checkpoint-2100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2100/model.safetensors b/checkpoint-2100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..243b484655114075ef5e7414901f6e5f070ad881
--- /dev/null
+++ b/checkpoint-2100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0be2dd85a1610aa48a9f1c247f5c014c3c7f5f035928c7f6559b20db7a09e3f
+size 324662984
diff --git a/checkpoint-2100/training_args.bin b/checkpoint-2100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21000/config.json b/checkpoint-21000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21000/model.safetensors b/checkpoint-21000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9d3e22db58ccf10bea3da9043c18324b6e40ffa2
--- /dev/null
+++ b/checkpoint-21000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2f8a2ac60356ac2f6abeff8a62175dda43f9602b1126d808dd6cbf589f5c9cc
+size 324662984
diff --git a/checkpoint-21000/training_args.bin b/checkpoint-21000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21100/config.json b/checkpoint-21100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21100/model.safetensors b/checkpoint-21100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..51f660cb98a40d3f2e9c93619fa2692e0dd308e8
--- /dev/null
+++ b/checkpoint-21100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3406383d78bf670787e228c99bd7b1c12f308047d38da463a449645f6a0afbab
+size 324662984
diff --git a/checkpoint-21100/training_args.bin b/checkpoint-21100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21200/config.json b/checkpoint-21200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21200/model.safetensors b/checkpoint-21200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e3bba66252598205a6c0706a0fb3c92cd9425eba
--- /dev/null
+++ b/checkpoint-21200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e60178009e9335ee4d84337ca66fe750eeb1bf1a36642ca75c32f49580ff75d3
+size 324662984
diff --git a/checkpoint-21200/training_args.bin b/checkpoint-21200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21300/config.json b/checkpoint-21300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21300/model.safetensors b/checkpoint-21300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1d27e1dfe1d1607d782021ef2350b625927f21fa
--- /dev/null
+++ b/checkpoint-21300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9eb16aa360b5f12a7b3541a20c2de12fdb15b372a7788514fba695ea6b8cadec
+size 324662984
diff --git a/checkpoint-21300/training_args.bin b/checkpoint-21300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21400/config.json b/checkpoint-21400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21400/model.safetensors b/checkpoint-21400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a831d7b06c88fdd477caf30dfcace8d1ec5dfef6
--- /dev/null
+++ b/checkpoint-21400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da46accb39a44eafed68ff2f7da1082ad2e8b81a5dfa25cd1f18eadaa6638c11
+size 324662984
diff --git a/checkpoint-21400/training_args.bin b/checkpoint-21400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21500/config.json b/checkpoint-21500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21500/model.safetensors b/checkpoint-21500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ab2c47280c7b98639771a6eb556660de2bf54e44
--- /dev/null
+++ b/checkpoint-21500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71345b0a9018c00b35bbf1ad9c1c2291b4030b7ff5616944c9fd945ffbb749e3
+size 324662984
diff --git a/checkpoint-21500/training_args.bin b/checkpoint-21500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21600/config.json b/checkpoint-21600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21600/model.safetensors b/checkpoint-21600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..62cc169b96ec3d55c4dbf9876fa4d75e2a4b2f6f
--- /dev/null
+++ b/checkpoint-21600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:138a566cada0a8fc5eaeb68b0a9848fc30ea2250a06861b57cbf691926f5399a
+size 324662984
diff --git a/checkpoint-21600/training_args.bin b/checkpoint-21600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21700/config.json b/checkpoint-21700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21700/model.safetensors b/checkpoint-21700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b8806d7ae714f571413153ed1949ef83a63dd053
--- /dev/null
+++ b/checkpoint-21700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35184ae0581ad8f7a25bad1e5cd6d138680359fedd1a03e7b656d3b9cff69979
+size 324662984
diff --git a/checkpoint-21700/training_args.bin b/checkpoint-21700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21800/config.json b/checkpoint-21800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21800/model.safetensors b/checkpoint-21800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..af8faa3a638d8daa0e5e5b6bfa742e630fe96aac
--- /dev/null
+++ b/checkpoint-21800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc80a7dc1961d5158353a56ffd43b8bc5aa2d3fd8b26f3506a900da062e8da09
+size 324662984
diff --git a/checkpoint-21800/training_args.bin b/checkpoint-21800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-21900/config.json b/checkpoint-21900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-21900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-21900/model.safetensors b/checkpoint-21900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9331afc33a25d3074923fe436d2205247250dac4
--- /dev/null
+++ b/checkpoint-21900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f734ef6284c17ef86b9ea7bf5df0ce4e6e51a2716410b88f1a1113881829af4
+size 324662984
diff --git a/checkpoint-21900/training_args.bin b/checkpoint-21900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-21900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2200/config.json b/checkpoint-2200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2200/model.safetensors b/checkpoint-2200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bb1930010041722df52d8905058e30ba8f0c0f88
--- /dev/null
+++ b/checkpoint-2200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19c003e5cc7170b452c313832b101d4c044b815c5b532f4afcbe9d2701dd67a7
+size 324662984
diff --git a/checkpoint-2200/training_args.bin b/checkpoint-2200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22000/config.json b/checkpoint-22000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22000/model.safetensors b/checkpoint-22000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3bb9711c99752d141fdf7026a480c7ad7768266a
--- /dev/null
+++ b/checkpoint-22000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05345f9a52eafceb1fc9b6e0cda7064259774e9d2b656eca5b2379a662787d44
+size 324662984
diff --git a/checkpoint-22000/training_args.bin b/checkpoint-22000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22100/config.json b/checkpoint-22100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22100/model.safetensors b/checkpoint-22100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c39cbac0939ef650ab803590883e73c03b77c4eb
--- /dev/null
+++ b/checkpoint-22100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3406178fd88dc19e1765bd3d52afc1c5d4c7a0efac80d9cda037b7313ff1a78
+size 324662984
diff --git a/checkpoint-22100/training_args.bin b/checkpoint-22100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22200/config.json b/checkpoint-22200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22200/model.safetensors b/checkpoint-22200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b0e35f240bd0f5e47eeb8b15732d1c1f7e4bd54e
--- /dev/null
+++ b/checkpoint-22200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6787ef302d7248d83562013a7e61e28fef7d95017d2d42a6a6d77cfce66ae710
+size 324662984
diff --git a/checkpoint-22200/training_args.bin b/checkpoint-22200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22300/config.json b/checkpoint-22300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22300/model.safetensors b/checkpoint-22300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9eb1309d5998a5f9f517bbce972c749ca28434a4
--- /dev/null
+++ b/checkpoint-22300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45530d48043b2c2c3c3f35c378eb6c7d3d010e5aed0675ac0eb19d199e9f7458
+size 324662984
diff --git a/checkpoint-22300/training_args.bin b/checkpoint-22300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22400/config.json b/checkpoint-22400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22400/model.safetensors b/checkpoint-22400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c93aa205d6ff4ac4984f44365f63ed2167a36552
--- /dev/null
+++ b/checkpoint-22400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75b2f3eef7d5173b956401d75a6dd1bfb4dfc1a461300eb595912c8bde61bf80
+size 324662984
diff --git a/checkpoint-22400/training_args.bin b/checkpoint-22400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22500/config.json b/checkpoint-22500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22500/model.safetensors b/checkpoint-22500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..79a5052c5be3c6b4b532cad58b6021c51978a554
--- /dev/null
+++ b/checkpoint-22500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dba66c66dee5b0652e29bb73cf478ef921eb76609923f5dfab52fb1087832343
+size 324662984
diff --git a/checkpoint-22500/training_args.bin b/checkpoint-22500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22600/config.json b/checkpoint-22600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22600/model.safetensors b/checkpoint-22600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..40f942bc9a7ec15f6020f9540200a399603fba3a
--- /dev/null
+++ b/checkpoint-22600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bab2fcb7e5cfb3533c66296daaddd34f20f92e65fb2619d3aa690baaafd37ff
+size 324662984
diff --git a/checkpoint-22600/training_args.bin b/checkpoint-22600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22700/config.json b/checkpoint-22700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22700/model.safetensors b/checkpoint-22700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6253d5a62fda9ddb85006b9916eb0ba10d565d97
--- /dev/null
+++ b/checkpoint-22700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fd92a528e9c8869f2952071270745a39d6d7c6460cf6dcb8fa7669c05beff02
+size 324662984
diff --git a/checkpoint-22700/training_args.bin b/checkpoint-22700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22800/config.json b/checkpoint-22800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22800/model.safetensors b/checkpoint-22800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0677188bff596caa5adc1b99cca2f04723547009
--- /dev/null
+++ b/checkpoint-22800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20f27ce981a8f6562e407a3daa1adcd68c9c955f90e69a86ab81dac10427d9b4
+size 324662984
diff --git a/checkpoint-22800/training_args.bin b/checkpoint-22800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-22900/config.json b/checkpoint-22900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-22900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-22900/model.safetensors b/checkpoint-22900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3bcc0393a45f93a80b2f00dc25fccb600ac4eee1
--- /dev/null
+++ b/checkpoint-22900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dbf86ef4ac2e0df9b86b8affb92a833f3ec6f3eb16ee4cd1738f735bad08b28
+size 324662984
diff --git a/checkpoint-22900/training_args.bin b/checkpoint-22900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-22900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2300/config.json b/checkpoint-2300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2300/model.safetensors b/checkpoint-2300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..81ace81971d84de139f298daa7b570669c7c3296
--- /dev/null
+++ b/checkpoint-2300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5cb04342396d69ff3fae1d17b6114c239a560da07f716a339ce3eae2f691965
+size 324662984
diff --git a/checkpoint-2300/training_args.bin b/checkpoint-2300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23000/config.json b/checkpoint-23000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23000/model.safetensors b/checkpoint-23000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d3d5fc32ed7fde95c71ea716562f33a3d9f1a814
--- /dev/null
+++ b/checkpoint-23000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8efaf3bf70d3c3367b9bd5fad8e688f172eaf3508d062c51cd1a2149e4a6522
+size 324662984
diff --git a/checkpoint-23000/training_args.bin b/checkpoint-23000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23100/config.json b/checkpoint-23100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23100/model.safetensors b/checkpoint-23100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..537ad8f6475b021f5d0b791b1da58d7df2cc0284
--- /dev/null
+++ b/checkpoint-23100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2cb0eb18265b6ed6e7fb4fd329ff023ec58ef59acf19cfe71dfc0955c7da5d6
+size 324662984
diff --git a/checkpoint-23100/training_args.bin b/checkpoint-23100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23200/config.json b/checkpoint-23200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23200/model.safetensors b/checkpoint-23200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..69814d2f25f5697aff24269460e7f67494987554
--- /dev/null
+++ b/checkpoint-23200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ba76ba87de566a8ef213422afe1e516cc2d14f03027c57ca74b11d54c471259
+size 324662984
diff --git a/checkpoint-23200/training_args.bin b/checkpoint-23200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23300/config.json b/checkpoint-23300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23300/model.safetensors b/checkpoint-23300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a422ca27a0795e603d6ad20ef22e129cf4e36ba3
--- /dev/null
+++ b/checkpoint-23300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4de84ba70c76c002664b028754fefe192eb3dd868b56b82443f9be52b3031ab8
+size 324662984
diff --git a/checkpoint-23300/training_args.bin b/checkpoint-23300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23400/config.json b/checkpoint-23400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23400/model.safetensors b/checkpoint-23400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..877ea88d7a5523b2acbe0aaaacfe1a64dc3ffa79
--- /dev/null
+++ b/checkpoint-23400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5804183f62ec9c370adfb85b11935340099e3884f5f64f7cc17b17429d7aa7df
+size 324662984
diff --git a/checkpoint-23400/training_args.bin b/checkpoint-23400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23500/config.json b/checkpoint-23500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23500/model.safetensors b/checkpoint-23500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c5d15ad288e3a977199085fd2955850e927d3fc8
--- /dev/null
+++ b/checkpoint-23500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cda7f00ffc3eb3c1913fe7fbda5fe65bc269ce1c39d714f1e145838b66314749
+size 324662984
diff --git a/checkpoint-23500/training_args.bin b/checkpoint-23500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23600/config.json b/checkpoint-23600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23600/model.safetensors b/checkpoint-23600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cec53ae8b10c06606339b131033c87031b029d69
--- /dev/null
+++ b/checkpoint-23600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbca2eafbc04e1a18a8706cd4b11ff8a353dee5517ee0d005121f111d304048b
+size 324662984
diff --git a/checkpoint-23600/training_args.bin b/checkpoint-23600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23700/config.json b/checkpoint-23700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23700/model.safetensors b/checkpoint-23700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fbbc9fa6bdebc0c60ec7be8adbabb775098eda96
--- /dev/null
+++ b/checkpoint-23700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75f85c4c55bfdbc9d7d1fa5980219308ccefd83363e203f9358b8d59fd94972f
+size 324662984
diff --git a/checkpoint-23700/training_args.bin b/checkpoint-23700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23800/config.json b/checkpoint-23800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23800/model.safetensors b/checkpoint-23800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4020e0e6d8f3d98c1792b9723dd16f57dcd47b23
--- /dev/null
+++ b/checkpoint-23800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f83ad91ac554b94af6639e0b9f376e9a7c34608707bedf654a690c32967c0538
+size 324662984
diff --git a/checkpoint-23800/training_args.bin b/checkpoint-23800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-23900/config.json b/checkpoint-23900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-23900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-23900/model.safetensors b/checkpoint-23900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5fe726e708e761994c833a4692f4d275a25ab5ce
--- /dev/null
+++ b/checkpoint-23900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a296dd74f0cea0437f5798b30c94fa5ee4e2807e30e89ad6bf3b45f3ed49037e
+size 324662984
diff --git a/checkpoint-23900/training_args.bin b/checkpoint-23900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-23900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2400/config.json b/checkpoint-2400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2400/model.safetensors b/checkpoint-2400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9c10bbf95abe6272e376b433ac2c65415fe66c15
--- /dev/null
+++ b/checkpoint-2400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce928c1f1d7af7b04a4699cac51925eaf051c6bd496101723e26e2da34111d32
+size 324662984
diff --git a/checkpoint-2400/training_args.bin b/checkpoint-2400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24000/config.json b/checkpoint-24000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24000/model.safetensors b/checkpoint-24000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..94e624f9a1cae712bfc216208af97c0f5bd8647b
--- /dev/null
+++ b/checkpoint-24000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b00a661e4daaea97ed52a3e4e0fb2b780a56601c10ed7a7124c62d1dfedc1925
+size 324662984
diff --git a/checkpoint-24000/training_args.bin b/checkpoint-24000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24100/config.json b/checkpoint-24100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24100/model.safetensors b/checkpoint-24100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b8efe53ebd7870a1ab1e8713299dcd6319b134c0
--- /dev/null
+++ b/checkpoint-24100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa3c19f5cafd26458d68709fbd88405ea864f52f8403491bef4e8e81ca8b0f37
+size 324662984
diff --git a/checkpoint-24100/training_args.bin b/checkpoint-24100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24200/config.json b/checkpoint-24200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24200/model.safetensors b/checkpoint-24200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7b0c3d9fff172860943f13b9d62ed776a5615a37
--- /dev/null
+++ b/checkpoint-24200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f90d29c5fa1d74c7585833cf46b15c3978bc9c9af361eea8735b20098f7bbae
+size 324662984
diff --git a/checkpoint-24200/training_args.bin b/checkpoint-24200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24300/config.json b/checkpoint-24300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24300/model.safetensors b/checkpoint-24300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5cd831fbb8b701241bf1cff15aab1c2eae54c127
--- /dev/null
+++ b/checkpoint-24300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c69b8a9bd6f13544ee9c4128929ad5943e29e2b985d5c3eabb55091409c71cc
+size 324662984
diff --git a/checkpoint-24300/training_args.bin b/checkpoint-24300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24400/config.json b/checkpoint-24400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24400/model.safetensors b/checkpoint-24400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fde618aba23dc2c2c0a7c80928ec6e3e8a060b14
--- /dev/null
+++ b/checkpoint-24400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9b06ba7677cdb205232d22e6c0bd0c330089ed5d26729aaf80afd7693dc7329
+size 324662984
diff --git a/checkpoint-24400/training_args.bin b/checkpoint-24400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24500/config.json b/checkpoint-24500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24500/model.safetensors b/checkpoint-24500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..200907733c006f1b1a9a5f4122a7482dedb9daaa
--- /dev/null
+++ b/checkpoint-24500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0118695f7d27fb9538aff2240e7366d09230f4ee97994668fcb174130eab5e64
+size 324662984
diff --git a/checkpoint-24500/training_args.bin b/checkpoint-24500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24600/config.json b/checkpoint-24600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24600/model.safetensors b/checkpoint-24600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1d2f606ebdf88585641a0c007f9dc860e7815148
--- /dev/null
+++ b/checkpoint-24600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69f18ad8e649e29136024a4598dee1e59a56d960c031e470c4fedd754cae5f64
+size 324662984
diff --git a/checkpoint-24600/training_args.bin b/checkpoint-24600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24700/config.json b/checkpoint-24700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24700/model.safetensors b/checkpoint-24700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5ec18ea650ad0072dd6400079165d890e8eff008
--- /dev/null
+++ b/checkpoint-24700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:842ac33bb17b0014ae43f1298d4523077ed4fb844891e390e28034a0d104f04c
+size 324662984
diff --git a/checkpoint-24700/training_args.bin b/checkpoint-24700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24800/config.json b/checkpoint-24800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24800/model.safetensors b/checkpoint-24800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c0549ffdea3fe44cb12c4db0da663a8ce9ad442d
--- /dev/null
+++ b/checkpoint-24800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d21fafd7bddec870d67285e1ef351151f81f78b88db062f3f7cfe6ca5ee61fea
+size 324662984
diff --git a/checkpoint-24800/training_args.bin b/checkpoint-24800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-24900/config.json b/checkpoint-24900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-24900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-24900/model.safetensors b/checkpoint-24900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..06a7306b7337c9f109422ae24bbf207e6aec8f81
--- /dev/null
+++ b/checkpoint-24900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47b7a9aa75289d88dd5d6f154b8341fd98067797cf6779a50e4d154ae092b056
+size 324662984
diff --git a/checkpoint-24900/training_args.bin b/checkpoint-24900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-24900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2500/config.json b/checkpoint-2500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2500/model.safetensors b/checkpoint-2500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8d4bad596a80417e38ade4a2274170f983366656
--- /dev/null
+++ b/checkpoint-2500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5934f7e73534cc2a78459c6ea9cef44737c353070229337f92381c65a7cd8084
+size 324662984
diff --git a/checkpoint-2500/training_args.bin b/checkpoint-2500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25000/config.json b/checkpoint-25000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25000/model.safetensors b/checkpoint-25000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..25791fef7c6eebe2978d2b6982f5ba600e7f5b95
--- /dev/null
+++ b/checkpoint-25000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f46c7ce1aa7cebc9573b2fc0c60b0364c426562539ca0b6797ff72daa7d7a87
+size 324662984
diff --git a/checkpoint-25000/training_args.bin b/checkpoint-25000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25100/config.json b/checkpoint-25100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25100/model.safetensors b/checkpoint-25100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cb0b8bb65ddcdfa67edb94f9f6f4386727586cda
--- /dev/null
+++ b/checkpoint-25100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ccd07cec0a748feb8c855e48ce7b5369a881b516bce6c43ffd09fe5580144f4
+size 324662984
diff --git a/checkpoint-25100/training_args.bin b/checkpoint-25100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25200/config.json b/checkpoint-25200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25200/model.safetensors b/checkpoint-25200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2f598c914c34f00f6d12cdb7c4dbfe7e694f231c
--- /dev/null
+++ b/checkpoint-25200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d51a1296b16ce770225fdd40362c85992503bd9630d47c5cbaa6616b3e0aa67f
+size 324662984
diff --git a/checkpoint-25200/training_args.bin b/checkpoint-25200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25300/config.json b/checkpoint-25300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25300/model.safetensors b/checkpoint-25300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..50eeb7ea0d4e078c9f33130772181037b5acff99
--- /dev/null
+++ b/checkpoint-25300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9374cf67600df560a3210234b0b803122d8b73d338d8216018b7fc08441adb58
+size 324662984
diff --git a/checkpoint-25300/training_args.bin b/checkpoint-25300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25400/config.json b/checkpoint-25400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25400/model.safetensors b/checkpoint-25400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6faa5e2baa96aae7435d78636b4901cd35f7b7bd
--- /dev/null
+++ b/checkpoint-25400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa7412604e0b5c9a5cc07c930a616f9ddd103ed453158380445fe1b5bd7d1f55
+size 324662984
diff --git a/checkpoint-25400/training_args.bin b/checkpoint-25400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25500/config.json b/checkpoint-25500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25500/model.safetensors b/checkpoint-25500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f67df0d0d41800ecdfef43c485fb34ed026583a0
--- /dev/null
+++ b/checkpoint-25500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9113292fc39bbeba33e1e3fb1793e0163fd0f26c3e3c24bb8fb60102e2b8c310
+size 324662984
diff --git a/checkpoint-25500/training_args.bin b/checkpoint-25500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25600/config.json b/checkpoint-25600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25600/model.safetensors b/checkpoint-25600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c6280bb759f72f17116e9eb8554b0b9d0d0563d9
--- /dev/null
+++ b/checkpoint-25600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a58deb993ab4598b9fbfac976bb422a336214155445ffe4c6dee3a984424ff1
+size 324662984
diff --git a/checkpoint-25600/training_args.bin b/checkpoint-25600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25700/config.json b/checkpoint-25700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25700/model.safetensors b/checkpoint-25700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6755247d7c846995359da267e4600fd71ff61165
--- /dev/null
+++ b/checkpoint-25700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b6a8c8bd9144682c98ef1d0e4a554f09f093b9a50abcf6634a8363e4a01b61e
+size 324662984
diff --git a/checkpoint-25700/training_args.bin b/checkpoint-25700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25800/config.json b/checkpoint-25800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25800/model.safetensors b/checkpoint-25800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ad9d205d546c36223e6812a73d8c1c63355e8e03
--- /dev/null
+++ b/checkpoint-25800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b85da45181faa37f7d5ca51d02af43ca13066e81d283c45767a9219d288a23bb
+size 324662984
diff --git a/checkpoint-25800/training_args.bin b/checkpoint-25800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-25900/config.json b/checkpoint-25900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-25900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-25900/model.safetensors b/checkpoint-25900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6d4ac3c6c9badfe06c9df34ddb75c2949b52929e
--- /dev/null
+++ b/checkpoint-25900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c86ad9c61aa188b5abf043f1fa8d8e8bcaf16402523c377d53666a8bf2f79d7f
+size 324662984
diff --git a/checkpoint-25900/training_args.bin b/checkpoint-25900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-25900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2600/config.json b/checkpoint-2600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2600/model.safetensors b/checkpoint-2600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..02191eed128db7a3e450eef5e86ecf33a0776e34
--- /dev/null
+++ b/checkpoint-2600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4200d83057e42b6dbf512f63a5c35f63c247189a19c9aa3e9024387d984cf1e1
+size 324662984
diff --git a/checkpoint-2600/training_args.bin b/checkpoint-2600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26000/config.json b/checkpoint-26000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26000/model.safetensors b/checkpoint-26000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7485823924e5ce6078281fe02ef7ebd727695f9e
--- /dev/null
+++ b/checkpoint-26000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba6a72ae164acdc2c901c2c1de0044ccfc6e4c62386cc16113510ac8a64961c2
+size 324662984
diff --git a/checkpoint-26000/training_args.bin b/checkpoint-26000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26100/config.json b/checkpoint-26100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26100/model.safetensors b/checkpoint-26100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..402fccefc8bcb736b6818fa9b02aa15ef0198668
--- /dev/null
+++ b/checkpoint-26100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16fb29e2645725791896af26f3b8488ec533796abe3ddaed238421293b360ab4
+size 324662984
diff --git a/checkpoint-26100/training_args.bin b/checkpoint-26100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26200/config.json b/checkpoint-26200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26200/model.safetensors b/checkpoint-26200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f2743544b80bc7e377046cf7f00fcf48b41662d9
--- /dev/null
+++ b/checkpoint-26200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9aa8487546e2aa1590cec3d1caca1bf4e6971da258767dd2de0fbbc46d2bd54a
+size 324662984
diff --git a/checkpoint-26200/training_args.bin b/checkpoint-26200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26300/config.json b/checkpoint-26300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26300/model.safetensors b/checkpoint-26300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bb11d3c058af8ce63bee85b7ae12b47ae6eecfc3
--- /dev/null
+++ b/checkpoint-26300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b407a2ca5ca9bde66cff55c87d3b6d9253675744648289d8d3613a3ec8e41c9
+size 324662984
diff --git a/checkpoint-26300/training_args.bin b/checkpoint-26300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26400/config.json b/checkpoint-26400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26400/model.safetensors b/checkpoint-26400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d7bcfc423fe32ab9d3991c15d9b59058ffb886b4
--- /dev/null
+++ b/checkpoint-26400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3da6ad1f8c9500e17710190dd2b6670850f4a29015669939d982fa1019c04f01
+size 324662984
diff --git a/checkpoint-26400/training_args.bin b/checkpoint-26400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26500/config.json b/checkpoint-26500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26500/model.safetensors b/checkpoint-26500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fe53743daf71099a8e54c2965d57fdcfb572b868
--- /dev/null
+++ b/checkpoint-26500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd3e4ffd1d6fe9d34e346049fdf5684306c90f54ca730179d9a9d525c48af38
+size 324662984
diff --git a/checkpoint-26500/training_args.bin b/checkpoint-26500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26600/config.json b/checkpoint-26600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26600/model.safetensors b/checkpoint-26600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..929c8ce07307e8bca623305173aaf2e7820bc39e
--- /dev/null
+++ b/checkpoint-26600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e74718206a4a4ba0686a4d6cf4124e25f53f646e98844586ab6fbeaed5e52db8
+size 324662984
diff --git a/checkpoint-26600/training_args.bin b/checkpoint-26600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26700/config.json b/checkpoint-26700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26700/model.safetensors b/checkpoint-26700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..54f14e4bd379e2b54e1f2d4d67bec79221d62f9e
--- /dev/null
+++ b/checkpoint-26700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cdf6c5c613824a89eed8e79006f7fb16acc33d2d9e06821abe1f7835b9a91de
+size 324662984
diff --git a/checkpoint-26700/training_args.bin b/checkpoint-26700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26800/config.json b/checkpoint-26800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26800/model.safetensors b/checkpoint-26800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..04679cde10fabde4b5f00335a645c271bbf324c6
--- /dev/null
+++ b/checkpoint-26800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:135312af24c9ce9a6a709184904f3e3b18d4210491d1aef5b388dc52b5b36f54
+size 324662984
diff --git a/checkpoint-26800/training_args.bin b/checkpoint-26800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-26900/config.json b/checkpoint-26900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-26900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-26900/model.safetensors b/checkpoint-26900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..be4c610c2c47e10ce2f107fc8531a727a222efce
--- /dev/null
+++ b/checkpoint-26900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2487d58de970687aeb8c91b7018276907349d1779486858344b27ba3e35fa5b
+size 324662984
diff --git a/checkpoint-26900/training_args.bin b/checkpoint-26900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-26900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2700/config.json b/checkpoint-2700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2700/model.safetensors b/checkpoint-2700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e3bd37c6fb4a0e10667b103ab3022c48590a5c60
--- /dev/null
+++ b/checkpoint-2700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7ff3e56f9ede3a9961d2fb123799fef9e327c625dbc83d1507dc286448fe86c
+size 324662984
diff --git a/checkpoint-2700/training_args.bin b/checkpoint-2700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27000/config.json b/checkpoint-27000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27000/model.safetensors b/checkpoint-27000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..29788cbaa9101ec5d5bd5e7308286039dd8aa667
--- /dev/null
+++ b/checkpoint-27000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b533d853bd2892f9ccf2f23380f291e3a710a199910aabacec547c554acc86c
+size 324662984
diff --git a/checkpoint-27000/training_args.bin b/checkpoint-27000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27100/config.json b/checkpoint-27100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27100/model.safetensors b/checkpoint-27100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c6bf840b8cd05beffae1ae02714ad5feb69782b7
--- /dev/null
+++ b/checkpoint-27100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c809a12d65350a16bf743728facf5540ae2a893c935a172fe8707439e84274
+size 324662984
diff --git a/checkpoint-27100/training_args.bin b/checkpoint-27100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27200/config.json b/checkpoint-27200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27200/model.safetensors b/checkpoint-27200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..07acfabc4dc665a9a1faf6222cd5b9c21d9dbd67
--- /dev/null
+++ b/checkpoint-27200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f5fd8f476ee0e3d79480021f885e453ec76bc6e758f43f9e304f50073fd9909
+size 324662984
diff --git a/checkpoint-27200/training_args.bin b/checkpoint-27200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27300/config.json b/checkpoint-27300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27300/model.safetensors b/checkpoint-27300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..203a40dbe73e791524394aaf96203dd01752fe15
--- /dev/null
+++ b/checkpoint-27300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f98d2c9c1c94cbee66185ea5745ac7489dd03f518b76bc117976c5c4b7f6c9f
+size 324662984
diff --git a/checkpoint-27300/training_args.bin b/checkpoint-27300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27400/config.json b/checkpoint-27400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27400/model.safetensors b/checkpoint-27400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c08792515ff650f5f5ccca0acb78dad0d10e76cb
--- /dev/null
+++ b/checkpoint-27400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fccc1c62c52992cc11d49b9424de02eae490e731575f59cbf56ae16454da65a
+size 324662984
diff --git a/checkpoint-27400/training_args.bin b/checkpoint-27400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27500/config.json b/checkpoint-27500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27500/model.safetensors b/checkpoint-27500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6f75a944f4d25d03417e824b60749bd92e97c551
--- /dev/null
+++ b/checkpoint-27500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8fdb4017f6b7d59b5f803cb98eb5298b62585c2360eae93362de37d4bed02d5
+size 324662984
diff --git a/checkpoint-27500/training_args.bin b/checkpoint-27500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27600/config.json b/checkpoint-27600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27600/model.safetensors b/checkpoint-27600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6e1c719d2142d42d21da25d6b944a9b631f02ebe
--- /dev/null
+++ b/checkpoint-27600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9655f08861cf113ba13125051e2792353ea1a208a2b55072e2ef6d7905aaa6db
+size 324662984
diff --git a/checkpoint-27600/training_args.bin b/checkpoint-27600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27700/config.json b/checkpoint-27700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27700/model.safetensors b/checkpoint-27700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1e4fbd27fa939b96c18bcd21f2ebe893776d5a78
--- /dev/null
+++ b/checkpoint-27700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74a34dd884dfe5b0896db091b4e81e221ab239a384dd0cc46aa982685b82c5af
+size 324662984
diff --git a/checkpoint-27700/training_args.bin b/checkpoint-27700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27800/config.json b/checkpoint-27800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27800/model.safetensors b/checkpoint-27800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d068c2197beb0fe170861426f35fc10fbc1e7b0f
--- /dev/null
+++ b/checkpoint-27800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f64a4eb9106236b2250edefef17e3fbe097130a6023cc0f4498fa0f907295887
+size 324662984
diff --git a/checkpoint-27800/training_args.bin b/checkpoint-27800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-27900/config.json b/checkpoint-27900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-27900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-27900/model.safetensors b/checkpoint-27900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d59d352d62801f8e55ad505ccf3125d06e9456ea
--- /dev/null
+++ b/checkpoint-27900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7188577a507ddfd8e1592bf6e04dbeedbb514c02a0d3a74962af15f393f8707
+size 324662984
diff --git a/checkpoint-27900/training_args.bin b/checkpoint-27900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-27900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2800/config.json b/checkpoint-2800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2800/model.safetensors b/checkpoint-2800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b386208c3bdd7f7a917548fd52ae24fe905bd3b1
--- /dev/null
+++ b/checkpoint-2800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd0a9604f1ae0b86829ee7373d5d890cdb9695a87dc9c5cd81fae74838bb320a
+size 324662984
diff --git a/checkpoint-2800/training_args.bin b/checkpoint-2800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28000/config.json b/checkpoint-28000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28000/model.safetensors b/checkpoint-28000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5a139616a799f37f518fbc7af6a18ab3cb27ab7d
--- /dev/null
+++ b/checkpoint-28000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bba435a4500f249b6ef3fc71316eb3a71f8778b1fe94c87a5e978ece7a5dd11
+size 324662984
diff --git a/checkpoint-28000/training_args.bin b/checkpoint-28000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28100/config.json b/checkpoint-28100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28100/model.safetensors b/checkpoint-28100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bd37b65e2e1a637711bc40d86a3de9eeacd54d8f
--- /dev/null
+++ b/checkpoint-28100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8d52396660892e25a7de8ceae6aa20e799953a65575b54e0dbc58012a7e7574
+size 324662984
diff --git a/checkpoint-28100/training_args.bin b/checkpoint-28100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28200/config.json b/checkpoint-28200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28200/model.safetensors b/checkpoint-28200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..461a4daf05059379608e6f9a48e840cbc455a3aa
--- /dev/null
+++ b/checkpoint-28200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e22daac26adba0885d62d4c5a637b1b2671111db9c4972a20c9cf1d00932c94b
+size 324662984
diff --git a/checkpoint-28200/training_args.bin b/checkpoint-28200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28300/config.json b/checkpoint-28300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28300/model.safetensors b/checkpoint-28300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d6842055e6ff50c0713b6dc605d5a1ecc0962d58
--- /dev/null
+++ b/checkpoint-28300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38b91d878051239a36365db77cf35a9367d2cf2ccd395d07da6e9b7007da9c3f
+size 324662984
diff --git a/checkpoint-28300/training_args.bin b/checkpoint-28300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28400/config.json b/checkpoint-28400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28400/model.safetensors b/checkpoint-28400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c7479b56195271a8a086ef98ad1ba08aac6767aa
--- /dev/null
+++ b/checkpoint-28400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93c4a8a6efcce54b5c26defb91998c3052c073ba2d233d7e5400fb7d9a6527a6
+size 324662984
diff --git a/checkpoint-28400/training_args.bin b/checkpoint-28400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28500/config.json b/checkpoint-28500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28500/model.safetensors b/checkpoint-28500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2ef28821f96983dd62cc310441ffb015d51f62ca
--- /dev/null
+++ b/checkpoint-28500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21dec4d883bc4de0254598d41aab19e49df11cf0154d742cc98807347b7ea3a7
+size 324662984
diff --git a/checkpoint-28500/training_args.bin b/checkpoint-28500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28600/config.json b/checkpoint-28600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28600/model.safetensors b/checkpoint-28600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9867640eeac87b839f8858099b6f456bff8fc96f
--- /dev/null
+++ b/checkpoint-28600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74d7f542a7a5607201396ca6119a6882dde8723bf386893aa9ee7304cbe18b73
+size 324662984
diff --git a/checkpoint-28600/training_args.bin b/checkpoint-28600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28700/config.json b/checkpoint-28700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28700/model.safetensors b/checkpoint-28700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1e6a7186be5cf2dd18465b7a44a0b22a9715ebff
--- /dev/null
+++ b/checkpoint-28700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7acdcfa8f74f33a5a38d1a77c5085baf267fded95dcdceabb32f84d8d87af36a
+size 324662984
diff --git a/checkpoint-28700/training_args.bin b/checkpoint-28700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28800/config.json b/checkpoint-28800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28800/model.safetensors b/checkpoint-28800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..88d6ac16087e426849e5895ba9ce7ea1733f3b9e
--- /dev/null
+++ b/checkpoint-28800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e00832b6dc9a037adb7fc7bfb2a0a23370c6875556e22dbf7f8080b49969fe6f
+size 324662984
diff --git a/checkpoint-28800/training_args.bin b/checkpoint-28800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-28900/config.json b/checkpoint-28900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-28900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-28900/model.safetensors b/checkpoint-28900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..efc4046aae8734a1f7e618e71dd51582224044aa
--- /dev/null
+++ b/checkpoint-28900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58a443bf467ae9f8b41bdcfa19a69f533a7224a5e10f43b30f9e2418b60c68bc
+size 324662984
diff --git a/checkpoint-28900/training_args.bin b/checkpoint-28900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-28900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-2900/config.json b/checkpoint-2900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-2900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-2900/model.safetensors b/checkpoint-2900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c3bc3d341519b1ee3b619e1e3e7d5b67ed61d08c
--- /dev/null
+++ b/checkpoint-2900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d1ae42fbfdf13df6357152b656b8bf345cb4c08f8ea7c30df106bb10b4d2aa8
+size 324662984
diff --git a/checkpoint-2900/training_args.bin b/checkpoint-2900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-2900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29000/config.json b/checkpoint-29000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29000/model.safetensors b/checkpoint-29000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d91ca30d0e436670364c52492e173ccb71c253ab
--- /dev/null
+++ b/checkpoint-29000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7aeb3c28c07f8ce84027f1f9e35277d05c17bfa9f114b1f474e8cc39ef52ec99
+size 324662984
diff --git a/checkpoint-29000/training_args.bin b/checkpoint-29000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29100/config.json b/checkpoint-29100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29100/model.safetensors b/checkpoint-29100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a9f0707b13aad31f3c9c606783cd88aeb9c2f58d
--- /dev/null
+++ b/checkpoint-29100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:426c27df4826aecc001ba3b860044bc574e8387e87c50fae37e47470562656c7
+size 324662984
diff --git a/checkpoint-29100/training_args.bin b/checkpoint-29100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29200/config.json b/checkpoint-29200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29200/model.safetensors b/checkpoint-29200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9c95f51e993c28807633216ed21fa819bf2d51e5
--- /dev/null
+++ b/checkpoint-29200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86e6fefb67e8873624190da0ab43811ac20cc480c7f3c9693dc2bd9745ae66b4
+size 324662984
diff --git a/checkpoint-29200/training_args.bin b/checkpoint-29200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29300/config.json b/checkpoint-29300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29300/model.safetensors b/checkpoint-29300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..29b63ede08a9f9b6c4319544875f1b655a9d1373
--- /dev/null
+++ b/checkpoint-29300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13723d5459f5568233b59f8efa65d12673a5f996575914adc834d5a6aad282eb
+size 324662984
diff --git a/checkpoint-29300/training_args.bin b/checkpoint-29300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29400/config.json b/checkpoint-29400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29400/model.safetensors b/checkpoint-29400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d9b262a326d99599272c9d584dae148ccc106df2
--- /dev/null
+++ b/checkpoint-29400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b2388a5583b4f9923b520f57aeaa0e68c60720f455d794aef8466a4636329f1
+size 324662984
diff --git a/checkpoint-29400/training_args.bin b/checkpoint-29400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29500/config.json b/checkpoint-29500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29500/model.safetensors b/checkpoint-29500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5480ed2dfd070849fe8bfbb61ede2df39d23f636
--- /dev/null
+++ b/checkpoint-29500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c053a337706eeb143c949b0459d242bd4b5748f0ab959085bb182ee6601606d
+size 324662984
diff --git a/checkpoint-29500/training_args.bin b/checkpoint-29500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29600/config.json b/checkpoint-29600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29600/model.safetensors b/checkpoint-29600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9bd374403403f5508db21eb0e05403eef454463e
--- /dev/null
+++ b/checkpoint-29600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:242650da13ea31d36070fc0d8281212a0b058cd9d30722ff0e85a970f1385fe5
+size 324662984
diff --git a/checkpoint-29600/training_args.bin b/checkpoint-29600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29700/config.json b/checkpoint-29700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29700/model.safetensors b/checkpoint-29700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c7d54352a532cc6701f432776ce9086dce35371d
--- /dev/null
+++ b/checkpoint-29700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:209645d70a569e9900416496314c2411ef95d38cc0fbc70ffbf5208ef4ec1e46
+size 324662984
diff --git a/checkpoint-29700/training_args.bin b/checkpoint-29700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29800/config.json b/checkpoint-29800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29800/model.safetensors b/checkpoint-29800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d5d536859d4d819ebdde8811846a41bf2325c5ca
--- /dev/null
+++ b/checkpoint-29800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb4ae3d27442e39f7bdb4bf136eed151f3acd747c30958e3f96cf52a81b6331d
+size 324662984
diff --git a/checkpoint-29800/training_args.bin b/checkpoint-29800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-29900/config.json b/checkpoint-29900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-29900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-29900/model.safetensors b/checkpoint-29900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..379f54a9618054f79b1d92a25d2d0c43a4316398
--- /dev/null
+++ b/checkpoint-29900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:314fb036f6275f1089a80c2cea55caba8aefcbfeac7b850036509cd75a99e164
+size 324662984
diff --git a/checkpoint-29900/training_args.bin b/checkpoint-29900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-29900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-300/config.json b/checkpoint-300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-300/model.safetensors b/checkpoint-300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f21876866beab510325fc3b0514ada6c7813cfa9
--- /dev/null
+++ b/checkpoint-300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36f18c740a272ca7e32993e375d9b59d0a23be68c1052f6cb99f88c68dd8e9d9
+size 324662984
diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3000/config.json b/checkpoint-3000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3000/model.safetensors b/checkpoint-3000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..222d9ac48450f4b7fc76584d6ee3f33e7e9b253f
--- /dev/null
+++ b/checkpoint-3000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e84d2d823cee440d2394979a4706de530de262c547ad3d748087f62f4f02951
+size 324662984
diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30000/config.json b/checkpoint-30000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30000/model.safetensors b/checkpoint-30000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1be520f997f0521b493fca1ddb79b48062e56364
--- /dev/null
+++ b/checkpoint-30000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2edbfae7da47103d7718cd2b0cdc99a72966c5ca54b4c7c1e0cfbd66320237fb
+size 324662984
diff --git a/checkpoint-30000/training_args.bin b/checkpoint-30000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30100/config.json b/checkpoint-30100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30100/model.safetensors b/checkpoint-30100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6558c4d3aa0983a7e27ce1e7e47ff772e4b96bd5
--- /dev/null
+++ b/checkpoint-30100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abd758f5ba870f01d58dce055609efedc917a74865a9f2645dffccd612d87ba2
+size 324662984
diff --git a/checkpoint-30100/training_args.bin b/checkpoint-30100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30200/config.json b/checkpoint-30200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30200/model.safetensors b/checkpoint-30200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..61fadeb079cac7936711ed5ffeaea6b8656c3a22
--- /dev/null
+++ b/checkpoint-30200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96df9ad657ac672c43d0a090b924fb054e1283292ffc48842f277b355fd791c2
+size 324662984
diff --git a/checkpoint-30200/training_args.bin b/checkpoint-30200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30300/config.json b/checkpoint-30300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30300/model.safetensors b/checkpoint-30300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7a3b45ab82c0e2db219868a2ce4d5b27cb0134a4
--- /dev/null
+++ b/checkpoint-30300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a71ac9b7130e016c19e9af15c1336a5295c35331eed77ebe9f7007ce5b6da523
+size 324662984
diff --git a/checkpoint-30300/training_args.bin b/checkpoint-30300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30400/config.json b/checkpoint-30400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30400/model.safetensors b/checkpoint-30400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..35a98c0a0b336bcab91115d127a634b6f6664f2a
--- /dev/null
+++ b/checkpoint-30400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:916ec83d89343366c94a35f85c2b44f4cf4bf75ac2ff6aabf8b8f19cb79fd2cb
+size 324662984
diff --git a/checkpoint-30400/training_args.bin b/checkpoint-30400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30500/config.json b/checkpoint-30500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30500/model.safetensors b/checkpoint-30500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dff56c22b3054c6624b29e46037596a51604443f
--- /dev/null
+++ b/checkpoint-30500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f3ee917b01b2c74b7284b22456befa71aa0c8077c3ca1d270cb20f46e59a0a
+size 324662984
diff --git a/checkpoint-30500/training_args.bin b/checkpoint-30500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30600/config.json b/checkpoint-30600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30600/model.safetensors b/checkpoint-30600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8edb9eb5aa9092b0cdbd36dd478f33cb2265321b
--- /dev/null
+++ b/checkpoint-30600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f59337ba47c78c37f00d1e551be678f802efd37169c859469780522b3fd01964
+size 324662984
diff --git a/checkpoint-30600/training_args.bin b/checkpoint-30600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30700/config.json b/checkpoint-30700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30700/model.safetensors b/checkpoint-30700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3872cd88240ea45cf0ac3a8be329248774799797
--- /dev/null
+++ b/checkpoint-30700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ee9ffb46dbe2d80a55ee07b0c1de2b1e323d52616c26e74f51ce7ad4bb232d
+size 324662984
diff --git a/checkpoint-30700/training_args.bin b/checkpoint-30700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30800/config.json b/checkpoint-30800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30800/model.safetensors b/checkpoint-30800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..45fdbe7a4cbaa22d861d37a464c8c77f122c6335
--- /dev/null
+++ b/checkpoint-30800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ac25c31031a12a93c0b44a2c5076bed560813ea5febc7c2f7d4c39b5cc0053d
+size 324662984
diff --git a/checkpoint-30800/training_args.bin b/checkpoint-30800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-30900/config.json b/checkpoint-30900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-30900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-30900/model.safetensors b/checkpoint-30900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ffc8a13b1c63ef232ad201c8c58c52870dec5638
--- /dev/null
+++ b/checkpoint-30900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:066fbf02eb9955cea370752525572bee1d3cd0c4bc316de1033c3c3d840cd7b1
+size 324662984
diff --git a/checkpoint-30900/training_args.bin b/checkpoint-30900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-30900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3100/config.json b/checkpoint-3100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3100/model.safetensors b/checkpoint-3100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f5fc020504599348da927bdc844c68b152d2b2bd
--- /dev/null
+++ b/checkpoint-3100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf6264ef9db178dcb3f8f02963253d01dfeb1a11c9ba2e100441f6bf0ca08aa0
+size 324662984
diff --git a/checkpoint-3100/training_args.bin b/checkpoint-3100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31000/config.json b/checkpoint-31000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31000/model.safetensors b/checkpoint-31000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9090f5dcd4be8ea0e064d8fbf9aab69526d25b68
--- /dev/null
+++ b/checkpoint-31000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:796352667718ac000171d935f59b7923abef065e8adcfb655d1a3880b90c864e
+size 324662984
diff --git a/checkpoint-31000/training_args.bin b/checkpoint-31000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31100/config.json b/checkpoint-31100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31100/model.safetensors b/checkpoint-31100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d5d58a8bccb329bbfb720d7822d3b74e513b2ace
--- /dev/null
+++ b/checkpoint-31100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d40e15b4a83d5a70a593539c936e15f0c5e869053dba85fd6230a31cda08fcc1
+size 324662984
diff --git a/checkpoint-31100/training_args.bin b/checkpoint-31100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31200/config.json b/checkpoint-31200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31200/model.safetensors b/checkpoint-31200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..562309d0d2833b484f973da9774b575f0fa5ecf1
--- /dev/null
+++ b/checkpoint-31200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:132c069983b17ecf394f01cb51e3b4924f8510660326ac5a148cacf366a078a1
+size 324662984
diff --git a/checkpoint-31200/training_args.bin b/checkpoint-31200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31300/config.json b/checkpoint-31300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31300/model.safetensors b/checkpoint-31300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0443ee4caa7e6983d2e6d132f6fd01777bbab713
--- /dev/null
+++ b/checkpoint-31300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:570c8b9244ac01655d034fbdb8d2063b801ae68e8b182d2907b3dc8682d98900
+size 324662984
diff --git a/checkpoint-31300/training_args.bin b/checkpoint-31300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31400/config.json b/checkpoint-31400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31400/model.safetensors b/checkpoint-31400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..675523d777490391f3c634721129e47f7485fd8e
--- /dev/null
+++ b/checkpoint-31400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3410b3f3edbc4dc8dbd68bf7f2f7dde82b8f26685e05c2cdfc25646a981cc7e
+size 324662984
diff --git a/checkpoint-31400/training_args.bin b/checkpoint-31400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31500/config.json b/checkpoint-31500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31500/model.safetensors b/checkpoint-31500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b60cdbf0a531f3c139d07e21bf1e627c4120b71f
--- /dev/null
+++ b/checkpoint-31500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:435272bd1611e5bf70aaf85a9da7fb238289c56fbf1d10378baba7ba5d9ef32f
+size 324662984
diff --git a/checkpoint-31500/training_args.bin b/checkpoint-31500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31600/config.json b/checkpoint-31600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31600/model.safetensors b/checkpoint-31600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..00ac628bee5eecca57a81ddfa5edfbb5c0bf700f
--- /dev/null
+++ b/checkpoint-31600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baade67c90fd6af85bafe14452857e027535cd5fea8090990693ba1fe1a01584
+size 324662984
diff --git a/checkpoint-31600/training_args.bin b/checkpoint-31600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31700/config.json b/checkpoint-31700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31700/model.safetensors b/checkpoint-31700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2dd7accd84a4006897d2271eac1a8324bcfb3896
--- /dev/null
+++ b/checkpoint-31700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f3325b41e377f5edbde9386cc0315f486fabe10029306f20f74a82a2d577b07
+size 324662984
diff --git a/checkpoint-31700/training_args.bin b/checkpoint-31700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31800/config.json b/checkpoint-31800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31800/model.safetensors b/checkpoint-31800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1ce930d4e5e9f3712ba1371ec31b721876737619
--- /dev/null
+++ b/checkpoint-31800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41e47ecab26e5bfa1d1d51516aa5b5125d2081e05fedf245427df1b5bfb79d3d
+size 324662984
diff --git a/checkpoint-31800/training_args.bin b/checkpoint-31800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-31900/config.json b/checkpoint-31900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-31900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-31900/model.safetensors b/checkpoint-31900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7eb813ff301a5ceef89781b2a8be9750c2e36dde
--- /dev/null
+++ b/checkpoint-31900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb6d5c706b76e79013283c452a8a2c1b821fceb67fab0fcf889873e6323da674
+size 324662984
diff --git a/checkpoint-31900/training_args.bin b/checkpoint-31900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-31900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3200/config.json b/checkpoint-3200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3200/model.safetensors b/checkpoint-3200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a28967ab37641c82fa4bfcaffda62a434522db92
--- /dev/null
+++ b/checkpoint-3200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e917151a35aa6b6cfc99d8a98ae42a32ef31ef0e3a635a30bbadf6ad3ecd1ebf
+size 324662984
diff --git a/checkpoint-3200/training_args.bin b/checkpoint-3200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32000/config.json b/checkpoint-32000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32000/model.safetensors b/checkpoint-32000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fa06c61d855f0f15f32295b34056c38623d5c48c
--- /dev/null
+++ b/checkpoint-32000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1ecb8a424cc802030ca5f7bdadb22d1db9a1158acb52ab5aef6e0d23c40a7d6
+size 324662984
diff --git a/checkpoint-32000/training_args.bin b/checkpoint-32000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32100/config.json b/checkpoint-32100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32100/model.safetensors b/checkpoint-32100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e68578adae68f90600f3dd5a44c2be3465bbf370
--- /dev/null
+++ b/checkpoint-32100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23cda3280c49c10a2911439dd2915a51d150dfe02cca161ba20c5b14c21508c4
+size 324662984
diff --git a/checkpoint-32100/training_args.bin b/checkpoint-32100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32200/config.json b/checkpoint-32200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32200/model.safetensors b/checkpoint-32200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..32aa51925746247c418489e6a5ad0ad4b729f7b7
--- /dev/null
+++ b/checkpoint-32200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0855d92f3c687b8e630f47ba8c2bd42aaf8be48cfc3a53a5767b96e66af6928c
+size 324662984
diff --git a/checkpoint-32200/training_args.bin b/checkpoint-32200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32300/config.json b/checkpoint-32300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32300/model.safetensors b/checkpoint-32300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..78add577474a1802b0e1000397b12e8dca57955b
--- /dev/null
+++ b/checkpoint-32300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8385b9e61b0c43ded9bfe58c8f7ad0c1918c8cb10736ee28d3b5c06a4472884
+size 324662984
diff --git a/checkpoint-32300/training_args.bin b/checkpoint-32300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32400/config.json b/checkpoint-32400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32400/model.safetensors b/checkpoint-32400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ad84a01b4ff69c66fc85fcd672f84709862c3cb6
--- /dev/null
+++ b/checkpoint-32400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85c878a7dee9dfe2a1bdd9085cba5952424f705860f0d3e6a98d2210f144e292
+size 324662984
diff --git a/checkpoint-32400/training_args.bin b/checkpoint-32400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32500/config.json b/checkpoint-32500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32500/model.safetensors b/checkpoint-32500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e8d7d31c41aa3926781ee7610b88c3d226e9a1c5
--- /dev/null
+++ b/checkpoint-32500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e329afd47c71c4236d255ff667f4f4449fbe2af4690a0f05b8e047df5fe56d63
+size 324662984
diff --git a/checkpoint-32500/training_args.bin b/checkpoint-32500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32600/config.json b/checkpoint-32600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32600/model.safetensors b/checkpoint-32600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..eefe51b28eae65777bd1a1b03aa3381d7c67ca10
--- /dev/null
+++ b/checkpoint-32600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a98def87504f1759a466207b17c99b65da8a516ad7a50b2e825e565449f45664
+size 324662984
diff --git a/checkpoint-32600/training_args.bin b/checkpoint-32600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32700/config.json b/checkpoint-32700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32700/model.safetensors b/checkpoint-32700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..66022be288b056f05a61270a59d0b8c57f62951d
--- /dev/null
+++ b/checkpoint-32700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cd4880c815f1b0ecd279a1a9a067b350eaff8bd50e3a4b242e7b588244ffe2
+size 324662984
diff --git a/checkpoint-32700/training_args.bin b/checkpoint-32700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32800/config.json b/checkpoint-32800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32800/model.safetensors b/checkpoint-32800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e2ea87dc7d448197325a8919468a2e36a3bd0d54
--- /dev/null
+++ b/checkpoint-32800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c905f9633595c436ebddd22d2c185f516767821bbc5ba0d8cd347887cbec7b85
+size 324662984
diff --git a/checkpoint-32800/training_args.bin b/checkpoint-32800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-32900/config.json b/checkpoint-32900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-32900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-32900/model.safetensors b/checkpoint-32900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8131b27630a0f0360ce74d9626d90155f6ccfbc2
--- /dev/null
+++ b/checkpoint-32900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf0bed1d92cdcbd30bf6e093ecb9037891e48083472ca2c93cbd1fdac023f427
+size 324662984
diff --git a/checkpoint-32900/training_args.bin b/checkpoint-32900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-32900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3300/config.json b/checkpoint-3300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3300/model.safetensors b/checkpoint-3300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3456f10316341e298e39be16bd7c8c31431800dc
--- /dev/null
+++ b/checkpoint-3300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d38fa85ef205d8c92522417ff2ff2900eddc243bc07bc22334da548119bc44ba
+size 324662984
diff --git a/checkpoint-3300/training_args.bin b/checkpoint-3300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33000/config.json b/checkpoint-33000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33000/model.safetensors b/checkpoint-33000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..15915a4950efe7220b205072f6e18b50b883e5b6
--- /dev/null
+++ b/checkpoint-33000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c9e02aea7ae486c7578b945732abe34f15f6881ddb0802f10f255facd5f8722
+size 324662984
diff --git a/checkpoint-33000/training_args.bin b/checkpoint-33000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33100/config.json b/checkpoint-33100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33100/model.safetensors b/checkpoint-33100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d6e5f97936438718a8b32571bc9e26f72aacadc2
--- /dev/null
+++ b/checkpoint-33100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e07314a2d8a5715cf25bd5c1dbc8129886435994b0d10b0c1776aaf14a905242
+size 324662984
diff --git a/checkpoint-33100/training_args.bin b/checkpoint-33100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33200/config.json b/checkpoint-33200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33200/model.safetensors b/checkpoint-33200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6b3c397c8ac3dd346ea12f351f0cdfb8499d8b68
--- /dev/null
+++ b/checkpoint-33200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc45af5b74c583b1cb84268b6b1ab05b11c80f3e718c36754ea670ee9167f20c
+size 324662984
diff --git a/checkpoint-33200/training_args.bin b/checkpoint-33200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33300/config.json b/checkpoint-33300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33300/model.safetensors b/checkpoint-33300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9967afb9a815263f05127fccb70b1276cde6238d
--- /dev/null
+++ b/checkpoint-33300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10ac31a9ab4e522addf9715263eb83b5a6821da10e799287d1ed6cbf5ec6b2e5
+size 324662984
diff --git a/checkpoint-33300/training_args.bin b/checkpoint-33300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33400/config.json b/checkpoint-33400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33400/model.safetensors b/checkpoint-33400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7836c09bb2993b33387c7790a203194d05520deb
--- /dev/null
+++ b/checkpoint-33400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55ea26b064d0ab83981430fb73af9c57fa42858b8e6d4fcd9fdc5c2d692abd87
+size 324662984
diff --git a/checkpoint-33400/training_args.bin b/checkpoint-33400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33500/config.json b/checkpoint-33500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33500/model.safetensors b/checkpoint-33500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9bd75963bd1b2dca347a3ae0765b951554d593a2
--- /dev/null
+++ b/checkpoint-33500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b74c071ac2c4ccd910ba16eee6222bc77af7e00438bec26da3393a2103512f5d
+size 324662984
diff --git a/checkpoint-33500/training_args.bin b/checkpoint-33500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33600/config.json b/checkpoint-33600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33600/model.safetensors b/checkpoint-33600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d95df4e13feddea917e5816d325a62cf0780523e
--- /dev/null
+++ b/checkpoint-33600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa4de1bfc363ce4cb520e100f5daf069d74986a8026fb465f55808da511da507
+size 324662984
diff --git a/checkpoint-33600/training_args.bin b/checkpoint-33600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33700/config.json b/checkpoint-33700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33700/model.safetensors b/checkpoint-33700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..eba4fd735998a760dd52782f864623ecf5f0de07
--- /dev/null
+++ b/checkpoint-33700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92fdc5995ec32d43198bbd888b9d0e1ac083da29bdbc7049900a964fe0b6ae4b
+size 324662984
diff --git a/checkpoint-33700/training_args.bin b/checkpoint-33700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33800/config.json b/checkpoint-33800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33800/model.safetensors b/checkpoint-33800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e88d95df79db6622defd1006e795a6a3a89e1c38
--- /dev/null
+++ b/checkpoint-33800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:207f81a03986645d5c180bc369d84a6a1664ed13846e02c617864f20041c2d98
+size 324662984
diff --git a/checkpoint-33800/training_args.bin b/checkpoint-33800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-33900/config.json b/checkpoint-33900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-33900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-33900/model.safetensors b/checkpoint-33900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3fe127d4d0bbf1339ed7bd2ee46f9a96d2f98348
--- /dev/null
+++ b/checkpoint-33900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e081297265c9539ca42c3040c6ae809632e119e2e0b7f4bda010bafbf0c4969
+size 324662984
diff --git a/checkpoint-33900/training_args.bin b/checkpoint-33900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-33900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3400/config.json b/checkpoint-3400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3400/model.safetensors b/checkpoint-3400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bafa7e19a68f4d8d899663d2bc03353b03b30d81
--- /dev/null
+++ b/checkpoint-3400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f357877e3ece0799864924e07c86818e4a2bcf91449476f847b20fba725cdc5
+size 324662984
diff --git a/checkpoint-3400/training_args.bin b/checkpoint-3400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34000/config.json b/checkpoint-34000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34000/model.safetensors b/checkpoint-34000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5cdddb2c5047a70152a0a19f6567c98f33e29834
--- /dev/null
+++ b/checkpoint-34000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e82bc7b641491f7d3d23d74710a9a14b385749b6994f0aa5629d96f9142c8047
+size 324662984
diff --git a/checkpoint-34000/training_args.bin b/checkpoint-34000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34100/config.json b/checkpoint-34100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34100/model.safetensors b/checkpoint-34100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8d97225c6f404c9a1a39cecc0321476a23599976
--- /dev/null
+++ b/checkpoint-34100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a0c8c3f91db01f9a73c2655edbd5e41fbaf9be10df85fae9ce115833a48f2e1
+size 324662984
diff --git a/checkpoint-34100/training_args.bin b/checkpoint-34100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34200/config.json b/checkpoint-34200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34200/model.safetensors b/checkpoint-34200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d5a9829af6b0acbb3683ae05b3d067904f56a165
--- /dev/null
+++ b/checkpoint-34200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97245b26da471ff9a4353ceb3aaa0fddff1587ef8683bfc3384939f312381af4
+size 324662984
diff --git a/checkpoint-34200/training_args.bin b/checkpoint-34200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34300/config.json b/checkpoint-34300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34300/model.safetensors b/checkpoint-34300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..58dc7816488d65fd3bf1bd33f8df98d10d9909e3
--- /dev/null
+++ b/checkpoint-34300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40dc9d7796c8f32847e2e46c1b083d04627e0c68544edee518278fcdd986b762
+size 324662984
diff --git a/checkpoint-34300/training_args.bin b/checkpoint-34300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34400/config.json b/checkpoint-34400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34400/model.safetensors b/checkpoint-34400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..99ebc3d444337e6efdda808cd767a15b4b52e97c
--- /dev/null
+++ b/checkpoint-34400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3129e11b6344af8126a4599eaa2b9ddba1b4236ec4f3c27eaa5d27a60316a2f
+size 324662984
diff --git a/checkpoint-34400/training_args.bin b/checkpoint-34400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34500/config.json b/checkpoint-34500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34500/model.safetensors b/checkpoint-34500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..621dc6012a251efafdba4acf3f6052d0ff8eb330
--- /dev/null
+++ b/checkpoint-34500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c971a024686562dba27bf88e4128eced27525a78b4f2b60cca1ab431fb6220b
+size 324662984
diff --git a/checkpoint-34500/training_args.bin b/checkpoint-34500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34600/config.json b/checkpoint-34600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34600/model.safetensors b/checkpoint-34600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3f9a3eb4d4caea7227fcd2be17887859bc67b529
--- /dev/null
+++ b/checkpoint-34600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:867ce4cc442f729cb07f036e6d3f14b46bf4ee1da2df53681076755c52b11ab4
+size 324662984
diff --git a/checkpoint-34600/training_args.bin b/checkpoint-34600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34700/config.json b/checkpoint-34700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34700/model.safetensors b/checkpoint-34700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..11d0398c40f76f961966bbef521e5db8998c4c73
--- /dev/null
+++ b/checkpoint-34700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:234521e89202b0f95397a85398a944cfe18ba8dbda1aaf6b6c8f877dd1876b9f
+size 324662984
diff --git a/checkpoint-34700/training_args.bin b/checkpoint-34700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34800/config.json b/checkpoint-34800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34800/model.safetensors b/checkpoint-34800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..49052ce23f32bb9adb9fd72a414afe0b5baea476
--- /dev/null
+++ b/checkpoint-34800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c4e1060a44ad4c0caaa56dc79135a2682dd49bfcb09e2a4fc6101ea6e9f864b
+size 324662984
diff --git a/checkpoint-34800/training_args.bin b/checkpoint-34800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-34900/config.json b/checkpoint-34900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-34900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-34900/model.safetensors b/checkpoint-34900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..59c399fffacda704784ce1b9bcab8a9910d56e1a
--- /dev/null
+++ b/checkpoint-34900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0685cb28b38b77fcb14da37fcb3ea1d5b0016c5266c7f34ae480570ee4ac454e
+size 324662984
diff --git a/checkpoint-34900/training_args.bin b/checkpoint-34900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-34900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3500/config.json b/checkpoint-3500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3500/model.safetensors b/checkpoint-3500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8b879fd6d48fce9abf6662f5c7d6b8db88789e49
--- /dev/null
+++ b/checkpoint-3500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:928db51ce3db276a09f627b57b0bd2d01fe8ea9d5dbd53de1c2c689a354e03cf
+size 324662984
diff --git a/checkpoint-3500/training_args.bin b/checkpoint-3500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35000/config.json b/checkpoint-35000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35000/model.safetensors b/checkpoint-35000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e98edb65fb55b35775684ad5db2c22bc0c43bc10
--- /dev/null
+++ b/checkpoint-35000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa1aa3231f39982c7b621c3db7ec8716901fdc740590c0ca736f1410e21b6ab7
+size 324662984
diff --git a/checkpoint-35000/training_args.bin b/checkpoint-35000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35100/config.json b/checkpoint-35100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35100/model.safetensors b/checkpoint-35100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..64a7f168c3b6a7f62db64fe6f5bfe31e4addc54f
--- /dev/null
+++ b/checkpoint-35100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed9f2aecca95f588c694a620a612c66af861790c442bcc5208872095ea38afb
+size 324662984
diff --git a/checkpoint-35100/training_args.bin b/checkpoint-35100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35200/config.json b/checkpoint-35200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35200/model.safetensors b/checkpoint-35200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a95e3a49e57ada380f4dfa85f8e0915051d35da2
--- /dev/null
+++ b/checkpoint-35200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a87d90de34133dd52af2d2a01f7987a104bc0b445771e8b040b5bdc193f1586
+size 324662984
diff --git a/checkpoint-35200/training_args.bin b/checkpoint-35200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35300/config.json b/checkpoint-35300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35300/model.safetensors b/checkpoint-35300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..25a40b6b2c311c9fb871c5b9aa3a35e090817862
--- /dev/null
+++ b/checkpoint-35300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3927b029b222397da0044af573c49eeb64c1eeec4c5a3c142d9628e617a5e63
+size 324662984
diff --git a/checkpoint-35300/training_args.bin b/checkpoint-35300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35400/config.json b/checkpoint-35400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35400/model.safetensors b/checkpoint-35400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..506f760909531352c10c34848f65d1dfca9c89ff
--- /dev/null
+++ b/checkpoint-35400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef26307d603b4d85ba896918480fdcbb90fc329753d24d533a2897218e6df0b5
+size 324662984
diff --git a/checkpoint-35400/training_args.bin b/checkpoint-35400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35500/config.json b/checkpoint-35500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35500/model.safetensors b/checkpoint-35500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..730c969fa9c34df7abd8dad181561a149f893c3b
--- /dev/null
+++ b/checkpoint-35500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cecf93392f73ddf1fa6b5ecc7d3abd42fa82bfdec976924e098928cfc8950c8d
+size 324662984
diff --git a/checkpoint-35500/training_args.bin b/checkpoint-35500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35600/config.json b/checkpoint-35600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35600/model.safetensors b/checkpoint-35600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e9a5790f65014eff64672f902e783cd09347fbf2
--- /dev/null
+++ b/checkpoint-35600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1925e7a995000e2acebf1680d0c6986474445d3b9c8a4d4229c96e659c80e137
+size 324662984
diff --git a/checkpoint-35600/training_args.bin b/checkpoint-35600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35700/config.json b/checkpoint-35700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35700/model.safetensors b/checkpoint-35700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..947b4614a82109f45c5f05f9e2588ba55356c40c
--- /dev/null
+++ b/checkpoint-35700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7629a88d00e1e05d524f2a4952804ba25af61e517cc49425aa0fffa198581a93
+size 324662984
diff --git a/checkpoint-35700/training_args.bin b/checkpoint-35700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35800/config.json b/checkpoint-35800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35800/model.safetensors b/checkpoint-35800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4253aa1f8e27ba28cb7202f01864017e39a16d20
--- /dev/null
+++ b/checkpoint-35800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b67dce529721aec21cfb98e6cf2bb5c641bf9ea9abfe94aa1e9a03788322e89
+size 324662984
diff --git a/checkpoint-35800/training_args.bin b/checkpoint-35800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-35900/config.json b/checkpoint-35900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-35900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-35900/model.safetensors b/checkpoint-35900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e601ffa935fd7d7fd61999e82f62f65783dd4142
--- /dev/null
+++ b/checkpoint-35900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9a5d6def3e6f9852971b06aff4e43ec505646d67d31e9702d6d0588049598d7
+size 324662984
diff --git a/checkpoint-35900/training_args.bin b/checkpoint-35900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-35900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3600/config.json b/checkpoint-3600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3600/model.safetensors b/checkpoint-3600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ace042464fb9ed8788e0366f43a56ba582648273
--- /dev/null
+++ b/checkpoint-3600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:819af980a103ba40116bd94997ef95897422e7f2b323ba862d19885f338dfbe9
+size 324662984
diff --git a/checkpoint-3600/training_args.bin b/checkpoint-3600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36000/config.json b/checkpoint-36000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36000/model.safetensors b/checkpoint-36000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..92042e5b6c0e9fda044cd2bc62292a12bb373fcd
--- /dev/null
+++ b/checkpoint-36000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:397cf705877d21c6c59852839c9b86b1434196a62ee850b160ab35e32f1d9cd1
+size 324662984
diff --git a/checkpoint-36000/training_args.bin b/checkpoint-36000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36100/config.json b/checkpoint-36100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36100/model.safetensors b/checkpoint-36100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2cac48c501cfcbff4ed45249c9f1f36a2e5e7d65
--- /dev/null
+++ b/checkpoint-36100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af04335a4aff16141c1ae0747aa03d23333c62e77d1feed7e6632e072af14de0
+size 324662984
diff --git a/checkpoint-36100/training_args.bin b/checkpoint-36100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36200/config.json b/checkpoint-36200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36200/model.safetensors b/checkpoint-36200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..41e8e7636799c715a28c043e51a898e43cfb9e73
--- /dev/null
+++ b/checkpoint-36200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7ff0c03b22b5e3c8794a11dbc4aab50f5389c213553244aa84e73179304e375
+size 324662984
diff --git a/checkpoint-36200/training_args.bin b/checkpoint-36200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36300/config.json b/checkpoint-36300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36300/model.safetensors b/checkpoint-36300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2ab2a2e9e7833cc0f9d919804e02ee60113c6cc6
--- /dev/null
+++ b/checkpoint-36300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd445573ca7f9d246972b31c1f513dde47263830d844f547f9990081471862a9
+size 324662984
diff --git a/checkpoint-36300/training_args.bin b/checkpoint-36300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36400/config.json b/checkpoint-36400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36400/model.safetensors b/checkpoint-36400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7ced31657cf730d2b8db7b46a76de4e29a97f8b2
--- /dev/null
+++ b/checkpoint-36400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:790ae6ce9c9e066815d5e0ea609d9396959234e1b5439aa0439b5a90df841105
+size 324662984
diff --git a/checkpoint-36400/training_args.bin b/checkpoint-36400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36500/config.json b/checkpoint-36500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36500/model.safetensors b/checkpoint-36500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..eca6c3f22e073eeea00d86c5008b49291640e6d3
--- /dev/null
+++ b/checkpoint-36500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b94eadece2afc9f0b0cae594903145c02d6bb5401f5c32cea6aaca2f98ebc6fe
+size 324662984
diff --git a/checkpoint-36500/training_args.bin b/checkpoint-36500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36600/config.json b/checkpoint-36600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36600/model.safetensors b/checkpoint-36600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cd64f77ccf683898cc51203d2cbb817c20dd2d89
--- /dev/null
+++ b/checkpoint-36600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b48a6f8c8756c6c3f6c42a20e18f0646a10f7f342fe13e051e6aeabb970eb43
+size 324662984
diff --git a/checkpoint-36600/training_args.bin b/checkpoint-36600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36700/config.json b/checkpoint-36700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36700/model.safetensors b/checkpoint-36700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e463b3c69ea8a96754229970076dd7a2781c6bef
--- /dev/null
+++ b/checkpoint-36700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75f49a51d35205c688f24afeeb99700426431ec2ba171341e875f3edc6591695
+size 324662984
diff --git a/checkpoint-36700/training_args.bin b/checkpoint-36700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36800/config.json b/checkpoint-36800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36800/model.safetensors b/checkpoint-36800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6899ca8f252ac8e9ebfb8d15a3cff36dd62bd17b
--- /dev/null
+++ b/checkpoint-36800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea65665ae2ee4a81fb2629f74ee6a9a2b9b567d71c9b7f125fdbfeea6240a6fc
+size 324662984
diff --git a/checkpoint-36800/training_args.bin b/checkpoint-36800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-36900/config.json b/checkpoint-36900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-36900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-36900/model.safetensors b/checkpoint-36900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7cffd3920251de52838df2b59c79557cbf14827c
--- /dev/null
+++ b/checkpoint-36900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78f9f50b16ed089e0c875c86829a26071fbce76a64c07fa8e7a2dd68bc445e65
+size 324662984
diff --git a/checkpoint-36900/training_args.bin b/checkpoint-36900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-36900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3700/config.json b/checkpoint-3700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3700/model.safetensors b/checkpoint-3700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0a69c1329adfc3fa4fe01754b44ffc95fe5ffb13
--- /dev/null
+++ b/checkpoint-3700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc177c7c1082c020829b15b724c7ce85b3fbc7967ebf872df026f4ea25416f6f
+size 324662984
diff --git a/checkpoint-3700/training_args.bin b/checkpoint-3700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37000/config.json b/checkpoint-37000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37000/model.safetensors b/checkpoint-37000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c84bcdf815f93cce8f46f36e0df91965ce9e78be
--- /dev/null
+++ b/checkpoint-37000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f17143ce3c9673b322fac491c522f54fb6fbfe9da7d34fd85203a0df8bb6b2ec
+size 324662984
diff --git a/checkpoint-37000/training_args.bin b/checkpoint-37000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37100/config.json b/checkpoint-37100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37100/model.safetensors b/checkpoint-37100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c65548b369623f62412ca4a87daae0096988160c
--- /dev/null
+++ b/checkpoint-37100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20f58897245d45fd0d8676356f00ee804f0e80993bebcc7fea5c87b76f2e37ff
+size 324662984
diff --git a/checkpoint-37100/training_args.bin b/checkpoint-37100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37200/config.json b/checkpoint-37200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37200/model.safetensors b/checkpoint-37200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7e0d8715c6ba9ea6b018046e7ae0adf531ff42a0
--- /dev/null
+++ b/checkpoint-37200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:620d35de0531433f32ec6caf2a89c55815b1742823a87e01df0058e4b9c9739f
+size 324662984
diff --git a/checkpoint-37200/training_args.bin b/checkpoint-37200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37300/config.json b/checkpoint-37300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37300/model.safetensors b/checkpoint-37300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..eb945c8555e49168192e3eab10fa2a66e9e0ccb9
--- /dev/null
+++ b/checkpoint-37300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d59ae58c1107e1c55dd587cf4e76d8001c765513bf53b1779b3990e1ede25785
+size 324662984
diff --git a/checkpoint-37300/training_args.bin b/checkpoint-37300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37400/config.json b/checkpoint-37400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37400/model.safetensors b/checkpoint-37400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ac2aaa32e93339b74f54b56593ffcb3e12f66afa
--- /dev/null
+++ b/checkpoint-37400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce5324c182eb224b035d885e4c49873910cab1a836386888e818c717e690ad5c
+size 324662984
diff --git a/checkpoint-37400/training_args.bin b/checkpoint-37400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37500/config.json b/checkpoint-37500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37500/model.safetensors b/checkpoint-37500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cab0b9c6a2f0204fe0c1ecef9066c0e0c9afbf4a
--- /dev/null
+++ b/checkpoint-37500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3142b2054a7f792bd9e88da7d863044735a42610567188ed27830d2f37fee2b9
+size 324662984
diff --git a/checkpoint-37500/training_args.bin b/checkpoint-37500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37600/config.json b/checkpoint-37600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37600/model.safetensors b/checkpoint-37600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e1212a32d074bf5333ad18f77d4c573c49aafb4a
--- /dev/null
+++ b/checkpoint-37600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d87d1fdb176b8adc75a584e893e9a12e36359eb9a9be105341990e84a8e1c71e
+size 324662984
diff --git a/checkpoint-37600/training_args.bin b/checkpoint-37600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37700/config.json b/checkpoint-37700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37700/model.safetensors b/checkpoint-37700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..25913df335af77e0651366993b24a431ec9d93a3
--- /dev/null
+++ b/checkpoint-37700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4859be93ed809850ce963629f4b457e9b496976be0c59f04510ccb523ac9cd4
+size 324662984
diff --git a/checkpoint-37700/training_args.bin b/checkpoint-37700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37800/config.json b/checkpoint-37800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37800/model.safetensors b/checkpoint-37800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..69c21b68ea1cd01d6e5be6a26081c788010f6c0e
--- /dev/null
+++ b/checkpoint-37800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00d5afc4efb4adc584f629358eeebe4711ece5202a5ab3715af4cf38f62aa3f0
+size 324662984
diff --git a/checkpoint-37800/training_args.bin b/checkpoint-37800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-37900/config.json b/checkpoint-37900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-37900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-37900/model.safetensors b/checkpoint-37900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..719f0e5ccc437db142b916f5b095acb9b89b3251
--- /dev/null
+++ b/checkpoint-37900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e5991343c362bac699b67ceb44d07a8f9c698f9294f831e2089582213dece98
+size 324662984
diff --git a/checkpoint-37900/training_args.bin b/checkpoint-37900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-37900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3800/config.json b/checkpoint-3800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3800/model.safetensors b/checkpoint-3800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f61e4b775ee651580ebd2a04ea878c184fc9c18d
--- /dev/null
+++ b/checkpoint-3800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e385b5123488b4d641e598217fdc31cf9ed656803aae1b47dc98a3b319397fe
+size 324662984
diff --git a/checkpoint-3800/training_args.bin b/checkpoint-3800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38000/config.json b/checkpoint-38000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38000/model.safetensors b/checkpoint-38000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f35e5d7fd57dc00b1af1938af4d6621183bb0499
--- /dev/null
+++ b/checkpoint-38000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01007bf7dcd9dbb129456554ffa71cb212fd194117af5b746b7b8e2da9950dde
+size 324662984
diff --git a/checkpoint-38000/training_args.bin b/checkpoint-38000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38100/config.json b/checkpoint-38100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38100/model.safetensors b/checkpoint-38100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..73227d60528812b21510b7f4b87a814585687d8b
--- /dev/null
+++ b/checkpoint-38100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a60537fe35c019312be82342b521f4222b45658d96728b65362c58c80e191b6
+size 324662984
diff --git a/checkpoint-38100/training_args.bin b/checkpoint-38100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38200/config.json b/checkpoint-38200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38200/model.safetensors b/checkpoint-38200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6845e04d2e7095dea1ab9a8a8bc663f46dd3fcf9
--- /dev/null
+++ b/checkpoint-38200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3870b73b8d0d47ed3a312faae6998dc989a70ce3964e9db0d7027af9257df169
+size 324662984
diff --git a/checkpoint-38200/training_args.bin b/checkpoint-38200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38300/config.json b/checkpoint-38300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38300/model.safetensors b/checkpoint-38300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f40ecde7142973bf57d5e5ab587c20705e266bf4
--- /dev/null
+++ b/checkpoint-38300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32aa32676c99f0fd3749c06dcb6060f0d6be88f19f3b3bd6a01a9bae68dcc7c2
+size 324662984
diff --git a/checkpoint-38300/training_args.bin b/checkpoint-38300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38400/config.json b/checkpoint-38400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38400/model.safetensors b/checkpoint-38400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..da4414c955f957c49c00ce60081f130cd35768ab
--- /dev/null
+++ b/checkpoint-38400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e026cf9cf0916d6bd37fc49ea9b34860211c33006bf5a2a9d2a735ab2e5f5228
+size 324662984
diff --git a/checkpoint-38400/training_args.bin b/checkpoint-38400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38500/config.json b/checkpoint-38500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38500/model.safetensors b/checkpoint-38500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..55451e1fc3163620d293148b30ced76be6970c52
--- /dev/null
+++ b/checkpoint-38500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd3cf6fb4b2ee0249d8f0c70ab9fa044f1886adce2fb0da4e1bbb9ddcf33785b
+size 324662984
diff --git a/checkpoint-38500/training_args.bin b/checkpoint-38500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38600/config.json b/checkpoint-38600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38600/model.safetensors b/checkpoint-38600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..00a9940d6a5b50d7b88fdc200e1b8e2c44742375
--- /dev/null
+++ b/checkpoint-38600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4eb8222e35b90285566ad7a6f9f4def56a8cb1ff9824cf2e87264208808ec191
+size 324662984
diff --git a/checkpoint-38600/training_args.bin b/checkpoint-38600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38700/config.json b/checkpoint-38700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38700/model.safetensors b/checkpoint-38700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4fa3369cbd2e52a77d8abb4714b36b0f9354e753
--- /dev/null
+++ b/checkpoint-38700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe07d2d63909f3043f8c0ccb80b8dd52e239f9e8ec79ccb6b7f930551fa9e86e
+size 324662984
diff --git a/checkpoint-38700/training_args.bin b/checkpoint-38700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38800/config.json b/checkpoint-38800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38800/model.safetensors b/checkpoint-38800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bfd8b27cd5873d48ec2ab680c9064eb17b60b89c
--- /dev/null
+++ b/checkpoint-38800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46f05281808e64de2d9bc21e79cb7fff5af3ab73d7d03383c542ca90d3191ea6
+size 324662984
diff --git a/checkpoint-38800/training_args.bin b/checkpoint-38800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-38900/config.json b/checkpoint-38900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-38900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-38900/model.safetensors b/checkpoint-38900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6804b471d6eb3a5d350670ce56efff916dae9e5f
--- /dev/null
+++ b/checkpoint-38900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52689055cbac556e23091fd0f1e4d4d8523e1acc886399b04f7ce8b4612662c6
+size 324662984
diff --git a/checkpoint-38900/training_args.bin b/checkpoint-38900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-38900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-3900/config.json b/checkpoint-3900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-3900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-3900/model.safetensors b/checkpoint-3900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..93e1c74ad8b570b292f6c76388c0f35366f7586a
--- /dev/null
+++ b/checkpoint-3900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:611a9f3527977c2b1e1a39b3e7f5e75df8db7af0893d59ef50c9b866cacb4f2f
+size 324662984
diff --git a/checkpoint-3900/training_args.bin b/checkpoint-3900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-3900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39000/config.json b/checkpoint-39000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39000/model.safetensors b/checkpoint-39000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dafd610c534be2858a9e8c0f1be19c48c906f2dc
--- /dev/null
+++ b/checkpoint-39000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:909c6df7b20793ad0d357a660d09c2befc15e0c0c1a30f5cf85d8571a7794259
+size 324662984
diff --git a/checkpoint-39000/training_args.bin b/checkpoint-39000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39100/config.json b/checkpoint-39100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39100/model.safetensors b/checkpoint-39100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..23fd2b09b780b9a96d36e5a30b3bab9ae96a671b
--- /dev/null
+++ b/checkpoint-39100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e246a01d1877076f5e37752e0b266c3ffd2dc0cb109e94f3035455a4072d8701
+size 324662984
diff --git a/checkpoint-39100/training_args.bin b/checkpoint-39100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39200/config.json b/checkpoint-39200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39200/model.safetensors b/checkpoint-39200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c7da86292c44d0eb3e1641859dae38511df4554c
--- /dev/null
+++ b/checkpoint-39200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c24fd3a3b071aed7121cc048348e7153a15a66dae0c564f8a1f3c454c5ed49a
+size 324662984
diff --git a/checkpoint-39200/training_args.bin b/checkpoint-39200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39300/config.json b/checkpoint-39300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39300/model.safetensors b/checkpoint-39300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4a44dd357d445cb641aaad26f9cc5f7bacdb96d3
--- /dev/null
+++ b/checkpoint-39300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68041cf762678e42b7cb6b187637c968dbeaa57276c882a3067482d245e482c6
+size 324662984
diff --git a/checkpoint-39300/training_args.bin b/checkpoint-39300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39400/config.json b/checkpoint-39400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39400/model.safetensors b/checkpoint-39400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f9516294622fa01a36e591a5aae8aea9a867ef7d
--- /dev/null
+++ b/checkpoint-39400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b63343a6f5a128a03d83288d37da6e0b750d00bee5401446f329986ed49f0918
+size 324662984
diff --git a/checkpoint-39400/training_args.bin b/checkpoint-39400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39500/config.json b/checkpoint-39500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39500/model.safetensors b/checkpoint-39500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b2ad4ab2139c2511a5ecb93814a644cab282390f
--- /dev/null
+++ b/checkpoint-39500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:989f1a19a42a0aafe147873a1488c498fb4516c38a79e1a1b0a1bea5172a3a27
+size 324662984
diff --git a/checkpoint-39500/training_args.bin b/checkpoint-39500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39600/config.json b/checkpoint-39600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39600/model.safetensors b/checkpoint-39600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6acf937b01dccbbf5f536310ea84c912fa49a94a
--- /dev/null
+++ b/checkpoint-39600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbacfe75d34077f41215e93349c9b3995581236ab147497dde89f08aa4173cc6
+size 324662984
diff --git a/checkpoint-39600/training_args.bin b/checkpoint-39600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39700/config.json b/checkpoint-39700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39700/model.safetensors b/checkpoint-39700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a8d0aca6f8cd30fc133de57fafb159e0b536008c
--- /dev/null
+++ b/checkpoint-39700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c27f07d878469f2a94daa253cd151821ccadbf466eb4421eba56a3ac2fda6da
+size 324662984
diff --git a/checkpoint-39700/training_args.bin b/checkpoint-39700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39800/config.json b/checkpoint-39800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39800/model.safetensors b/checkpoint-39800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..82b082fb9a4848a29c2c10dcf766726696c29d36
--- /dev/null
+++ b/checkpoint-39800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4311cfed1345a4e94a6bcd63405bf7ebe5219e4c3192fa603e9ed26cde89f7f
+size 324662984
diff --git a/checkpoint-39800/training_args.bin b/checkpoint-39800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-39900/config.json b/checkpoint-39900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-39900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-39900/model.safetensors b/checkpoint-39900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9862ecb81a1e5faa5fdf488cc0719a20bed401fb
--- /dev/null
+++ b/checkpoint-39900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d81d73f68fe63c27636ce88b2b9fb598c41062eb35a61934b2ca4f9056d3b2ce
+size 324662984
diff --git a/checkpoint-39900/training_args.bin b/checkpoint-39900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-39900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-400/config.json b/checkpoint-400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-400/model.safetensors b/checkpoint-400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..28f5bad9cd2fdacf6d38b9adbc0cf65cb38647e4
--- /dev/null
+++ b/checkpoint-400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c52b1b0659f152c1779a66ada1c70277cd1169830e5defefe2ebb2b81bc1e54
+size 324662984
diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4000/config.json b/checkpoint-4000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4000/model.safetensors b/checkpoint-4000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2c44de29ba372b8a550eb4c23dc822adac2d30ef
--- /dev/null
+++ b/checkpoint-4000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e65cb1c0967fc40a19d2c60a8cffc75831cf20bab4dbed47c32abb81ea934f9
+size 324662984
diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40000/config.json b/checkpoint-40000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40000/model.safetensors b/checkpoint-40000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..72f23c7344bcfb77d0dacc1099c9ce37ac24a255
--- /dev/null
+++ b/checkpoint-40000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd1e6a80bdfbc0906875353210cc4adebdf1f8745089051206051c1458a0edd7
+size 324662984
diff --git a/checkpoint-40000/training_args.bin b/checkpoint-40000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40100/config.json b/checkpoint-40100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40100/model.safetensors b/checkpoint-40100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..59579f66f13a30daf28ba0aa4d4cd56123953767
--- /dev/null
+++ b/checkpoint-40100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d370bffaf536669445dc348516b38b15ab1a9bb5b52e6786a4285ec5af000ed
+size 324662984
diff --git a/checkpoint-40100/training_args.bin b/checkpoint-40100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40200/config.json b/checkpoint-40200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40200/model.safetensors b/checkpoint-40200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d947beae527c54be318e63660e97f96a28550e4b
--- /dev/null
+++ b/checkpoint-40200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fabdb75d645899c9b3c5f156a3341283e5304d49129a8d16a90cccb4d471b97
+size 324662984
diff --git a/checkpoint-40200/training_args.bin b/checkpoint-40200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40300/config.json b/checkpoint-40300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40300/model.safetensors b/checkpoint-40300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7438ab905400789dd560bbc32cd992995bbe671f
--- /dev/null
+++ b/checkpoint-40300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07c8c11e50f8c7ec46fc230f024361702d64ac41515b8e1b76254fda1ac6781d
+size 324662984
diff --git a/checkpoint-40300/training_args.bin b/checkpoint-40300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40400/config.json b/checkpoint-40400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40400/model.safetensors b/checkpoint-40400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e22e0343441ec5478c2b28e3f62c95680d19f0a3
--- /dev/null
+++ b/checkpoint-40400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f9dbdc05a495f974b15ece6949cfba208508d29500cbf37e51d1bdd5d9c465b
+size 324662984
diff --git a/checkpoint-40400/training_args.bin b/checkpoint-40400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40500/config.json b/checkpoint-40500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40500/model.safetensors b/checkpoint-40500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fa77e5072a9c4f67eb5a2e94ec60fd8dc288619b
--- /dev/null
+++ b/checkpoint-40500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e58238b13e95a8b3fb9e291d5d46e0a76921a1f2cdffc2eabde6fb04a26ab20
+size 324662984
diff --git a/checkpoint-40500/training_args.bin b/checkpoint-40500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40600/config.json b/checkpoint-40600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40600/model.safetensors b/checkpoint-40600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5a12cdc5d91fe003c42485b349341a4b413960c6
--- /dev/null
+++ b/checkpoint-40600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b3c340541a070a8e1e995a4615a1e2cfcdeac634e0f70576b8a9fb074858712
+size 324662984
diff --git a/checkpoint-40600/training_args.bin b/checkpoint-40600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40700/config.json b/checkpoint-40700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40700/model.safetensors b/checkpoint-40700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..17d88b4592dd96770600973a0f78aba79c5ff78f
--- /dev/null
+++ b/checkpoint-40700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b9f188709472fa8edbe19c009f1a1d661614ba767eada687e4d4b4c09f664c1
+size 324662984
diff --git a/checkpoint-40700/training_args.bin b/checkpoint-40700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40800/config.json b/checkpoint-40800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40800/model.safetensors b/checkpoint-40800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9cefdc2f1b2d2dbae546c6022daa253029547c4f
--- /dev/null
+++ b/checkpoint-40800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77e1c760c2b539df7ab4f6badd6db687ef1decee968132b48ed11370526a6c72
+size 324662984
diff --git a/checkpoint-40800/training_args.bin b/checkpoint-40800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-40900/config.json b/checkpoint-40900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-40900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-40900/model.safetensors b/checkpoint-40900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..31bf1f5eba8a84aa5b68dfb4f04cd1cda3e9f066
--- /dev/null
+++ b/checkpoint-40900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3476d4e66366ef1e8fa2dfb66f236b9fc308e8aed2567df0fa801cdcbfec8472
+size 324662984
diff --git a/checkpoint-40900/training_args.bin b/checkpoint-40900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-40900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4100/config.json b/checkpoint-4100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4100/model.safetensors b/checkpoint-4100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e63fc84343be02330560a2c3550bfc95bd663c80
--- /dev/null
+++ b/checkpoint-4100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c59580b4eddbe4a9171c2180e3e722511dbc45a00da9524fd595a87341041fd
+size 324662984
diff --git a/checkpoint-4100/training_args.bin b/checkpoint-4100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41000/config.json b/checkpoint-41000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41000/model.safetensors b/checkpoint-41000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0fb1a1227422d6d9fd999f189782d41cad4de958
--- /dev/null
+++ b/checkpoint-41000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95a98719d51584c0911caa850d12ab82c5b2a52d462012c8e46e4912b219b977
+size 324662984
diff --git a/checkpoint-41000/training_args.bin b/checkpoint-41000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41100/config.json b/checkpoint-41100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41100/model.safetensors b/checkpoint-41100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..84571c9323131fcb2122f029af15b28b403956a5
--- /dev/null
+++ b/checkpoint-41100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f790c79282d7071760a23d51ba4d138e3cdb18698237368e6c08782f33123901
+size 324662984
diff --git a/checkpoint-41100/training_args.bin b/checkpoint-41100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41200/config.json b/checkpoint-41200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41200/model.safetensors b/checkpoint-41200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cac97c4b031445a13aa3ab2afa37bbf613f7f11b
--- /dev/null
+++ b/checkpoint-41200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aecd276aeddac323fbea78497bd90305f224409bdb02960ebd31e5b4269b8d6d
+size 324662984
diff --git a/checkpoint-41200/training_args.bin b/checkpoint-41200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41300/config.json b/checkpoint-41300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41300/model.safetensors b/checkpoint-41300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..109ca8e78139db05c0d6725c16eb667f4c8b38bf
--- /dev/null
+++ b/checkpoint-41300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:561c3604200209474d36c29f29d3a9e342491f6568d1bfd4fa37a060c9d31bd6
+size 324662984
diff --git a/checkpoint-41300/training_args.bin b/checkpoint-41300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41400/config.json b/checkpoint-41400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41400/model.safetensors b/checkpoint-41400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d76d314e77d3cba57543154135af40ea6ef53449
--- /dev/null
+++ b/checkpoint-41400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71af88ebb22208d773244b374772008b31e483829d55f7b15dcc3b251021eafe
+size 324662984
diff --git a/checkpoint-41400/training_args.bin b/checkpoint-41400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41500/config.json b/checkpoint-41500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41500/model.safetensors b/checkpoint-41500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ddd98dc2221d802603af83c9c6b702e2bca8bbea
--- /dev/null
+++ b/checkpoint-41500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed0dc957536d6f4bb3b2cda0c90aca9545f08aa7bf57895fed5edf3a1f940fb1
+size 324662984
diff --git a/checkpoint-41500/training_args.bin b/checkpoint-41500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41600/config.json b/checkpoint-41600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41600/model.safetensors b/checkpoint-41600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f69cab882908084d796e4bdbe9753fab364c337b
--- /dev/null
+++ b/checkpoint-41600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c6c10837bb802bb57645dc5a225dbd989b6172edc76fa5f7a55fbfccca8c1f1
+size 324662984
diff --git a/checkpoint-41600/training_args.bin b/checkpoint-41600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41700/config.json b/checkpoint-41700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41700/model.safetensors b/checkpoint-41700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8f261b8707e7bda911d8cd51360c93147b0dedb3
--- /dev/null
+++ b/checkpoint-41700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b653c1badc4e06ec0c40ee2954ed4ee51253acffc8c967f5632416651bab68e8
+size 324662984
diff --git a/checkpoint-41700/training_args.bin b/checkpoint-41700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41800/config.json b/checkpoint-41800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41800/model.safetensors b/checkpoint-41800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d45da8eeb05e9973b2043b9a60d036bcd6007159
--- /dev/null
+++ b/checkpoint-41800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a3dac918aab5647a23b918ea698f3f1228b213ae077eb084b110986c31ba404
+size 324662984
diff --git a/checkpoint-41800/training_args.bin b/checkpoint-41800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-41900/config.json b/checkpoint-41900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-41900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-41900/model.safetensors b/checkpoint-41900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e118cfa606fc14f046717817baa95ea27daecffb
--- /dev/null
+++ b/checkpoint-41900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c653f5550d12714fedeeb1d76448e5f23fe3e5c30d0afa6eebff3344f0de341d
+size 324662984
diff --git a/checkpoint-41900/training_args.bin b/checkpoint-41900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-41900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4200/config.json b/checkpoint-4200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4200/model.safetensors b/checkpoint-4200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4a5b3a48d1b7aba82ff47a935fc392071a0d1bcb
--- /dev/null
+++ b/checkpoint-4200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2acbeb41aabb415eacdb1160a681258b9c21c744a752ffe73acf24bdb0afae76
+size 324662984
diff --git a/checkpoint-4200/training_args.bin b/checkpoint-4200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42000/config.json b/checkpoint-42000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42000/model.safetensors b/checkpoint-42000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..77a928dfc1f18dadbfed7fc4f2ce26ef5e254d6f
--- /dev/null
+++ b/checkpoint-42000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e98c50886804aa54897a3f5d7c73fa94fce46f70a83f31c7650ec41e5ac6bbe
+size 324662984
diff --git a/checkpoint-42000/training_args.bin b/checkpoint-42000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42100/config.json b/checkpoint-42100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42100/model.safetensors b/checkpoint-42100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f4d310d46b908f7fb2e9fd6dabfb7eb00fd3778b
--- /dev/null
+++ b/checkpoint-42100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5451bfebbcf1b6066d86f907eaa20a9e3b9bb42cdd4f0edaf34ea10d77babcd
+size 324662984
diff --git a/checkpoint-42100/training_args.bin b/checkpoint-42100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42200/config.json b/checkpoint-42200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42200/model.safetensors b/checkpoint-42200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d8cab103d8c86772fd02bbce7750583da3435137
--- /dev/null
+++ b/checkpoint-42200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b9cdaac4f5abdca6260a3ff7dc48aca761729f30bc3b55191dd6a1db46b7781
+size 324662984
diff --git a/checkpoint-42200/training_args.bin b/checkpoint-42200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42300/config.json b/checkpoint-42300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42300/model.safetensors b/checkpoint-42300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..73304f3b79ec90648894aed015c439760ebff84b
--- /dev/null
+++ b/checkpoint-42300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdc6d85daab250444dd553b335b292e5f2cf3d0adb28c4af84bfbdb6fe891a98
+size 324662984
diff --git a/checkpoint-42300/training_args.bin b/checkpoint-42300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42400/config.json b/checkpoint-42400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42400/model.safetensors b/checkpoint-42400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..66ad71e2011147bf4284dd8cddf2d7bd18c68120
--- /dev/null
+++ b/checkpoint-42400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae0858927f3e7670969c06c48391e2c619a8a21f064d5e3f9e269071b5514046
+size 324662984
diff --git a/checkpoint-42400/training_args.bin b/checkpoint-42400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42500/config.json b/checkpoint-42500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42500/model.safetensors b/checkpoint-42500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7502e4274c7f1a02316dd0d9e4cfc7f64ad00cd8
--- /dev/null
+++ b/checkpoint-42500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e35b64ba3a63ca71e9c8d65c01659ea64e7b8d226e343878597fd17d5de47779
+size 324662984
diff --git a/checkpoint-42500/training_args.bin b/checkpoint-42500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42600/config.json b/checkpoint-42600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42600/model.safetensors b/checkpoint-42600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8ddc6392779969f6be914b38a38280ef2a3501e0
--- /dev/null
+++ b/checkpoint-42600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07dcca7559ed98f11bb890dc2b9d1df3ca2e39b6ef9710f6f91325e1d4f87101
+size 324662984
diff --git a/checkpoint-42600/training_args.bin b/checkpoint-42600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42700/config.json b/checkpoint-42700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42700/model.safetensors b/checkpoint-42700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..91aa13454d2b0bbe252f8809958ba0d3c807fba0
--- /dev/null
+++ b/checkpoint-42700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69e851c26163f294f701a8fcab019850e67b4d82b95f90baa8657fbf8591bf85
+size 324662984
diff --git a/checkpoint-42700/training_args.bin b/checkpoint-42700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42800/config.json b/checkpoint-42800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42800/model.safetensors b/checkpoint-42800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9feaa308eeba8450e573ba1fcc37623488324f87
--- /dev/null
+++ b/checkpoint-42800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16997ddfceaf3ac9db7d0c0fdc1fcc9aa0e7b08b5cce1e15f7e85b37d24a0bd0
+size 324662984
diff --git a/checkpoint-42800/training_args.bin b/checkpoint-42800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-42900/config.json b/checkpoint-42900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-42900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-42900/model.safetensors b/checkpoint-42900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..610a12953f90a01b082afbe8a0af8073b9614805
--- /dev/null
+++ b/checkpoint-42900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be53c8f9267dccdfdcfe1bdbbee7446980652bb120f59a2f514e6aa0a9be671f
+size 324662984
diff --git a/checkpoint-42900/training_args.bin b/checkpoint-42900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-42900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4300/config.json b/checkpoint-4300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4300/model.safetensors b/checkpoint-4300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3bbf5b44d07a9e47caf0abb70db64b13dd57d153
--- /dev/null
+++ b/checkpoint-4300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ea25c6eaee8cefe3536b626a729d32c3da3818557c66d0dc4b79cf05e2513e6
+size 324662984
diff --git a/checkpoint-4300/training_args.bin b/checkpoint-4300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43000/config.json b/checkpoint-43000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43000/model.safetensors b/checkpoint-43000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4f6b5d3d8551b7d012dd4854e7c1bec9da7e1b9f
--- /dev/null
+++ b/checkpoint-43000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:282a271e2524c27ab60b45f18eddf0d176e687ebc3501edd60020ab4fe3b29fa
+size 324662984
diff --git a/checkpoint-43000/training_args.bin b/checkpoint-43000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43100/config.json b/checkpoint-43100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43100/model.safetensors b/checkpoint-43100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..28e842755d8fb8e44a938438c68aa805640dc58e
--- /dev/null
+++ b/checkpoint-43100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbf56c75612fa4bb0c256f3f815e65b0ecc65b2c2295ba66ef38d9e5208c1634
+size 324662984
diff --git a/checkpoint-43100/training_args.bin b/checkpoint-43100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43200/config.json b/checkpoint-43200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43200/model.safetensors b/checkpoint-43200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2f1b912f2a449c812dd2e4b0974517788c7c6cf5
--- /dev/null
+++ b/checkpoint-43200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d1afa0c6437fec63e16d1849b8ebe7aac671c29c8fd51da8f13a481f08b6c85
+size 324662984
diff --git a/checkpoint-43200/training_args.bin b/checkpoint-43200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43300/config.json b/checkpoint-43300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43300/model.safetensors b/checkpoint-43300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4ebe67203329d432c265786a893b73a1b76966aa
--- /dev/null
+++ b/checkpoint-43300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d88e85fc929b9c018581d32b5345c4f00deb38b31907a759836197bae58bbc96
+size 324662984
diff --git a/checkpoint-43300/training_args.bin b/checkpoint-43300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43400/config.json b/checkpoint-43400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43400/model.safetensors b/checkpoint-43400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f4e7e55adbe425a3e11d12918b13d75bfae1a66d
--- /dev/null
+++ b/checkpoint-43400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c946c017cb7ea4686b12bbc939072f0c043a11a78c9cc61bdb1259164e10a237
+size 324662984
diff --git a/checkpoint-43400/training_args.bin b/checkpoint-43400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43500/config.json b/checkpoint-43500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43500/model.safetensors b/checkpoint-43500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..27a5cfd1fc5af557a197e18c415ccaa6ff4ba1da
--- /dev/null
+++ b/checkpoint-43500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94f89b170482062bef82c3ac8652de01b6c6a5518fa0dd93d40f0c88f1bba7f2
+size 324662984
diff --git a/checkpoint-43500/training_args.bin b/checkpoint-43500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43600/config.json b/checkpoint-43600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43600/model.safetensors b/checkpoint-43600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c4561af8d46f3442bf1b92ca2610e63b66935692
--- /dev/null
+++ b/checkpoint-43600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c41ccd23f2504cf971e9b7517c0f3002dbd0fff8a401aec0ff2f3cd650970c5f
+size 324662984
diff --git a/checkpoint-43600/training_args.bin b/checkpoint-43600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43700/config.json b/checkpoint-43700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43700/model.safetensors b/checkpoint-43700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..398e92647949643c17809ac90bd56b1e164ba20e
--- /dev/null
+++ b/checkpoint-43700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b2a37ebebbf2a385cc3ea8d81e326dae5b7225079e6001369e1e5c3e25d2ce0
+size 324662984
diff --git a/checkpoint-43700/training_args.bin b/checkpoint-43700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43800/config.json b/checkpoint-43800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43800/model.safetensors b/checkpoint-43800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..53df77676a287d2ed2672c64a37141b1a1bd34e8
--- /dev/null
+++ b/checkpoint-43800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35aa39e26a38948203d860cc410d6eb28c44b667b421c3d312bed42dd28538bc
+size 324662984
diff --git a/checkpoint-43800/training_args.bin b/checkpoint-43800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-43900/config.json b/checkpoint-43900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-43900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-43900/model.safetensors b/checkpoint-43900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..758ed7caae069b51f4288a779090afda0035a6d5
--- /dev/null
+++ b/checkpoint-43900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b15f1e9bd78a1be4a78e24d319b8a099ab6061c68c627069e29d430411ae445
+size 324662984
diff --git a/checkpoint-43900/training_args.bin b/checkpoint-43900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-43900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4400/config.json b/checkpoint-4400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4400/model.safetensors b/checkpoint-4400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3acaeff7082539c302ae17b408f285dda2c35cbf
--- /dev/null
+++ b/checkpoint-4400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7c15356d4e01e9c065f5bd669aa257446d24f5953197cd91c27581361a8d0bb
+size 324662984
diff --git a/checkpoint-4400/training_args.bin b/checkpoint-4400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44000/config.json b/checkpoint-44000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44000/model.safetensors b/checkpoint-44000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e7ebb0d0a35477cc0bc9d3cb2040fd62395afa32
--- /dev/null
+++ b/checkpoint-44000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1acf74862bff91fdf9438c466b695e88f78a44b05cb082b07aefada82449fae4
+size 324662984
diff --git a/checkpoint-44000/training_args.bin b/checkpoint-44000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44100/config.json b/checkpoint-44100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44100/model.safetensors b/checkpoint-44100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8d5dd9412735f6a36ca0840d9e560787941aacef
--- /dev/null
+++ b/checkpoint-44100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b90a0063efcfa31cb5d9c70dff362194f91462d38d36e1257253526c3e45b64
+size 324662984
diff --git a/checkpoint-44100/training_args.bin b/checkpoint-44100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44200/config.json b/checkpoint-44200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44200/model.safetensors b/checkpoint-44200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9e9f77e3b13ec831f8470f26fe2c6e48f1df0512
--- /dev/null
+++ b/checkpoint-44200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78b1b287a540fdc7c132f8229bd058b38909a625bb5def9cb01b7e90f9037f4e
+size 324662984
diff --git a/checkpoint-44200/training_args.bin b/checkpoint-44200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44300/config.json b/checkpoint-44300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44300/model.safetensors b/checkpoint-44300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bef8ae5732951c5ff1b479363bc3d5bf14986594
--- /dev/null
+++ b/checkpoint-44300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a9d702b001779f65a3b3bb2c43d21966379c9377515eca2a81df2bd46030295
+size 324662984
diff --git a/checkpoint-44300/training_args.bin b/checkpoint-44300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44400/config.json b/checkpoint-44400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44400/model.safetensors b/checkpoint-44400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cd3d4f1f08ba723dcf76f23f118fd9d6405deb7e
--- /dev/null
+++ b/checkpoint-44400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cae75238757581e7c87784cae6709ae44550c8cc9edd5c635b11633503d569a2
+size 324662984
diff --git a/checkpoint-44400/training_args.bin b/checkpoint-44400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44500/config.json b/checkpoint-44500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44500/model.safetensors b/checkpoint-44500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..73d7da766c350c4d0b3103be1d3770aa3adb0926
--- /dev/null
+++ b/checkpoint-44500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fcaa73eb13436686d1818eb530323398d0ae5e2038c8629f995b0f1eb11251c
+size 324662984
diff --git a/checkpoint-44500/training_args.bin b/checkpoint-44500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44600/config.json b/checkpoint-44600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44600/model.safetensors b/checkpoint-44600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..49c17504b75ddd51dc959feda239a852b20ff40e
--- /dev/null
+++ b/checkpoint-44600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e235ba2a2ec6714cbdf8bc56e607445dcbbd933ac479517a7b83d6c997e5c3c4
+size 324662984
diff --git a/checkpoint-44600/training_args.bin b/checkpoint-44600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44700/config.json b/checkpoint-44700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44700/model.safetensors b/checkpoint-44700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4d479b019fee65abb8d1c2c0de9b98c1e8697ccb
--- /dev/null
+++ b/checkpoint-44700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e62b7983d690369da0ee92dc6f854e9c075f0b792abc36404fa03c3516396255
+size 324662984
diff --git a/checkpoint-44700/training_args.bin b/checkpoint-44700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44800/config.json b/checkpoint-44800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44800/model.safetensors b/checkpoint-44800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2610725ad51452ab95c83ac1cf8cd45b61baaaf1
--- /dev/null
+++ b/checkpoint-44800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:849dc05212f14a05b5527aa18827a919f0ff0f6fdab38f2f76c88aa309be6c7b
+size 324662984
diff --git a/checkpoint-44800/training_args.bin b/checkpoint-44800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-44900/config.json b/checkpoint-44900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-44900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-44900/model.safetensors b/checkpoint-44900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d0886c33bb7cab2a7aea1463b9e83ac586e74b00
--- /dev/null
+++ b/checkpoint-44900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52016892fc0db924bcaa8c4420dfc9d771eac55b4ff326c693e4ba1d7c86dc90
+size 324662984
diff --git a/checkpoint-44900/training_args.bin b/checkpoint-44900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-44900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4500/config.json b/checkpoint-4500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4500/model.safetensors b/checkpoint-4500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..80413087de9e688e17cef9331500b3eb5b20d3e9
--- /dev/null
+++ b/checkpoint-4500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b9acb7c271b0562aadc1e8b5a3a97ef5cd7eb708e5dac66e736eb1b177ccfdd
+size 324662984
diff --git a/checkpoint-4500/training_args.bin b/checkpoint-4500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45000/config.json b/checkpoint-45000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45000/model.safetensors b/checkpoint-45000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..45802f05d35a393b3a5d5fa0e967a63f0a462bac
--- /dev/null
+++ b/checkpoint-45000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3110b10bf8672beaa96f611fc9de7d12b40415d0a650b0096d7510dbd220549a
+size 324662984
diff --git a/checkpoint-45000/training_args.bin b/checkpoint-45000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45100/config.json b/checkpoint-45100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45100/model.safetensors b/checkpoint-45100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0da3a06cb7747f4e60698b1735bf1abe0890d5b6
--- /dev/null
+++ b/checkpoint-45100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:688a224ed68a6846746abaf7bec76b90bff6bc5541379a14821cbc347f0d906e
+size 324662984
diff --git a/checkpoint-45100/training_args.bin b/checkpoint-45100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45200/config.json b/checkpoint-45200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45200/model.safetensors b/checkpoint-45200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c0ff45f654930b19108d5f9a42ffe22be59d90cc
--- /dev/null
+++ b/checkpoint-45200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de42838eb04b65e627da249192d9709a2d9f5db2d1cccf60e8c799199ecf668e
+size 324662984
diff --git a/checkpoint-45200/training_args.bin b/checkpoint-45200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45300/config.json b/checkpoint-45300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45300/model.safetensors b/checkpoint-45300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d3cda871c6d652119bd2aaa394063f53b780bb46
--- /dev/null
+++ b/checkpoint-45300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a6916fb07afd0c8914c7baa1530842d58c9cfd0ee5d2c9040bd34e4fcdc16c3
+size 324662984
diff --git a/checkpoint-45300/training_args.bin b/checkpoint-45300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45400/config.json b/checkpoint-45400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45400/model.safetensors b/checkpoint-45400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..545f13c9146b12374f432e02f3407b5626ed83b8
--- /dev/null
+++ b/checkpoint-45400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00385e13e6027c86a1dd2871bcb25ea9d6f04c0731bf0fd34e106d705ce0746f
+size 324662984
diff --git a/checkpoint-45400/training_args.bin b/checkpoint-45400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45500/config.json b/checkpoint-45500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45500/model.safetensors b/checkpoint-45500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..364eeb279550742bdfe1883d5394303d25c56b3a
--- /dev/null
+++ b/checkpoint-45500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df749e00491b624f2f10d7ae2970a34fb321cad97211726159dd757c687acb6c
+size 324662984
diff --git a/checkpoint-45500/training_args.bin b/checkpoint-45500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45600/config.json b/checkpoint-45600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45600/model.safetensors b/checkpoint-45600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8d7d9e425fc57eda1dac84a45c062106b07fd621
--- /dev/null
+++ b/checkpoint-45600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30e6b3e2652e45d35f312b5c928c46878dd9cd3a5f3e0034f72bb3bb7314a079
+size 324662984
diff --git a/checkpoint-45600/training_args.bin b/checkpoint-45600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45700/config.json b/checkpoint-45700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45700/model.safetensors b/checkpoint-45700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d64d55a7f8f4c302546cca242e39701697ed8fba
--- /dev/null
+++ b/checkpoint-45700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:660723f845c4aaeb0c40fbea9c79ed6934caf2561239cf4079c559ba14bd81d8
+size 324662984
diff --git a/checkpoint-45700/training_args.bin b/checkpoint-45700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45800/config.json b/checkpoint-45800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45800/model.safetensors b/checkpoint-45800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..daa8d9d40fd2a8be34971590e46f56ddc2614d5d
--- /dev/null
+++ b/checkpoint-45800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e2c089fd70187cfb5c01ae9de6614b3df5535cf655421ec77bd83b78004918e
+size 324662984
diff --git a/checkpoint-45800/training_args.bin b/checkpoint-45800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-45900/config.json b/checkpoint-45900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-45900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-45900/model.safetensors b/checkpoint-45900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2e61b535ea6b0cbdc5e73a6dc355f2f20935a3d9
--- /dev/null
+++ b/checkpoint-45900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd4d0fe1b92c762ee4ce4186d127bd66bf252caf28d6897a5884550c7e8dd146
+size 324662984
diff --git a/checkpoint-45900/training_args.bin b/checkpoint-45900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-45900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4600/config.json b/checkpoint-4600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4600/model.safetensors b/checkpoint-4600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4a43567d53d0a2cc0a04f874d799f0aebf415eb0
--- /dev/null
+++ b/checkpoint-4600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6556932fb43e9e26fba6128534f35285263167fee415556e2b69565e19b3f50
+size 324662984
diff --git a/checkpoint-4600/training_args.bin b/checkpoint-4600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46000/config.json b/checkpoint-46000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46000/model.safetensors b/checkpoint-46000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8072c7187ccc4016bddb4398927557aa5a3b3632
--- /dev/null
+++ b/checkpoint-46000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c7514f271577c7c1b48653f2e4ca991e0a3a954b51cecb370c82d1110039018
+size 324662984
diff --git a/checkpoint-46000/training_args.bin b/checkpoint-46000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46100/config.json b/checkpoint-46100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46100/model.safetensors b/checkpoint-46100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..314c6bc03f565ce48d4128eb1bd0d31f082c8ffa
--- /dev/null
+++ b/checkpoint-46100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ee69e9ba68eebe565ad61798e07086ecffc23bd644350c886efdf8fe6b9d308
+size 324662984
diff --git a/checkpoint-46100/training_args.bin b/checkpoint-46100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46200/config.json b/checkpoint-46200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46200/model.safetensors b/checkpoint-46200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2d9cb73a25a6a891f9cfef4acda482bf8b8640f9
--- /dev/null
+++ b/checkpoint-46200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb1df48787fddfd005275e516670847e419cb2ad20c74f5f2d37c8bad1f78f2d
+size 324662984
diff --git a/checkpoint-46200/training_args.bin b/checkpoint-46200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46300/config.json b/checkpoint-46300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46300/model.safetensors b/checkpoint-46300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2abbcca967f55b6371b88864c2820ddd3b32a331
--- /dev/null
+++ b/checkpoint-46300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c946fed72e2aefe040d3440782ae2b2bbc753390eba4e7db4ba744471b4acb7d
+size 324662984
diff --git a/checkpoint-46300/training_args.bin b/checkpoint-46300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46400/config.json b/checkpoint-46400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46400/model.safetensors b/checkpoint-46400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bb69bd75c61b065fc183803a22c9c803d403cd0e
--- /dev/null
+++ b/checkpoint-46400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e451d40a1da85f1a9cde79072819ae907634c3dfea5f105656367a6ed760e32b
+size 324662984
diff --git a/checkpoint-46400/training_args.bin b/checkpoint-46400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46500/config.json b/checkpoint-46500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46500/model.safetensors b/checkpoint-46500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d41d646b98e350614beb22b29aca9dcc95937567
--- /dev/null
+++ b/checkpoint-46500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:739468959316bc452accca7b8817779bf4fa925740c4bff8bbfe978ac7bfc85a
+size 324662984
diff --git a/checkpoint-46500/training_args.bin b/checkpoint-46500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46600/config.json b/checkpoint-46600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46600/model.safetensors b/checkpoint-46600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a31de278a5dc7b7d69996a0c06a455784d5eda85
--- /dev/null
+++ b/checkpoint-46600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9aa65f3d56e890b745154fb3009ee720f5b99bccfa88b740318e8b5798ec0ea1
+size 324662984
diff --git a/checkpoint-46600/training_args.bin b/checkpoint-46600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46700/config.json b/checkpoint-46700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46700/model.safetensors b/checkpoint-46700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..80a21151fd0f66c56662bcbcb5848f21c45bc1e7
--- /dev/null
+++ b/checkpoint-46700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:064dbb239b267f180fea9c49c7d797f71e3c2bc6105ea4e930c17deb7128baff
+size 324662984
diff --git a/checkpoint-46700/training_args.bin b/checkpoint-46700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46800/config.json b/checkpoint-46800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46800/model.safetensors b/checkpoint-46800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..97af9bbf2ab55878acb5d7c8ee11fe738fb24f7f
--- /dev/null
+++ b/checkpoint-46800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59d5dd29a52b3b0a54b26e3891f91c30f96afb251b40ea171fd87fe81624f155
+size 324662984
diff --git a/checkpoint-46800/training_args.bin b/checkpoint-46800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-46900/config.json b/checkpoint-46900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-46900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-46900/model.safetensors b/checkpoint-46900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4d76a643e97eb5acba63171b8abb7c9503d303ff
--- /dev/null
+++ b/checkpoint-46900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c82e70fdd44392adb6d069fc6700ab1eb45fc6b4d46cd2aa83bd6375c249e4b
+size 324662984
diff --git a/checkpoint-46900/training_args.bin b/checkpoint-46900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-46900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4700/config.json b/checkpoint-4700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4700/model.safetensors b/checkpoint-4700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..90a546fe2e3245a000cb0a06af7246fc35fc2520
--- /dev/null
+++ b/checkpoint-4700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64cda82628ee13b900f1bdb5435599e5d303ca5220de33e16511cb1647ec3fad
+size 324662984
diff --git a/checkpoint-4700/training_args.bin b/checkpoint-4700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47000/config.json b/checkpoint-47000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47000/model.safetensors b/checkpoint-47000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..27acd615cd384e82d0218264192dddff61bad95f
--- /dev/null
+++ b/checkpoint-47000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4adc41d1f99ce89d4c025551e1d0de55c0a7060a480361e2ff82ad1301c4ab29
+size 324662984
diff --git a/checkpoint-47000/training_args.bin b/checkpoint-47000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47100/config.json b/checkpoint-47100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47100/model.safetensors b/checkpoint-47100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0a0bb2e67a1d83a44239338b393040b0c5295f13
--- /dev/null
+++ b/checkpoint-47100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c25bd28bb3e7dad60306121200122fb4254b5f54385dcf9573493b4d8731270d
+size 324662984
diff --git a/checkpoint-47100/training_args.bin b/checkpoint-47100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47200/config.json b/checkpoint-47200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47200/model.safetensors b/checkpoint-47200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2ff03b094c40bacf9696d539ca7bb5ada23f929b
--- /dev/null
+++ b/checkpoint-47200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7b161f3962dfd22e9748f8e0a083982935dae46ab1adf86c2574d08d85e0ae1
+size 324662984
diff --git a/checkpoint-47200/training_args.bin b/checkpoint-47200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47300/config.json b/checkpoint-47300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47300/model.safetensors b/checkpoint-47300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e26a266288b64d801b3b0bf5aa5dee849d2cdaa1
--- /dev/null
+++ b/checkpoint-47300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3427defd28c52550b27bfa8de02ff061eeb0414afc1bb5317c75029c5604ca24
+size 324662984
diff --git a/checkpoint-47300/training_args.bin b/checkpoint-47300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47400/config.json b/checkpoint-47400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47400/model.safetensors b/checkpoint-47400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..efd9008520d486f5f5556d70ec0683242d85abde
--- /dev/null
+++ b/checkpoint-47400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3089a2fe4d93a8e4729d4ad28761c941a88a856c28dcf6fb42f46072e71cad53
+size 324662984
diff --git a/checkpoint-47400/training_args.bin b/checkpoint-47400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47500/config.json b/checkpoint-47500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47500/model.safetensors b/checkpoint-47500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d40ed61df928064e39617e72bf7be05f0183dc0d
--- /dev/null
+++ b/checkpoint-47500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe02eca47ec9e488a7ce94a9d3014c0e9329b6578f29a659d29a217e2ea41613
+size 324662984
diff --git a/checkpoint-47500/training_args.bin b/checkpoint-47500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47600/config.json b/checkpoint-47600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47600/model.safetensors b/checkpoint-47600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..65e1527df6bd002c3a321cfb65fec623c2a5f2c0
--- /dev/null
+++ b/checkpoint-47600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c00573689e235b2a7b1c39c30950333438f2a26496b0201b61b5f8efc98547d7
+size 324662984
diff --git a/checkpoint-47600/training_args.bin b/checkpoint-47600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47700/config.json b/checkpoint-47700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47700/model.safetensors b/checkpoint-47700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..76fb978e60f92ade6bc8a7430428b1bf530ed52e
--- /dev/null
+++ b/checkpoint-47700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2342da1fede19f58a45c4145af19934b861dc2da62e58bc6296404a0ea43f512
+size 324662984
diff --git a/checkpoint-47700/training_args.bin b/checkpoint-47700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47800/config.json b/checkpoint-47800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47800/model.safetensors b/checkpoint-47800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..22b40a548898fd692ce08365aab90633ae0c1dd2
--- /dev/null
+++ b/checkpoint-47800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:722a5f89325f0d06e2bb55b5406d85134d03939cb1632ef57128f6c02741d191
+size 324662984
diff --git a/checkpoint-47800/training_args.bin b/checkpoint-47800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-47900/config.json b/checkpoint-47900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-47900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-47900/model.safetensors b/checkpoint-47900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1904f4be7f9e7d47b2af3d08cf9844db50513d52
--- /dev/null
+++ b/checkpoint-47900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99ce8855e4c0cd185813647ebb3e988a7d57c520e06e249e3eebd1ef5eef7ca1
+size 324662984
diff --git a/checkpoint-47900/training_args.bin b/checkpoint-47900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-47900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4800/config.json b/checkpoint-4800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4800/model.safetensors b/checkpoint-4800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9deea88b29f1ea66c5414546bf320fe287d6ecbf
--- /dev/null
+++ b/checkpoint-4800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72a53702a0eaf1838f8cf88f70251cafd4525754344a43b6af9f28056027bfb8
+size 324662984
diff --git a/checkpoint-4800/training_args.bin b/checkpoint-4800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48000/config.json b/checkpoint-48000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48000/model.safetensors b/checkpoint-48000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8cdf4af9b7b1bcceb667bcd5cf90433cb662758c
--- /dev/null
+++ b/checkpoint-48000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04f5265229e9a3625f71d219d47b53c8e8f140d345bcb2ffeacb742b35c21430
+size 324662984
diff --git a/checkpoint-48000/training_args.bin b/checkpoint-48000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48100/config.json b/checkpoint-48100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48100/model.safetensors b/checkpoint-48100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a489bcdaf59adfe896ebd734ca142271cef867ba
--- /dev/null
+++ b/checkpoint-48100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7b6ef4feda8a4a9e2cca23a3b94f18981f1bd864bea8a05e43926cc6170ab59
+size 324662984
diff --git a/checkpoint-48100/training_args.bin b/checkpoint-48100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48200/config.json b/checkpoint-48200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48200/model.safetensors b/checkpoint-48200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2fe6fbcefb5cfc895edf699205dcc3f61115a659
--- /dev/null
+++ b/checkpoint-48200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80e27e26191eae8204189e5ae3f32e8fe12107226acbb845133dd96d0e9aaf8b
+size 324662984
diff --git a/checkpoint-48200/training_args.bin b/checkpoint-48200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48300/config.json b/checkpoint-48300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48300/model.safetensors b/checkpoint-48300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..169793ad900723dc87737d3e3bfa5eeadb5abc1b
--- /dev/null
+++ b/checkpoint-48300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d6c3a6a32162a5cd1060cf788854d2e5cc1dae7c8bc3c33b8a1ca603db86232
+size 324662984
diff --git a/checkpoint-48300/training_args.bin b/checkpoint-48300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48400/config.json b/checkpoint-48400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48400/model.safetensors b/checkpoint-48400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7add8115b263e3ec661358c06ffabb883490ab8e
--- /dev/null
+++ b/checkpoint-48400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a200f3937f04e9c041cf19d83ca3d294fcbb955e6993924930b55c21d348fba
+size 324662984
diff --git a/checkpoint-48400/training_args.bin b/checkpoint-48400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48500/config.json b/checkpoint-48500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48500/model.safetensors b/checkpoint-48500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c66e59437da6de99e33ef26befaa0229721a3d77
--- /dev/null
+++ b/checkpoint-48500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4895f707302897ff241a6f97f85f7948a0286f2c51d04b4fcd949d1346e76a8f
+size 324662984
diff --git a/checkpoint-48500/training_args.bin b/checkpoint-48500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48600/config.json b/checkpoint-48600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48600/model.safetensors b/checkpoint-48600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9acff89e7366f2d49afbcce36f76b3def9bc4eb2
--- /dev/null
+++ b/checkpoint-48600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a52399d4831ea44e90d62dd9f3df27d4bcad81d40b2de69b5ff20329098b22e
+size 324662984
diff --git a/checkpoint-48600/training_args.bin b/checkpoint-48600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48700/config.json b/checkpoint-48700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48700/model.safetensors b/checkpoint-48700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d3da98d8380793aa4810503a60cc7b52360c9b10
--- /dev/null
+++ b/checkpoint-48700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0509e27a5d699a5e993620b3ddc8adb5f84d375454a7055b5686b81c8f56aaf5
+size 324662984
diff --git a/checkpoint-48700/training_args.bin b/checkpoint-48700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48800/config.json b/checkpoint-48800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48800/model.safetensors b/checkpoint-48800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f95452a1f6a7da5f9928ed869d7d5c38e5c57fdd
--- /dev/null
+++ b/checkpoint-48800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d83e05c2adcf8e580afb332d1b1ca88c79ef18eac68ccf06df7fe19cd224dea
+size 324662984
diff --git a/checkpoint-48800/training_args.bin b/checkpoint-48800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-48900/config.json b/checkpoint-48900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-48900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-48900/model.safetensors b/checkpoint-48900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ef97e58f33b71feb8b1f0c5f6b0f303c48a88081
--- /dev/null
+++ b/checkpoint-48900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:287daf89b83a675cdc09d6d2f841dc23aded2137faafb0f28ee4b1f8d0d5149f
+size 324662984
diff --git a/checkpoint-48900/training_args.bin b/checkpoint-48900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-48900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-4900/config.json b/checkpoint-4900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-4900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-4900/model.safetensors b/checkpoint-4900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b1841b3669c77961ee9be66a30a0f63e227e2213
--- /dev/null
+++ b/checkpoint-4900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b42fbca8f8a7a2af4a657fdb594538d98fe1bd1a5aa3abd10098b6ed4f00d2b
+size 324662984
diff --git a/checkpoint-4900/training_args.bin b/checkpoint-4900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-4900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49000/config.json b/checkpoint-49000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49000/model.safetensors b/checkpoint-49000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0d1fd81b8d5c6e2b9c52c86c0e05a98ba60e65fb
--- /dev/null
+++ b/checkpoint-49000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e1348d7fe5ad8813e36e6c9cd79fa40b9b03eaff2fc370cae1a45bedd69dbc0
+size 324662984
diff --git a/checkpoint-49000/training_args.bin b/checkpoint-49000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49100/config.json b/checkpoint-49100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49100/model.safetensors b/checkpoint-49100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b0abf292bc51f322525347f7b368e317eff68a17
--- /dev/null
+++ b/checkpoint-49100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cdda6f804cb015f2326e00f34092461749636a95d3f1ccdea567d31fd55f051
+size 324662984
diff --git a/checkpoint-49100/training_args.bin b/checkpoint-49100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49200/config.json b/checkpoint-49200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49200/model.safetensors b/checkpoint-49200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..34879d8756debce3f58bf1056ac3ef7c787b96fb
--- /dev/null
+++ b/checkpoint-49200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:638b772209c4d6b4244ee587c88653f9c012809b229d1af3aaed8d9ab8614db3
+size 324662984
diff --git a/checkpoint-49200/training_args.bin b/checkpoint-49200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49300/config.json b/checkpoint-49300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49300/model.safetensors b/checkpoint-49300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..936e3ef37cba616a32e7f0184c40eb311c20ab8c
--- /dev/null
+++ b/checkpoint-49300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:720303fa01f261b5da3420e7072f89bfc6fda5289612298b8310d0f59bdffc3c
+size 324662984
diff --git a/checkpoint-49300/training_args.bin b/checkpoint-49300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49400/config.json b/checkpoint-49400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49400/model.safetensors b/checkpoint-49400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f1c7863a3dad0dd78ecb466ad4584bbe0a987b31
--- /dev/null
+++ b/checkpoint-49400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0e439d57274cc813537d3c10ac73478071cc8273fafeaa20783838368178d6c
+size 324662984
diff --git a/checkpoint-49400/training_args.bin b/checkpoint-49400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49500/config.json b/checkpoint-49500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49500/model.safetensors b/checkpoint-49500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f92f4cda7848f360c6fc782eedbd7aba23a46576
--- /dev/null
+++ b/checkpoint-49500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0ead1b0dd8a46c62ab1842bcece20bce49d2b27c1f6de0d372aed06ba3d5cd
+size 324662984
diff --git a/checkpoint-49500/training_args.bin b/checkpoint-49500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49600/config.json b/checkpoint-49600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49600/model.safetensors b/checkpoint-49600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0642716fabf0c52f07ad51b760cbf0da238b381f
--- /dev/null
+++ b/checkpoint-49600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0329d138ec3df404560f6fbd08082ca595b391dbce4b26708703133a891f9aa
+size 324662984
diff --git a/checkpoint-49600/training_args.bin b/checkpoint-49600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49700/config.json b/checkpoint-49700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49700/model.safetensors b/checkpoint-49700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e0f15c157f745f834ee48883f841657f818bc41c
--- /dev/null
+++ b/checkpoint-49700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bf882217f6f82ce8e50d2c61282bafdd6a0d068fad4d361d10d3a2063fb6637
+size 324662984
diff --git a/checkpoint-49700/training_args.bin b/checkpoint-49700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49800/config.json b/checkpoint-49800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49800/model.safetensors b/checkpoint-49800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6f78e0d2bb3e4a477c84a4f15384c7fa9bcc5cc1
--- /dev/null
+++ b/checkpoint-49800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1255743a31d88d70556c022478ce0f2ab837e6762d2604bd907203394d972955
+size 324662984
diff --git a/checkpoint-49800/training_args.bin b/checkpoint-49800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-49900/config.json b/checkpoint-49900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-49900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-49900/model.safetensors b/checkpoint-49900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..801757467b057cd2e62624465a838e616994f208
--- /dev/null
+++ b/checkpoint-49900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc6b84adb7fb2e492cbd162bc87ef39b4885e675c9b1d7fa9bad2b5a640115e5
+size 324662984
diff --git a/checkpoint-49900/training_args.bin b/checkpoint-49900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-49900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-500/config.json b/checkpoint-500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-500/model.safetensors b/checkpoint-500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a1eb731b4eff255f60a7a07f9e182265e6fdf38a
--- /dev/null
+++ b/checkpoint-500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d46c2ed38a318af855fe6a7b7e3bd8d23f9c90968d76ea49c9c04a19a340fe6
+size 324662984
diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5000/config.json b/checkpoint-5000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5000/model.safetensors b/checkpoint-5000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..69ef03396d5619504a7b94d3fb0a636fb5323296
--- /dev/null
+++ b/checkpoint-5000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b47f82c7c80e959794f165aabfbb44cf7300145a37dd120f1e13116c538a43e0
+size 324662984
diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50000/config.json b/checkpoint-50000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50000/model.safetensors b/checkpoint-50000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..646871a16f164810648cbd0bd82606bca2822f45
--- /dev/null
+++ b/checkpoint-50000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e646ba3dc6d21283828abdfd02b80d71c9daf5649c0dd4afa5380b95efebbaf
+size 324662984
diff --git a/checkpoint-50000/training_args.bin b/checkpoint-50000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50100/config.json b/checkpoint-50100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50100/model.safetensors b/checkpoint-50100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1395e6a299ae542c25310f1159271363fcebfd34
--- /dev/null
+++ b/checkpoint-50100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2733fdd692a4e1ad0b1a59acdeb3c13e2f24a74407ad1d8bbd2b1e91f2705067
+size 324662984
diff --git a/checkpoint-50100/training_args.bin b/checkpoint-50100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50200/config.json b/checkpoint-50200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50200/model.safetensors b/checkpoint-50200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e12eff8ddd610c8eb2b625d3f81efbc483ad8946
--- /dev/null
+++ b/checkpoint-50200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97af7da2de54a981e52fa846fdf8b7ed41075e5ee514b4fa3d1f364a5c8469de
+size 324662984
diff --git a/checkpoint-50200/training_args.bin b/checkpoint-50200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50300/config.json b/checkpoint-50300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50300/model.safetensors b/checkpoint-50300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d6f1407c2c3d6d8953edc65ac2f7d7d621ad9460
--- /dev/null
+++ b/checkpoint-50300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1621a8193e5e70439793a7562dee230cd25a8220e1722c1e1d57136ade08122
+size 324662984
diff --git a/checkpoint-50300/training_args.bin b/checkpoint-50300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50400/config.json b/checkpoint-50400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50400/model.safetensors b/checkpoint-50400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..45b4a92026462d40f6b46e9b3880b6ba4ea4bd15
--- /dev/null
+++ b/checkpoint-50400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca75c5218a290d7bea372ccaed5fddfc0d6cb0d4b18e8b52fe293c41747426c0
+size 324662984
diff --git a/checkpoint-50400/training_args.bin b/checkpoint-50400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50500/config.json b/checkpoint-50500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50500/model.safetensors b/checkpoint-50500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..73f653f9973d05a51baa1aa5751701e8fcdd46ef
--- /dev/null
+++ b/checkpoint-50500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37ef5eacbff80662136f811e99d8e0cebaa9265298c397da695324e9b0678fc0
+size 324662984
diff --git a/checkpoint-50500/training_args.bin b/checkpoint-50500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50600/config.json b/checkpoint-50600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50600/model.safetensors b/checkpoint-50600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..02e93d97ccb7015a4f179a075fb640fd914c1528
--- /dev/null
+++ b/checkpoint-50600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b08550f41837f872d90a23a7bb77cf7a1dfeca9e4973b1ef8e424afba4d2373d
+size 324662984
diff --git a/checkpoint-50600/training_args.bin b/checkpoint-50600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50700/config.json b/checkpoint-50700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50700/model.safetensors b/checkpoint-50700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2cc73c02da86db2fe083ffc2abd3a971d1cd10a1
--- /dev/null
+++ b/checkpoint-50700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9b23e4a9648aa7e3256d3a195c0eb850312ae3bbd6e4ad71e62de1e08a61fa4
+size 324662984
diff --git a/checkpoint-50700/training_args.bin b/checkpoint-50700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50800/config.json b/checkpoint-50800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50800/model.safetensors b/checkpoint-50800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0986c9572fdc339c978a228ad82599ecc6866b70
--- /dev/null
+++ b/checkpoint-50800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dc118d8de15604e20fa034b3718d20a26f2f0d92da08cf2941c70f445cc5fbc
+size 324662984
diff --git a/checkpoint-50800/training_args.bin b/checkpoint-50800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-50900/config.json b/checkpoint-50900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-50900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-50900/model.safetensors b/checkpoint-50900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c5004374da3263cf1e391c0ea64f4c7dcf5d89d9
--- /dev/null
+++ b/checkpoint-50900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8775a11c0994e1c129e63d0ec22be5e190492ff86b98ce822edb482d6458616d
+size 324662984
diff --git a/checkpoint-50900/training_args.bin b/checkpoint-50900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-50900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5100/config.json b/checkpoint-5100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5100/model.safetensors b/checkpoint-5100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..35bf5608f410502012f59da134def2f49d3e21e9
--- /dev/null
+++ b/checkpoint-5100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3be97b7dd310dd3991c300ea0780c072a0294179d7c83206445a5ba06aef7740
+size 324662984
diff --git a/checkpoint-5100/training_args.bin b/checkpoint-5100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51000/config.json b/checkpoint-51000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51000/model.safetensors b/checkpoint-51000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..85977e41596bca2894235d70bf60cc18bf600674
--- /dev/null
+++ b/checkpoint-51000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:955684de0e9d273f401bd2a3f5fd81855e0005f3aec9757eaf7f4f8b594fe3bb
+size 324662984
diff --git a/checkpoint-51000/training_args.bin b/checkpoint-51000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51100/config.json b/checkpoint-51100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51100/model.safetensors b/checkpoint-51100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..87016b4c96147f2c4f1479815a6ee883202293ea
--- /dev/null
+++ b/checkpoint-51100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e02115a0623919bc99019becbfb6858d3155cf5f2e2d777073d8b09f2a3b4753
+size 324662984
diff --git a/checkpoint-51100/training_args.bin b/checkpoint-51100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51200/config.json b/checkpoint-51200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51200/model.safetensors b/checkpoint-51200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c1004c3a9df097878727a1206d83ca1850fc82ac
--- /dev/null
+++ b/checkpoint-51200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce9f57763e130f36f14f98237396fad1e1dc254e1013fb60159f1ff64f655669
+size 324662984
diff --git a/checkpoint-51200/training_args.bin b/checkpoint-51200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51300/config.json b/checkpoint-51300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51300/model.safetensors b/checkpoint-51300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..79a899865d331b45b6c0ab94b8e4267442c55774
--- /dev/null
+++ b/checkpoint-51300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:638b755f64737adfb40bb77fcb48b1763f210d814d32ff1409feceed40e8d301
+size 324662984
diff --git a/checkpoint-51300/training_args.bin b/checkpoint-51300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51400/config.json b/checkpoint-51400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51400/model.safetensors b/checkpoint-51400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fe9fa6634cac10b0f0bc45f42e502285e583aaac
--- /dev/null
+++ b/checkpoint-51400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d0c230781b0787867e5c2c63e120cf2734eb6cc5d6037d6e518c8be20aea161
+size 324662984
diff --git a/checkpoint-51400/training_args.bin b/checkpoint-51400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51500/config.json b/checkpoint-51500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51500/model.safetensors b/checkpoint-51500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e5e82d220b5a5fd880bc4f6181f5692d19151577
--- /dev/null
+++ b/checkpoint-51500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b701041b1c9a0662fd4a77235cb65e86fdfe55c7a314ff598da52e710b2af04
+size 324662984
diff --git a/checkpoint-51500/training_args.bin b/checkpoint-51500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51600/config.json b/checkpoint-51600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51600/model.safetensors b/checkpoint-51600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..59a518752d3955e241afc401228316c8243a9114
--- /dev/null
+++ b/checkpoint-51600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b9b6f1613597f4bc85584418fc00271d5466ef47f08e3cea97622ac93b7490
+size 324662984
diff --git a/checkpoint-51600/training_args.bin b/checkpoint-51600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51700/config.json b/checkpoint-51700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51700/model.safetensors b/checkpoint-51700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..52d69f0f025f7f72e170221bb8f31bb1ea4e4bf9
--- /dev/null
+++ b/checkpoint-51700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bfb382b6ad7bb9ac6dbab302f6afb7c9d8c5af434b847805dbcc1fffe3dd647
+size 324662984
diff --git a/checkpoint-51700/training_args.bin b/checkpoint-51700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51800/config.json b/checkpoint-51800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51800/model.safetensors b/checkpoint-51800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d1cfdf33011da6cd51b956a30ec436e4c1f27ab8
--- /dev/null
+++ b/checkpoint-51800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90224a22d3848ada6cc4c0de359c72ef04c343a0cf5295f04ea9a1160da66018
+size 324662984
diff --git a/checkpoint-51800/training_args.bin b/checkpoint-51800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-51900/config.json b/checkpoint-51900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-51900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-51900/model.safetensors b/checkpoint-51900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..984a5d9f2f2bc7d6b1519f5815273fc6323a5ae8
--- /dev/null
+++ b/checkpoint-51900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c958ab63d9ee10d9fc358dce0649776cd315d3ae7d4e18078ee7546034588908
+size 324662984
diff --git a/checkpoint-51900/training_args.bin b/checkpoint-51900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-51900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5200/config.json b/checkpoint-5200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5200/model.safetensors b/checkpoint-5200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d6635a8ae28250f0a81ac159e475723a70da7c05
--- /dev/null
+++ b/checkpoint-5200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c957c8edab9ed7ddc292ed11c5a63f8930192e1f46723fdda1f7ccdb5073993
+size 324662984
diff --git a/checkpoint-5200/training_args.bin b/checkpoint-5200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52000/config.json b/checkpoint-52000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52000/model.safetensors b/checkpoint-52000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..405388ba2924cbd4ff0f06dfbe57b95a42e4c497
--- /dev/null
+++ b/checkpoint-52000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f8b357a12bec64bc5a163da635d2c29db8113e660f4e60d851e6fdd3509a7e1
+size 324662984
diff --git a/checkpoint-52000/training_args.bin b/checkpoint-52000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52100/config.json b/checkpoint-52100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52100/model.safetensors b/checkpoint-52100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..df908a25200b16cc6d93d37f138bf35d5c7f1f2c
--- /dev/null
+++ b/checkpoint-52100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:515332484f3e62ed8fccb40864a5d00e04a16d6ec4b1f7bbee497cbaf015b71a
+size 324662984
diff --git a/checkpoint-52100/training_args.bin b/checkpoint-52100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52200/config.json b/checkpoint-52200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52200/model.safetensors b/checkpoint-52200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c95a79e13896e6c4b94ef0e3cdf43c48b77e3418
--- /dev/null
+++ b/checkpoint-52200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a50c46ab5712712bdd5e732008f3670786ced528124f0384b586bedfb10ef19
+size 324662984
diff --git a/checkpoint-52200/training_args.bin b/checkpoint-52200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52300/config.json b/checkpoint-52300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52300/model.safetensors b/checkpoint-52300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..be6d8c82f29ff699cff2dafd1389a6f7e6dfc9e2
--- /dev/null
+++ b/checkpoint-52300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a016730663052a61b7a7ece34692bfe68418062f8a80dde48b07ceb6bc54eb40
+size 324662984
diff --git a/checkpoint-52300/training_args.bin b/checkpoint-52300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52400/config.json b/checkpoint-52400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52400/model.safetensors b/checkpoint-52400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bf42c6395e4a099d8ffe517ef062d4e9cafd2fdc
--- /dev/null
+++ b/checkpoint-52400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42bf8dbeceb03039dce43e47fef342e6c59bb10d4422b0bc0cc8d82b45cdb933
+size 324662984
diff --git a/checkpoint-52400/training_args.bin b/checkpoint-52400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52500/config.json b/checkpoint-52500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52500/model.safetensors b/checkpoint-52500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a0315a238b65cbcfe4e6d4ebb1355cbb8ec0dd28
--- /dev/null
+++ b/checkpoint-52500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38eab4ae2885bdb9c0a61a8c849c2cb476aa13346e806067beaf8de1a909c5b2
+size 324662984
diff --git a/checkpoint-52500/training_args.bin b/checkpoint-52500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52600/config.json b/checkpoint-52600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52600/model.safetensors b/checkpoint-52600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2693d6fbcab39b569d111a1d9d6661c174764960
--- /dev/null
+++ b/checkpoint-52600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de7b1b13b17adf2f1ec82c2a51099892e3f28d480cd097c4611f8d2ac363b360
+size 324662984
diff --git a/checkpoint-52600/training_args.bin b/checkpoint-52600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52700/config.json b/checkpoint-52700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52700/model.safetensors b/checkpoint-52700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..302895bf90d748909a90dd96db038498236111fa
--- /dev/null
+++ b/checkpoint-52700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f65324e2e993705158a16668894e50ec474a4c0cb8f3b38853f76d75904a372c
+size 324662984
diff --git a/checkpoint-52700/training_args.bin b/checkpoint-52700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52800/config.json b/checkpoint-52800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52800/model.safetensors b/checkpoint-52800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c227d6906a8cfdf35a445c726c62ee9efedc0785
--- /dev/null
+++ b/checkpoint-52800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1981d3daa33288af5361345daaa99853bd36ae42864fe29794f1139874408136
+size 324662984
diff --git a/checkpoint-52800/training_args.bin b/checkpoint-52800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-52900/config.json b/checkpoint-52900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-52900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-52900/model.safetensors b/checkpoint-52900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2e36f7e474bd82a66fff57151d7da43fbfaa44c4
--- /dev/null
+++ b/checkpoint-52900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a9b883615156ed2f4d8189b8b683fe1378095de89032d7d9aa7a5b7872cd77a
+size 324662984
diff --git a/checkpoint-52900/training_args.bin b/checkpoint-52900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-52900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5300/config.json b/checkpoint-5300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5300/model.safetensors b/checkpoint-5300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..42e5bac4e12083f6e496a035ede7f5f04b46e908
--- /dev/null
+++ b/checkpoint-5300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed111f8fc81e841b498a7219841ae59fa37b646e332897b07ce0f3f725631299
+size 324662984
diff --git a/checkpoint-5300/training_args.bin b/checkpoint-5300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53000/config.json b/checkpoint-53000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53000/model.safetensors b/checkpoint-53000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4b68be7abca79545d223a76070bd34c60ad73c3b
--- /dev/null
+++ b/checkpoint-53000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:830d3f3f244e2639f09efd12af7af28fced95e471e71a25a97d0deb96b7f5b23
+size 324662984
diff --git a/checkpoint-53000/training_args.bin b/checkpoint-53000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53100/config.json b/checkpoint-53100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53100/model.safetensors b/checkpoint-53100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6f37c89e2d3f4f25406d1056a1be90651b294a03
--- /dev/null
+++ b/checkpoint-53100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c274bbe1aa329c07ccdbd3ee26aa5fcd6773090bf049b4780f6c3fa2db36c4cc
+size 324662984
diff --git a/checkpoint-53100/training_args.bin b/checkpoint-53100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53200/config.json b/checkpoint-53200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53200/model.safetensors b/checkpoint-53200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ecf6a7b2a737f3d7de1f10e7868475d49dcf37a0
--- /dev/null
+++ b/checkpoint-53200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a93a921c947b1a29765ac47b575518530f7505128dcd6da25e6c3162908b905
+size 324662984
diff --git a/checkpoint-53200/training_args.bin b/checkpoint-53200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53300/config.json b/checkpoint-53300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53300/model.safetensors b/checkpoint-53300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0dbf98b5558c10a7a008c523c67c8c1602ae92af
--- /dev/null
+++ b/checkpoint-53300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22fd7c479266fa526780353034638a80108c9b8148b41e6c04c58632404147ab
+size 324662984
diff --git a/checkpoint-53300/training_args.bin b/checkpoint-53300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53400/config.json b/checkpoint-53400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53400/model.safetensors b/checkpoint-53400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8c94da68cfd2cc44fd238948b7e85ab14dd13bb3
--- /dev/null
+++ b/checkpoint-53400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc009919b019a3ae318251b2c3983b3383b9f3c6d3520f1f3f4521b636ba5d56
+size 324662984
diff --git a/checkpoint-53400/training_args.bin b/checkpoint-53400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53500/config.json b/checkpoint-53500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53500/model.safetensors b/checkpoint-53500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fb99b68bd832241a1b244e0a6a91850c3c78a42e
--- /dev/null
+++ b/checkpoint-53500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:106f68392411ca463d607905f68fcedea717f65d3717896ae1b8a68f4d6c3ea1
+size 324662984
diff --git a/checkpoint-53500/training_args.bin b/checkpoint-53500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53600/config.json b/checkpoint-53600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53600/model.safetensors b/checkpoint-53600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9e0323367a6af5a8faedd1f87df02ae7ad90d191
--- /dev/null
+++ b/checkpoint-53600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2bf24504a24a4ded83ec3a64b868b2600fe2a9597d723daa02611fc3c02699a
+size 324662984
diff --git a/checkpoint-53600/training_args.bin b/checkpoint-53600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53700/config.json b/checkpoint-53700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53700/model.safetensors b/checkpoint-53700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4686863a353bb0c036edaaa583c794449e1d9c94
--- /dev/null
+++ b/checkpoint-53700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9de2a4c7fb3c9171cc094a1395d0f313998786a19edd046859190afd4664f351
+size 324662984
diff --git a/checkpoint-53700/training_args.bin b/checkpoint-53700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53800/config.json b/checkpoint-53800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53800/model.safetensors b/checkpoint-53800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..723d2e97c441b30c7b0d9ce84ae69ab0b1e13403
--- /dev/null
+++ b/checkpoint-53800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30bb63ac1683786070d026509203044c7578c99baa2ea58b793bf87725ab9946
+size 324662984
diff --git a/checkpoint-53800/training_args.bin b/checkpoint-53800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-53900/config.json b/checkpoint-53900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-53900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-53900/model.safetensors b/checkpoint-53900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2125b1590c0535abe012242e974d22d17a140385
--- /dev/null
+++ b/checkpoint-53900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b74bf7f3eef49685ed7fca3d915a6df88a593d8ac4722187f707b9ca5166db5
+size 324662984
diff --git a/checkpoint-53900/training_args.bin b/checkpoint-53900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-53900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5400/config.json b/checkpoint-5400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5400/model.safetensors b/checkpoint-5400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4c8ee36a2c53c6478014bcb9ba90cd64b7541ac7
--- /dev/null
+++ b/checkpoint-5400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6be4b9b1868647d9810a003a207c2a0a51a26e1d2d2adf867af5450154f21332
+size 324662984
diff --git a/checkpoint-5400/training_args.bin b/checkpoint-5400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54000/config.json b/checkpoint-54000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54000/model.safetensors b/checkpoint-54000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..97a3654d17bd2e2d6a919d3bcf42dfe5de3e8501
--- /dev/null
+++ b/checkpoint-54000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82f0c48eec5108bbbe421db5fce6c277d90b8086044196a380f8624119065afa
+size 324662984
diff --git a/checkpoint-54000/training_args.bin b/checkpoint-54000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54100/config.json b/checkpoint-54100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54100/model.safetensors b/checkpoint-54100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fad2dc73f7a5a68afef31ef406738b1e10ea8ebc
--- /dev/null
+++ b/checkpoint-54100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:253de3794472aec58c8fa9d0e05caaf51e8330ee78db3ede06af8eb785524df3
+size 324662984
diff --git a/checkpoint-54100/training_args.bin b/checkpoint-54100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54200/config.json b/checkpoint-54200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54200/model.safetensors b/checkpoint-54200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f791b70878e1450b39085fb2bdfe7e6707c26d61
--- /dev/null
+++ b/checkpoint-54200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca9b9d440e53558ac1d36939716ad66042881c65a399198dfc8e78e82a45874e
+size 324662984
diff --git a/checkpoint-54200/training_args.bin b/checkpoint-54200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54300/config.json b/checkpoint-54300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54300/model.safetensors b/checkpoint-54300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d5ef5913080d7b3529e04a68ed635933f683ef84
--- /dev/null
+++ b/checkpoint-54300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f6a86f61c9b992bb2cb2fc9b71cd469bd86f394ad72f2b560a352a26732f423
+size 324662984
diff --git a/checkpoint-54300/training_args.bin b/checkpoint-54300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54400/config.json b/checkpoint-54400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54400/model.safetensors b/checkpoint-54400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..287072d039a5c94d3c44b4911f6986e0fe1deed8
--- /dev/null
+++ b/checkpoint-54400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:554feb7e34f780ac2df059f682cb820f8e2fd09079bf71185c6ecb2782a9e94d
+size 324662984
diff --git a/checkpoint-54400/training_args.bin b/checkpoint-54400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54500/config.json b/checkpoint-54500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54500/model.safetensors b/checkpoint-54500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e24bd7604b590af550d46714dd22e92b1d551f4b
--- /dev/null
+++ b/checkpoint-54500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ded828566961a7a626c254eb3872e112b93ace3b504b44b6f6888a600b542eb0
+size 324662984
diff --git a/checkpoint-54500/training_args.bin b/checkpoint-54500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54600/config.json b/checkpoint-54600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54600/model.safetensors b/checkpoint-54600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ba37679184a940a0f73080a29bda7960d31516db
--- /dev/null
+++ b/checkpoint-54600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7d7089f221620a1492774e0eda4f4e30ad8233040b3807ed2566f26d55696f7
+size 324662984
diff --git a/checkpoint-54600/training_args.bin b/checkpoint-54600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54700/config.json b/checkpoint-54700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54700/model.safetensors b/checkpoint-54700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..01afdb8842218d069a0dd9339ca510c46768367a
--- /dev/null
+++ b/checkpoint-54700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ac98ca84178e7f2f854244fb9fd13eadc384d33372a019f6b418fd22ba5e866
+size 324662984
diff --git a/checkpoint-54700/training_args.bin b/checkpoint-54700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54800/config.json b/checkpoint-54800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54800/model.safetensors b/checkpoint-54800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8e41cb79c0803942a8044efaab714dac6174a587
--- /dev/null
+++ b/checkpoint-54800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:167d1f5eec4f6f1286059bb2f3e52786ce003fccdf432a18575b2829f9b4bf7b
+size 324662984
diff --git a/checkpoint-54800/training_args.bin b/checkpoint-54800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-54900/config.json b/checkpoint-54900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-54900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-54900/model.safetensors b/checkpoint-54900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2ec9737259e7651e8d716b01ed87889f314a9187
--- /dev/null
+++ b/checkpoint-54900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e59bde1a5750cf1f4876d5bf4cf320de0936ce0fbbf19703a089abdc663f28f6
+size 324662984
diff --git a/checkpoint-54900/training_args.bin b/checkpoint-54900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-54900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5500/config.json b/checkpoint-5500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5500/model.safetensors b/checkpoint-5500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e2da5e0ffd1d054aed0fb40f41123bdd99247b0d
--- /dev/null
+++ b/checkpoint-5500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12f23ce69781387319073950623452f1b024a9f011ca154460a64dd116db1dc3
+size 324662984
diff --git a/checkpoint-5500/training_args.bin b/checkpoint-5500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55000/config.json b/checkpoint-55000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55000/model.safetensors b/checkpoint-55000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..38661f4ae0b79d0978deeddf607169797a59761f
--- /dev/null
+++ b/checkpoint-55000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:914b0483268b844cd77ad4ce2743859621295bde0e7c64905924eabe638c9987
+size 324662984
diff --git a/checkpoint-55000/training_args.bin b/checkpoint-55000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55100/config.json b/checkpoint-55100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55100/model.safetensors b/checkpoint-55100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..94bff267a59d1a0daee33d81998a15d72154cf4c
--- /dev/null
+++ b/checkpoint-55100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cea7a843fc372c20fb7541f8a9d96646ca6fe1302d515cdd4c3065c11994ce4
+size 324662984
diff --git a/checkpoint-55100/training_args.bin b/checkpoint-55100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55200/config.json b/checkpoint-55200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55200/model.safetensors b/checkpoint-55200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8c0cea8b839829aecc0ff0d23209ad1d62a0ad5c
--- /dev/null
+++ b/checkpoint-55200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d00311541d39a91da35ed494bada36eac2d8aefbca3dc2cc37e1d9c5bf5f937c
+size 324662984
diff --git a/checkpoint-55200/training_args.bin b/checkpoint-55200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55300/config.json b/checkpoint-55300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55300/model.safetensors b/checkpoint-55300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9373648d5693cd9a3f44457ad700feb2facc4047
--- /dev/null
+++ b/checkpoint-55300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cdad23595d40bad90751915154bb556e89e708df3d43a1c54e2dc863ceb584c
+size 324662984
diff --git a/checkpoint-55300/training_args.bin b/checkpoint-55300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55400/config.json b/checkpoint-55400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55400/model.safetensors b/checkpoint-55400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e9184728dc4079797f73ee110ae3e9edf8fbda88
--- /dev/null
+++ b/checkpoint-55400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7a5aff87ece5fe8124715cf96197670e622008bbfb6035a644d3b9aeb4a9190
+size 324662984
diff --git a/checkpoint-55400/training_args.bin b/checkpoint-55400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55500/config.json b/checkpoint-55500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55500/model.safetensors b/checkpoint-55500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0dc68ce66401836db73c9678adf535cc6405d418
--- /dev/null
+++ b/checkpoint-55500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82db3742ee0f7c7e3b6023e6628d322330dc1834b3cd0a44ff2b02b19e903678
+size 324662984
diff --git a/checkpoint-55500/training_args.bin b/checkpoint-55500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55600/config.json b/checkpoint-55600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55600/model.safetensors b/checkpoint-55600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f5115fbeb729a64aa89c5c367411150f8da612e6
--- /dev/null
+++ b/checkpoint-55600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b13a59f1b48e24856548716885f6d5797ddcb5c41f02d8cbd9475a92b3a5266
+size 324662984
diff --git a/checkpoint-55600/training_args.bin b/checkpoint-55600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55700/config.json b/checkpoint-55700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55700/model.safetensors b/checkpoint-55700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c7b55d6d44252676e47d539c2df0a3b9f29e36d8
--- /dev/null
+++ b/checkpoint-55700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00c28f6e9aa197a334b5acabce4908ad8e199c3053ae4195e2c15390983e3c23
+size 324662984
diff --git a/checkpoint-55700/training_args.bin b/checkpoint-55700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55800/config.json b/checkpoint-55800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55800/model.safetensors b/checkpoint-55800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1dc1207971a37fe6b481d4bd08dd95e81c67191f
--- /dev/null
+++ b/checkpoint-55800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25d0617e91823da53d0fa2d719276d8bc8e23aeb11fc22ccc6a2e9acdc68da21
+size 324662984
diff --git a/checkpoint-55800/training_args.bin b/checkpoint-55800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-55900/config.json b/checkpoint-55900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-55900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-55900/model.safetensors b/checkpoint-55900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c24c45743c1034740c4a1cee5afcab4ef0726687
--- /dev/null
+++ b/checkpoint-55900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b27dc8eadf687ec6c3956902d5807ba526b4bbc6913de5a63f3a54f6d5e5744
+size 324662984
diff --git a/checkpoint-55900/training_args.bin b/checkpoint-55900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-55900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5600/config.json b/checkpoint-5600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5600/model.safetensors b/checkpoint-5600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..399563edc5fbaad2d9b64744d728c6e24dbc1499
--- /dev/null
+++ b/checkpoint-5600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:753e5526458ed85967f22e6ee444a347756ac99254206595e2ec88c160891e7d
+size 324662984
diff --git a/checkpoint-5600/training_args.bin b/checkpoint-5600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56000/config.json b/checkpoint-56000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56000/model.safetensors b/checkpoint-56000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..39297491154a3ca6dfbbb65d7064a48488e9a758
--- /dev/null
+++ b/checkpoint-56000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66bdced92b1aad40f4bb9c85b3ad8d42c24541563df59d1006a1ab456ce74c64
+size 324662984
diff --git a/checkpoint-56000/training_args.bin b/checkpoint-56000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56100/config.json b/checkpoint-56100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56100/model.safetensors b/checkpoint-56100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..147ff1cdd5e1b77841b4f4a897ea7d1beedcc9f1
--- /dev/null
+++ b/checkpoint-56100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2adca99faf689da90d0544dfd5b8854daefbb1ef629155632ed50c98e20351ca
+size 324662984
diff --git a/checkpoint-56100/training_args.bin b/checkpoint-56100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56200/config.json b/checkpoint-56200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56200/model.safetensors b/checkpoint-56200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..33d4551bea92f016acad6de889b16ba45acaed64
--- /dev/null
+++ b/checkpoint-56200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79ab7457d3432e0b265ab431d71caeb8fea5a1a14248b8f44d23c3b75b5772ed
+size 324662984
diff --git a/checkpoint-56200/training_args.bin b/checkpoint-56200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56300/config.json b/checkpoint-56300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56300/model.safetensors b/checkpoint-56300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7101f0ac673816232cf906dae3ed36141f7d7256
--- /dev/null
+++ b/checkpoint-56300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:506c0c290e57f901305015f8cae67dd188cabb538976357b7c2b81766bc57ba9
+size 324662984
diff --git a/checkpoint-56300/training_args.bin b/checkpoint-56300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56400/config.json b/checkpoint-56400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56400/model.safetensors b/checkpoint-56400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3cc84eb8b4e346b1fdfb8511a21fccd57ff94a8c
--- /dev/null
+++ b/checkpoint-56400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad688dacfc9312806b45edc5b7d41251b1a93cb79b8d661b9aa9ee407177ee40
+size 324662984
diff --git a/checkpoint-56400/training_args.bin b/checkpoint-56400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56500/config.json b/checkpoint-56500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56500/model.safetensors b/checkpoint-56500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bed8f282fc4b254052bb116562e94a64f20370c2
--- /dev/null
+++ b/checkpoint-56500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:045f9b8ea25681c991c8c24c181cc661c28cfc5beadfe107edaae415faa052c5
+size 324662984
diff --git a/checkpoint-56500/training_args.bin b/checkpoint-56500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56600/config.json b/checkpoint-56600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56600/model.safetensors b/checkpoint-56600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4c9bd006a71b994a2783090771c9d58353a6edb3
--- /dev/null
+++ b/checkpoint-56600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c40fbb5b55be16d52b8cf734f2540d9a3278ce48d94e8f87d3b7eec3a544d0c
+size 324662984
diff --git a/checkpoint-56600/training_args.bin b/checkpoint-56600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56700/config.json b/checkpoint-56700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56700/model.safetensors b/checkpoint-56700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2602d6656a5c5dbf1478c73461561e3c46eeb5d6
--- /dev/null
+++ b/checkpoint-56700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b14e808955fbf097e089b42bed4687900e3f6575b9fe54adb9ac9a35d8fd395d
+size 324662984
diff --git a/checkpoint-56700/training_args.bin b/checkpoint-56700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56800/config.json b/checkpoint-56800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56800/model.safetensors b/checkpoint-56800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2fc650c3a4ffb8589930d135d7e74a7dd0f7633e
--- /dev/null
+++ b/checkpoint-56800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:545f2d420cf815d62dd5fbd515ea380f8aed21dd55370eb079e21caa4dba8947
+size 324662984
diff --git a/checkpoint-56800/training_args.bin b/checkpoint-56800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-56900/config.json b/checkpoint-56900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-56900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-56900/model.safetensors b/checkpoint-56900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f22b6bb9ef1c0b947d7f6e90c18e7dcd3b93fffb
--- /dev/null
+++ b/checkpoint-56900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9d712bade3256aeb01154d1caffe23d7a04eece0252780878fae468f98589a4
+size 324662984
diff --git a/checkpoint-56900/training_args.bin b/checkpoint-56900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-56900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5700/config.json b/checkpoint-5700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5700/model.safetensors b/checkpoint-5700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..28dcc84621188cd7c58cb6e01d7f8cebcc8f99fa
--- /dev/null
+++ b/checkpoint-5700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62ddb822d75681999621e974d3f69fcb8095d924f96fbc767e142e8bef0bfab4
+size 324662984
diff --git a/checkpoint-5700/training_args.bin b/checkpoint-5700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57000/config.json b/checkpoint-57000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57000/model.safetensors b/checkpoint-57000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ea4af03ac22fcc401e2a364a2fad5505665560dc
--- /dev/null
+++ b/checkpoint-57000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77ea24c562f46505c020562a5aa59a25841f66d0f59ea910bcbc8e87adeb7ccf
+size 324662984
diff --git a/checkpoint-57000/training_args.bin b/checkpoint-57000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57100/config.json b/checkpoint-57100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57100/model.safetensors b/checkpoint-57100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..eeca69c1111a6d50080012b0b9d21d8d6fe51e76
--- /dev/null
+++ b/checkpoint-57100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dd2716e2f5daae3a89c7708b61ebfd5a026e0bed594d3c329220102f383e6f1
+size 324662984
diff --git a/checkpoint-57100/training_args.bin b/checkpoint-57100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57200/config.json b/checkpoint-57200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57200/model.safetensors b/checkpoint-57200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b4a897e8b8e6e963c39870cb6d3bb863af9f3c56
--- /dev/null
+++ b/checkpoint-57200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a99c368777522789eff170ef22efd310688d3b5736993a92d2c24bdff921cfe
+size 324662984
diff --git a/checkpoint-57200/training_args.bin b/checkpoint-57200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57300/config.json b/checkpoint-57300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57300/model.safetensors b/checkpoint-57300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..52375b02fca7154676b6c3614c25570cfceaa4e5
--- /dev/null
+++ b/checkpoint-57300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96f22a0024f4df99303e775b59a6ece61b7306f31a2f0c65fff7baf383b0dc6f
+size 324662984
diff --git a/checkpoint-57300/training_args.bin b/checkpoint-57300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57400/config.json b/checkpoint-57400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57400/model.safetensors b/checkpoint-57400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..811a1bbd264a20e6a2594d84e5c9c88f557e7905
--- /dev/null
+++ b/checkpoint-57400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb86482762a1fac4a17ac758672f08f999dac367ce0d8f44deb5b7c2d85aeb8a
+size 324662984
diff --git a/checkpoint-57400/training_args.bin b/checkpoint-57400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57500/config.json b/checkpoint-57500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57500/model.safetensors b/checkpoint-57500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7d12fe7dc59415661fe1dfd1d9d45b9bef29ecba
--- /dev/null
+++ b/checkpoint-57500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c261c8f05378365cc3728850a595dae49e7c86960cff0120b56f230248a17d4
+size 324662984
diff --git a/checkpoint-57500/training_args.bin b/checkpoint-57500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57600/config.json b/checkpoint-57600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57600/model.safetensors b/checkpoint-57600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f63fa629dc87d0b3434143a4b5bab65816122c02
--- /dev/null
+++ b/checkpoint-57600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e47548ad13af0921cd2f79a55440509a7dc934d0fcd88a1bd22ba017fba5afc9
+size 324662984
diff --git a/checkpoint-57600/training_args.bin b/checkpoint-57600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57700/config.json b/checkpoint-57700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57700/model.safetensors b/checkpoint-57700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..08fc083834a8c7f2744217a512cfc5c87d918c2c
--- /dev/null
+++ b/checkpoint-57700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a2cc28d9a625a53ed49a4a1426716006364f7e94621bddc7228e51fdaadc593
+size 324662984
diff --git a/checkpoint-57700/training_args.bin b/checkpoint-57700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57800/config.json b/checkpoint-57800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57800/model.safetensors b/checkpoint-57800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..15f920f9bb4c8944c28a6a44c7bf8bc90eae02a8
--- /dev/null
+++ b/checkpoint-57800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9db9ca958bf2a81f9acafb9f98a12f383d1d41bc8a7b21f5d89789564c48b2d
+size 324662984
diff --git a/checkpoint-57800/training_args.bin b/checkpoint-57800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-57900/config.json b/checkpoint-57900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-57900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-57900/model.safetensors b/checkpoint-57900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..05428f9aaf1841e79abd800904369d4ff137e921
--- /dev/null
+++ b/checkpoint-57900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45307eae041cf638341ae7d75570f48936fd7866f8c841e417c224b67f169ff2
+size 324662984
diff --git a/checkpoint-57900/training_args.bin b/checkpoint-57900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-57900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5800/config.json b/checkpoint-5800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5800/model.safetensors b/checkpoint-5800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..38d16e8ec76b68dee7b0580a102998d69fb13111
--- /dev/null
+++ b/checkpoint-5800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e97ab267780a42aa734cbf3b60d30d92074eee6b23ce3166de41b8c669f4362f
+size 324662984
diff --git a/checkpoint-5800/training_args.bin b/checkpoint-5800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58000/config.json b/checkpoint-58000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58000/model.safetensors b/checkpoint-58000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..54a1fdfcee8b1738a2f28dafa477a4b4a010a298
--- /dev/null
+++ b/checkpoint-58000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff937d38bb3ab058967b55b4f77f0f6dcad0714b60032eb5ea530fc4db5a0f37
+size 324662984
diff --git a/checkpoint-58000/training_args.bin b/checkpoint-58000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58100/config.json b/checkpoint-58100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58100/model.safetensors b/checkpoint-58100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..102e4441ad9fe7517c02d2eeceb90d272a3a108c
--- /dev/null
+++ b/checkpoint-58100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e5dddd5526262cc8fdcceecda186f2a031ae85ec2aad785fd0d4ff3568658e1
+size 324662984
diff --git a/checkpoint-58100/training_args.bin b/checkpoint-58100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58200/config.json b/checkpoint-58200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58200/model.safetensors b/checkpoint-58200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b3336f2664d8606a95281c50109d86e6e91929bf
--- /dev/null
+++ b/checkpoint-58200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eea849e178f2954c9b860830f2a42595acf293f12384b7c7b3bfe119c9038c01
+size 324662984
diff --git a/checkpoint-58200/training_args.bin b/checkpoint-58200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58300/config.json b/checkpoint-58300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58300/model.safetensors b/checkpoint-58300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..75d202afedb3f096f0d9ebe301f61a275195c860
--- /dev/null
+++ b/checkpoint-58300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9788c99d59ff155ab27457b7bb71e02a254e7f5c89bc852e82ccee36fd2c4e58
+size 324662984
diff --git a/checkpoint-58300/training_args.bin b/checkpoint-58300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58400/config.json b/checkpoint-58400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58400/model.safetensors b/checkpoint-58400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..879ce7902324f94a602ae45ab0b33633292c230d
--- /dev/null
+++ b/checkpoint-58400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb49e3226f3d3eb79742b043b977b1a4153c30fc33b091db8ed0e643e250cbf9
+size 324662984
diff --git a/checkpoint-58400/training_args.bin b/checkpoint-58400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58500/config.json b/checkpoint-58500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58500/model.safetensors b/checkpoint-58500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c5dd0ad00eff26eaa2d9b2444d440fffe53ebe35
--- /dev/null
+++ b/checkpoint-58500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b79cd49a0593864f4a6ffbeb1c8125727f836a073196de3432e9c829772e2d1
+size 324662984
diff --git a/checkpoint-58500/training_args.bin b/checkpoint-58500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58600/config.json b/checkpoint-58600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58600/model.safetensors b/checkpoint-58600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..221245faa6fddb175cba9674cc057a846f4f132a
--- /dev/null
+++ b/checkpoint-58600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5e94213949e6760bdc9be5b6685a2eaf5cbd8023561d346d82836019b492277
+size 324662984
diff --git a/checkpoint-58600/training_args.bin b/checkpoint-58600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58700/config.json b/checkpoint-58700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58700/model.safetensors b/checkpoint-58700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..21e5f75851d4def5c6a9eb18efb2183fa0221178
--- /dev/null
+++ b/checkpoint-58700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da39500086ca60005342ecf250c6f6fa09482b8efe7a0d3d7fbf7d3ae6ef3970
+size 324662984
diff --git a/checkpoint-58700/training_args.bin b/checkpoint-58700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58800/config.json b/checkpoint-58800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58800/model.safetensors b/checkpoint-58800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c7e24d6f97778dbbbcd97291e7c1528a0098a318
--- /dev/null
+++ b/checkpoint-58800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c9fdf2684a6e19ff3f96a3561bd6a51c9fd52b075b9a95834f17ffd0144e54b
+size 324662984
diff --git a/checkpoint-58800/training_args.bin b/checkpoint-58800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-58900/config.json b/checkpoint-58900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-58900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-58900/model.safetensors b/checkpoint-58900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2ba78324283cc3a1ef0e5315d88aee2393aaf25b
--- /dev/null
+++ b/checkpoint-58900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb1851ab1e3c5854784417dca115bbb9387aebd632482e260c52821582eaa67
+size 324662984
diff --git a/checkpoint-58900/training_args.bin b/checkpoint-58900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-58900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-5900/config.json b/checkpoint-5900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-5900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-5900/model.safetensors b/checkpoint-5900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fa78a64619a19ff8fa0c42ca19977e77798eeced
--- /dev/null
+++ b/checkpoint-5900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d46b40b96c4ad67ff2529ad0fd3cbc8f9a6a0a75e7713b3152545181a7b5c609
+size 324662984
diff --git a/checkpoint-5900/training_args.bin b/checkpoint-5900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-5900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59000/config.json b/checkpoint-59000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59000/model.safetensors b/checkpoint-59000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..937894638e3a59d122023ba39a67473d413f82cc
--- /dev/null
+++ b/checkpoint-59000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2243fe0463695600d06197c52575a1bc04c736bd7806cedf860f63e9c261bad
+size 324662984
diff --git a/checkpoint-59000/training_args.bin b/checkpoint-59000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59100/config.json b/checkpoint-59100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59100/model.safetensors b/checkpoint-59100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2e8d87199eb8d7d63b71bf4e4d3d7d2eb54d0bde
--- /dev/null
+++ b/checkpoint-59100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f84cb2fb7d60d582c21f0a5ceec34c84e10a2459dd7535d4023aa6b78d29bf2
+size 324662984
diff --git a/checkpoint-59100/training_args.bin b/checkpoint-59100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59200/config.json b/checkpoint-59200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59200/model.safetensors b/checkpoint-59200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..51564545f6ebf80c5163c73cffa759b23241c172
--- /dev/null
+++ b/checkpoint-59200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d802c8b9ff01079e54379be684360e60599ef9af3c59c10726c6e1bb63f9beb
+size 324662984
diff --git a/checkpoint-59200/training_args.bin b/checkpoint-59200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59300/config.json b/checkpoint-59300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59300/model.safetensors b/checkpoint-59300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..27d03cb1240a2c57b63cae4dd4a27acce09301fa
--- /dev/null
+++ b/checkpoint-59300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2b4c77d6eb6a1b69781a173b13e09b8631a277847ff720d65275ecbdef90376
+size 324662984
diff --git a/checkpoint-59300/training_args.bin b/checkpoint-59300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59400/config.json b/checkpoint-59400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59400/model.safetensors b/checkpoint-59400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..20024bcbb616b9e87ebeccda5c2022aa7368dc0e
--- /dev/null
+++ b/checkpoint-59400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab9f4389334fc7ad82b220983109ffc64687974507a4e5a31b47d53d5cdb634f
+size 324662984
diff --git a/checkpoint-59400/training_args.bin b/checkpoint-59400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59500/config.json b/checkpoint-59500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59500/model.safetensors b/checkpoint-59500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5c77d5db25829ebbefb8e28698a560fa3a4cbeb2
--- /dev/null
+++ b/checkpoint-59500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d70a40ece6176221fada518b3289c14638b86ae9beb1e6993f133caac2175d9c
+size 324662984
diff --git a/checkpoint-59500/training_args.bin b/checkpoint-59500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59600/config.json b/checkpoint-59600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59600/model.safetensors b/checkpoint-59600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e6b3f820aaaa766a863eb0cc52d10e6f60cf56ae
--- /dev/null
+++ b/checkpoint-59600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9511c8eddac945157d3bc772d7561d4c7d4483a3410bae2e1459b6dc2480799
+size 324662984
diff --git a/checkpoint-59600/training_args.bin b/checkpoint-59600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59700/config.json b/checkpoint-59700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59700/model.safetensors b/checkpoint-59700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dfedccf49afdf357701f3d73ff3afe21c408d1e7
--- /dev/null
+++ b/checkpoint-59700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8df6851f3405016460dcdb58832049982b6884c04345dabd22dd462d3301acf7
+size 324662984
diff --git a/checkpoint-59700/training_args.bin b/checkpoint-59700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59800/config.json b/checkpoint-59800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59800/model.safetensors b/checkpoint-59800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..25c18121df0d2c505966163c3bc20f1f1e4f0f37
--- /dev/null
+++ b/checkpoint-59800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de111983e45ead9ac461c3eb1e70332f58725846535a5ecd92db21b30451c4a7
+size 324662984
diff --git a/checkpoint-59800/training_args.bin b/checkpoint-59800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-59900/config.json b/checkpoint-59900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-59900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-59900/model.safetensors b/checkpoint-59900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..db2b16ab0ab51c0d73d45eb81917fb9d14d042d0
--- /dev/null
+++ b/checkpoint-59900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffe0fe01d3a4ae3d8da194f626303b250c87ae9f85ab5f60926c4518064380b7
+size 324662984
diff --git a/checkpoint-59900/training_args.bin b/checkpoint-59900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-59900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-600/config.json b/checkpoint-600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-600/model.safetensors b/checkpoint-600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7c858423c7f1c755bea7e2267e815249ae9abf18
--- /dev/null
+++ b/checkpoint-600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1aeacc2c448ac0651561ddccb1ae81763195f7fdf4bd769ac1edbbe4b2be2597
+size 324662984
diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6000/config.json b/checkpoint-6000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6000/model.safetensors b/checkpoint-6000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..24290506ad4f0c6f89386a36d0eb7989576f5689
--- /dev/null
+++ b/checkpoint-6000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba6af9ebf5c45281f642d0c41038773c7fd8c6f082f19357222a85b2c7bef604
+size 324662984
diff --git a/checkpoint-6000/training_args.bin b/checkpoint-6000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60000/config.json b/checkpoint-60000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60000/model.safetensors b/checkpoint-60000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..35ea4da85f736840b2d4a66e7d7622fb853d9f93
--- /dev/null
+++ b/checkpoint-60000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4682ad086e17041e485d3c40e5e79397a4460a1dc841dc8b3353b68f322b091b
+size 324662984
diff --git a/checkpoint-60000/training_args.bin b/checkpoint-60000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60100/config.json b/checkpoint-60100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60100/model.safetensors b/checkpoint-60100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fb50c49a6e3c90e53d67f7c774771e5ee67c4b7b
--- /dev/null
+++ b/checkpoint-60100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a5d3e87f1d0d6a9f4f62a00811c23b8556c9a410e226bfcafa6d3367b0aa02d
+size 324662984
diff --git a/checkpoint-60100/training_args.bin b/checkpoint-60100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60200/config.json b/checkpoint-60200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60200/model.safetensors b/checkpoint-60200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0a2dab1740d58becb335aac9f5fba3e0efcc8d1b
--- /dev/null
+++ b/checkpoint-60200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3758630f3d0cc89d3eff10dd494c9c2971a8674017f13140a43774f2d8a0569
+size 324662984
diff --git a/checkpoint-60200/training_args.bin b/checkpoint-60200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60300/config.json b/checkpoint-60300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60300/model.safetensors b/checkpoint-60300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9b3af6c9e0c9f3d50b0102012b91bed71aaa08f4
--- /dev/null
+++ b/checkpoint-60300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9563e0e583999b32cbb160fe096b800eb9d019fe0b3d34be288e148d62622e64
+size 324662984
diff --git a/checkpoint-60300/training_args.bin b/checkpoint-60300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60400/config.json b/checkpoint-60400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60400/model.safetensors b/checkpoint-60400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b6a62dcfe1ff891baf5b66c1b0224aca4023557f
--- /dev/null
+++ b/checkpoint-60400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9636d68acefc65980ae79cccf2fff5988da44bfde17bea9facddbe68c3c8a454
+size 324662984
diff --git a/checkpoint-60400/training_args.bin b/checkpoint-60400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60500/config.json b/checkpoint-60500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60500/model.safetensors b/checkpoint-60500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d8b944ef545966ff89b02e856d7f5169966768ac
--- /dev/null
+++ b/checkpoint-60500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c08a152f56c419c8364cc373c56be9a8325ef3d5c85a41ee152d5aafb80a55b1
+size 324662984
diff --git a/checkpoint-60500/training_args.bin b/checkpoint-60500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60600/config.json b/checkpoint-60600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60600/model.safetensors b/checkpoint-60600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2338cfa637ae6d17f1f615ae53f13dd86c158996
--- /dev/null
+++ b/checkpoint-60600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b473a27f8441f54b660d3aa26041c323026df68df26fc78a02cb9e7fab7c83aa
+size 324662984
diff --git a/checkpoint-60600/training_args.bin b/checkpoint-60600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60700/config.json b/checkpoint-60700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60700/model.safetensors b/checkpoint-60700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4d6361bc3be2f270076dc0bdf673566afd465816
--- /dev/null
+++ b/checkpoint-60700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:169a302d098c1bd39f6e3c826362058075d5d2af9b4305375fcfa6e1cc5d1dd0
+size 324662984
diff --git a/checkpoint-60700/training_args.bin b/checkpoint-60700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60800/config.json b/checkpoint-60800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60800/model.safetensors b/checkpoint-60800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1a25472885974df90d8a8f34ec464ed0b8b8bc94
--- /dev/null
+++ b/checkpoint-60800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e8963fb350ba043c2a5f6bf022f5ea6a20cd92574a0ad13ce16ea200dae66ed
+size 324662984
diff --git a/checkpoint-60800/training_args.bin b/checkpoint-60800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-60900/config.json b/checkpoint-60900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-60900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-60900/model.safetensors b/checkpoint-60900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0457900b2c5fe64ae88aee173ff723206d70a537
--- /dev/null
+++ b/checkpoint-60900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6da09c994f1666849220bea89723c8529366c89e88cc8125544738f87d72e4a
+size 324662984
diff --git a/checkpoint-60900/training_args.bin b/checkpoint-60900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-60900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6100/config.json b/checkpoint-6100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6100/model.safetensors b/checkpoint-6100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e60a7b24ed3f3dc2e7cb7cfa388009cd4b26e0dd
--- /dev/null
+++ b/checkpoint-6100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68c2c74b4dc2faccf4f59642e5a9e3e1cd52a0a99989722096068f64d8c054a2
+size 324662984
diff --git a/checkpoint-6100/training_args.bin b/checkpoint-6100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61000/config.json b/checkpoint-61000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61000/model.safetensors b/checkpoint-61000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c0b4936fa1f70af4be8dab45ebd12e087251f993
--- /dev/null
+++ b/checkpoint-61000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee43bcb14c4e12d9d69698c5851f290e81501b0d6784ae018a18aca169954c10
+size 324662984
diff --git a/checkpoint-61000/training_args.bin b/checkpoint-61000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61100/config.json b/checkpoint-61100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61100/model.safetensors b/checkpoint-61100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..af254e8322d6835853ed8032511ea32a201207e2
--- /dev/null
+++ b/checkpoint-61100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:342fe41e9417a7ab74d99ce274fbdf85a843c084d87ae246936f33eea870d493
+size 324662984
diff --git a/checkpoint-61100/training_args.bin b/checkpoint-61100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61200/config.json b/checkpoint-61200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61200/model.safetensors b/checkpoint-61200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e1453d80c24af8b59285ca15b1b10a9e1fa9d353
--- /dev/null
+++ b/checkpoint-61200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00395e3db3a514c10bf2525a69a4062b7a161101b8416774dd7ca48fbc743022
+size 324662984
diff --git a/checkpoint-61200/training_args.bin b/checkpoint-61200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61300/config.json b/checkpoint-61300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61300/model.safetensors b/checkpoint-61300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3a9acb2de171ccca83a785b1b548bd2bf67ff1e6
--- /dev/null
+++ b/checkpoint-61300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:372ef5633f0282baf296c7476eaf59cff26656a343c32ab1ae30102ba49b79d9
+size 324662984
diff --git a/checkpoint-61300/training_args.bin b/checkpoint-61300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61400/config.json b/checkpoint-61400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61400/model.safetensors b/checkpoint-61400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..960e3a5934bd371a9baa7d7c65acfb531d50634c
--- /dev/null
+++ b/checkpoint-61400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9543ccbb118a2d0d7bd1f0f66bb02d53f544d85edd6cee9cb12f5fde4ecbebaa
+size 324662984
diff --git a/checkpoint-61400/training_args.bin b/checkpoint-61400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61500/config.json b/checkpoint-61500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61500/model.safetensors b/checkpoint-61500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4b4d377f363d4f69696cb9d19afae55195b3f65c
--- /dev/null
+++ b/checkpoint-61500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7203b09741192de73184d653e52eaf6e997c22ce892d99fd93e4436a135afa9
+size 324662984
diff --git a/checkpoint-61500/training_args.bin b/checkpoint-61500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61600/config.json b/checkpoint-61600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61600/model.safetensors b/checkpoint-61600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2450f851859417555de60899ba71654e146dc0e9
--- /dev/null
+++ b/checkpoint-61600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a77a5f2c71fc3fce2dc0ec58c72090d51fe8ca7f516f03bb67f2f5fddb06d6ae
+size 324662984
diff --git a/checkpoint-61600/training_args.bin b/checkpoint-61600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61700/config.json b/checkpoint-61700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61700/model.safetensors b/checkpoint-61700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..22173b89eb34cb139fe417a0233dcdbbbce023d5
--- /dev/null
+++ b/checkpoint-61700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc9657e987f27b818370545e4d71bc16749940026f9b8ab38b2f789b078dd1eb
+size 324662984
diff --git a/checkpoint-61700/training_args.bin b/checkpoint-61700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61800/config.json b/checkpoint-61800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61800/model.safetensors b/checkpoint-61800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b4f6af033c6ce4787b4fbf999be556c9b95d52e6
--- /dev/null
+++ b/checkpoint-61800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:728a97406b9d639b80419d1f970a6583f885b3720c8b0705c0044354b2e8b44e
+size 324662984
diff --git a/checkpoint-61800/training_args.bin b/checkpoint-61800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-61900/config.json b/checkpoint-61900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-61900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-61900/model.safetensors b/checkpoint-61900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..204408303495c5d6767280fd77fccfcab150353e
--- /dev/null
+++ b/checkpoint-61900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7befbe582ee93cc3391f82280df65603a84bdad1bac30357fc4a1874d230a2e
+size 324662984
diff --git a/checkpoint-61900/training_args.bin b/checkpoint-61900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-61900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6200/config.json b/checkpoint-6200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6200/model.safetensors b/checkpoint-6200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d06f03e7bbac6c0f601bd32a833a77a0efed8472
--- /dev/null
+++ b/checkpoint-6200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:726ea7db783f0b840ff5230117f3d19ab82b278e4844606d69623bf4caf16059
+size 324662984
diff --git a/checkpoint-6200/training_args.bin b/checkpoint-6200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62000/config.json b/checkpoint-62000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62000/model.safetensors b/checkpoint-62000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..79bb8d2d7fdafd562bc93181a2f61b488a64346e
--- /dev/null
+++ b/checkpoint-62000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0990b41b6b2e15ffd481af42406ef2cd237f3370aa6a2f78c5631be262b8697
+size 324662984
diff --git a/checkpoint-62000/training_args.bin b/checkpoint-62000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62100/config.json b/checkpoint-62100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62100/model.safetensors b/checkpoint-62100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1a37dfde552000e3593d174f22bb2e26aab6dd68
--- /dev/null
+++ b/checkpoint-62100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13c0aa0e0fb7916688c30d0ab3368b4abfc97664ad3ed32d75b075f9b7dcf408
+size 324662984
diff --git a/checkpoint-62100/training_args.bin b/checkpoint-62100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62200/config.json b/checkpoint-62200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62200/model.safetensors b/checkpoint-62200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aabc459a93a935a9429e0112f8db28e0378223bd
--- /dev/null
+++ b/checkpoint-62200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f177b6074198f5226192a44ced8e3c31fde3563e6c5975a763aa3560204cceb
+size 324662984
diff --git a/checkpoint-62200/training_args.bin b/checkpoint-62200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62300/config.json b/checkpoint-62300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62300/model.safetensors b/checkpoint-62300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..293d6ede31bcee4eb93513fd521cdb5314519b3c
--- /dev/null
+++ b/checkpoint-62300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:263a8d7f83f4ad08570aca18ca1f0d95704da86d7b04d968836f89c9d66db405
+size 324662984
diff --git a/checkpoint-62300/training_args.bin b/checkpoint-62300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62400/config.json b/checkpoint-62400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62400/model.safetensors b/checkpoint-62400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4d843b14acc595eded63cde2a0607a307ce8028e
--- /dev/null
+++ b/checkpoint-62400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b6804e73db025b0846ed5e5497f9016e912a4831fd8f315abffe1492b1ca4e6
+size 324662984
diff --git a/checkpoint-62400/training_args.bin b/checkpoint-62400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62500/config.json b/checkpoint-62500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62500/model.safetensors b/checkpoint-62500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c647d326e8a417f6bee2a0983728c63fae0e9db0
--- /dev/null
+++ b/checkpoint-62500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f52291037e4405f24aa4f25c630d5b607a2bf38275efd944ba3687a08d4cd6e7
+size 324662984
diff --git a/checkpoint-62500/training_args.bin b/checkpoint-62500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62600/config.json b/checkpoint-62600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62600/model.safetensors b/checkpoint-62600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2134b1bb2ff9a14d5583d31a84318e73c4c66b54
--- /dev/null
+++ b/checkpoint-62600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ece28c85ac4241799d7da1d7adfb0261d936c2f13aeb7911348d11440f8bffc7
+size 324662984
diff --git a/checkpoint-62600/training_args.bin b/checkpoint-62600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62700/config.json b/checkpoint-62700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62700/model.safetensors b/checkpoint-62700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f41e2cefbf330e12c2ea5e2c0c1c122f7bb4ee59
--- /dev/null
+++ b/checkpoint-62700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fffb7b1271accd6bffe0339781ae8bd78d77010c866957d89a88c68eb2f5dc40
+size 324662984
diff --git a/checkpoint-62700/training_args.bin b/checkpoint-62700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62800/config.json b/checkpoint-62800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62800/model.safetensors b/checkpoint-62800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b2d19d9d43921b68e835c833b630deaeff29319c
--- /dev/null
+++ b/checkpoint-62800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:048878fd2ecb93ec73d748cd80abbf597524b08c93c5e4411d176c7f1499a554
+size 324662984
diff --git a/checkpoint-62800/training_args.bin b/checkpoint-62800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-62900/config.json b/checkpoint-62900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-62900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-62900/model.safetensors b/checkpoint-62900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3946bf494714529fc6a862f6b73c8a60d5f87459
--- /dev/null
+++ b/checkpoint-62900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe2715b202b3edd56eb848bcaffe7e36109b3820ef14ece18e9475bb9f41e43e
+size 324662984
diff --git a/checkpoint-62900/training_args.bin b/checkpoint-62900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-62900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6300/config.json b/checkpoint-6300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6300/model.safetensors b/checkpoint-6300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a640aa59e362735f02300b609be1447f79cf89d7
--- /dev/null
+++ b/checkpoint-6300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab4d39ca22e8a2f488af79095aeb48f80c7887890eae1461a3c6e0c30a7d88d9
+size 324662984
diff --git a/checkpoint-6300/training_args.bin b/checkpoint-6300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63000/config.json b/checkpoint-63000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63000/model.safetensors b/checkpoint-63000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a1b406252f8e6a776cfea9078146ed1e9a1f9d99
--- /dev/null
+++ b/checkpoint-63000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86f266e6e251d49bf99b9a57bf5d537acec676652270229ad22ccb44076ac3f5
+size 324662984
diff --git a/checkpoint-63000/training_args.bin b/checkpoint-63000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63100/config.json b/checkpoint-63100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63100/model.safetensors b/checkpoint-63100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4b5b5d60cc44c09b18183a4df824ee1793f6543e
--- /dev/null
+++ b/checkpoint-63100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80a407a89f21131a8db7de4f6dc21e81f46258e4d2f6d4962f4b0aa34c0011c7
+size 324662984
diff --git a/checkpoint-63100/training_args.bin b/checkpoint-63100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63200/config.json b/checkpoint-63200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63200/model.safetensors b/checkpoint-63200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ee0a9bc33059827a786a75d1dba71a7de53fd46d
--- /dev/null
+++ b/checkpoint-63200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ad66fe5dccf8f13dba096e55205565739b2711dfb56312792f56267a9e26ffc
+size 324662984
diff --git a/checkpoint-63200/training_args.bin b/checkpoint-63200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63300/config.json b/checkpoint-63300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63300/model.safetensors b/checkpoint-63300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f6e46f67a6b220121f956a1c342b84755cc46403
--- /dev/null
+++ b/checkpoint-63300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4dea942032a4e124dab1d413aea85b0c2a12eca07aac04b74f708e2336f8425
+size 324662984
diff --git a/checkpoint-63300/training_args.bin b/checkpoint-63300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63400/config.json b/checkpoint-63400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63400/model.safetensors b/checkpoint-63400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a95bc76e08b16055377a6fb6b37a52bc77191ea0
--- /dev/null
+++ b/checkpoint-63400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:627733424c10b1d1f2a749c65fd14bdc817d3cbe1e3eaa22e8aa1f00a3e6d6f5
+size 324662984
diff --git a/checkpoint-63400/training_args.bin b/checkpoint-63400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63500/config.json b/checkpoint-63500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63500/model.safetensors b/checkpoint-63500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6dc18b4b61c59d4a69f72af4f9e2ea8f5df98357
--- /dev/null
+++ b/checkpoint-63500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99b8d3b4ef46c8d72ea6bda63befaabeb0ec2c7ca11e3ab5385c4695c47a112f
+size 324662984
diff --git a/checkpoint-63500/training_args.bin b/checkpoint-63500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63600/config.json b/checkpoint-63600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63600/model.safetensors b/checkpoint-63600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aa29a3a0cbe00a8a528b6960e742143b034c31ca
--- /dev/null
+++ b/checkpoint-63600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34a6f0231881a8b474a083b91ac07b2dba7e5c9ced6ecc31dca65431880a3842
+size 324662984
diff --git a/checkpoint-63600/training_args.bin b/checkpoint-63600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63700/config.json b/checkpoint-63700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63700/model.safetensors b/checkpoint-63700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a2d1481b93e136c115c63747a881243c808ec4f5
--- /dev/null
+++ b/checkpoint-63700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:703868ba40fdc0cb54553ab85b6249c5abad2c7e31ae4e1fb776787ee09de959
+size 324662984
diff --git a/checkpoint-63700/training_args.bin b/checkpoint-63700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63800/config.json b/checkpoint-63800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63800/model.safetensors b/checkpoint-63800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1da0adc97f0b7eeaf7d3441901aaced7aa0bc118
--- /dev/null
+++ b/checkpoint-63800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f97a919dd671cf224a60f3ff1431efb0c8772b59bf84ea7c347c8e90d91eeaeb
+size 324662984
diff --git a/checkpoint-63800/training_args.bin b/checkpoint-63800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-63900/config.json b/checkpoint-63900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-63900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-63900/model.safetensors b/checkpoint-63900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..791680723bea82c3b7bcbafaddd6b1d70e99105d
--- /dev/null
+++ b/checkpoint-63900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43f2607aeec3b9aaf94b0f02f081f5926aedd00aaef34b0d82af8bc9c92bf3da
+size 324662984
diff --git a/checkpoint-63900/training_args.bin b/checkpoint-63900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-63900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6400/config.json b/checkpoint-6400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6400/model.safetensors b/checkpoint-6400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..821e10ffd9389a9e833471f25171688ce1ddab9c
--- /dev/null
+++ b/checkpoint-6400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d09acff479c4bf9114de2f737464e593781add1c920ab0fe58e382d164ccb9e9
+size 324662984
diff --git a/checkpoint-6400/training_args.bin b/checkpoint-6400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64000/config.json b/checkpoint-64000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64000/model.safetensors b/checkpoint-64000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cbe77848b3b62cb198435e6bad2d904a85e62cf4
--- /dev/null
+++ b/checkpoint-64000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d67d1c0abc62492bab34483bb8c7170eb979d111ea83e45e411b920ae94e314c
+size 324662984
diff --git a/checkpoint-64000/training_args.bin b/checkpoint-64000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64100/config.json b/checkpoint-64100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64100/model.safetensors b/checkpoint-64100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c8d117e1fc50f658ac3218dc62eb1266551c817e
--- /dev/null
+++ b/checkpoint-64100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ea422d3e202949d0f3ad577729e34383bfcccdb922162fe06b2f9c207ca314c
+size 324662984
diff --git a/checkpoint-64100/training_args.bin b/checkpoint-64100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64200/config.json b/checkpoint-64200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64200/model.safetensors b/checkpoint-64200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1eacd0cc1d597a3dd3aa43581321011b47202738
--- /dev/null
+++ b/checkpoint-64200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fcc84afef5bc4672805c0200e6583b6fe1616b9dc61f36373e5715dd2653ff3
+size 324662984
diff --git a/checkpoint-64200/training_args.bin b/checkpoint-64200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64300/config.json b/checkpoint-64300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64300/model.safetensors b/checkpoint-64300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..46bbc2dd4c64081b43ca504daac645c7f4626a5f
--- /dev/null
+++ b/checkpoint-64300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b65818a67a7f5f22ff603066604b42796c7dba971d309640a20f64f175c1cbf4
+size 324662984
diff --git a/checkpoint-64300/training_args.bin b/checkpoint-64300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64400/config.json b/checkpoint-64400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64400/model.safetensors b/checkpoint-64400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..210153ee40a54a28e6c0dbdaeb14e782e9720333
--- /dev/null
+++ b/checkpoint-64400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e3c4894bcc93bdb74da9c6e38fb23cb4f0a7d47023e7e9e477fa48ffe779565
+size 324662984
diff --git a/checkpoint-64400/training_args.bin b/checkpoint-64400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64500/config.json b/checkpoint-64500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64500/model.safetensors b/checkpoint-64500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5641fa7437a56131d72937b9cc8fbce830823165
--- /dev/null
+++ b/checkpoint-64500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73b60e0a19b1aa421bebb6b95a68737eab77a02c1ea798e4e4752ae14fb0e45f
+size 324662984
diff --git a/checkpoint-64500/training_args.bin b/checkpoint-64500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64600/config.json b/checkpoint-64600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64600/model.safetensors b/checkpoint-64600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b7594a3ec1859eab48fa3913013cfa04b225f673
--- /dev/null
+++ b/checkpoint-64600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8500b8c3c1f6aedb0caf58ec47df844b363ac9911c2a0387797bb3d060a04416
+size 324662984
diff --git a/checkpoint-64600/training_args.bin b/checkpoint-64600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64700/config.json b/checkpoint-64700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64700/model.safetensors b/checkpoint-64700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3ff355ef814c8783ef2fc2a3bd1e0d05deefed50
--- /dev/null
+++ b/checkpoint-64700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb948b5470bce8fcf8e980402613045c5525be37f73586106101bc97da933263
+size 324662984
diff --git a/checkpoint-64700/training_args.bin b/checkpoint-64700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64800/config.json b/checkpoint-64800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64800/model.safetensors b/checkpoint-64800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5baf9e495657a61d0d7ecbb6f8ce0b3b042960bb
--- /dev/null
+++ b/checkpoint-64800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2402bfe5069f9fbf499e1c6fb15de83f5e8f812208f5ec43c09eb2c4523cdd8
+size 324662984
diff --git a/checkpoint-64800/training_args.bin b/checkpoint-64800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-64900/config.json b/checkpoint-64900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-64900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-64900/model.safetensors b/checkpoint-64900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..891ea8d2a058bc8d395c5b3b41c8353ad8f7e2cc
--- /dev/null
+++ b/checkpoint-64900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fb1e4cc02b94421bee872ea092f486618413a13c1d28a9b1c1bcab0e950bed4
+size 324662984
diff --git a/checkpoint-64900/training_args.bin b/checkpoint-64900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-64900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6500/config.json b/checkpoint-6500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6500/model.safetensors b/checkpoint-6500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..42fbbc39e0c9d3342e5869db365278a76ad93442
--- /dev/null
+++ b/checkpoint-6500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f52ac856700f8f303d97c23dc948a7bff13261c928a96b31c74d6c78860caf0
+size 324662984
diff --git a/checkpoint-6500/training_args.bin b/checkpoint-6500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-65000/config.json b/checkpoint-65000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-65000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-65000/model.safetensors b/checkpoint-65000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8fdf4484fda9be59093c26897e7a859ac2ed79d6
--- /dev/null
+++ b/checkpoint-65000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dac5604b2a506c6f294df10995e3e349a7f9c1b3ea60bb3a85ccee3d140f57a
+size 324662984
diff --git a/checkpoint-65000/training_args.bin b/checkpoint-65000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-65000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6600/config.json b/checkpoint-6600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6600/model.safetensors b/checkpoint-6600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2bb8543f4f1fc8d171911adfc4e640f18a07b69a
--- /dev/null
+++ b/checkpoint-6600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7ff8b1b097a2f046e81746f800d046ccb8e60b89abfc8cdd560382dfd144a72
+size 324662984
diff --git a/checkpoint-6600/training_args.bin b/checkpoint-6600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6700/config.json b/checkpoint-6700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6700/model.safetensors b/checkpoint-6700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7841f55eb7250d616bd1a0aaa3829a3dade07c06
--- /dev/null
+++ b/checkpoint-6700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dd20ea0682ba734680cfd215f7347773f5ddabc2350f3be4b6f2e00a0589704
+size 324662984
diff --git a/checkpoint-6700/training_args.bin b/checkpoint-6700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6800/config.json b/checkpoint-6800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6800/model.safetensors b/checkpoint-6800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fa85f13878e0127b9a82063f6b996043c7f2c3a8
--- /dev/null
+++ b/checkpoint-6800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2279140c1deb6e5833edd0c130a8d2cee0b85f5a4611c01cc1320d56167c65e9
+size 324662984
diff --git a/checkpoint-6800/training_args.bin b/checkpoint-6800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-6900/config.json b/checkpoint-6900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-6900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-6900/model.safetensors b/checkpoint-6900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4fee8683feec6eedfe33ad1146b4ec6550a8124d
--- /dev/null
+++ b/checkpoint-6900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01b5048bc1830d58bf27ae6ced5d10e49304421d32d7889024a7b10270d4c266
+size 324662984
diff --git a/checkpoint-6900/training_args.bin b/checkpoint-6900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-6900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-700/config.json b/checkpoint-700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-700/model.safetensors b/checkpoint-700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..49cc9551a394c1fe559501961cc377b7a3ce786c
--- /dev/null
+++ b/checkpoint-700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:724bb60a2c9fe997583beba8959b4f2c8e454faacea8001b80b930465e690fed
+size 324662984
diff --git a/checkpoint-700/training_args.bin b/checkpoint-700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7000/config.json b/checkpoint-7000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7000/model.safetensors b/checkpoint-7000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8a685cdd00c45a5e9b6581a275d5d6a0cecf8f91
--- /dev/null
+++ b/checkpoint-7000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5a241c65d3ae00ca4a538b9315c3bd9837ffefd79bb9d8e8ebf24988f01e6a3
+size 324662984
diff --git a/checkpoint-7000/training_args.bin b/checkpoint-7000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7100/config.json b/checkpoint-7100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7100/model.safetensors b/checkpoint-7100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..01f2a3a51a832ed1de1ffea51b8113452d7e24d2
--- /dev/null
+++ b/checkpoint-7100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed1a58b0f30d6a349a87cdd008bc7e6bdcc0957e01a153f9e80f382cb2967858
+size 324662984
diff --git a/checkpoint-7100/training_args.bin b/checkpoint-7100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7200/config.json b/checkpoint-7200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7200/model.safetensors b/checkpoint-7200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d62fe3365fa34784966a1ef10fb215ad4e2cc19d
--- /dev/null
+++ b/checkpoint-7200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd36d4692630e8184ee30cf30f4b8a02033e9af7bf9837fbec1bb051240f92db
+size 324662984
diff --git a/checkpoint-7200/training_args.bin b/checkpoint-7200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7300/config.json b/checkpoint-7300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7300/model.safetensors b/checkpoint-7300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..793f2ddef323c6af00b534345d5bcc7377130fe2
--- /dev/null
+++ b/checkpoint-7300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5388658effd508e9be423f26e6183bd495af0b81ae18e8a121bffed82b6db34d
+size 324662984
diff --git a/checkpoint-7300/training_args.bin b/checkpoint-7300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7400/config.json b/checkpoint-7400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7400/model.safetensors b/checkpoint-7400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c52c86405899baf10fc4414e4c949a0997d19cbc
--- /dev/null
+++ b/checkpoint-7400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44a012ce6e9f267000b4b36af46cfd138228ba69a49c1969d05918cce1a7d790
+size 324662984
diff --git a/checkpoint-7400/training_args.bin b/checkpoint-7400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7500/config.json b/checkpoint-7500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7500/model.safetensors b/checkpoint-7500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5a2327e438672a1c8cdf99717b7779372398b314
--- /dev/null
+++ b/checkpoint-7500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:855fee1c8869ca35094faf8e4cb6e831211539c40f2373ed772c61ad2d5c08c2
+size 324662984
diff --git a/checkpoint-7500/training_args.bin b/checkpoint-7500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7600/config.json b/checkpoint-7600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7600/model.safetensors b/checkpoint-7600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..748d59204370a816bcf00121d379b514ce0e8a13
--- /dev/null
+++ b/checkpoint-7600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6fcb59d8fa73f0f7473993046c6e69d6f8ea39c00404162d7a0495bd7959e34
+size 324662984
diff --git a/checkpoint-7600/training_args.bin b/checkpoint-7600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7700/config.json b/checkpoint-7700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7700/model.safetensors b/checkpoint-7700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f0d1783f9bb362e2c097d62d526334c381b32acb
--- /dev/null
+++ b/checkpoint-7700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a1601f3d28d0cc94e326935f65bee9d435f654957110d7cbec310a7b67d1ecf
+size 324662984
diff --git a/checkpoint-7700/training_args.bin b/checkpoint-7700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7800/config.json b/checkpoint-7800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7800/model.safetensors b/checkpoint-7800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..84667ff94cb94ef8393610dac23aca63171e6a11
--- /dev/null
+++ b/checkpoint-7800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c35876c9a45d48be675703af1b4af956306059b4d9af2e3e1c491efbe5605aa5
+size 324662984
diff --git a/checkpoint-7800/training_args.bin b/checkpoint-7800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-7900/config.json b/checkpoint-7900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-7900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-7900/model.safetensors b/checkpoint-7900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e0e04c4587b8d1e14c18561567195694e7417fa3
--- /dev/null
+++ b/checkpoint-7900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9b1ec52395be1c14301fdbfb88d8924e9b5a02ce3bcc900aa363ff1870b38e8
+size 324662984
diff --git a/checkpoint-7900/training_args.bin b/checkpoint-7900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-7900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-800/config.json b/checkpoint-800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-800/model.safetensors b/checkpoint-800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8b8fa7280294dda3e868ca31fbfdc4e8ff0a13b8
--- /dev/null
+++ b/checkpoint-800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de87fc2b26d177e393555124766b0d1959f2ee409b61a092812128ab7e82fd73
+size 324662984
diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8000/config.json b/checkpoint-8000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8000/model.safetensors b/checkpoint-8000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5c61eeeafd3d9993fded5a03021b11077434c76f
--- /dev/null
+++ b/checkpoint-8000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e4241c188b8a772300ab415f4d9cd43193cd8802baa1b2ea24b07f5b5d7cb4d
+size 324662984
diff --git a/checkpoint-8000/training_args.bin b/checkpoint-8000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8100/config.json b/checkpoint-8100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8100/model.safetensors b/checkpoint-8100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bcb4302bdf96c6380b435572e38ac8ebfd440a6b
--- /dev/null
+++ b/checkpoint-8100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f87200d4f5fc6fb8aa1c477ed0af2eec66bd290fb088098b0a1bebeb3920ac0e
+size 324662984
diff --git a/checkpoint-8100/training_args.bin b/checkpoint-8100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8200/config.json b/checkpoint-8200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8200/model.safetensors b/checkpoint-8200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..58b8e76eec44381f2e4f763a2763d4ffa0287b09
--- /dev/null
+++ b/checkpoint-8200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f669b5851234a7b9bae603d87d6f04dd55cd988ea452f77110146f4d5d9cb846
+size 324662984
diff --git a/checkpoint-8200/training_args.bin b/checkpoint-8200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8300/config.json b/checkpoint-8300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8300/model.safetensors b/checkpoint-8300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a43f5382e4c6fc74e9a22511bbae8bbd9310885a
--- /dev/null
+++ b/checkpoint-8300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62f2b00b9ab27bbb9dca9f661c702869fef069621e979a4d3971e8c19061b8b0
+size 324662984
diff --git a/checkpoint-8300/training_args.bin b/checkpoint-8300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8400/config.json b/checkpoint-8400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8400/model.safetensors b/checkpoint-8400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c145bab06decb11fe7cf23f323089607a70a7549
--- /dev/null
+++ b/checkpoint-8400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54cc59179d546cd497868862c468637e3f6d52de755c3e5d7d7df1456283cb97
+size 324662984
diff --git a/checkpoint-8400/training_args.bin b/checkpoint-8400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8500/config.json b/checkpoint-8500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8500/model.safetensors b/checkpoint-8500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..215adacb0963f6c85413d6a44d890412e2b62974
--- /dev/null
+++ b/checkpoint-8500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca0c5726343c42606559214afda2201196ec99878cf2a3a01833e2fd780a8a2e
+size 324662984
diff --git a/checkpoint-8500/training_args.bin b/checkpoint-8500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8600/config.json b/checkpoint-8600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8600/model.safetensors b/checkpoint-8600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b45693ea7419253ad1013548ac56903522109335
--- /dev/null
+++ b/checkpoint-8600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:414b4c14ac16d2fb5a88fc4e8973c3c88ea9e5ce08db335174b15eb15b04de1c
+size 324662984
diff --git a/checkpoint-8600/training_args.bin b/checkpoint-8600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8700/config.json b/checkpoint-8700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8700/model.safetensors b/checkpoint-8700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d62fb959d23f016396b872c79bdf95fdc8d3bc40
--- /dev/null
+++ b/checkpoint-8700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:563400599d449a325f839ca444577e391414ff0d13948fcb720e2ed05a513c13
+size 324662984
diff --git a/checkpoint-8700/training_args.bin b/checkpoint-8700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8800/config.json b/checkpoint-8800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8800/model.safetensors b/checkpoint-8800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ed5dec2e71da0640063c9a406e912705c861a0ca
--- /dev/null
+++ b/checkpoint-8800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:998d027e222fa8fd3135f9a1b69c094f52c2b7814f3e82cfb0cdf2b02c1d7b3e
+size 324662984
diff --git a/checkpoint-8800/training_args.bin b/checkpoint-8800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-8900/config.json b/checkpoint-8900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-8900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-8900/model.safetensors b/checkpoint-8900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bf3e94343fd27595bad5cd08c02a001c56aa6732
--- /dev/null
+++ b/checkpoint-8900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7391feb0a3adf74968b22c39d92e956dc064486397997699eec3d9061e5bb03e
+size 324662984
diff --git a/checkpoint-8900/training_args.bin b/checkpoint-8900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-8900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-900/config.json b/checkpoint-900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-900/model.safetensors b/checkpoint-900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f148ac062efc4ea3081b9a6f3b7515774e6b1377
--- /dev/null
+++ b/checkpoint-900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef4d020683dcf2ce71ffebf9c5c3b3414691acb36632c56bf2296cd12a9d9ad4
+size 324662984
diff --git a/checkpoint-900/training_args.bin b/checkpoint-900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9000/config.json b/checkpoint-9000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9000/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9000/model.safetensors b/checkpoint-9000/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..874ff22ae43aed88208e3e44a52441d4cf484851
--- /dev/null
+++ b/checkpoint-9000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab6c744f4e9ce44ae52384a846204a735fc943b613ffcf69e27ca36ec7b66fa6
+size 324662984
diff --git a/checkpoint-9000/training_args.bin b/checkpoint-9000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9100/config.json b/checkpoint-9100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9100/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9100/model.safetensors b/checkpoint-9100/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a3a40729a57b52c5b8bdd8a2ef7aa6130cf69b55
--- /dev/null
+++ b/checkpoint-9100/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bf18dbc11161ba6c7df98d461791b21be3f22144a769d5c848b326bd2dbf1a2
+size 324662984
diff --git a/checkpoint-9100/training_args.bin b/checkpoint-9100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9200/config.json b/checkpoint-9200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9200/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9200/model.safetensors b/checkpoint-9200/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9a0788cc31d8cd7f1b6f936229a088dda5b940a2
--- /dev/null
+++ b/checkpoint-9200/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4547a55dfbb3f3468f4517eccab0b0b8cef88db65c07d0cf889954bd4f83590
+size 324662984
diff --git a/checkpoint-9200/training_args.bin b/checkpoint-9200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9300/config.json b/checkpoint-9300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9300/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9300/model.safetensors b/checkpoint-9300/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8f954e98f44beed4483c075ec0ef51afcff54803
--- /dev/null
+++ b/checkpoint-9300/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5959fd7cf7b63b7c9f67209e3f77e307af740f90d140fa624c098f8bdad14f6a
+size 324662984
diff --git a/checkpoint-9300/training_args.bin b/checkpoint-9300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9400/config.json b/checkpoint-9400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9400/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9400/model.safetensors b/checkpoint-9400/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3c1b69fb682ae1b267c881e6a7242051e80b0cb8
--- /dev/null
+++ b/checkpoint-9400/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4c9848f50ae65fced71a6789066bbee4d4e80a65b573caec0eafc247e55b4e2
+size 324662984
diff --git a/checkpoint-9400/training_args.bin b/checkpoint-9400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9500/config.json b/checkpoint-9500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9500/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9500/model.safetensors b/checkpoint-9500/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d3c5f417e7b9230e7e2595b570401194f5c37c38
--- /dev/null
+++ b/checkpoint-9500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed796406d86ff951fc0e8b4d1e4b9d010ce034b5da4aa1f7ee34fc43770c7063
+size 324662984
diff --git a/checkpoint-9500/training_args.bin b/checkpoint-9500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9600/config.json b/checkpoint-9600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9600/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9600/model.safetensors b/checkpoint-9600/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..af55b1f9396498e3139b36bd308cd2c026628876
--- /dev/null
+++ b/checkpoint-9600/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9af152577574773c13846f03a5fc80107f8430b06ce2cabef0900c93dbd9e0cc
+size 324662984
diff --git a/checkpoint-9600/training_args.bin b/checkpoint-9600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9700/config.json b/checkpoint-9700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9700/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9700/model.safetensors b/checkpoint-9700/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..baee4679768caf23ea7806191c5ad1a4e326ff44
--- /dev/null
+++ b/checkpoint-9700/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff9e74601efc680a966275f676b3dd5afc8d2a3c45bd5be3114cd8b666053175
+size 324662984
diff --git a/checkpoint-9700/training_args.bin b/checkpoint-9700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9800/config.json b/checkpoint-9800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9800/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9800/model.safetensors b/checkpoint-9800/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1df4e9565f07f52083e9ad76c5d723b05a59e455
--- /dev/null
+++ b/checkpoint-9800/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:634b4b48e676a45b6e072509627a2fdcbf623cb009cb59a82d5f07c171f36191
+size 324662984
diff --git a/checkpoint-9800/training_args.bin b/checkpoint-9800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520
diff --git a/checkpoint-9900/config.json b/checkpoint-9900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59b0b4c67b30baa7b62a3a87fc086e8dd1f8916
--- /dev/null
+++ b/checkpoint-9900/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 1024,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/checkpoint-9900/model.safetensors b/checkpoint-9900/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9523fea81493e7651ca8959cf84a300c2ca1f6b0
--- /dev/null
+++ b/checkpoint-9900/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a745e278d424fa9d2ce3448e99b01f0ef232ecccb1a92e46d863fe73a69c9b4
+size 324662984
diff --git a/checkpoint-9900/training_args.bin b/checkpoint-9900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..669444c12379431082710f84260b241424546f4f
--- /dev/null
+++ b/checkpoint-9900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
+size 6520