Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

config.json +41 -0
generation_config.json +5 -0
latest +1 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +203 -0
scheduler.pt +3 -0
special_tokens_map.json +29 -0
tokenizer.json +0 -0
tokenizer_config.json +133 -0
trainer_state.json +1777 -0
training_args.bin +3 -0
zero_to_fp32.py +604 -0

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "tiiuae/falcon-7b-instruct",
+  "activation": "gelu",
+  "alibi": false,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "FalconForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "tiiuae/falcon-7b-instruct--configuration_falcon.FalconConfig",
+    "AutoModel": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconModel",
+    "AutoModelForCausalLM": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForCausalLM",
+    "AutoModelForQuestionAnswering": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForQuestionAnswering",
+    "AutoModelForSequenceClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForSequenceClassification",
+    "AutoModelForTokenClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForTokenClassification"
+  },
+  "bias": false,
+  "bos_token_id": 11,
+  "eos_token_id": 11,
+  "ffn_hidden_size": 18176,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4544,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "falcon",
+  "multi_query": true,
+  "new_decoder_architecture": false,
+  "num_attention_heads": 71,
+  "num_hidden_layers": 32,
+  "num_kv_heads": 71,
+  "num_ln_in_parallel_attn": null,
+  "parallel_attn": true,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.1",
+  "use_cache": true,
+  "vocab_size": 65024
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 11,
+  "transformers_version": "4.45.1"
+}

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step1328

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fff6537416c1213e7273fe5a6aebbf7ba5ac3e7445dcd5ab9e015641e13f5b7
+size 4981285848

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b939f7584268d36be31b8cba930ce8a1afad66e99c67e89bb0606638f0714901
+size 4969690568

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f2a9ee7fe38000d9cde569b851fdee296a7aaae87dc721ee8aebadf4ed26ef2
+size 4483426544

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,203 @@

+{
+  "metadata": {
+    "total_size": 14434379520
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.0.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.0.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.0.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.0.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.0.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.10.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.10.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.10.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.10.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.10.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.11.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.2.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.2.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.2.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.2.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.2.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.20.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.20.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.20.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.20.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.20.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.22.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.22.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.22.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.22.self_attention.dense.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.22.self_attention.query_key_value.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.23.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.self_attention.dense.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.self_attention.query_key_value.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.self_attention.dense.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.self_attention.query_key_value.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.self_attention.dense.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.self_attention.query_key_value.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.self_attention.dense.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.self_attention.query_key_value.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.self_attention.dense.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.self_attention.query_key_value.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.self_attention.dense.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.self_attention.query_key_value.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.self_attention.dense.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.self_attention.query_key_value.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.3.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.3.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.3.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.3.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.3.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.30.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.30.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.30.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.30.self_attention.dense.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.30.self_attention.query_key_value.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.self_attention.dense.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.self_attention.query_key_value.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.4.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.4.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.4.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.4.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.4.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.self_attention.dense.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.self_attention.query_key_value.weight": "model-00001-of-00003.safetensors",
+    "transformer.ln_f.bias": "model-00003-of-00003.safetensors",
+    "transformer.ln_f.weight": "model-00003-of-00003.safetensors",
+    "transformer.word_embeddings.weight": "model-00001-of-00003.safetensors"
+  }
+}

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:588414cfa807e959eb02647e1087517e824e0866509759a9b4d8f6719eadea68
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "additional_special_tokens": [
+    ">>TITLE<<",
+    ">>ABSTRACT<<",
+    ">>INTRODUCTION<<",
+    ">>SUMMARY<<",
+    ">>COMMENT<<",
+    ">>ANSWER<<",
+    ">>QUESTION<<",
+    ">>DOMAIN<<",
+    ">>PREFIX<<",
+    ">>SUFFIX<<",
+    ">>MIDDLE<<"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,133 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": ">>TITLE<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": ">>ABSTRACT<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": ">>INTRODUCTION<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": ">>SUMMARY<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": ">>COMMENT<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": ">>ANSWER<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": ">>QUESTION<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": ">>DOMAIN<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": ">>PREFIX<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": ">>SUFFIX<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": ">>MIDDLE<<",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65024": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    ">>TITLE<<",
+    ">>ABSTRACT<<",
+    ">>INTRODUCTION<<",
+    ">>SUMMARY<<",
+    ">>COMMENT<<",
+    ">>ANSWER<<",
+    ">>QUESTION<<",
+    ">>DOMAIN<<",
+    ">>PREFIX<<",
+    ">>SUFFIX<<",
+    ">>MIDDLE<<"
+  ],
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ system_message.strip() }}{% endif %}{% if message['role'] == 'user' %}{{ '\n\nUser: ' + message['content'].strip().replace('\r\n', '\n').replace('\n\n', '\n') }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: ' + message['content'].strip().replace('\r\n', '\n').replace('\n\n', '\n') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant:' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 2048,
+  "pad_token": "[PAD]",
+  "padding_side": "left",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1777 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "episode": 21248,
+  "epoch": 0.29143977944504645,
+  "eval_steps": 200.0,
+  "global_step": 415,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "episode": 256,
+      "epoch": 0.003511322643916222,
+      "eps": 6,
+      "loss/policy_avg": -0.07090990990400314,
+      "loss/value_avg": 0.0,
+      "lr": 3e-06,
+      "objective/entropy": 49.42120361328125,
+      "objective/kl": 0.006465356796979904,
+      "objective/non_score_reward": -0.000646535714622587,
+      "objective/rlhf_reward": -1.1137903928756714,
+      "objective/scores": -1.109375,
+      "policy/approxkl_avg": 27.096786499023438,
+      "policy/clipfrac_avg": 0.732421875,
+      "policy/entropy_avg": 0.92181396484375,
+      "step": 5,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 12,
+      "val/ratio": 1.0399832725524902,
+      "val/ratio_var": 0.010045886039733887
+    },
+    {
+      "episode": 512,
+      "epoch": 0.007022645287832444,
+      "eps": 6,
+      "loss/policy_avg": -0.06497187167406082,
+      "loss/value_avg": 0.0,
+      "lr": 2.9923273657289e-06,
+      "objective/entropy": 48.286014556884766,
+      "objective/kl": 0.8119473457336426,
+      "objective/non_score_reward": -0.08119472861289978,
+      "objective/rlhf_reward": -1.266162633895874,
+      "objective/scores": -1.1875,
+      "policy/approxkl_avg": 18.666072845458984,
+      "policy/clipfrac_avg": 0.7314453125,
+      "policy/entropy_avg": 0.912261962890625,
+      "step": 10,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.020957112312317,
+      "val/ratio_var": 0.00411860179156065
+    },
+    {
+      "episode": 768,
+      "epoch": 0.010533967931748666,
+      "eps": 6,
+      "loss/policy_avg": -0.0872286781668663,
+      "loss/value_avg": 0.0,
+      "lr": 2.9846547314578008e-06,
+      "objective/entropy": 49.34376525878906,
+      "objective/kl": 1.9591996669769287,
+      "objective/non_score_reward": -0.1959199756383896,
+      "objective/rlhf_reward": -1.2858657836914062,
+      "objective/scores": -1.09375,
+      "policy/approxkl_avg": 20.772502899169922,
+      "policy/clipfrac_avg": 0.73828125,
+      "policy/entropy_avg": 0.927978515625,
+      "step": 15,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 12,
+      "val/ratio": 1.0191609859466553,
+      "val/ratio_var": 0.00307083735242486
+    },
+    {
+      "episode": 1024,
+      "epoch": 0.014045290575664887,
+      "eps": 6,
+      "loss/policy_avg": -0.07566041499376297,
+      "loss/value_avg": 0.0,
+      "lr": 2.9769820971867007e-06,
+      "objective/entropy": 53.13662338256836,
+      "objective/kl": 2.4811532497406006,
+      "objective/non_score_reward": -0.24811533093452454,
+      "objective/rlhf_reward": -1.2548893690109253,
+      "objective/scores": -1.0078125,
+      "policy/approxkl_avg": 20.665164947509766,
+      "policy/clipfrac_avg": 0.7314453125,
+      "policy/entropy_avg": 0.989776611328125,
+      "step": 20,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 11,
+      "val/ratio": 1.011010766029358,
+      "val/ratio_var": 0.004201602190732956
+    },
+    {
+      "episode": 1280,
+      "epoch": 0.01755661321958111,
+      "eps": 6,
+      "loss/policy_avg": -0.08593496680259705,
+      "loss/value_avg": 0.0,
+      "lr": 2.9693094629156014e-06,
+      "objective/entropy": 53.72633743286133,
+      "objective/kl": 3.3111624717712402,
+      "objective/non_score_reward": -0.3311161994934082,
+      "objective/rlhf_reward": -1.339456558227539,
+      "objective/scores": -1.0078125,
+      "policy/approxkl_avg": 25.559288024902344,
+      "policy/clipfrac_avg": 0.7353515625,
+      "policy/entropy_avg": 0.997894287109375,
+      "step": 25,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 13,
+      "val/ratio": 1.0134021043777466,
+      "val/ratio_var": 0.0019979747012257576
+    },
+    {
+      "episode": 1536,
+      "epoch": 0.021067935863497332,
+      "eps": 6,
+      "loss/policy_avg": -0.09734417498111725,
+      "loss/value_avg": 0.0,
+      "lr": 2.9616368286445014e-06,
+      "objective/entropy": 51.259735107421875,
+      "objective/kl": 5.089182376861572,
+      "objective/non_score_reward": -0.5089181661605835,
+      "objective/rlhf_reward": -1.2202520370483398,
+      "objective/scores": -0.7109375,
+      "policy/approxkl_avg": 29.841636657714844,
+      "policy/clipfrac_avg": 0.736328125,
+      "policy/entropy_avg": 0.960479736328125,
+      "step": 30,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 26,
+      "val/ratio": 1.0178756713867188,
+      "val/ratio_var": 0.009866585955023766
+    },
+    {
+      "episode": 1792,
+      "epoch": 0.024579258507413555,
+      "eps": 6,
+      "loss/policy_avg": -0.06831618398427963,
+      "loss/value_avg": 0.0,
+      "lr": 2.9539641943734013e-06,
+      "objective/entropy": 40.643272399902344,
+      "objective/kl": 6.974010944366455,
+      "objective/non_score_reward": -0.6974011063575745,
+      "objective/rlhf_reward": -1.2684605121612549,
+      "objective/scores": -0.5703125,
+      "policy/approxkl_avg": 35.33942413330078,
+      "policy/clipfrac_avg": 0.6982421875,
+      "policy/entropy_avg": 0.7505035400390625,
+      "step": 35,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.00449800491333,
+      "val/ratio_var": 0.0022142010275274515
+    },
+    {
+      "episode": 2048,
+      "epoch": 0.028090581151329775,
+      "eps": 6,
+      "loss/policy_avg": -0.04068079590797424,
+      "loss/value_avg": 0.0,
+      "lr": 2.946291560102302e-06,
+      "objective/entropy": 23.142562866210938,
+      "objective/kl": 8.180486679077148,
+      "objective/non_score_reward": -0.8180487155914307,
+      "objective/rlhf_reward": -1.0729957818984985,
+      "objective/scores": -0.255859375,
+      "policy/approxkl_avg": 23.68307876586914,
+      "policy/clipfrac_avg": 0.5859375,
+      "policy/entropy_avg": 0.4361400604248047,
+      "step": 40,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 8,
+      "val/ratio": 1.0077030658721924,
+      "val/ratio_var": 0.0024766812566667795
+    },
+    {
+      "episode": 2304,
+      "epoch": 0.031601903795246,
+      "eps": 6,
+      "loss/policy_avg": -0.07307010889053345,
+      "loss/value_avg": 0.0,
+      "lr": 2.938618925831202e-06,
+      "objective/entropy": 19.376842498779297,
+      "objective/kl": 8.770210266113281,
+      "objective/non_score_reward": -0.8770210146903992,
+      "objective/rlhf_reward": -1.0002652406692505,
+      "objective/scores": -0.12353515625,
+      "policy/approxkl_avg": 31.00873565673828,
+      "policy/clipfrac_avg": 0.5302734375,
+      "policy/entropy_avg": 0.33237457275390625,
+      "step": 45,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 20,
+      "val/ratio": 0.996111273765564,
+      "val/ratio_var": 0.001100091845728457
+    },
+    {
+      "episode": 2560,
+      "epoch": 0.03511322643916222,
+      "eps": 6,
+      "loss/policy_avg": -0.04584116116166115,
+      "loss/value_avg": 0.0,
+      "lr": 2.9309462915601027e-06,
+      "objective/entropy": 11.984097480773926,
+      "objective/kl": 8.4966402053833,
+      "objective/non_score_reward": -0.849664032459259,
+      "objective/rlhf_reward": -0.8017911911010742,
+      "objective/scores": 0.0478515625,
+      "policy/approxkl_avg": 22.561037063598633,
+      "policy/clipfrac_avg": 0.451171875,
+      "policy/entropy_avg": 0.19393539428710938,
+      "step": 50,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 20,
+      "val/ratio": 0.9952375888824463,
+      "val/ratio_var": 0.000761833623982966
+    },
+    {
+      "episode": 2816,
+      "epoch": 0.03862454908307844,
+      "eps": 5,
+      "loss/policy_avg": -0.029720915481448174,
+      "loss/value_avg": 0.0,
+      "lr": 2.9232736572890026e-06,
+      "objective/entropy": 4.9489898681640625,
+      "objective/kl": 8.733837127685547,
+      "objective/non_score_reward": -0.8733837604522705,
+      "objective/rlhf_reward": -0.7492713928222656,
+      "objective/scores": 0.1240234375,
+      "policy/approxkl_avg": 16.253189086914062,
+      "policy/clipfrac_avg": 0.341796875,
+      "policy/entropy_avg": 0.07728099822998047,
+      "step": 55,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 18,
+      "val/ratio": 0.9972053170204163,
+      "val/ratio_var": 0.00032430028659291565
+    },
+    {
+      "episode": 3072,
+      "epoch": 0.042135871726994664,
+      "eps": 5,
+      "loss/policy_avg": -0.01298562902957201,
+      "loss/value_avg": 0.0,
+      "lr": 2.9156010230179026e-06,
+      "objective/entropy": 1.3101667165756226,
+      "objective/kl": 8.699792861938477,
+      "objective/non_score_reward": -0.8699792623519897,
+      "objective/rlhf_reward": -0.5752952098846436,
+      "objective/scores": 0.294921875,
+      "policy/approxkl_avg": 2.27925968170166,
+      "policy/clipfrac_avg": 0.236328125,
+      "policy/entropy_avg": 0.02513742446899414,
+      "step": 60,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 20,
+      "val/ratio": 1.0017118453979492,
+      "val/ratio_var": 0.00016639505338389426
+    },
+    {
+      "episode": 3328,
+      "epoch": 0.04564719437091089,
+      "eps": 5,
+      "loss/policy_avg": -0.02618303708732128,
+      "loss/value_avg": 0.0,
+      "lr": 2.9079283887468033e-06,
+      "objective/entropy": 2.3685269355773926,
+      "objective/kl": 9.208517074584961,
+      "objective/non_score_reward": -0.9208516478538513,
+      "objective/rlhf_reward": -0.5182289481163025,
+      "objective/scores": 0.40234375,
+      "policy/approxkl_avg": 2.6189699172973633,
+      "policy/clipfrac_avg": 0.310546875,
+      "policy/entropy_avg": 0.04020071029663086,
+      "step": 65,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 20,
+      "val/ratio": 1.003983497619629,
+      "val/ratio_var": 0.0009448421187698841
+    },
+    {
+      "episode": 3584,
+      "epoch": 0.04915851701482711,
+      "eps": 5,
+      "loss/policy_avg": -0.02327096462249756,
+      "loss/value_avg": 0.0,
+      "lr": 2.9002557544757032e-06,
+      "objective/entropy": 2.0416018962860107,
+      "objective/kl": 9.701976776123047,
+      "objective/non_score_reward": -0.9701976776123047,
+      "objective/rlhf_reward": -0.49486449360847473,
+      "objective/scores": 0.474609375,
+      "policy/approxkl_avg": 1.271956443786621,
+      "policy/clipfrac_avg": 0.2734375,
+      "policy/entropy_avg": 0.041253089904785156,
+      "step": 70,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.0039558410644531,
+      "val/ratio_var": 0.00041477559716440737
+    },
+    {
+      "episode": 3840,
+      "epoch": 0.052669839658743334,
+      "eps": 5,
+      "loss/policy_avg": -0.033096276223659515,
+      "loss/value_avg": 0.0,
+      "lr": 2.892583120204604e-06,
+      "objective/entropy": 2.7795495986938477,
+      "objective/kl": 10.028523445129395,
+      "objective/non_score_reward": -1.0028523206710815,
+      "objective/rlhf_reward": -0.46555712819099426,
+      "objective/scores": 0.5390625,
+      "policy/approxkl_avg": 3.055203676223755,
+      "policy/clipfrac_avg": 0.3427734375,
+      "policy/entropy_avg": 0.053270816802978516,
+      "step": 75,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 23,
+      "val/ratio": 1.0012407302856445,
+      "val/ratio_var": 0.00011274257121840492
+    },
+    {
+      "episode": 4096,
+      "epoch": 0.05618116230265955,
+      "eps": 5,
+      "loss/policy_avg": -0.01961323618888855,
+      "loss/value_avg": 0.0,
+      "lr": 2.884910485933504e-06,
+      "objective/entropy": 2.5525641441345215,
+      "objective/kl": 10.111019134521484,
+      "objective/non_score_reward": -1.0111019611358643,
+      "objective/rlhf_reward": -0.510233461856842,
+      "objective/scores": 0.5,
+      "policy/approxkl_avg": 1.331697940826416,
+      "policy/clipfrac_avg": 0.2861328125,
+      "policy/entropy_avg": 0.048857688903808594,
+      "step": 80,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 25,
+      "val/ratio": 1.011049509048462,
+      "val/ratio_var": 0.004252108279615641
+    },
+    {
+      "episode": 4352,
+      "epoch": 0.05969248494657577,
+      "eps": 5,
+      "loss/policy_avg": -0.009127877652645111,
+      "loss/value_avg": 0.0,
+      "lr": 2.877237851662404e-06,
+      "objective/entropy": 3.016789674758911,
+      "objective/kl": 11.257818222045898,
+      "objective/non_score_reward": -1.125781774520874,
+      "objective/rlhf_reward": -0.4276960492134094,
+      "objective/scores": 0.69921875,
+      "policy/approxkl_avg": 1.4772686958312988,
+      "policy/clipfrac_avg": 0.35546875,
+      "policy/entropy_avg": 0.053719520568847656,
+      "step": 85,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 6,
+      "val/ratio": 1.0042904615402222,
+      "val/ratio_var": 0.0008556774700991809
+    },
+    {
+      "episode": 4608,
+      "epoch": 0.063203807590492,
+      "eps": 5,
+      "loss/policy_avg": -0.025049656629562378,
+      "loss/value_avg": 0.0,
+      "lr": 2.8695652173913046e-06,
+      "objective/entropy": 2.5907459259033203,
+      "objective/kl": 10.457273483276367,
+      "objective/non_score_reward": -1.0457274913787842,
+      "objective/rlhf_reward": -0.3816419839859009,
+      "objective/scores": 0.6640625,
+      "policy/approxkl_avg": 2.3460922241210938,
+      "policy/clipfrac_avg": 0.322265625,
+      "policy/entropy_avg": 0.04626178741455078,
+      "step": 90,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 11,
+      "val/ratio": 1.0003862380981445,
+      "val/ratio_var": 7.93520302977413e-05
+    },
+    {
+      "episode": 4864,
+      "epoch": 0.06671513023440821,
+      "eps": 5,
+      "loss/policy_avg": -0.01828361675143242,
+      "loss/value_avg": 0.0,
+      "lr": 2.8618925831202045e-06,
+      "objective/entropy": 2.397810220718384,
+      "objective/kl": 10.732559204101562,
+      "objective/non_score_reward": -1.073256015777588,
+      "objective/rlhf_reward": -0.35966813564300537,
+      "objective/scores": 0.71484375,
+      "policy/approxkl_avg": 1.1093428134918213,
+      "policy/clipfrac_avg": 0.32421875,
+      "policy/entropy_avg": 0.041881561279296875,
+      "step": 95,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 15,
+      "val/ratio": 1.0054664611816406,
+      "val/ratio_var": 0.0017973663052543998
+    },
+    {
+      "episode": 5120,
+      "epoch": 0.07022645287832444,
+      "eps": 5,
+      "loss/policy_avg": -0.04088423401117325,
+      "loss/value_avg": 0.0,
+      "lr": 2.8542199488491053e-06,
+      "objective/entropy": 2.343449592590332,
+      "objective/kl": 11.780994415283203,
+      "objective/non_score_reward": -1.1780993938446045,
+      "objective/rlhf_reward": -0.4628324806690216,
+      "objective/scores": 0.71484375,
+      "policy/approxkl_avg": 0.894420325756073,
+      "policy/clipfrac_avg": 0.46875,
+      "policy/entropy_avg": 0.04486083984375,
+      "step": 100,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 11,
+      "val/ratio": 1.0009559392929077,
+      "val/ratio_var": 4.804596756002866e-05
+    },
+    {
+      "episode": 5376,
+      "epoch": 0.07373777552224066,
+      "eps": 5,
+      "loss/policy_avg": -0.020697183907032013,
+      "loss/value_avg": 0.0,
+      "lr": 2.846547314578005e-06,
+      "objective/entropy": 1.9023351669311523,
+      "objective/kl": 10.29288101196289,
+      "objective/non_score_reward": -1.0292882919311523,
+      "objective/rlhf_reward": -0.29047834873199463,
+      "objective/scores": 0.73828125,
+      "policy/approxkl_avg": 0.9143690466880798,
+      "policy/clipfrac_avg": 0.373046875,
+      "policy/entropy_avg": 0.028568267822265625,
+      "step": 105,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 10,
+      "val/ratio": 1.000715732574463,
+      "val/ratio_var": 4.201457340968773e-05
+    },
+    {
+      "episode": 5632,
+      "epoch": 0.07724909816615688,
+      "eps": 5,
+      "loss/policy_avg": -0.012633640319108963,
+      "loss/value_avg": 0.0,
+      "lr": 2.8388746803069055e-06,
+      "objective/entropy": 1.3839142322540283,
+      "objective/kl": 10.57151985168457,
+      "objective/non_score_reward": -1.0571520328521729,
+      "objective/rlhf_reward": -0.2935946583747864,
+      "objective/scores": 0.765625,
+      "policy/approxkl_avg": 0.6525547504425049,
+      "policy/clipfrac_avg": 0.2646484375,
+      "policy/entropy_avg": 0.0345916748046875,
+      "step": 110,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 10,
+      "val/ratio": 0.9999199509620667,
+      "val/ratio_var": 2.6978697860613465e-05
+    },
+    {
+      "episode": 5888,
+      "epoch": 0.0807604208100731,
+      "eps": 5,
+      "loss/policy_avg": -0.026668714359402657,
+      "loss/value_avg": 0.0,
+      "lr": 2.831202046035806e-06,
+      "objective/entropy": 2.17741322517395,
+      "objective/kl": 11.39688491821289,
+      "objective/non_score_reward": -1.139688491821289,
+      "objective/rlhf_reward": -0.3027456998825073,
+      "objective/scores": 0.8359375,
+      "policy/approxkl_avg": 8.829752922058105,
+      "policy/clipfrac_avg": 0.35546875,
+      "policy/entropy_avg": 0.034277915954589844,
+      "step": 115,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 8,
+      "val/ratio": 1.0012441873550415,
+      "val/ratio_var": 9.009366476675496e-05
+    },
+    {
+      "episode": 6144,
+      "epoch": 0.08427174345398933,
+      "eps": 5,
+      "loss/policy_avg": -0.011602860875427723,
+      "loss/value_avg": 0.0,
+      "lr": 2.823529411764706e-06,
+      "objective/entropy": 1.418602466583252,
+      "objective/kl": 10.246469497680664,
+      "objective/non_score_reward": -1.0246469974517822,
+      "objective/rlhf_reward": -0.22599510848522186,
+      "objective/scores": 0.796875,
+      "policy/approxkl_avg": 0.31790149211883545,
+      "policy/clipfrac_avg": 0.2314453125,
+      "policy/entropy_avg": 0.028847694396972656,
+      "step": 120,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 9,
+      "val/ratio": 1.0009679794311523,
+      "val/ratio_var": 3.900106457876973e-05
+    },
+    {
+      "episode": 6400,
+      "epoch": 0.08778306609790555,
+      "eps": 5,
+      "loss/policy_avg": -0.0157505851238966,
+      "loss/value_avg": 0.0,
+      "lr": 2.8158567774936066e-06,
+      "objective/entropy": 1.936393141746521,
+      "objective/kl": 10.550077438354492,
+      "objective/non_score_reward": -1.0550076961517334,
+      "objective/rlhf_reward": -0.252943217754364,
+      "objective/scores": 0.80078125,
+      "policy/approxkl_avg": 6.545133113861084,
+      "policy/clipfrac_avg": 0.341796875,
+      "policy/entropy_avg": 0.039971351623535156,
+      "step": 125,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 12,
+      "val/ratio": 1.0001187324523926,
+      "val/ratio_var": 0.00011527155584190041
+    },
+    {
+      "episode": 6656,
+      "epoch": 0.09129438874182177,
+      "eps": 5,
+      "loss/policy_avg": -0.00908716581761837,
+      "loss/value_avg": 0.0,
+      "lr": 2.8081841432225065e-06,
+      "objective/entropy": 1.9167767763137817,
+      "objective/kl": 10.831771850585938,
+      "objective/non_score_reward": -1.0831772089004517,
+      "objective/rlhf_reward": -0.24270595610141754,
+      "objective/scores": 0.83984375,
+      "policy/approxkl_avg": 13.507976531982422,
+      "policy/clipfrac_avg": 0.25,
+      "policy/entropy_avg": 0.034499168395996094,
+      "step": 130,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 7,
+      "val/ratio": 1.0004911422729492,
+      "val/ratio_var": 0.00018595268193166703
+    },
+    {
+      "episode": 6912,
+      "epoch": 0.094805711385738,
+      "eps": 5,
+      "loss/policy_avg": -0.017197387292981148,
+      "loss/value_avg": 0.0,
+      "lr": 2.800511508951407e-06,
+      "objective/entropy": 1.7237651348114014,
+      "objective/kl": 11.095592498779297,
+      "objective/non_score_reward": -1.1095592975616455,
+      "objective/rlhf_reward": -0.21057555079460144,
+      "objective/scores": 0.8984375,
+      "policy/approxkl_avg": 2.7560040950775146,
+      "policy/clipfrac_avg": 0.2841796875,
+      "policy/entropy_avg": 0.032952308654785156,
+      "step": 135,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 2,
+      "val/ratio": 0.9994020462036133,
+      "val/ratio_var": 3.074964843108319e-05
+    },
+    {
+      "episode": 7168,
+      "epoch": 0.09831703402965422,
+      "eps": 5,
+      "loss/policy_avg": -0.012010859325528145,
+      "loss/value_avg": 0.0,
+      "lr": 2.792838874680307e-06,
+      "objective/entropy": 1.5862581729888916,
+      "objective/kl": 10.674396514892578,
+      "objective/non_score_reward": -1.0674396753311157,
+      "objective/rlhf_reward": -0.14433012902736664,
+      "objective/scores": 0.921875,
+      "policy/approxkl_avg": 1.1186727285385132,
+      "policy/clipfrac_avg": 0.2783203125,
+      "policy/entropy_avg": 0.0295562744140625,
+      "step": 140,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 13,
+      "val/ratio": 1.0007727146148682,
+      "val/ratio_var": 4.557183274300769e-05
+    },
+    {
+      "episode": 7424,
+      "epoch": 0.10182835667357044,
+      "eps": 5,
+      "loss/policy_avg": -0.013728385791182518,
+      "loss/value_avg": 0.0,
+      "lr": 2.785166240409207e-06,
+      "objective/entropy": 1.5388869047164917,
+      "objective/kl": 10.359582901000977,
+      "objective/non_score_reward": -1.035958170890808,
+      "objective/rlhf_reward": -0.14511710405349731,
+      "objective/scores": 0.890625,
+      "policy/approxkl_avg": 0.5204602479934692,
+      "policy/clipfrac_avg": 0.283203125,
+      "policy/entropy_avg": 0.028924942016601562,
+      "step": 145,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 14,
+      "val/ratio": 1.056097149848938,
+      "val/ratio_var": 0.13372056186199188
+    },
+    {
+      "episode": 7680,
+      "epoch": 0.10533967931748667,
+      "eps": 5,
+      "loss/policy_avg": -0.014945434406399727,
+      "loss/value_avg": 0.0,
+      "lr": 2.7774936061381074e-06,
+      "objective/entropy": 2.0769755840301514,
+      "objective/kl": 11.147063255310059,
+      "objective/non_score_reward": -1.11470627784729,
+      "objective/rlhf_reward": -0.08940108120441437,
+      "objective/scores": 1.0234375,
+      "policy/approxkl_avg": 0.5961493253707886,
+      "policy/clipfrac_avg": 0.3681640625,
+      "policy/entropy_avg": 0.037804603576660156,
+      "step": 150,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 13,
+      "val/ratio": 1.0033739805221558,
+      "val/ratio_var": 0.00030022990540601313
+    },
+    {
+      "episode": 7936,
+      "epoch": 0.10885100196140288,
+      "eps": 5,
+      "loss/policy_avg": -0.02276831492781639,
+      "loss/value_avg": 0.0,
+      "lr": 2.7698209718670078e-06,
+      "objective/entropy": 2.1412830352783203,
+      "objective/kl": 11.697949409484863,
+      "objective/non_score_reward": -1.169795036315918,
+      "objective/rlhf_reward": -0.13582009077072144,
+      "objective/scores": 1.03125,
+      "policy/approxkl_avg": 0.7155288457870483,
+      "policy/clipfrac_avg": 0.3193359375,
+      "policy/entropy_avg": 0.037835121154785156,
+      "step": 155,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 13,
+      "val/ratio": 1.0014090538024902,
+      "val/ratio_var": 5.2470270020421594e-05
+    },
+    {
+      "episode": 8192,
+      "epoch": 0.1123623246053191,
+      "eps": 5,
+      "loss/policy_avg": -0.013076605275273323,
+      "loss/value_avg": 0.0,
+      "lr": 2.762148337595908e-06,
+      "objective/entropy": 1.634714126586914,
+      "objective/kl": 11.629154205322266,
+      "objective/non_score_reward": -1.1629154682159424,
+      "objective/rlhf_reward": -0.28488799929618835,
+      "objective/scores": 0.87890625,
+      "policy/approxkl_avg": 0.4181188941001892,
+      "policy/clipfrac_avg": 0.3037109375,
+      "policy/entropy_avg": 0.029273509979248047,
+      "step": 160,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 15,
+      "val/ratio": 1.0008339881896973,
+      "val/ratio_var": 1.4662801731901709e-05
+    },
+    {
+      "episode": 8448,
+      "epoch": 0.11587364724923532,
+      "eps": 5,
+      "loss/policy_avg": -0.01651182770729065,
+      "loss/value_avg": 0.0,
+      "lr": 2.7544757033248085e-06,
+      "objective/entropy": 1.9540742635726929,
+      "objective/kl": 11.4830322265625,
+      "objective/non_score_reward": -1.1483032703399658,
+      "objective/rlhf_reward": -0.05983233451843262,
+      "objective/scores": 1.0859375,
+      "policy/approxkl_avg": 18.791297912597656,
+      "policy/clipfrac_avg": 0.2880859375,
+      "policy/entropy_avg": 0.03601264953613281,
+      "step": 165,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.0220942497253418,
+      "val/ratio_var": 0.02208283357322216
+    },
+    {
+      "episode": 8704,
+      "epoch": 0.11938496989315155,
+      "eps": 5,
+      "loss/policy_avg": -0.013821810483932495,
+      "loss/value_avg": 0.0,
+      "lr": 2.7468030690537084e-06,
+      "objective/entropy": 1.6243339776992798,
+      "objective/kl": 11.435280799865723,
+      "objective/non_score_reward": -1.1435281038284302,
+      "objective/rlhf_reward": -0.12443088740110397,
+      "objective/scores": 1.015625,
+      "policy/approxkl_avg": 0.29013216495513916,
+      "policy/clipfrac_avg": 0.28125,
+      "policy/entropy_avg": 0.03498649597167969,
+      "step": 170,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 15,
+      "val/ratio": 1.0027971267700195,
+      "val/ratio_var": 0.0002298366161994636
+    },
+    {
+      "episode": 8960,
+      "epoch": 0.12289629253706777,
+      "eps": 5,
+      "loss/policy_avg": -0.011003649793565273,
+      "loss/value_avg": 0.0,
+      "lr": 2.7391304347826087e-06,
+      "objective/entropy": 2.000375986099243,
+      "objective/kl": 11.78514575958252,
+      "objective/non_score_reward": -1.1785145998001099,
+      "objective/rlhf_reward": -0.2609584331512451,
+      "objective/scores": 0.91796875,
+      "policy/approxkl_avg": 0.8603074550628662,
+      "policy/clipfrac_avg": 0.2998046875,
+      "policy/entropy_avg": 0.034775733947753906,
+      "step": 175,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 19,
+      "val/ratio": 1.0012288093566895,
+      "val/ratio_var": 3.532394111971371e-05
+    },
+    {
+      "episode": 9216,
+      "epoch": 0.126407615180984,
+      "eps": 5,
+      "loss/policy_avg": -0.010885423980653286,
+      "loss/value_avg": 0.0,
+      "lr": 2.731457800511509e-06,
+      "objective/entropy": 1.5240473747253418,
+      "objective/kl": 12.420597076416016,
+      "objective/non_score_reward": -1.2420598268508911,
+      "objective/rlhf_reward": -0.16641265153884888,
+      "objective/scores": 1.078125,
+      "policy/approxkl_avg": 0.46217110753059387,
+      "policy/clipfrac_avg": 0.2783203125,
+      "policy/entropy_avg": 0.029424667358398438,
+      "step": 180,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 20,
+      "val/ratio": 1.0007582902908325,
+      "val/ratio_var": 2.4759892767178826e-05
+    },
+    {
+      "episode": 9472,
+      "epoch": 0.12991893782490022,
+      "eps": 5,
+      "loss/policy_avg": -0.01097183395177126,
+      "loss/value_avg": 0.0,
+      "lr": 2.7237851662404094e-06,
+      "objective/entropy": 1.6292238235473633,
+      "objective/kl": 12.73173713684082,
+      "objective/non_score_reward": -1.2731736898422241,
+      "objective/rlhf_reward": -0.10916168242692947,
+      "objective/scores": 1.1640625,
+      "policy/approxkl_avg": 0.5525862574577332,
+      "policy/clipfrac_avg": 0.310546875,
+      "policy/entropy_avg": 0.031815528869628906,
+      "step": 185,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 22,
+      "val/ratio": 1.0027148723602295,
+      "val/ratio_var": 0.00016600274830125272
+    },
+    {
+      "episode": 9728,
+      "epoch": 0.13343026046881643,
+      "eps": 5,
+      "loss/policy_avg": -0.010572239756584167,
+      "loss/value_avg": 0.0,
+      "lr": 2.7161125319693097e-06,
+      "objective/entropy": 2.028618335723877,
+      "objective/kl": 12.439943313598633,
+      "objective/non_score_reward": -1.2439942359924316,
+      "objective/rlhf_reward": -0.06748821586370468,
+      "objective/scores": 1.171875,
+      "policy/approxkl_avg": 0.4930054843425751,
+      "policy/clipfrac_avg": 0.2841796875,
+      "policy/entropy_avg": 0.03688812255859375,
+      "step": 190,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 22,
+      "val/ratio": 1.001340627670288,
+      "val/ratio_var": 4.4035481550963596e-05
+    },
+    {
+      "episode": 9984,
+      "epoch": 0.13694158311273266,
+      "eps": 5,
+      "loss/policy_avg": -0.019254155457019806,
+      "loss/value_avg": 0.0,
+      "lr": 2.7084398976982097e-06,
+      "objective/entropy": 2.295351266860962,
+      "objective/kl": 13.32223892211914,
+      "objective/non_score_reward": -1.332223892211914,
+      "objective/rlhf_reward": -0.1836824268102646,
+      "objective/scores": 1.1484375,
+      "policy/approxkl_avg": 3.1426281929016113,
+      "policy/clipfrac_avg": 0.3251953125,
+      "policy/entropy_avg": 0.03939247131347656,
+      "step": 195,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 17,
+      "val/ratio": 1.0032271146774292,
+      "val/ratio_var": 0.00019827872165478766
+    },
+    {
+      "episode": 10240,
+      "epoch": 0.14045290575664887,
+      "eps": 5,
+      "loss/policy_avg": -0.018122296780347824,
+      "loss/value_avg": 0.0,
+      "lr": 2.70076726342711e-06,
+      "objective/entropy": 2.345075845718384,
+      "objective/kl": 12.536066055297852,
+      "objective/non_score_reward": -1.2536065578460693,
+      "objective/rlhf_reward": -0.056986674666404724,
+      "objective/scores": 1.1953125,
+      "policy/approxkl_avg": 27.5201473236084,
+      "policy/clipfrac_avg": 0.3046875,
+      "policy/entropy_avg": 0.04156017303466797,
+      "step": 200,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 20,
+      "val/ratio": 0.9993807077407837,
+      "val/ratio_var": 0.00011275127326371148
+    },
+    {
+      "episode": 10496,
+      "epoch": 0.1439642284005651,
+      "eps": 5,
+      "loss/policy_avg": -0.019295353442430496,
+      "loss/value_avg": 0.0,
+      "lr": 2.6930946291560103e-06,
+      "objective/entropy": 2.091012477874756,
+      "objective/kl": 12.746508598327637,
+      "objective/non_score_reward": -1.2746508121490479,
+      "objective/rlhf_reward": -0.09065462648868561,
+      "objective/scores": 1.1875,
+      "policy/approxkl_avg": 0.5554059743881226,
+      "policy/clipfrac_avg": 0.2998046875,
+      "policy/entropy_avg": 0.03620719909667969,
+      "step": 205,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 17,
+      "val/ratio": 1.001387119293213,
+      "val/ratio_var": 3.4958156902575865e-05
+    },
+    {
+      "episode": 10752,
+      "epoch": 0.14747555104448132,
+      "eps": 5,
+      "loss/policy_avg": -0.010203800164163113,
+      "loss/value_avg": 0.0,
+      "lr": 2.6854219948849107e-06,
+      "objective/entropy": 2.1808600425720215,
+      "objective/kl": 12.404802322387695,
+      "objective/non_score_reward": -1.2404803037643433,
+      "objective/rlhf_reward": -0.059675075113773346,
+      "objective/scores": 1.1796875,
+      "policy/approxkl_avg": 0.5876989364624023,
+      "policy/clipfrac_avg": 0.27734375,
+      "policy/entropy_avg": 0.041385650634765625,
+      "step": 210,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 14,
+      "val/ratio": 1.0035128593444824,
+      "val/ratio_var": 0.0004931605653837323
+    },
+    {
+      "episode": 11008,
+      "epoch": 0.15098687368839755,
+      "eps": 5,
+      "loss/policy_avg": -0.018955286592245102,
+      "loss/value_avg": 0.0,
+      "lr": 2.677749360613811e-06,
+      "objective/entropy": 1.968322992324829,
+      "objective/kl": 13.322561264038086,
+      "objective/non_score_reward": -1.3322560787200928,
+      "objective/rlhf_reward": -0.0670965313911438,
+      "objective/scores": 1.265625,
+      "policy/approxkl_avg": 0.39782679080963135,
+      "policy/clipfrac_avg": 0.373046875,
+      "policy/entropy_avg": 0.03279399871826172,
+      "step": 215,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 14,
+      "val/ratio": 1.0017969608306885,
+      "val/ratio_var": 6.843745359219611e-05
+    },
+    {
+      "episode": 11264,
+      "epoch": 0.15449819633231376,
+      "eps": 5,
+      "loss/policy_avg": -0.014947709627449512,
+      "loss/value_avg": 0.0,
+      "lr": 2.670076726342711e-06,
+      "objective/entropy": 1.7985560894012451,
+      "objective/kl": 12.856376647949219,
+      "objective/non_score_reward": -1.285637617111206,
+      "objective/rlhf_reward": -0.03251491114497185,
+      "objective/scores": 1.25,
+      "policy/approxkl_avg": 0.4516296982765198,
+      "policy/clipfrac_avg": 0.3671875,
+      "policy/entropy_avg": 0.031859397888183594,
+      "step": 220,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 13,
+      "val/ratio": 1.0009992122650146,
+      "val/ratio_var": 4.1194460209226236e-05
+    },
+    {
+      "episode": 11520,
+      "epoch": 0.15800951897623,
+      "eps": 5,
+      "loss/policy_avg": -0.019819077104330063,
+      "loss/value_avg": 0.0,
+      "lr": 2.6624040920716113e-06,
+      "objective/entropy": 1.5284242630004883,
+      "objective/kl": 14.283391952514648,
+      "objective/non_score_reward": -1.4283392429351807,
+      "objective/rlhf_reward": -0.014965277165174484,
+      "objective/scores": 1.4140625,
+      "policy/approxkl_avg": 1.5518393516540527,
+      "policy/clipfrac_avg": 0.2744140625,
+      "policy/entropy_avg": 0.026048660278320312,
+      "step": 225,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 8,
+      "val/ratio": 1.0007925033569336,
+      "val/ratio_var": 3.721785105881281e-05
+    },
+    {
+      "episode": 11776,
+      "epoch": 0.1615208416201462,
+      "eps": 5,
+      "loss/policy_avg": -0.015632648020982742,
+      "loss/value_avg": 0.0,
+      "lr": 2.6547314578005116e-06,
+      "objective/entropy": 1.5101430416107178,
+      "objective/kl": 13.435927391052246,
+      "objective/non_score_reward": -1.3435927629470825,
+      "objective/rlhf_reward": 0.017792798578739166,
+      "objective/scores": 1.359375,
+      "policy/approxkl_avg": 0.22922199964523315,
+      "policy/clipfrac_avg": 0.271484375,
+      "policy/entropy_avg": 0.025536060333251953,
+      "step": 230,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 11,
+      "val/ratio": 1.0355898141860962,
+      "val/ratio_var": 0.09009659290313721
+    },
+    {
+      "episode": 12032,
+      "epoch": 0.16503216426406245,
+      "eps": 5,
+      "loss/policy_avg": -0.014460040256381035,
+      "loss/value_avg": 0.0,
+      "lr": 2.647058823529412e-06,
+      "objective/entropy": 1.412046194076538,
+      "objective/kl": 13.981653213500977,
+      "objective/non_score_reward": -1.3981653451919556,
+      "objective/rlhf_reward": -0.19434592127799988,
+      "objective/scores": 1.203125,
+      "policy/approxkl_avg": 0.48358476161956787,
+      "policy/clipfrac_avg": 0.287109375,
+      "policy/entropy_avg": 0.027116775512695312,
+      "step": 235,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 19,
+      "val/ratio": 1.0002431869506836,
+      "val/ratio_var": 1.317531587119447e-05
+    },
+    {
+      "episode": 12288,
+      "epoch": 0.16854348690797866,
+      "eps": 5,
+      "loss/policy_avg": -0.0144148338586092,
+      "loss/value_avg": 0.0,
+      "lr": 2.6393861892583123e-06,
+      "objective/entropy": 1.5728825330734253,
+      "objective/kl": 13.091099739074707,
+      "objective/non_score_reward": -1.3091099262237549,
+      "objective/rlhf_reward": -0.12438549101352692,
+      "objective/scores": 1.1875,
+      "policy/approxkl_avg": 0.5084937810897827,
+      "policy/clipfrac_avg": 0.2587890625,
+      "policy/entropy_avg": 0.028881072998046875,
+      "step": 240,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 14,
+      "val/ratio": 1.004534125328064,
+      "val/ratio_var": 0.001037073670886457
+    },
+    {
+      "episode": 12544,
+      "epoch": 0.17205480955189487,
+      "eps": 5,
+      "loss/policy_avg": -0.02535724639892578,
+      "loss/value_avg": 0.0,
+      "lr": 2.6317135549872122e-06,
+      "objective/entropy": 1.6895666122436523,
+      "objective/kl": 13.036446571350098,
+      "objective/non_score_reward": -1.3036446571350098,
+      "objective/rlhf_reward": -0.08131173253059387,
+      "objective/scores": 1.21875,
+      "policy/approxkl_avg": 1.397173285484314,
+      "policy/clipfrac_avg": 0.296875,
+      "policy/entropy_avg": 0.025877952575683594,
+      "step": 245,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 11,
+      "val/ratio": 0.9993641972541809,
+      "val/ratio_var": 3.554957584128715e-05
+    },
+    {
+      "episode": 12800,
+      "epoch": 0.1755661321958111,
+      "eps": 5,
+      "loss/policy_avg": -0.013989413157105446,
+      "loss/value_avg": 0.0,
+      "lr": 2.6240409207161126e-06,
+      "objective/entropy": 1.4321318864822388,
+      "objective/kl": 13.751260757446289,
+      "objective/non_score_reward": -1.3751261234283447,
+      "objective/rlhf_reward": 0.024946460500359535,
+      "objective/scores": 1.3984375,
+      "policy/approxkl_avg": 0.3265579044818878,
+      "policy/clipfrac_avg": 0.3095703125,
+      "policy/entropy_avg": 0.02507495880126953,
+      "step": 250,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 14,
+      "val/ratio": 1.0016669034957886,
+      "val/ratio_var": 3.951354665332474e-05
+    },
+    {
+      "episode": 13056,
+      "epoch": 0.1790774548397273,
+      "eps": 5,
+      "loss/policy_avg": -0.01614242233335972,
+      "loss/value_avg": 0.0,
+      "lr": 2.616368286445013e-06,
+      "objective/entropy": 1.2477443218231201,
+      "objective/kl": 14.385757446289062,
+      "objective/non_score_reward": -1.4385757446289062,
+      "objective/rlhf_reward": -0.048571567982435226,
+      "objective/scores": 1.390625,
+      "policy/approxkl_avg": 0.38643181324005127,
+      "policy/clipfrac_avg": 0.33203125,
+      "policy/entropy_avg": 0.02417755126953125,
+      "step": 255,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 19,
+      "val/ratio": 1.0020815134048462,
+      "val/ratio_var": 0.00012469623470678926
+    },
+    {
+      "episode": 13312,
+      "epoch": 0.18258877748364355,
+      "eps": 5,
+      "loss/policy_avg": -0.013632966205477715,
+      "loss/value_avg": 0.0,
+      "lr": 2.6086956521739132e-06,
+      "objective/entropy": 1.5228471755981445,
+      "objective/kl": 14.862211227416992,
+      "objective/non_score_reward": -1.486221194267273,
+      "objective/rlhf_reward": -0.07545565813779831,
+      "objective/scores": 1.40625,
+      "policy/approxkl_avg": 2.011383056640625,
+      "policy/clipfrac_avg": 0.3359375,
+      "policy/entropy_avg": 0.027433395385742188,
+      "step": 260,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 17,
+      "val/ratio": 0.9998003244400024,
+      "val/ratio_var": 2.738163857429754e-05
+    },
+    {
+      "episode": 13568,
+      "epoch": 0.18610010012755976,
+      "eps": 5,
+      "loss/policy_avg": -0.020372817292809486,
+      "loss/value_avg": 0.0,
+      "lr": 2.6010230179028136e-06,
+      "objective/entropy": 1.633180856704712,
+      "objective/kl": 14.094629287719727,
+      "objective/non_score_reward": -1.4094629287719727,
+      "objective/rlhf_reward": -0.015713702887296677,
+      "objective/scores": 1.390625,
+      "policy/approxkl_avg": 0.47778478264808655,
+      "policy/clipfrac_avg": 0.3701171875,
+      "policy/entropy_avg": 0.02643442153930664,
+      "step": 265,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 14,
+      "val/ratio": 1.0079278945922852,
+      "val/ratio_var": 0.0023215855471789837
+    },
+    {
+      "episode": 13824,
+      "epoch": 0.189611422771476,
+      "eps": 5,
+      "loss/policy_avg": -0.01413625106215477,
+      "loss/value_avg": 0.0,
+      "lr": 2.5933503836317135e-06,
+      "objective/entropy": 1.2899070978164673,
+      "objective/kl": 14.59975528717041,
+      "objective/non_score_reward": -1.4599756002426147,
+      "objective/rlhf_reward": -0.09675531834363937,
+      "objective/scores": 1.359375,
+      "policy/approxkl_avg": 0.4568091630935669,
+      "policy/clipfrac_avg": 0.3076171875,
+      "policy/entropy_avg": 0.02667713165283203,
+      "step": 270,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 19,
+      "val/ratio": 1.003631830215454,
+      "val/ratio_var": 0.00041694415267556906
+    },
+    {
+      "episode": 14080,
+      "epoch": 0.1931227454153922,
+      "eps": 5,
+      "loss/policy_avg": -0.018304049968719482,
+      "loss/value_avg": 0.0,
+      "lr": 2.585677749360614e-06,
+      "objective/entropy": 1.3464603424072266,
+      "objective/kl": 14.915502548217773,
+      "objective/non_score_reward": -1.491550326347351,
+      "objective/rlhf_reward": -0.013601185753941536,
+      "objective/scores": 1.4765625,
+      "policy/approxkl_avg": 0.5154660940170288,
+      "policy/clipfrac_avg": 0.3251953125,
+      "policy/entropy_avg": 0.024927139282226562,
+      "step": 275,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 20,
+      "val/ratio": 1.0014865398406982,
+      "val/ratio_var": 8.263294148491696e-05
+    },
+    {
+      "episode": 14336,
+      "epoch": 0.19663406805930844,
+      "eps": 5,
+      "loss/policy_avg": -0.009162629023194313,
+      "loss/value_avg": 0.0,
+      "lr": 2.578005115089514e-06,
+      "objective/entropy": 1.3251242637634277,
+      "objective/kl": 14.600137710571289,
+      "objective/non_score_reward": -1.460013747215271,
+      "objective/rlhf_reward": -0.10571230947971344,
+      "objective/scores": 1.3515625,
+      "policy/approxkl_avg": 0.4187917411327362,
+      "policy/clipfrac_avg": 0.3046875,
+      "policy/entropy_avg": 0.023657798767089844,
+      "step": 280,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.008554458618164,
+      "val/ratio_var": 0.001467635971494019
+    },
+    {
+      "episode": 14592,
+      "epoch": 0.20014539070322465,
+      "eps": 5,
+      "loss/policy_avg": -0.01609072834253311,
+      "loss/value_avg": 0.0,
+      "lr": 2.5703324808184145e-06,
+      "objective/entropy": 1.3078004121780396,
+      "objective/kl": 14.999523162841797,
+      "objective/non_score_reward": -1.4999523162841797,
+      "objective/rlhf_reward": -0.15238332748413086,
+      "objective/scores": 1.3515625,
+      "policy/approxkl_avg": 0.3968128561973572,
+      "policy/clipfrac_avg": 0.36328125,
+      "policy/entropy_avg": 0.023943424224853516,
+      "step": 285,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 12,
+      "val/ratio": 1.0019521713256836,
+      "val/ratio_var": 8.228437945945188e-05
+    },
+    {
+      "episode": 14848,
+      "epoch": 0.2036567133471409,
+      "eps": 5,
+      "loss/policy_avg": -0.014186807908117771,
+      "loss/value_avg": 0.0,
+      "lr": 2.562659846547315e-06,
+      "objective/entropy": 1.2583755254745483,
+      "objective/kl": 15.623100280761719,
+      "objective/non_score_reward": -1.5623100996017456,
+      "objective/rlhf_reward": -0.09625323116779327,
+      "objective/scores": 1.46875,
+      "policy/approxkl_avg": 0.5678977370262146,
+      "policy/clipfrac_avg": 0.3076171875,
+      "policy/entropy_avg": 0.024990558624267578,
+      "step": 290,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 13,
+      "val/ratio": 1.0014848709106445,
+      "val/ratio_var": 4.219371476210654e-05
+    },
+    {
+      "episode": 15104,
+      "epoch": 0.2071680359910571,
+      "eps": 5,
+      "loss/policy_avg": -0.013804701156914234,
+      "loss/value_avg": 0.0,
+      "lr": 2.5549872122762148e-06,
+      "objective/entropy": 1.568720817565918,
+      "objective/kl": 14.687668800354004,
+      "objective/non_score_reward": -1.4687669277191162,
+      "objective/rlhf_reward": -0.17009752988815308,
+      "objective/scores": 1.296875,
+      "policy/approxkl_avg": 0.3046334981918335,
+      "policy/clipfrac_avg": 0.26953125,
+      "policy/entropy_avg": 0.027862548828125,
+      "step": 295,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.0005717277526855,
+      "val/ratio_var": 1.1324932529532816e-05
+    },
+    {
+      "episode": 15360,
+      "epoch": 0.21067935863497333,
+      "eps": 5,
+      "loss/policy_avg": -0.018133502453565598,
+      "loss/value_avg": 0.0,
+      "lr": 2.547314578005115e-06,
+      "objective/entropy": 1.2987349033355713,
+      "objective/kl": 13.89183235168457,
+      "objective/non_score_reward": -1.3891831636428833,
+      "objective/rlhf_reward": -0.12500587105751038,
+      "objective/scores": 1.265625,
+      "policy/approxkl_avg": 0.31936001777648926,
+      "policy/clipfrac_avg": 0.33984375,
+      "policy/entropy_avg": 0.02469015121459961,
+      "step": 300,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 18,
+      "val/ratio": 1.0020601749420166,
+      "val/ratio_var": 3.1293042411562055e-05
+    },
+    {
+      "episode": 15616,
+      "epoch": 0.21419068127888954,
+      "eps": 5,
+      "loss/policy_avg": -0.01710616797208786,
+      "loss/value_avg": 0.0,
+      "lr": 2.5396419437340155e-06,
+      "objective/entropy": 1.4288297891616821,
+      "objective/kl": 14.952780723571777,
+      "objective/non_score_reward": -1.4952781200408936,
+      "objective/rlhf_reward": -0.15792769193649292,
+      "objective/scores": 1.3359375,
+      "policy/approxkl_avg": 0.6461950540542603,
+      "policy/clipfrac_avg": 0.3125,
+      "policy/entropy_avg": 0.02637958526611328,
+      "step": 305,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 19,
+      "val/ratio": 1.0013606548309326,
+      "val/ratio_var": 4.203971548122354e-05
+    },
+    {
+      "episode": 15872,
+      "epoch": 0.21770200392280575,
+      "eps": 5,
+      "loss/policy_avg": -0.016128187999129295,
+      "loss/value_avg": 0.0,
+      "lr": 2.531969309462916e-06,
+      "objective/entropy": 1.3288850784301758,
+      "objective/kl": 15.583921432495117,
+      "objective/non_score_reward": -1.55839204788208,
+      "objective/rlhf_reward": -0.07665687799453735,
+      "objective/scores": 1.484375,
+      "policy/approxkl_avg": 0.3284182548522949,
+      "policy/clipfrac_avg": 0.328125,
+      "policy/entropy_avg": 0.02404165267944336,
+      "step": 310,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.0064442157745361,
+      "val/ratio_var": 0.0015344778075814247
+    },
+    {
+      "episode": 16128,
+      "epoch": 0.221213326566722,
+      "eps": 5,
+      "loss/policy_avg": -0.015278931707143784,
+      "loss/value_avg": 0.0,
+      "lr": 2.524296675191816e-06,
+      "objective/entropy": 1.3236112594604492,
+      "objective/kl": 14.773448944091797,
+      "objective/non_score_reward": -1.4773449897766113,
+      "objective/rlhf_reward": -0.08708612620830536,
+      "objective/scores": 1.390625,
+      "policy/approxkl_avg": 0.2980467975139618,
+      "policy/clipfrac_avg": 0.34765625,
+      "policy/entropy_avg": 0.02494335174560547,
+      "step": 315,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 7,
+      "val/ratio": 1.0008249282836914,
+      "val/ratio_var": 1.927867742779199e-05
+    },
+    {
+      "episode": 16384,
+      "epoch": 0.2247246492106382,
+      "eps": 5,
+      "loss/policy_avg": -0.020951703190803528,
+      "loss/value_avg": 0.0,
+      "lr": 2.516624040920716e-06,
+      "objective/entropy": 1.2670817375183105,
+      "objective/kl": 14.8348970413208,
+      "objective/non_score_reward": -1.483489751815796,
+      "objective/rlhf_reward": 0.03382519632577896,
+      "objective/scores": 1.515625,
+      "policy/approxkl_avg": 1.0973663330078125,
+      "policy/clipfrac_avg": 0.361328125,
+      "policy/entropy_avg": 0.02091073989868164,
+      "step": 320,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 15,
+      "val/ratio": 1.003138542175293,
+      "val/ratio_var": 0.0001205340595333837
+    },
+    {
+      "episode": 16640,
+      "epoch": 0.22823597185455444,
+      "eps": 5,
+      "loss/policy_avg": -0.006676271557807922,
+      "loss/value_avg": 0.0,
+      "lr": 2.5089514066496164e-06,
+      "objective/entropy": 1.191056728363037,
+      "objective/kl": 16.46404457092285,
+      "objective/non_score_reward": -1.646404504776001,
+      "objective/rlhf_reward": -0.14990828931331635,
+      "objective/scores": 1.5,
+      "policy/approxkl_avg": 0.31475046277046204,
+      "policy/clipfrac_avg": 0.2578125,
+      "policy/entropy_avg": 0.021608352661132812,
+      "step": 325,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 25,
+      "val/ratio": 1.000575065612793,
+      "val/ratio_var": 1.735856494633481e-05
+    },
+    {
+      "episode": 16896,
+      "epoch": 0.23174729449847065,
+      "eps": 5,
+      "loss/policy_avg": -0.02127978205680847,
+      "loss/value_avg": 0.0,
+      "lr": 2.5012787723785167e-06,
+      "objective/entropy": 1.490570068359375,
+      "objective/kl": 16.22044563293457,
+      "objective/non_score_reward": -1.622044563293457,
+      "objective/rlhf_reward": -0.015120631083846092,
+      "objective/scores": 1.609375,
+      "policy/approxkl_avg": 2.5871665477752686,
+      "policy/clipfrac_avg": 0.35546875,
+      "policy/entropy_avg": 0.027353286743164062,
+      "step": 330,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 25,
+      "val/ratio": 1.003063440322876,
+      "val/ratio_var": 6.509448576252908e-05
+    },
+    {
+      "episode": 17152,
+      "epoch": 0.23525861714238688,
+      "eps": 5,
+      "loss/policy_avg": -0.013108542189002037,
+      "loss/value_avg": 0.0,
+      "lr": 2.493606138107417e-06,
+      "objective/entropy": 1.2718842029571533,
+      "objective/kl": 16.047882080078125,
+      "objective/non_score_reward": -1.604788064956665,
+      "objective/rlhf_reward": -0.12415145337581635,
+      "objective/scores": 1.484375,
+      "policy/approxkl_avg": 0.2758824825286865,
+      "policy/clipfrac_avg": 0.287109375,
+      "policy/entropy_avg": 0.024268627166748047,
+      "step": 335,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.0036962032318115,
+      "val/ratio_var": 0.0003905180492438376
+    },
+    {
+      "episode": 17408,
+      "epoch": 0.2387699397863031,
+      "eps": 5,
+      "loss/policy_avg": -0.014837839640676975,
+      "loss/value_avg": 0.0,
+      "lr": 2.4859335038363174e-06,
+      "objective/entropy": 1.3406567573547363,
+      "objective/kl": 16.428348541259766,
+      "objective/non_score_reward": -1.642835021018982,
+      "objective/rlhf_reward": 0.018702151253819466,
+      "objective/scores": 1.6640625,
+      "policy/approxkl_avg": 0.192110076546669,
+      "policy/clipfrac_avg": 0.3388671875,
+      "policy/entropy_avg": 0.025295734405517578,
+      "step": 340,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 19,
+      "val/ratio": 1.0008959770202637,
+      "val/ratio_var": 1.205760781886056e-05
+    },
+    {
+      "episode": 17664,
+      "epoch": 0.24228126243021933,
+      "eps": 5,
+      "loss/policy_avg": -0.01899782381951809,
+      "loss/value_avg": 0.0,
+      "lr": 2.4782608695652173e-06,
+      "objective/entropy": 1.5880205631256104,
+      "objective/kl": 15.775943756103516,
+      "objective/non_score_reward": -1.577594518661499,
+      "objective/rlhf_reward": -0.08363974094390869,
+      "objective/scores": 1.4921875,
+      "policy/approxkl_avg": 0.9254180192947388,
+      "policy/clipfrac_avg": 0.3251953125,
+      "policy/entropy_avg": 0.024907588958740234,
+      "step": 345,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 15,
+      "val/ratio": 1.000749111175537,
+      "val/ratio_var": 2.8957187168998644e-05
+    },
+    {
+      "episode": 17920,
+      "epoch": 0.24579258507413554,
+      "eps": 5,
+      "loss/policy_avg": -0.014669202268123627,
+      "loss/value_avg": 0.0,
+      "lr": 2.4705882352941177e-06,
+      "objective/entropy": 1.2656543254852295,
+      "objective/kl": 16.034730911254883,
+      "objective/non_score_reward": -1.60347318649292,
+      "objective/rlhf_reward": -0.011744961142539978,
+      "objective/scores": 1.59375,
+      "policy/approxkl_avg": 1.004683017730713,
+      "policy/clipfrac_avg": 0.2646484375,
+      "policy/entropy_avg": 0.023741722106933594,
+      "step": 350,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 19,
+      "val/ratio": 1.0009608268737793,
+      "val/ratio_var": 4.649764014175162e-05
+    },
+    {
+      "episode": 18176,
+      "epoch": 0.24930390771805178,
+      "eps": 5,
+      "loss/policy_avg": -0.014812990091741085,
+      "loss/value_avg": 0.0,
+      "lr": 2.462915601023018e-06,
+      "objective/entropy": 1.2658121585845947,
+      "objective/kl": 15.927343368530273,
+      "objective/non_score_reward": -1.5927343368530273,
+      "objective/rlhf_reward": -0.12128548324108124,
+      "objective/scores": 1.46875,
+      "policy/approxkl_avg": 0.32618167996406555,
+      "policy/clipfrac_avg": 0.31640625,
+      "policy/entropy_avg": 0.0256500244140625,
+      "step": 355,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 19,
+      "val/ratio": 1.0012333393096924,
+      "val/ratio_var": 3.0146764402161352e-05
+    },
+    {
+      "episode": 18432,
+      "epoch": 0.252815230361968,
+      "eps": 5,
+      "loss/policy_avg": -0.015332316979765892,
+      "loss/value_avg": 0.0,
+      "lr": 2.4552429667519184e-06,
+      "objective/entropy": 1.3316707611083984,
+      "objective/kl": 15.554301261901855,
+      "objective/non_score_reward": -1.5554301738739014,
+      "objective/rlhf_reward": -0.007791273295879364,
+      "objective/scores": 1.546875,
+      "policy/approxkl_avg": 4.210527420043945,
+      "policy/clipfrac_avg": 0.283203125,
+      "policy/entropy_avg": 0.023929595947265625,
+      "step": 360,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.0024126768112183,
+      "val/ratio_var": 9.787842282094061e-05
+    },
+    {
+      "episode": 18688,
+      "epoch": 0.2563265530058842,
+      "eps": 5,
+      "loss/policy_avg": -0.01625869981944561,
+      "loss/value_avg": 0.0,
+      "lr": 2.4475703324808187e-06,
+      "objective/entropy": 1.4523942470550537,
+      "objective/kl": 15.260530471801758,
+      "objective/non_score_reward": -1.5260531902313232,
+      "objective/rlhf_reward": -0.06101101636886597,
+      "objective/scores": 1.46875,
+      "policy/approxkl_avg": 0.9305495023727417,
+      "policy/clipfrac_avg": 0.298828125,
+      "policy/entropy_avg": 0.02731466293334961,
+      "step": 365,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 21,
+      "val/ratio": 1.0011459589004517,
+      "val/ratio_var": 4.8135185352293774e-05
+    },
+    {
+      "episode": 18944,
+      "epoch": 0.25983787564980043,
+      "eps": 5,
+      "loss/policy_avg": -0.019823966547846794,
+      "loss/value_avg": 0.0,
+      "lr": 2.4398976982097186e-06,
+      "objective/entropy": 1.2733044624328613,
+      "objective/kl": 14.911139488220215,
+      "objective/non_score_reward": -1.4911139011383057,
+      "objective/rlhf_reward": -0.10889418423175812,
+      "objective/scores": 1.3828125,
+      "policy/approxkl_avg": 0.2764221429824829,
+      "policy/clipfrac_avg": 0.3681640625,
+      "policy/entropy_avg": 0.02543497085571289,
+      "step": 370,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 21,
+      "val/ratio": 1.0017691850662231,
+      "val/ratio_var": 4.75883498438634e-05
+    },
+    {
+      "episode": 19200,
+      "epoch": 0.26334919829371667,
+      "eps": 5,
+      "loss/policy_avg": -0.016569480299949646,
+      "loss/value_avg": 0.0,
+      "lr": 2.432225063938619e-06,
+      "objective/entropy": 1.3932634592056274,
+      "objective/kl": 14.846328735351562,
+      "objective/non_score_reward": -1.4846327304840088,
+      "objective/rlhf_reward": -0.05459916219115257,
+      "objective/scores": 1.4296875,
+      "policy/approxkl_avg": 0.6026493906974792,
+      "policy/clipfrac_avg": 0.314453125,
+      "policy/entropy_avg": 0.02507472038269043,
+      "step": 375,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 17,
+      "val/ratio": 1.0026881694793701,
+      "val/ratio_var": 9.921380114974454e-05
+    },
+    {
+      "episode": 19456,
+      "epoch": 0.26686052093763285,
+      "eps": 5,
+      "loss/policy_avg": -0.016383569687604904,
+      "loss/value_avg": 0.0,
+      "lr": 2.4245524296675193e-06,
+      "objective/entropy": 1.2150591611862183,
+      "objective/kl": 15.154396057128906,
+      "objective/non_score_reward": -1.5154396295547485,
+      "objective/rlhf_reward": -0.002263203263282776,
+      "objective/scores": 1.515625,
+      "policy/approxkl_avg": 0.2193431854248047,
+      "policy/clipfrac_avg": 0.310546875,
+      "policy/entropy_avg": 0.02501058578491211,
+      "step": 380,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 21,
+      "val/ratio": 1.002253532409668,
+      "val/ratio_var": 6.509936065413058e-05
+    },
+    {
+      "episode": 19712,
+      "epoch": 0.2703718435815491,
+      "eps": 5,
+      "loss/policy_avg": -0.012824717909097672,
+      "loss/value_avg": 0.0,
+      "lr": 2.4168797953964196e-06,
+      "objective/entropy": 1.1273529529571533,
+      "objective/kl": 15.169496536254883,
+      "objective/non_score_reward": -1.5169496536254883,
+      "objective/rlhf_reward": -0.077918142080307,
+      "objective/scores": 1.4375,
+      "policy/approxkl_avg": 0.936165452003479,
+      "policy/clipfrac_avg": 0.2626953125,
+      "policy/entropy_avg": 0.025212764739990234,
+      "step": 385,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 16,
+      "val/ratio": 1.0019395351409912,
+      "val/ratio_var": 9.75406655925326e-05
+    },
+    {
+      "episode": 19968,
+      "epoch": 0.2738831662254653,
+      "eps": 5,
+      "loss/policy_avg": -0.013698047958314419,
+      "loss/value_avg": 0.0,
+      "lr": 2.40920716112532e-06,
+      "objective/entropy": 1.491709589958191,
+      "objective/kl": 14.444531440734863,
+      "objective/non_score_reward": -1.4444531202316284,
+      "objective/rlhf_reward": -0.07102194428443909,
+      "objective/scores": 1.375,
+      "policy/approxkl_avg": 0.33337581157684326,
+      "policy/clipfrac_avg": 0.2841796875,
+      "policy/entropy_avg": 0.026212692260742188,
+      "step": 390,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 18,
+      "val/ratio": 1.0005760192871094,
+      "val/ratio_var": 1.5471163351321593e-05
+    },
+    {
+      "episode": 20224,
+      "epoch": 0.27739448886938156,
+      "eps": 5,
+      "loss/policy_avg": -0.015457297675311565,
+      "loss/value_avg": 0.0,
+      "lr": 2.40153452685422e-06,
+      "objective/entropy": 1.2903845310211182,
+      "objective/kl": 15.261798858642578,
+      "objective/non_score_reward": -1.5261797904968262,
+      "objective/rlhf_reward": -0.037545278668403625,
+      "objective/scores": 1.484375,
+      "policy/approxkl_avg": 0.1323377788066864,
+      "policy/clipfrac_avg": 0.302734375,
+      "policy/entropy_avg": 0.02473735809326172,
+      "step": 395,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 17,
+      "val/ratio": 1.001684546470642,
+      "val/ratio_var": 4.836953667108901e-05
+    },
+    {
+      "episode": 20480,
+      "epoch": 0.28090581151329774,
+      "eps": 5,
+      "loss/policy_avg": -0.010248646140098572,
+      "loss/value_avg": 0.0,
+      "lr": 2.3938618925831202e-06,
+      "objective/entropy": 1.4937005043029785,
+      "objective/kl": 15.048827171325684,
+      "objective/non_score_reward": -1.5048828125,
+      "objective/rlhf_reward": -0.09699033945798874,
+      "objective/scores": 1.40625,
+      "policy/approxkl_avg": 0.19970840215682983,
+      "policy/clipfrac_avg": 0.2958984375,
+      "policy/entropy_avg": 0.028090476989746094,
+      "step": 400,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 17,
+      "val/ratio": 1.0016024112701416,
+      "val/ratio_var": 9.803808643482625e-05
+    },
+    {
+      "episode": 20736,
+      "epoch": 0.284417134157214,
+      "eps": 5,
+      "loss/policy_avg": -0.017358586192131042,
+      "loss/value_avg": 0.0,
+      "lr": 2.3861892583120206e-06,
+      "objective/entropy": 1.4205702543258667,
+      "objective/kl": 15.169395446777344,
+      "objective/non_score_reward": -1.516939640045166,
+      "objective/rlhf_reward": -0.07302428036928177,
+      "objective/scores": 1.4453125,
+      "policy/approxkl_avg": 0.1988021731376648,
+      "policy/clipfrac_avg": 0.3017578125,
+      "policy/entropy_avg": 0.028011322021484375,
+      "step": 405,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 14,
+      "val/ratio": 1.0035557746887207,
+      "val/ratio_var": 0.0002127099724020809
+    },
+    {
+      "episode": 20992,
+      "epoch": 0.2879284568011302,
+      "eps": 5,
+      "loss/policy_avg": -0.01577112078666687,
+      "loss/value_avg": 0.0,
+      "lr": 2.378516624040921e-06,
+      "objective/entropy": 1.4795770645141602,
+      "objective/kl": 14.837481498718262,
+      "objective/non_score_reward": -1.483748197555542,
+      "objective/rlhf_reward": -0.06763182580471039,
+      "objective/scores": 1.4140625,
+      "policy/approxkl_avg": 0.193942129611969,
+      "policy/clipfrac_avg": 0.267578125,
+      "policy/entropy_avg": 0.02856159210205078,
+      "step": 410,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 17,
+      "val/ratio": 1.0132761001586914,
+      "val/ratio_var": 0.009631224907934666
+    },
+    {
+      "episode": 21248,
+      "epoch": 0.29143977944504645,
+      "eps": 5,
+      "loss/policy_avg": -0.01689928211271763,
+      "loss/value_avg": 0.0,
+      "lr": 2.3708439897698213e-06,
+      "objective/entropy": 1.6031843423843384,
+      "objective/kl": 13.909997940063477,
+      "objective/non_score_reward": -1.3909997940063477,
+      "objective/rlhf_reward": 0.009201999753713608,
+      "objective/scores": 1.3984375,
+      "policy/approxkl_avg": 0.5280826091766357,
+      "policy/clipfrac_avg": 0.357421875,
+      "policy/entropy_avg": 0.029265880584716797,
+      "step": 415,
+      "val/clipfrac_avg": 0.0,
+      "val/num_eos_tokens": 22,
+      "val/ratio": 1.0006372928619385,
+      "val/ratio_var": 1.6514597518835217e-05
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 391,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1.3716104077797742,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0,
+  "train_batch_size": null,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f69fedf04484e314c878c0562873de1761b9262a2545636c76d80eb9a5506163
+size 6840

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,604 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)