minghaowu commited on
Commit
0aaab9b
·
verified ·
1 Parent(s): 122a0f1

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0c01378f8626fb764b1495c44118898606b1d391b167805b17044f1db6ff1a1
3
+ size 7414895232
last-checkpoint/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e9950b7c884295d43031c0038a354d2a02f6da24e738b6a5ee500898b0ea02c
3
+ size 7414897472
last-checkpoint/global_step200/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:771b4f36ea5b1328ed87b99b2613cbf24ab04a0c9fd1823de5718e90d396651f
3
+ size 2471673464
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step100
 
1
+ global_step200
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed1516c096229ae80466370a26fe3de858486b117055f47a2e2cab489139a242
3
  size 2996982344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39b72fde5d7858e6afce8db4b0d0fdeca00d87ecb6c744603b12d525c98ccf36
3
  size 2996982344
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba02109c842441e127a3fad7657cf1bc583c3017b8b805f4eae546c0cb6bd6bf
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66c5a0df19b9c4c7f9628533d87b5e767121c5b1c20697fb2cfc5c745c752a6a
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8ee95e644e7f1780b1e0be4d9a67f79e7d8f95071be212bc1c77e329b261b6d
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59aedc175259cd5edebf90ddc9a5fcde025d3ccce2f0eca359a9ff56cba98147
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96a1388548298b291bf715880376a3f7056129f4c4c3e07823de336de496cdc4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e40935651363f2a1063f3f036a3600d22c7ab6431c4f31c42b100e6e12d0544e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7178106774338269,
5
  "eval_steps": 500,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -77,6 +77,76 @@
77
  "learning_rate": 8.453333333333334e-06,
78
  "loss": 0.5187,
79
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  }
81
  ],
82
  "logging_steps": 10,
@@ -96,7 +166,7 @@
96
  "attributes": {}
97
  }
98
  },
99
- "total_flos": 2.6785962973462528e+17,
100
  "train_batch_size": 14,
101
  "trial_name": null,
102
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.4365186182144458,
5
  "eval_steps": 500,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
77
  "learning_rate": 8.453333333333334e-06,
78
  "loss": 0.5187,
79
  "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.7895917451772095,
83
+ "grad_norm": 0.022063592448830605,
84
+ "learning_rate": 8.186666666666667e-06,
85
+ "loss": 0.51,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.8613728129205922,
90
+ "grad_norm": 0.021037070080637932,
91
+ "learning_rate": 7.92e-06,
92
+ "loss": 0.5043,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.9331538806639749,
97
+ "grad_norm": 0.01985151134431362,
98
+ "learning_rate": 7.653333333333333e-06,
99
+ "loss": 0.4985,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 1.0058322117541498,
104
+ "grad_norm": 0.04843816161155701,
105
+ "learning_rate": 7.386666666666667e-06,
106
+ "loss": 0.5345,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 1.0776132794975326,
111
+ "grad_norm": 0.02092103101313114,
112
+ "learning_rate": 7.1200000000000004e-06,
113
+ "loss": 0.4855,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 1.1493943472409152,
118
+ "grad_norm": 0.020286045968532562,
119
+ "learning_rate": 6.853333333333334e-06,
120
+ "loss": 0.4794,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 1.221175414984298,
125
+ "grad_norm": 0.020133651793003082,
126
+ "learning_rate": 6.5866666666666666e-06,
127
+ "loss": 0.4779,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 1.2929564827276807,
132
+ "grad_norm": 0.020597418770194054,
133
+ "learning_rate": 6.3200000000000005e-06,
134
+ "loss": 0.4738,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 1.3647375504710633,
139
+ "grad_norm": 0.020543133839964867,
140
+ "learning_rate": 6.0533333333333335e-06,
141
+ "loss": 0.4713,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 1.4365186182144458,
146
+ "grad_norm": 0.0203793253749609,
147
+ "learning_rate": 5.7866666666666674e-06,
148
+ "loss": 0.4665,
149
+ "step": 200
150
  }
151
  ],
152
  "logging_steps": 10,
 
166
  "attributes": {}
167
  }
168
  },
169
+ "total_flos": 5.3571925946925056e+17,
170
  "train_batch_size": 14,
171
  "trial_name": null,
172
  "trial_params": null