kkish commited on
Commit
2d303ff
·
verified ·
1 Parent(s): d243bda

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/kifish-k/huggingface/runs/ez3kxa2t)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/kifish-k/huggingface/runs/xot29f9k)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.08129200656737245,
4
- "train_runtime": 12851.1722,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.584,
7
- "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.01571170037346621,
4
+ "train_runtime": 7483.8014,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 1.002,
7
+ "train_steps_per_second": 0.008
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24a93abebdeb133d4ec524e1aa55cdda9ade6d1994ab1707e3982dffca6c0051
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb632cde9539e3209be92868df94c5e5b4bd1a2474dadd18204c4a1afba3bc8
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00597f114878a6eaff7d2af7891a1ca57a344de7a63c93eae3cb7b71377121f0
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6030d136e478d0aa2020a1c3931dd8050c750caada245d98f1147a7a154cd1c3
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a5af5fe712c2c2abf81fef39eba2383fbd77fd8c89ff3ad221b1d5137a2e64f
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7f6b4e94ff3267dbbb54bb1e83ef4891ff7cea80d7e0d407414b4654a7df050
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4003c925ff951f7281814aeffc391620e3cd828437b268222c2660bdfad9c12d
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d88e1ceab35f6614fbf7731a012e90c34f88b13c3cdc4b72457cf17539d3690
3
  size 1089994880
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.08129200656737245,
4
- "train_runtime": 12851.1722,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.584,
7
- "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.01571170037346621,
4
+ "train_runtime": 7483.8014,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 1.002,
7
+ "train_steps_per_second": 0.008
8
  }
trainer_state.json CHANGED
@@ -9,162 +9,162 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 613.2511436462403,
13
  "epoch": 0.08528784648187633,
14
- "grad_norm": 1.4833499193191528,
15
- "kl": 0.00018868446350097656,
16
  "learning_rate": 2.5e-06,
17
  "loss": 0.0,
18
- "reward": 0.633258955925703,
19
- "reward_std": 0.32811579667031765,
20
- "rewards/accuracy_reward": 0.6328125290572644,
21
- "rewards/format_reward": 0.00044642859138548373,
22
  "step": 5
23
  },
24
  {
25
- "completion_length": 622.7888648986816,
26
  "epoch": 0.17057569296375266,
27
- "grad_norm": 5.005911350250244,
28
- "kl": 23.51236982345581,
29
  "learning_rate": 2.956412726139078e-06,
30
- "loss": 0.9434,
31
- "reward": 0.7042411074042321,
32
- "reward_std": 0.28605092857033015,
33
- "rewards/accuracy_reward": 0.7037946790456772,
34
- "rewards/format_reward": 0.00044642859138548373,
35
  "step": 10
36
  },
37
  {
38
- "completion_length": 622.0384216308594,
39
  "epoch": 0.255863539445629,
40
- "grad_norm": 0.3033592402935028,
41
- "kl": 0.011827850341796875,
42
  "learning_rate": 2.7836719084521715e-06,
43
- "loss": 0.0005,
44
- "reward": 0.7473214641213417,
45
- "reward_std": 0.24078086167573928,
46
- "rewards/accuracy_reward": 0.7473214641213417,
47
  "rewards/format_reward": 0.0,
48
  "step": 15
49
  },
50
  {
51
- "completion_length": 617.8234657287597,
52
  "epoch": 0.3411513859275053,
53
- "grad_norm": 0.11973369121551514,
54
- "kl": 0.13522472381591796,
55
  "learning_rate": 2.4946839873611927e-06,
56
- "loss": 0.0054,
57
- "reward": 0.7578125327825547,
58
- "reward_std": 0.21702875867486,
59
- "rewards/accuracy_reward": 0.7578125327825547,
60
  "rewards/format_reward": 0.0,
61
  "step": 20
62
  },
63
  {
64
- "completion_length": 612.831502532959,
65
  "epoch": 0.42643923240938164,
66
- "grad_norm": 0.2234116494655609,
67
- "kl": 0.0034147262573242187,
68
  "learning_rate": 2.1156192081791355e-06,
69
- "loss": 0.0001,
70
- "reward": 0.747544676065445,
71
- "reward_std": 0.20008923448622226,
72
- "rewards/accuracy_reward": 0.747544676065445,
73
  "rewards/format_reward": 0.0,
74
  "step": 25
75
  },
76
  {
77
- "completion_length": 611.4951187133789,
78
  "epoch": 0.511727078891258,
79
- "grad_norm": 0.23166193068027496,
80
- "kl": 0.0030916213989257814,
81
  "learning_rate": 1.6808050203829845e-06,
82
  "loss": 0.0001,
83
- "reward": 0.7486607506871223,
84
- "reward_std": 0.19252096004784108,
85
- "rewards/accuracy_reward": 0.7486607506871223,
86
  "rewards/format_reward": 0.0,
87
  "step": 30
88
  },
89
  {
90
- "completion_length": 599.3102935791015,
91
  "epoch": 0.5970149253731343,
92
- "grad_norm": 0.34214457869529724,
93
- "kl": 0.003481292724609375,
94
  "learning_rate": 1.2296174432791415e-06,
95
  "loss": 0.0001,
96
- "reward": 0.7500000327825547,
97
- "reward_std": 0.17259779190644622,
98
- "rewards/accuracy_reward": 0.7500000327825547,
99
  "rewards/format_reward": 0.0,
100
  "step": 35
101
  },
102
  {
103
- "completion_length": 592.0377464294434,
104
  "epoch": 0.6823027718550106,
105
- "grad_norm": 0.126151442527771,
106
- "kl": 0.004676437377929688,
107
  "learning_rate": 8.029152419343472e-07,
108
- "loss": 0.0002,
109
- "reward": 0.7714286059141159,
110
- "reward_std": 0.18749583773314954,
111
- "rewards/accuracy_reward": 0.7714286059141159,
112
  "rewards/format_reward": 0.0,
113
  "step": 40
114
  },
115
  {
116
- "completion_length": 608.4870819091797,
117
  "epoch": 0.767590618336887,
118
- "grad_norm": 0.07366354763507843,
119
- "kl": 0.0032756805419921877,
120
  "learning_rate": 4.3933982822017883e-07,
121
- "loss": 0.0001,
122
- "reward": 0.7595982506871224,
123
- "reward_std": 0.16678393790498375,
124
- "rewards/accuracy_reward": 0.7595982506871224,
125
  "rewards/format_reward": 0.0,
126
  "step": 45
127
  },
128
  {
129
- "completion_length": 609.9491363525391,
130
  "epoch": 0.8528784648187633,
131
- "grad_norm": 0.08962756395339966,
132
- "kl": 0.0033018112182617186,
133
  "learning_rate": 1.718159615201853e-07,
134
- "loss": 0.0001,
135
- "reward": 0.7435268208384513,
136
- "reward_std": 0.18068666788749396,
137
- "rewards/accuracy_reward": 0.7435268208384513,
138
  "rewards/format_reward": 0.0,
139
  "step": 50
140
  },
141
  {
142
- "completion_length": 596.2395317077637,
143
  "epoch": 0.9381663113006397,
144
- "grad_norm": 0.08790023624897003,
145
- "kl": 0.00357818603515625,
146
  "learning_rate": 2.4570139579284723e-08,
147
- "loss": 0.0001,
148
- "reward": 0.7866071850061417,
149
- "reward_std": 0.1857817579060793,
150
- "rewards/accuracy_reward": 0.7866071850061417,
151
  "rewards/format_reward": 0.0,
152
  "step": 55
153
  },
154
  {
155
- "completion_length": 594.5562032063802,
156
  "epoch": 0.9893390191897654,
157
- "kl": 0.003570556640625,
158
- "reward": 0.7671131292978922,
159
- "reward_std": 0.18057130742818117,
160
- "rewards/accuracy_reward": 0.7671131292978922,
161
  "rewards/format_reward": 0.0,
162
  "step": 58,
163
  "total_flos": 0.0,
164
- "train_loss": 0.08129200656737245,
165
- "train_runtime": 12851.1722,
166
- "train_samples_per_second": 0.584,
167
- "train_steps_per_second": 0.005
168
  }
169
  ],
170
  "logging_steps": 5,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 613.9678833007813,
13
  "epoch": 0.08528784648187633,
14
+ "grad_norm": 1.6863154172897339,
15
+ "kl": 0.0001811981201171875,
16
  "learning_rate": 2.5e-06,
17
  "loss": 0.0,
18
+ "reward": 0.6475446730852127,
19
+ "reward_std": 0.32297179140150545,
20
+ "rewards/accuracy_reward": 0.646651816368103,
21
+ "rewards/format_reward": 0.0008928571827709675,
22
  "step": 5
23
  },
24
  {
25
+ "completion_length": 623.497127532959,
26
  "epoch": 0.17057569296375266,
27
+ "grad_norm": 5.361064910888672,
28
+ "kl": 4.185117244720459,
29
  "learning_rate": 2.956412726139078e-06,
30
+ "loss": 0.1668,
31
+ "reward": 0.695758955180645,
32
+ "reward_std": 0.2824364464730024,
33
+ "rewards/accuracy_reward": 0.6948660999536515,
34
+ "rewards/format_reward": 0.0008928571827709675,
35
  "step": 10
36
  },
37
  {
38
+ "completion_length": 621.0348518371582,
39
  "epoch": 0.255863539445629,
40
+ "grad_norm": 0.34819459915161133,
41
+ "kl": 0.006930732727050781,
42
  "learning_rate": 2.7836719084521715e-06,
43
+ "loss": 0.0003,
44
+ "reward": 0.7484375357627868,
45
+ "reward_std": 0.23940655626356602,
46
+ "rewards/accuracy_reward": 0.7484375357627868,
47
  "rewards/format_reward": 0.0,
48
  "step": 15
49
  },
50
  {
51
+ "completion_length": 617.9591751098633,
52
  "epoch": 0.3411513859275053,
53
+ "grad_norm": 5.891634941101074,
54
+ "kl": 0.007678604125976563,
55
  "learning_rate": 2.4946839873611927e-06,
56
+ "loss": 0.0003,
57
+ "reward": 0.7546875327825546,
58
+ "reward_std": 0.23243394643068313,
59
+ "rewards/accuracy_reward": 0.7546875327825546,
60
  "rewards/format_reward": 0.0,
61
  "step": 20
62
  },
63
  {
64
+ "completion_length": 625.6734642028808,
65
  "epoch": 0.42643923240938164,
66
+ "grad_norm": 0.4985528588294983,
67
+ "kl": 0.06998424530029297,
68
  "learning_rate": 2.1156192081791355e-06,
69
+ "loss": 0.0028,
70
+ "reward": 0.7497768148779869,
71
+ "reward_std": 0.20579003393650055,
72
+ "rewards/accuracy_reward": 0.7497768148779869,
73
  "rewards/format_reward": 0.0,
74
  "step": 25
75
  },
76
  {
77
+ "completion_length": 621.3915466308594,
78
  "epoch": 0.511727078891258,
79
+ "grad_norm": 0.4441167712211609,
80
+ "kl": 0.0034616470336914064,
81
  "learning_rate": 1.6808050203829845e-06,
82
  "loss": 0.0001,
83
+ "reward": 0.7517857521772384,
84
+ "reward_std": 0.19103028811514378,
85
+ "rewards/accuracy_reward": 0.7517857521772384,
86
  "rewards/format_reward": 0.0,
87
  "step": 30
88
  },
89
  {
90
+ "completion_length": 611.6134216308594,
91
  "epoch": 0.5970149253731343,
92
+ "grad_norm": 0.45489344000816345,
93
+ "kl": 0.003511810302734375,
94
  "learning_rate": 1.2296174432791415e-06,
95
  "loss": 0.0001,
96
+ "reward": 0.7323660954833031,
97
+ "reward_std": 0.19216552414000035,
98
+ "rewards/accuracy_reward": 0.7323660954833031,
99
  "rewards/format_reward": 0.0,
100
  "step": 35
101
  },
102
  {
103
+ "completion_length": 594.996004486084,
104
  "epoch": 0.6823027718550106,
105
+ "grad_norm": 0.38299915194511414,
106
+ "kl": 0.0036407470703125,
107
  "learning_rate": 8.029152419343472e-07,
108
+ "loss": 0.0001,
109
+ "reward": 0.7618303939700126,
110
+ "reward_std": 0.182381122559309,
111
+ "rewards/accuracy_reward": 0.7618303939700126,
112
  "rewards/format_reward": 0.0,
113
  "step": 40
114
  },
115
  {
116
+ "completion_length": 607.5631973266602,
117
  "epoch": 0.767590618336887,
118
+ "grad_norm": 0.1558612883090973,
119
+ "kl": 0.003850555419921875,
120
  "learning_rate": 4.3933982822017883e-07,
121
+ "loss": 0.0002,
122
+ "reward": 0.7508928924798965,
123
+ "reward_std": 0.19622449725866317,
124
+ "rewards/accuracy_reward": 0.7508928924798965,
125
  "rewards/format_reward": 0.0,
126
  "step": 45
127
  },
128
  {
129
+ "completion_length": 610.1696670532226,
130
  "epoch": 0.8528784648187633,
131
+ "grad_norm": 0.11370094120502472,
132
+ "kl": 0.004032135009765625,
133
  "learning_rate": 1.718159615201853e-07,
134
+ "loss": 0.0002,
135
+ "reward": 0.7486607491970062,
136
+ "reward_std": 0.17984699215739966,
137
+ "rewards/accuracy_reward": 0.7486607491970062,
138
  "rewards/format_reward": 0.0,
139
  "step": 50
140
  },
141
  {
142
+ "completion_length": 603.7953384399414,
143
  "epoch": 0.9381663113006397,
144
+ "grad_norm": 0.09367287904024124,
145
+ "kl": 0.0048553466796875,
146
  "learning_rate": 2.4570139579284723e-08,
147
+ "loss": 0.0002,
148
+ "reward": 0.7750000357627869,
149
+ "reward_std": 0.18225797163322568,
150
+ "rewards/accuracy_reward": 0.7750000357627869,
151
  "rewards/format_reward": 0.0,
152
  "step": 55
153
  },
154
  {
155
+ "completion_length": 598.5974960327148,
156
  "epoch": 0.9893390191897654,
157
+ "kl": 0.003758112589518229,
158
+ "reward": 0.7678571765621504,
159
+ "reward_std": 0.18772160820662975,
160
+ "rewards/accuracy_reward": 0.7678571765621504,
161
  "rewards/format_reward": 0.0,
162
  "step": 58,
163
  "total_flos": 0.0,
164
+ "train_loss": 0.01571170037346621,
165
+ "train_runtime": 7483.8014,
166
+ "train_samples_per_second": 1.002,
167
+ "train_steps_per_second": 0.008
168
  }
169
  ],
170
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03a0cf36012692bc8f8d495df3134fbd53fbdbf21083651acc2f91d15b60f19a
3
  size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9997f26134981ab163e535c40b79bb04034a38f546fd59ac1605c593626e8e0
3
  size 7544