susumuota commited on
Commit
c6076ce
·
verified ·
1 Parent(s): ecd8c4f

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-7B-Instruct
3
- datasets: DigitalLearningGmbH/MATH-lighteval
4
  library_name: transformers
5
  model_name: Qwen2.5-7B-Instruct-GRPO
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-7B-Instruct-GRPO
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the [DigitalLearningGmbH/MATH-lighteval](https://huggingface.co/datasets/DigitalLearningGmbH/MATH-lighteval) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/llm-m_wandb-weblab/Qwen2.5-7B-Instruct-GRPO/runs/potnc7q9)
33
 
34
 
35
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
  base_model: Qwen/Qwen2.5-7B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-7B-Instruct-GRPO
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-7B-Instruct-GRPO
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/llm-m_wandb-weblab/Qwen2.5-7B-Instruct-GRPO/runs/g31l17lf)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.020674003962555837,
4
- "train_runtime": 6417.2756,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 1.169,
7
  "train_steps_per_second": 0.009
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.10446281573767292,
4
+ "train_runtime": 6557.3192,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 1.144,
7
  "train_steps_per_second": 0.009
8
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
- "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 152064
29
  }
 
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
+ "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 152064
29
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:247735adddbbd9944bf2dc1cc35ccc9b4cfd5e79ef3d8c3ab7340c7f26a0955e
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6874b107a5edb2eb4e9ee32fef98e0a76195ecbd6de7bb231de9c6b5a41fc0a
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b9ef086092554c80dab98bd0bfbaa398b21a3d2e367bf94ad86f1e05c6ce509
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46bd31973c9ac5df46fd509623c7eef1bb6b0fa42cd0dde2de93ae7124403ef3
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e80ee5064fdd08d096850130cfbb55ce4521b8c8f8f019740bac43298665148f
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26c84a8977158b0268f6d10ac89c52a39796d5e4e1802b698aeadb8873034f4c
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe67c93ef45e03e5b117ec79ebbbcc459cf2fe2e878531f0a57c2da06c2fc0ef
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c651953533fcd55d348f1205d59a4230bd223834ba89b7689bb7b54bb0f32bf0
3
  size 1089994880
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.020674003962555837,
4
- "train_runtime": 6417.2756,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 1.169,
7
  "train_steps_per_second": 0.009
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.10446281573767292,
4
+ "train_runtime": 6557.3192,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 1.144,
7
  "train_steps_per_second": 0.009
8
  }
trainer_state.json CHANGED
@@ -12,7 +12,7 @@
12
  "clip_ratio": 0.0,
13
  "completion_length": 498.510066986084,
14
  "epoch": 0.017057569296375266,
15
- "grad_norm": 1.1215301752090454,
16
  "kl": 0.0,
17
  "learning_rate": 5e-07,
18
  "loss": 0.0115,
@@ -24,172 +24,172 @@
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
- "completion_length": 487.4813299179077,
28
  "epoch": 0.08528784648187633,
29
- "grad_norm": 476.3103332519531,
30
- "kl": 1.7723130583763123,
31
  "learning_rate": 2.5e-06,
32
- "loss": 0.0822,
33
- "reward": 0.7912946743890643,
34
- "reward_std": 0.3657265743240714,
35
- "rewards/accuracy_reward": 0.1872209922876209,
36
- "rewards/format_reward": 0.6040736874565482,
37
  "step": 5
38
  },
39
  {
40
  "clip_ratio": 0.0,
41
- "completion_length": 466.69912643432616,
42
  "epoch": 0.17057569296375266,
43
- "grad_norm": 0.41791579127311707,
44
- "kl": 0.012205886840820312,
45
  "learning_rate": 2.956412726139078e-06,
46
- "loss": 0.0203,
47
- "reward": 1.2589286297559739,
48
- "reward_std": 0.32850122936069964,
49
- "rewards/accuracy_reward": 0.3185267999768257,
50
- "rewards/format_reward": 0.9404018238186836,
51
  "step": 10
52
  },
53
  {
54
  "clip_ratio": 0.0,
55
- "completion_length": 444.567431640625,
56
  "epoch": 0.255863539445629,
57
- "grad_norm": 0.6771596670150757,
58
- "kl": 0.02176055908203125,
59
  "learning_rate": 2.7836719084521715e-06,
60
- "loss": 0.0059,
61
- "reward": 1.4082589864730835,
62
- "reward_std": 0.3335796441882849,
63
- "rewards/accuracy_reward": 0.441294664144516,
64
- "rewards/format_reward": 0.9669643238186836,
65
  "step": 15
66
  },
67
  {
68
  "clip_ratio": 0.0,
69
- "completion_length": 422.7493499755859,
70
  "epoch": 0.3411513859275053,
71
- "grad_norm": 0.23769104480743408,
72
- "kl": 0.026959228515625,
73
  "learning_rate": 2.4946839873611927e-06,
74
- "loss": 0.01,
75
- "reward": 1.4937500715255738,
76
- "reward_std": 0.3376178216189146,
77
- "rewards/accuracy_reward": 0.5156250238418579,
78
- "rewards/format_reward": 0.9781250357627869,
79
  "step": 20
80
  },
81
  {
82
  "clip_ratio": 0.0,
83
- "completion_length": 433.67725372314453,
84
  "epoch": 0.42643923240938164,
85
- "grad_norm": 0.1763259768486023,
86
- "kl": 0.0300628662109375,
87
  "learning_rate": 2.1156192081791355e-06,
88
- "loss": 0.0157,
89
- "reward": 1.5979911386966705,
90
- "reward_std": 0.2915887963026762,
91
- "rewards/accuracy_reward": 0.6209821671247482,
92
- "rewards/format_reward": 0.9770089671015739,
93
  "step": 25
94
  },
95
  {
96
  "clip_ratio": 0.0,
97
- "completion_length": 461.1984573364258,
98
  "epoch": 0.511727078891258,
99
- "grad_norm": 0.1527547538280487,
100
- "kl": 0.0370269775390625,
101
  "learning_rate": 1.6808050203829845e-06,
102
- "loss": 0.0155,
103
- "reward": 1.6714286535978318,
104
- "reward_std": 0.2001216158270836,
105
- "rewards/accuracy_reward": 0.6986607477068901,
106
- "rewards/format_reward": 0.9727678924798966,
107
  "step": 30
108
  },
109
  {
110
  "clip_ratio": 0.0,
111
- "completion_length": 458.88328018188474,
112
  "epoch": 0.5970149253731343,
113
- "grad_norm": 0.13492096960544586,
114
- "kl": 0.03984375,
115
  "learning_rate": 1.2296174432791415e-06,
116
- "loss": 0.0205,
117
- "reward": 1.6863840162754058,
118
- "reward_std": 0.19958442291244866,
119
- "rewards/accuracy_reward": 0.7189732484519482,
120
- "rewards/format_reward": 0.9674107521772385,
121
  "step": 35
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
- "completion_length": 435.2419822692871,
126
  "epoch": 0.6823027718550106,
127
- "grad_norm": 1.2669559717178345,
128
- "kl": 0.0412078857421875,
129
  "learning_rate": 8.029152419343472e-07,
130
- "loss": 0.0147,
131
- "reward": 1.7008929342031478,
132
- "reward_std": 0.18920395569875836,
133
- "rewards/accuracy_reward": 0.7238839611411094,
134
- "rewards/format_reward": 0.9770089611411095,
135
  "step": 40
136
  },
137
  {
138
  "clip_ratio": 0.0,
139
- "completion_length": 436.82925872802736,
140
  "epoch": 0.767590618336887,
141
- "grad_norm": 0.38188719749450684,
142
- "kl": 0.047900390625,
143
  "learning_rate": 4.3933982822017883e-07,
144
- "loss": 0.0174,
145
- "reward": 1.6915179312229156,
146
- "reward_std": 0.19770997650921346,
147
- "rewards/accuracy_reward": 0.712276816368103,
148
- "rewards/format_reward": 0.9792410984635354,
149
  "step": 45
150
  },
151
  {
152
  "clip_ratio": 0.0,
153
- "completion_length": 426.6221176147461,
154
  "epoch": 0.8528784648187633,
155
- "grad_norm": 1.376158356666565,
156
- "kl": 0.202972412109375,
157
  "learning_rate": 1.718159615201853e-07,
158
- "loss": 0.0264,
159
- "reward": 1.6868304401636123,
160
- "reward_std": 0.19814990404993296,
161
- "rewards/accuracy_reward": 0.7000000312924385,
162
- "rewards/format_reward": 0.986830385029316,
163
  "step": 50
164
  },
165
  {
166
  "clip_ratio": 0.0,
167
- "completion_length": 429.37256622314453,
168
  "epoch": 0.9381663113006397,
169
- "grad_norm": 2.8599460124969482,
170
- "kl": 0.0670989990234375,
171
  "learning_rate": 2.4570139579284723e-08,
172
- "loss": 0.0186,
173
- "reward": 1.722991144657135,
174
- "reward_std": 0.19592140736058355,
175
- "rewards/accuracy_reward": 0.7395089641213417,
176
- "rewards/format_reward": 0.9834821745753288,
177
  "step": 55
178
  },
179
  {
180
  "clip_ratio": 0.0,
181
- "completion_length": 415.95802815755206,
182
  "epoch": 0.9893390191897654,
183
- "kl": 0.0960235595703125,
184
- "reward": 1.7020090073347092,
185
- "reward_std": 0.19463430003573498,
186
- "rewards/accuracy_reward": 0.7127976529300213,
187
- "rewards/format_reward": 0.9892113382617632,
188
  "step": 58,
189
  "total_flos": 0.0,
190
- "train_loss": 0.020674003962555837,
191
- "train_runtime": 6417.2756,
192
- "train_samples_per_second": 1.169,
193
  "train_steps_per_second": 0.009
194
  }
195
  ],
 
12
  "clip_ratio": 0.0,
13
  "completion_length": 498.510066986084,
14
  "epoch": 0.017057569296375266,
15
+ "grad_norm": 1.116628856964355,
16
  "kl": 0.0,
17
  "learning_rate": 5e-07,
18
  "loss": 0.0115,
 
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
+ "completion_length": 487.5962829589844,
28
  "epoch": 0.08528784648187633,
29
+ "grad_norm": 329.581274808389,
30
+ "kl": 1.5817211270332336,
31
  "learning_rate": 2.5e-06,
32
+ "loss": 0.0633,
33
+ "reward": 0.7832031613215804,
34
+ "reward_std": 0.3882951531559229,
35
+ "rewards/accuracy_reward": 0.17801340064033866,
36
+ "rewards/format_reward": 0.6051897583529353,
37
  "step": 5
38
  },
39
  {
40
  "clip_ratio": 0.0,
41
+ "completion_length": 471.0462287902832,
42
  "epoch": 0.17057569296375266,
43
+ "grad_norm": 0.22688607924492205,
44
+ "kl": 0.0134246826171875,
45
  "learning_rate": 2.956412726139078e-06,
46
+ "loss": 0.0205,
47
+ "reward": 1.2479911297559738,
48
+ "reward_std": 0.33266205713152885,
49
+ "rewards/accuracy_reward": 0.3131696585565805,
50
+ "rewards/format_reward": 0.9348214715719223,
51
  "step": 10
52
  },
53
  {
54
  "clip_ratio": 0.0,
55
+ "completion_length": 441.45515899658204,
56
  "epoch": 0.255863539445629,
57
+ "grad_norm": 0.3322143554658319,
58
+ "kl": 0.01697235107421875,
59
  "learning_rate": 2.7836719084521715e-06,
60
+ "loss": 0.012,
61
+ "reward": 1.4209821969270706,
62
+ "reward_std": 0.33965970352292063,
63
+ "rewards/accuracy_reward": 0.45714287683367727,
64
+ "rewards/format_reward": 0.9638393223285675,
65
  "step": 15
66
  },
67
  {
68
  "clip_ratio": 0.0,
69
+ "completion_length": 421.6131874084473,
70
  "epoch": 0.3411513859275053,
71
+ "grad_norm": 0.20995700940835846,
72
+ "kl": 0.037835693359375,
73
  "learning_rate": 2.4946839873611927e-06,
74
+ "loss": 0.0117,
75
+ "reward": 1.646428656578064,
76
+ "reward_std": 0.2756752146407962,
77
+ "rewards/accuracy_reward": 0.6725446745753288,
78
+ "rewards/format_reward": 0.973883967101574,
79
  "step": 20
80
  },
81
  {
82
  "clip_ratio": 0.0,
83
+ "completion_length": 423.54466247558594,
84
  "epoch": 0.42643923240938164,
85
+ "grad_norm": 0.2012153869368259,
86
+ "kl": 0.037371826171875,
87
  "learning_rate": 2.1156192081791355e-06,
88
+ "loss": 0.0144,
89
+ "reward": 1.6178572177886963,
90
+ "reward_std": 0.27076737955212593,
91
+ "rewards/accuracy_reward": 0.645758955180645,
92
+ "rewards/format_reward": 0.9720982536673546,
93
  "step": 25
94
  },
95
  {
96
  "clip_ratio": 0.0,
97
+ "completion_length": 439.773233795166,
98
  "epoch": 0.511727078891258,
99
+ "grad_norm": 1.0763397890575324,
100
+ "kl": 0.0529998779296875,
101
  "learning_rate": 1.6808050203829845e-06,
102
+ "loss": 0.0176,
103
+ "reward": 1.668526867032051,
104
+ "reward_std": 0.21837877184152604,
105
+ "rewards/accuracy_reward": 0.6995536059141159,
106
+ "rewards/format_reward": 0.9689732551574707,
107
  "step": 30
108
  },
109
  {
110
  "clip_ratio": 0.0,
111
+ "completion_length": 446.2640869140625,
112
  "epoch": 0.5970149253731343,
113
+ "grad_norm": 0.15301980687370056,
114
+ "kl": 8.364321899414062,
115
  "learning_rate": 1.2296174432791415e-06,
116
+ "loss": 0.8027,
117
+ "reward": 1.6857143670320511,
118
+ "reward_std": 0.2085555238649249,
119
+ "rewards/accuracy_reward": 0.7165178850293159,
120
+ "rewards/format_reward": 0.9691964700818062,
121
  "step": 35
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
+ "completion_length": 428.8069389343262,
126
  "epoch": 0.6823027718550106,
127
+ "grad_norm": 0.16199556630311315,
128
+ "kl": 0.0596527099609375,
129
  "learning_rate": 8.029152419343472e-07,
130
+ "loss": 0.0204,
131
+ "reward": 1.6937500774860381,
132
+ "reward_std": 0.1925298016052693,
133
+ "rewards/accuracy_reward": 0.7194196820259094,
134
+ "rewards/format_reward": 0.9743303924798965,
135
  "step": 40
136
  },
137
  {
138
  "clip_ratio": 0.0,
139
+ "completion_length": 430.70425872802736,
140
  "epoch": 0.767590618336887,
141
+ "grad_norm": 1.6843827012960852,
142
+ "kl": 0.0576385498046875,
143
  "learning_rate": 4.3933982822017883e-07,
144
+ "loss": 0.0227,
145
+ "reward": 1.6881697207689286,
146
+ "reward_std": 0.2063945535570383,
147
+ "rewards/accuracy_reward": 0.7098214611411094,
148
+ "rewards/format_reward": 0.9783482521772384,
149
  "step": 45
150
  },
151
  {
152
  "clip_ratio": 0.0,
153
+ "completion_length": 423.26809921264646,
154
  "epoch": 0.8528784648187633,
155
+ "grad_norm": 5.03664692812433,
156
+ "kl": 2.7302520751953123,
157
  "learning_rate": 1.718159615201853e-07,
158
+ "loss": 0.1931,
159
+ "reward": 1.6832590103149414,
160
+ "reward_std": 0.20804516496136785,
161
+ "rewards/accuracy_reward": 0.7024553880095482,
162
+ "rewards/format_reward": 0.9808036014437675,
163
  "step": 50
164
  },
165
  {
166
  "clip_ratio": 0.0,
167
+ "completion_length": 423.15091094970705,
168
  "epoch": 0.9381663113006397,
169
+ "grad_norm": 2.620417003259849,
170
+ "kl": 0.3049896240234375,
171
  "learning_rate": 2.4570139579284723e-08,
172
+ "loss": 0.0318,
173
+ "reward": 1.7154018610715867,
174
+ "reward_std": 0.2086696395650506,
175
+ "rewards/accuracy_reward": 0.7316964611411094,
176
+ "rewards/format_reward": 0.9837053924798965,
177
  "step": 55
178
  },
179
  {
180
  "clip_ratio": 0.0,
181
+ "completion_length": 413.19120534261066,
182
  "epoch": 0.9893390191897654,
183
+ "kl": 0.060206095377604164,
184
+ "reward": 1.6845238904158275,
185
+ "reward_std": 0.19806722179055214,
186
+ "rewards/accuracy_reward": 0.6994047885139784,
187
+ "rewards/format_reward": 0.985119087000688,
188
  "step": 58,
189
  "total_flos": 0.0,
190
+ "train_loss": 0.10446281573767292,
191
+ "train_runtime": 6557.3192,
192
+ "train_samples_per_second": 1.144,
193
  "train_steps_per_second": 0.009
194
  }
195
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2b52bc508eabf11f85b23f9777010f62ee53072d9454f224c22b6871df1b509
3
  size 7992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c75d1a2753b530396acfc94ef1440941ec9e81cc653e6b5f5422469864e2bccb
3
  size 7992