Model save

Browse files

Files changed (10) hide show

README.md +2 -4
all_results.json +3 -3
config.json +1 -1
model-00001-of-00004.safetensors +1 -1
model-00002-of-00004.safetensors +1 -1
model-00003-of-00004.safetensors +1 -1
model-00004-of-00004.safetensors +1 -1
train_results.json +3 -3
trainer_state.json +98 -98
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -1,11 +1,9 @@
 ---
 base_model: Qwen/Qwen2.5-7B-Instruct
-datasets: DigitalLearningGmbH/MATH-lighteval
 library_name: transformers
 model_name: Qwen2.5-7B-Instruct-GRPO
 tags:
 - generated_from_trainer
-- open-r1
 - trl
 - grpo
 licence: license
@@ -13,7 +11,7 @@ licence: license
 # Model Card for Qwen2.5-7B-Instruct-GRPO
-This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the [DigitalLearningGmbH/MATH-lighteval](https://huggingface.co/datasets/DigitalLearningGmbH/MATH-lighteval) dataset.
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/llm-m_wandb-weblab/Qwen2.5-7B-Instruct-GRPO/runs/potnc7q9)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

 ---
 base_model: Qwen/Qwen2.5-7B-Instruct
 library_name: transformers
 model_name: Qwen2.5-7B-Instruct-GRPO
 tags:
 - generated_from_trainer
 - trl
 - grpo
 licence: license
 # Model Card for Qwen2.5-7B-Instruct-GRPO
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/llm-m_wandb-weblab/Qwen2.5-7B-Instruct-GRPO/runs/g31l17lf)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.020674003962555837,
-    "train_runtime": 6417.2756,
     "train_samples": 7500,
-    "train_samples_per_second": 1.169,
     "train_steps_per_second": 0.009
 }

 {
     "total_flos": 0.0,
+    "train_loss": 0.10446281573767292,
+    "train_runtime": 6557.3192,
     "train_samples": 7500,
+    "train_samples_per_second": 1.144,
     "train_steps_per_second": 0.009
 }

config.json CHANGED Viewed

@@ -23,7 +23,7 @@
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.49.0",
-  "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 152064
 }

   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.49.0",
+  "use_cache": false,
   "use_sliding_window": false,
   "vocab_size": 152064
 }

model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:247735adddbbd9944bf2dc1cc35ccc9b4cfd5e79ef3d8c3ab7340c7f26a0955e
 size 4877660776

 version https://git-lfs.github.com/spec/v1
+oid sha256:b6874b107a5edb2eb4e9ee32fef98e0a76195ecbd6de7bb231de9c6b5a41fc0a
 size 4877660776

model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b9ef086092554c80dab98bd0bfbaa398b21a3d2e367bf94ad86f1e05c6ce509
 size 4932751008

 version https://git-lfs.github.com/spec/v1
+oid sha256:46bd31973c9ac5df46fd509623c7eef1bb6b0fa42cd0dde2de93ae7124403ef3
 size 4932751008

model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e80ee5064fdd08d096850130cfbb55ce4521b8c8f8f019740bac43298665148f
 size 4330865200

 version https://git-lfs.github.com/spec/v1
+oid sha256:26c84a8977158b0268f6d10ac89c52a39796d5e4e1802b698aeadb8873034f4c
 size 4330865200

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe67c93ef45e03e5b117ec79ebbbcc459cf2fe2e878531f0a57c2da06c2fc0ef
 size 1089994880

 version https://git-lfs.github.com/spec/v1
+oid sha256:c651953533fcd55d348f1205d59a4230bd223834ba89b7689bb7b54bb0f32bf0
 size 1089994880

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.020674003962555837,
-    "train_runtime": 6417.2756,
     "train_samples": 7500,
-    "train_samples_per_second": 1.169,
     "train_steps_per_second": 0.009
 }

 {
     "total_flos": 0.0,
+    "train_loss": 0.10446281573767292,
+    "train_runtime": 6557.3192,
     "train_samples": 7500,
+    "train_samples_per_second": 1.144,
     "train_steps_per_second": 0.009
 }

trainer_state.json CHANGED Viewed

@@ -12,7 +12,7 @@
       "clip_ratio": 0.0,
       "completion_length": 498.510066986084,
       "epoch": 0.017057569296375266,
-      "grad_norm": 1.1215301752090454,
       "kl": 0.0,
       "learning_rate": 5e-07,
       "loss": 0.0115,
@@ -24,172 +24,172 @@
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 487.4813299179077,
       "epoch": 0.08528784648187633,
-      "grad_norm": 476.3103332519531,
-      "kl": 1.7723130583763123,
       "learning_rate": 2.5e-06,
-      "loss": 0.0822,
-      "reward": 0.7912946743890643,
-      "reward_std": 0.3657265743240714,
-      "rewards/accuracy_reward": 0.1872209922876209,
-      "rewards/format_reward": 0.6040736874565482,
       "step": 5
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 466.69912643432616,
       "epoch": 0.17057569296375266,
-      "grad_norm": 0.41791579127311707,
-      "kl": 0.012205886840820312,
       "learning_rate": 2.956412726139078e-06,
-      "loss": 0.0203,
-      "reward": 1.2589286297559739,
-      "reward_std": 0.32850122936069964,
-      "rewards/accuracy_reward": 0.3185267999768257,
-      "rewards/format_reward": 0.9404018238186836,
       "step": 10
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 444.567431640625,
       "epoch": 0.255863539445629,
-      "grad_norm": 0.6771596670150757,
-      "kl": 0.02176055908203125,
       "learning_rate": 2.7836719084521715e-06,
-      "loss": 0.0059,
-      "reward": 1.4082589864730835,
-      "reward_std": 0.3335796441882849,
-      "rewards/accuracy_reward": 0.441294664144516,
-      "rewards/format_reward": 0.9669643238186836,
       "step": 15
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 422.7493499755859,
       "epoch": 0.3411513859275053,
-      "grad_norm": 0.23769104480743408,
-      "kl": 0.026959228515625,
       "learning_rate": 2.4946839873611927e-06,
-      "loss": 0.01,
-      "reward": 1.4937500715255738,
-      "reward_std": 0.3376178216189146,
-      "rewards/accuracy_reward": 0.5156250238418579,
-      "rewards/format_reward": 0.9781250357627869,
       "step": 20
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 433.67725372314453,
       "epoch": 0.42643923240938164,
-      "grad_norm": 0.1763259768486023,
-      "kl": 0.0300628662109375,
       "learning_rate": 2.1156192081791355e-06,
-      "loss": 0.0157,
-      "reward": 1.5979911386966705,
-      "reward_std": 0.2915887963026762,
-      "rewards/accuracy_reward": 0.6209821671247482,
-      "rewards/format_reward": 0.9770089671015739,
       "step": 25
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 461.1984573364258,
       "epoch": 0.511727078891258,
-      "grad_norm": 0.1527547538280487,
-      "kl": 0.0370269775390625,
       "learning_rate": 1.6808050203829845e-06,
-      "loss": 0.0155,
-      "reward": 1.6714286535978318,
-      "reward_std": 0.2001216158270836,
-      "rewards/accuracy_reward": 0.6986607477068901,
-      "rewards/format_reward": 0.9727678924798966,
       "step": 30
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 458.88328018188474,
       "epoch": 0.5970149253731343,
-      "grad_norm": 0.13492096960544586,
-      "kl": 0.03984375,
       "learning_rate": 1.2296174432791415e-06,
-      "loss": 0.0205,
-      "reward": 1.6863840162754058,
-      "reward_std": 0.19958442291244866,
-      "rewards/accuracy_reward": 0.7189732484519482,
-      "rewards/format_reward": 0.9674107521772385,
       "step": 35
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 435.2419822692871,
       "epoch": 0.6823027718550106,
-      "grad_norm": 1.2669559717178345,
-      "kl": 0.0412078857421875,
       "learning_rate": 8.029152419343472e-07,
-      "loss": 0.0147,
-      "reward": 1.7008929342031478,
-      "reward_std": 0.18920395569875836,
-      "rewards/accuracy_reward": 0.7238839611411094,
-      "rewards/format_reward": 0.9770089611411095,
       "step": 40
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 436.82925872802736,
       "epoch": 0.767590618336887,
-      "grad_norm": 0.38188719749450684,
-      "kl": 0.047900390625,
       "learning_rate": 4.3933982822017883e-07,
-      "loss": 0.0174,
-      "reward": 1.6915179312229156,
-      "reward_std": 0.19770997650921346,
-      "rewards/accuracy_reward": 0.712276816368103,
-      "rewards/format_reward": 0.9792410984635354,
       "step": 45
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 426.6221176147461,
       "epoch": 0.8528784648187633,
-      "grad_norm": 1.376158356666565,
-      "kl": 0.202972412109375,
       "learning_rate": 1.718159615201853e-07,
-      "loss": 0.0264,
-      "reward": 1.6868304401636123,
-      "reward_std": 0.19814990404993296,
-      "rewards/accuracy_reward": 0.7000000312924385,
-      "rewards/format_reward": 0.986830385029316,
       "step": 50
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 429.37256622314453,
       "epoch": 0.9381663113006397,
-      "grad_norm": 2.8599460124969482,
-      "kl": 0.0670989990234375,
       "learning_rate": 2.4570139579284723e-08,
-      "loss": 0.0186,
-      "reward": 1.722991144657135,
-      "reward_std": 0.19592140736058355,
-      "rewards/accuracy_reward": 0.7395089641213417,
-      "rewards/format_reward": 0.9834821745753288,
       "step": 55
     },
     {
       "clip_ratio": 0.0,
-      "completion_length": 415.95802815755206,
       "epoch": 0.9893390191897654,
-      "kl": 0.0960235595703125,
-      "reward": 1.7020090073347092,
-      "reward_std": 0.19463430003573498,
-      "rewards/accuracy_reward": 0.7127976529300213,
-      "rewards/format_reward": 0.9892113382617632,
       "step": 58,
       "total_flos": 0.0,
-      "train_loss": 0.020674003962555837,
-      "train_runtime": 6417.2756,
-      "train_samples_per_second": 1.169,
       "train_steps_per_second": 0.009
     }
   ],

       "clip_ratio": 0.0,
       "completion_length": 498.510066986084,
       "epoch": 0.017057569296375266,
+      "grad_norm": 1.116628856964355,
       "kl": 0.0,
       "learning_rate": 5e-07,
       "loss": 0.0115,
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 487.5962829589844,
       "epoch": 0.08528784648187633,
+      "grad_norm": 329.581274808389,
+      "kl": 1.5817211270332336,
       "learning_rate": 2.5e-06,
+      "loss": 0.0633,
+      "reward": 0.7832031613215804,
+      "reward_std": 0.3882951531559229,
+      "rewards/accuracy_reward": 0.17801340064033866,
+      "rewards/format_reward": 0.6051897583529353,
       "step": 5
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 471.0462287902832,
       "epoch": 0.17057569296375266,
+      "grad_norm": 0.22688607924492205,
+      "kl": 0.0134246826171875,
       "learning_rate": 2.956412726139078e-06,
+      "loss": 0.0205,
+      "reward": 1.2479911297559738,
+      "reward_std": 0.33266205713152885,
+      "rewards/accuracy_reward": 0.3131696585565805,
+      "rewards/format_reward": 0.9348214715719223,
       "step": 10
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 441.45515899658204,
       "epoch": 0.255863539445629,
+      "grad_norm": 0.3322143554658319,
+      "kl": 0.01697235107421875,
       "learning_rate": 2.7836719084521715e-06,
+      "loss": 0.012,
+      "reward": 1.4209821969270706,
+      "reward_std": 0.33965970352292063,
+      "rewards/accuracy_reward": 0.45714287683367727,
+      "rewards/format_reward": 0.9638393223285675,
       "step": 15
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 421.6131874084473,
       "epoch": 0.3411513859275053,
+      "grad_norm": 0.20995700940835846,
+      "kl": 0.037835693359375,
       "learning_rate": 2.4946839873611927e-06,
+      "loss": 0.0117,
+      "reward": 1.646428656578064,
+      "reward_std": 0.2756752146407962,
+      "rewards/accuracy_reward": 0.6725446745753288,
+      "rewards/format_reward": 0.973883967101574,
       "step": 20
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 423.54466247558594,
       "epoch": 0.42643923240938164,
+      "grad_norm": 0.2012153869368259,
+      "kl": 0.037371826171875,
       "learning_rate": 2.1156192081791355e-06,
+      "loss": 0.0144,
+      "reward": 1.6178572177886963,
+      "reward_std": 0.27076737955212593,
+      "rewards/accuracy_reward": 0.645758955180645,
+      "rewards/format_reward": 0.9720982536673546,
       "step": 25
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 439.773233795166,
       "epoch": 0.511727078891258,
+      "grad_norm": 1.0763397890575324,
+      "kl": 0.0529998779296875,
       "learning_rate": 1.6808050203829845e-06,
+      "loss": 0.0176,
+      "reward": 1.668526867032051,
+      "reward_std": 0.21837877184152604,
+      "rewards/accuracy_reward": 0.6995536059141159,
+      "rewards/format_reward": 0.9689732551574707,
       "step": 30
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 446.2640869140625,
       "epoch": 0.5970149253731343,
+      "grad_norm": 0.15301980687370056,
+      "kl": 8.364321899414062,
       "learning_rate": 1.2296174432791415e-06,
+      "loss": 0.8027,
+      "reward": 1.6857143670320511,
+      "reward_std": 0.2085555238649249,
+      "rewards/accuracy_reward": 0.7165178850293159,
+      "rewards/format_reward": 0.9691964700818062,
       "step": 35
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 428.8069389343262,
       "epoch": 0.6823027718550106,
+      "grad_norm": 0.16199556630311315,
+      "kl": 0.0596527099609375,
       "learning_rate": 8.029152419343472e-07,
+      "loss": 0.0204,
+      "reward": 1.6937500774860381,
+      "reward_std": 0.1925298016052693,
+      "rewards/accuracy_reward": 0.7194196820259094,
+      "rewards/format_reward": 0.9743303924798965,
       "step": 40
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 430.70425872802736,
       "epoch": 0.767590618336887,
+      "grad_norm": 1.6843827012960852,
+      "kl": 0.0576385498046875,
       "learning_rate": 4.3933982822017883e-07,
+      "loss": 0.0227,
+      "reward": 1.6881697207689286,
+      "reward_std": 0.2063945535570383,
+      "rewards/accuracy_reward": 0.7098214611411094,
+      "rewards/format_reward": 0.9783482521772384,
       "step": 45
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 423.26809921264646,
       "epoch": 0.8528784648187633,
+      "grad_norm": 5.03664692812433,
+      "kl": 2.7302520751953123,
       "learning_rate": 1.718159615201853e-07,
+      "loss": 0.1931,
+      "reward": 1.6832590103149414,
+      "reward_std": 0.20804516496136785,
+      "rewards/accuracy_reward": 0.7024553880095482,
+      "rewards/format_reward": 0.9808036014437675,
       "step": 50
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 423.15091094970705,
       "epoch": 0.9381663113006397,
+      "grad_norm": 2.620417003259849,
+      "kl": 0.3049896240234375,
       "learning_rate": 2.4570139579284723e-08,
+      "loss": 0.0318,
+      "reward": 1.7154018610715867,
+      "reward_std": 0.2086696395650506,
+      "rewards/accuracy_reward": 0.7316964611411094,
+      "rewards/format_reward": 0.9837053924798965,
       "step": 55
     },
     {
       "clip_ratio": 0.0,
+      "completion_length": 413.19120534261066,
       "epoch": 0.9893390191897654,
+      "kl": 0.060206095377604164,
+      "reward": 1.6845238904158275,
+      "reward_std": 0.19806722179055214,
+      "rewards/accuracy_reward": 0.6994047885139784,
+      "rewards/format_reward": 0.985119087000688,
       "step": 58,
       "total_flos": 0.0,
+      "train_loss": 0.10446281573767292,
+      "train_runtime": 6557.3192,
+      "train_samples_per_second": 1.144,
       "train_steps_per_second": 0.009
     }
   ],

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d2b52bc508eabf11f85b23f9777010f62ee53072d9454f224c22b6871df1b509
 size 7992

 version https://git-lfs.github.com/spec/v1
+oid sha256:c75d1a2753b530396acfc94ef1440941ec9e81cc653e6b5f5422469864e2bccb
 size 7992