rs545837 commited on
Commit
bd5e9d3
·
verified ·
1 Parent(s): ac39a6f

Upload folder using huggingface_hub

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9b2c4db3a2d556ee001eb8fdf128d644b789b40b2f7cf64684b6fe78989053b
3
  size 213625344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0f1a637ff5efd3a742cf0adbf6c58e6934d9a4a6215f862074742126fcf08c2
3
  size 213625344
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:129003e0eae631ea59961c42baa93d6ece566b523e5b57d228356535ea34946d
3
  size 427334458
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf290f7d31c224d60ac5c8ed732a0ecc556301b4c61d776c8545f9109d532312
3
  size 427334458
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:593cfa780b4b09ba583a139eb81c2eae72992c19fc5f8f38c81bd37ea47dbe04
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:435089f27ea78cb1b5ffb371da67f47c7a3dea92ab07479122e62b4e8dbeed97
3
  size 16433
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9560f8c460a856a55828494146d2d52ecf0d95a3dec5919d8f29a972450cec34
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35c3205fe632396691980ab13bb747592db3b39a8f9bc42c6b4bce2ebc4e86d2
3
  size 16433
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e7b7bce88125710e1c78de933cf62b48d6ec5a97b36fc43a09d7f70aebd0307
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdd18dda129c8617269378af8c4207e690d6ed4efdbda0ce1aa5947221052d4a
3
  size 16433
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cb51f987fa17a879db4d368b0564564ae49379d5c5ce803d79d24b4b5a43c13
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d14c531768fd1b817e6ad83f2878c07af9aa939a5372ffc3020b84164720063
3
  size 16433
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33ed1a64882ec192e7bdbd0b9dda7c3dd977bc8ef889d26ddca3e2380d9debae
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6343b5ecd40e08336f425dc8913f4b43aad1a7465797446cfa1892e3cb3133f3
3
  size 16433
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dd0f015bb5215ff40f1555f5c47c89a7bd89b00e7ef4568ca045dc1c2b5514a
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0492bd561ad6444b11560f52c5a570d34e2dcc461489aae4bd2703fa55d13f47
3
  size 16433
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac5bc7ba9f4a9e405864d41e902bc7509a5b6fa554a6cf09f24491e00dac06fb
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a181670929d07bdff88356f8cc3404c6ef87b1e788fc5b1dc6f473ac9b2bc12
3
  size 16433
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:77e384968f192bdc497600d0108b82695a12247413143c0c9bd4e09fbb718212
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23b0154dd6c58d1151b932c0d3de52258209b2d646a7e6e34de818d4dfa12a13
3
  size 16433
rng_state_8.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc46c9518ec829f507eb5833115c977024d13a12bc4e0ecff2238d818e6eb6dc
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0c53426dcd44d43fe238422e433fb361c04df5585664ed42f08da1365fc112f
3
  size 16433
rng_state_9.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56cb5d3bf2f9602568c76013ebf1c626061418df7f27c779c448921d362d5232
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:268b7b1ca55cb9d2d96542e3789b74164ec07561d7774458ecef281e2a3ad163
3
  size 16433
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83a9c85c7b2c29125f99f000e54e900b05be0859260af7e4a0abf634beb2c469
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fc0c0b23c1fbf47eda082a36b0831feacf86e3aa80c09efc720587a7e961c90
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.3998792426790874,
5
  "eval_steps": 2318,
6
- "global_step": 9272,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2856,6 +2856,1427 @@
2856
  "eval_samples_per_second": 610.045,
2857
  "eval_steps_per_second": 7.626,
2858
  "step": 9272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2859
  }
2860
  ],
2861
  "logging_steps": 23,
@@ -2875,7 +4296,7 @@
2875
  "attributes": {}
2876
  }
2877
  },
2878
- "total_flos": 6.778106242599485e+17,
2879
  "train_batch_size": 8,
2880
  "trial_name": null,
2881
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5998188640186312,
5
  "eval_steps": 2318,
6
+ "global_step": 13908,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2856
  "eval_samples_per_second": 610.045,
2857
  "eval_steps_per_second": 7.626,
2858
  "step": 9272
2859
+ },
2860
+ {
2861
+ "epoch": 0.4007417949713201,
2862
+ "grad_norm": 0.408203125,
2863
+ "learning_rate": 0.0005998532205145915,
2864
+ "loss": 1.0191,
2865
+ "step": 9292
2866
+ },
2867
+ {
2868
+ "epoch": 0.4017337301073878,
2869
+ "grad_norm": 0.46484375,
2870
+ "learning_rate": 0.0005988603004662408,
2871
+ "loss": 1.0074,
2872
+ "step": 9315
2873
+ },
2874
+ {
2875
+ "epoch": 0.4027256652434554,
2876
+ "grad_norm": 0.353515625,
2877
+ "learning_rate": 0.0005978673804178898,
2878
+ "loss": 1.0052,
2879
+ "step": 9338
2880
+ },
2881
+ {
2882
+ "epoch": 0.40371760037952303,
2883
+ "grad_norm": 0.361328125,
2884
+ "learning_rate": 0.000596874460369539,
2885
+ "loss": 1.0248,
2886
+ "step": 9361
2887
+ },
2888
+ {
2889
+ "epoch": 0.40470953551559063,
2890
+ "grad_norm": 0.376953125,
2891
+ "learning_rate": 0.000595881540321188,
2892
+ "loss": 1.0328,
2893
+ "step": 9384
2894
+ },
2895
+ {
2896
+ "epoch": 0.40570147065165824,
2897
+ "grad_norm": 0.4453125,
2898
+ "learning_rate": 0.0005948886202728372,
2899
+ "loss": 1.0254,
2900
+ "step": 9407
2901
+ },
2902
+ {
2903
+ "epoch": 0.4066934057877259,
2904
+ "grad_norm": 0.302734375,
2905
+ "learning_rate": 0.0005938957002244862,
2906
+ "loss": 1.0285,
2907
+ "step": 9430
2908
+ },
2909
+ {
2910
+ "epoch": 0.4076853409237935,
2911
+ "grad_norm": 0.439453125,
2912
+ "learning_rate": 0.0005929027801761354,
2913
+ "loss": 1.0077,
2914
+ "step": 9453
2915
+ },
2916
+ {
2917
+ "epoch": 0.40867727605986115,
2918
+ "grad_norm": 0.3515625,
2919
+ "learning_rate": 0.0005919098601277845,
2920
+ "loss": 1.0181,
2921
+ "step": 9476
2922
+ },
2923
+ {
2924
+ "epoch": 0.40966921119592875,
2925
+ "grad_norm": 0.33203125,
2926
+ "learning_rate": 0.0005909169400794336,
2927
+ "loss": 1.0427,
2928
+ "step": 9499
2929
+ },
2930
+ {
2931
+ "epoch": 0.41066114633199635,
2932
+ "grad_norm": 0.380859375,
2933
+ "learning_rate": 0.0005899240200310827,
2934
+ "loss": 1.0133,
2935
+ "step": 9522
2936
+ },
2937
+ {
2938
+ "epoch": 0.411653081468064,
2939
+ "grad_norm": 0.40234375,
2940
+ "learning_rate": 0.0005889310999827318,
2941
+ "loss": 1.0187,
2942
+ "step": 9545
2943
+ },
2944
+ {
2945
+ "epoch": 0.4126450166041316,
2946
+ "grad_norm": 0.44921875,
2947
+ "learning_rate": 0.000587938179934381,
2948
+ "loss": 1.0091,
2949
+ "step": 9568
2950
+ },
2951
+ {
2952
+ "epoch": 0.41363695174019927,
2953
+ "grad_norm": 0.37109375,
2954
+ "learning_rate": 0.0005869452598860301,
2955
+ "loss": 1.002,
2956
+ "step": 9591
2957
+ },
2958
+ {
2959
+ "epoch": 0.41462888687626687,
2960
+ "grad_norm": 0.47265625,
2961
+ "learning_rate": 0.0005859523398376792,
2962
+ "loss": 1.0321,
2963
+ "step": 9614
2964
+ },
2965
+ {
2966
+ "epoch": 0.4156208220123345,
2967
+ "grad_norm": 0.4453125,
2968
+ "learning_rate": 0.0005849594197893283,
2969
+ "loss": 1.0244,
2970
+ "step": 9637
2971
+ },
2972
+ {
2973
+ "epoch": 0.4166127571484021,
2974
+ "grad_norm": 0.4375,
2975
+ "learning_rate": 0.0005839664997409773,
2976
+ "loss": 1.0345,
2977
+ "step": 9660
2978
+ },
2979
+ {
2980
+ "epoch": 0.4176046922844697,
2981
+ "grad_norm": 0.357421875,
2982
+ "learning_rate": 0.0005829735796926266,
2983
+ "loss": 1.0157,
2984
+ "step": 9683
2985
+ },
2986
+ {
2987
+ "epoch": 0.4185966274205374,
2988
+ "grad_norm": 0.3515625,
2989
+ "learning_rate": 0.0005819806596442756,
2990
+ "loss": 0.9774,
2991
+ "step": 9706
2992
+ },
2993
+ {
2994
+ "epoch": 0.419588562556605,
2995
+ "grad_norm": 0.408203125,
2996
+ "learning_rate": 0.0005809877395959248,
2997
+ "loss": 1.0011,
2998
+ "step": 9729
2999
+ },
3000
+ {
3001
+ "epoch": 0.42058049769267264,
3002
+ "grad_norm": 0.41015625,
3003
+ "learning_rate": 0.0005799948195475738,
3004
+ "loss": 1.0148,
3005
+ "step": 9752
3006
+ },
3007
+ {
3008
+ "epoch": 0.42157243282874024,
3009
+ "grad_norm": 0.53515625,
3010
+ "learning_rate": 0.000579001899499223,
3011
+ "loss": 1.0058,
3012
+ "step": 9775
3013
+ },
3014
+ {
3015
+ "epoch": 0.42256436796480784,
3016
+ "grad_norm": 0.37890625,
3017
+ "learning_rate": 0.000578008979450872,
3018
+ "loss": 1.0241,
3019
+ "step": 9798
3020
+ },
3021
+ {
3022
+ "epoch": 0.4235563031008755,
3023
+ "grad_norm": 0.38671875,
3024
+ "learning_rate": 0.0005770160594025212,
3025
+ "loss": 1.0067,
3026
+ "step": 9821
3027
+ },
3028
+ {
3029
+ "epoch": 0.4245482382369431,
3030
+ "grad_norm": 0.3359375,
3031
+ "learning_rate": 0.0005760231393541703,
3032
+ "loss": 1.0105,
3033
+ "step": 9844
3034
+ },
3035
+ {
3036
+ "epoch": 0.42554017337301075,
3037
+ "grad_norm": 0.40625,
3038
+ "learning_rate": 0.0005750302193058193,
3039
+ "loss": 1.0226,
3040
+ "step": 9867
3041
+ },
3042
+ {
3043
+ "epoch": 0.42653210850907836,
3044
+ "grad_norm": 0.392578125,
3045
+ "learning_rate": 0.0005740372992574685,
3046
+ "loss": 1.0243,
3047
+ "step": 9890
3048
+ },
3049
+ {
3050
+ "epoch": 0.427524043645146,
3051
+ "grad_norm": 0.419921875,
3052
+ "learning_rate": 0.0005730443792091175,
3053
+ "loss": 1.0128,
3054
+ "step": 9913
3055
+ },
3056
+ {
3057
+ "epoch": 0.4285159787812136,
3058
+ "grad_norm": 0.373046875,
3059
+ "learning_rate": 0.0005720514591607668,
3060
+ "loss": 0.9993,
3061
+ "step": 9936
3062
+ },
3063
+ {
3064
+ "epoch": 0.4295079139172812,
3065
+ "grad_norm": 0.365234375,
3066
+ "learning_rate": 0.0005710585391124158,
3067
+ "loss": 1.012,
3068
+ "step": 9959
3069
+ },
3070
+ {
3071
+ "epoch": 0.43049984905334887,
3072
+ "grad_norm": 0.302734375,
3073
+ "learning_rate": 0.000570065619064065,
3074
+ "loss": 1.0178,
3075
+ "step": 9982
3076
+ },
3077
+ {
3078
+ "epoch": 0.43149178418941647,
3079
+ "grad_norm": 0.3671875,
3080
+ "learning_rate": 0.000569072699015714,
3081
+ "loss": 1.0192,
3082
+ "step": 10005
3083
+ },
3084
+ {
3085
+ "epoch": 0.43248371932548413,
3086
+ "grad_norm": 0.345703125,
3087
+ "learning_rate": 0.0005680797789673631,
3088
+ "loss": 0.9992,
3089
+ "step": 10028
3090
+ },
3091
+ {
3092
+ "epoch": 0.43347565446155173,
3093
+ "grad_norm": 0.4140625,
3094
+ "learning_rate": 0.0005670868589190123,
3095
+ "loss": 1.0033,
3096
+ "step": 10051
3097
+ },
3098
+ {
3099
+ "epoch": 0.43446758959761933,
3100
+ "grad_norm": 0.359375,
3101
+ "learning_rate": 0.0005660939388706614,
3102
+ "loss": 1.0153,
3103
+ "step": 10074
3104
+ },
3105
+ {
3106
+ "epoch": 0.435459524733687,
3107
+ "grad_norm": 0.36328125,
3108
+ "learning_rate": 0.0005651010188223105,
3109
+ "loss": 0.997,
3110
+ "step": 10097
3111
+ },
3112
+ {
3113
+ "epoch": 0.4364514598697546,
3114
+ "grad_norm": 0.3671875,
3115
+ "learning_rate": 0.0005641080987739596,
3116
+ "loss": 1.0147,
3117
+ "step": 10120
3118
+ },
3119
+ {
3120
+ "epoch": 0.43744339500582224,
3121
+ "grad_norm": 0.3515625,
3122
+ "learning_rate": 0.0005631151787256087,
3123
+ "loss": 1.0046,
3124
+ "step": 10143
3125
+ },
3126
+ {
3127
+ "epoch": 0.43843533014188985,
3128
+ "grad_norm": 0.37890625,
3129
+ "learning_rate": 0.0005621222586772579,
3130
+ "loss": 1.0008,
3131
+ "step": 10166
3132
+ },
3133
+ {
3134
+ "epoch": 0.4394272652779575,
3135
+ "grad_norm": 0.41796875,
3136
+ "learning_rate": 0.0005611293386289069,
3137
+ "loss": 1.0315,
3138
+ "step": 10189
3139
+ },
3140
+ {
3141
+ "epoch": 0.4404192004140251,
3142
+ "grad_norm": 0.365234375,
3143
+ "learning_rate": 0.0005601364185805561,
3144
+ "loss": 1.0164,
3145
+ "step": 10212
3146
+ },
3147
+ {
3148
+ "epoch": 0.4414111355500927,
3149
+ "grad_norm": 0.361328125,
3150
+ "learning_rate": 0.0005591434985322051,
3151
+ "loss": 1.0044,
3152
+ "step": 10235
3153
+ },
3154
+ {
3155
+ "epoch": 0.44240307068616036,
3156
+ "grad_norm": 0.439453125,
3157
+ "learning_rate": 0.0005581505784838543,
3158
+ "loss": 1.0236,
3159
+ "step": 10258
3160
+ },
3161
+ {
3162
+ "epoch": 0.44339500582222796,
3163
+ "grad_norm": 0.341796875,
3164
+ "learning_rate": 0.0005571576584355034,
3165
+ "loss": 1.0156,
3166
+ "step": 10281
3167
+ },
3168
+ {
3169
+ "epoch": 0.4443869409582956,
3170
+ "grad_norm": 0.373046875,
3171
+ "learning_rate": 0.0005561647383871526,
3172
+ "loss": 0.9916,
3173
+ "step": 10304
3174
+ },
3175
+ {
3176
+ "epoch": 0.4453788760943632,
3177
+ "grad_norm": 0.33203125,
3178
+ "learning_rate": 0.0005551718183388016,
3179
+ "loss": 0.9961,
3180
+ "step": 10327
3181
+ },
3182
+ {
3183
+ "epoch": 0.4463708112304308,
3184
+ "grad_norm": 0.392578125,
3185
+ "learning_rate": 0.0005541788982904507,
3186
+ "loss": 1.0021,
3187
+ "step": 10350
3188
+ },
3189
+ {
3190
+ "epoch": 0.4473627463664985,
3191
+ "grad_norm": 0.375,
3192
+ "learning_rate": 0.0005531859782420998,
3193
+ "loss": 1.0219,
3194
+ "step": 10373
3195
+ },
3196
+ {
3197
+ "epoch": 0.4483546815025661,
3198
+ "grad_norm": 0.4140625,
3199
+ "learning_rate": 0.000552193058193749,
3200
+ "loss": 0.9982,
3201
+ "step": 10396
3202
+ },
3203
+ {
3204
+ "epoch": 0.44934661663863373,
3205
+ "grad_norm": 0.392578125,
3206
+ "learning_rate": 0.0005512001381453981,
3207
+ "loss": 0.994,
3208
+ "step": 10419
3209
+ },
3210
+ {
3211
+ "epoch": 0.45033855177470133,
3212
+ "grad_norm": 0.34765625,
3213
+ "learning_rate": 0.0005502072180970471,
3214
+ "loss": 0.9899,
3215
+ "step": 10442
3216
+ },
3217
+ {
3218
+ "epoch": 0.451330486910769,
3219
+ "grad_norm": 0.3828125,
3220
+ "learning_rate": 0.0005492142980486963,
3221
+ "loss": 1.0096,
3222
+ "step": 10465
3223
+ },
3224
+ {
3225
+ "epoch": 0.4523224220468366,
3226
+ "grad_norm": 0.3984375,
3227
+ "learning_rate": 0.0005482213780003453,
3228
+ "loss": 0.9882,
3229
+ "step": 10488
3230
+ },
3231
+ {
3232
+ "epoch": 0.4533143571829042,
3233
+ "grad_norm": 0.390625,
3234
+ "learning_rate": 0.0005472284579519945,
3235
+ "loss": 0.999,
3236
+ "step": 10511
3237
+ },
3238
+ {
3239
+ "epoch": 0.45430629231897185,
3240
+ "grad_norm": 0.3984375,
3241
+ "learning_rate": 0.0005462355379036436,
3242
+ "loss": 1.0087,
3243
+ "step": 10534
3244
+ },
3245
+ {
3246
+ "epoch": 0.45529822745503945,
3247
+ "grad_norm": 0.388671875,
3248
+ "learning_rate": 0.0005452426178552927,
3249
+ "loss": 0.9985,
3250
+ "step": 10557
3251
+ },
3252
+ {
3253
+ "epoch": 0.4562901625911071,
3254
+ "grad_norm": 0.455078125,
3255
+ "learning_rate": 0.0005442496978069418,
3256
+ "loss": 1.0104,
3257
+ "step": 10580
3258
+ },
3259
+ {
3260
+ "epoch": 0.4572820977271747,
3261
+ "grad_norm": 0.61328125,
3262
+ "learning_rate": 0.0005432567777585909,
3263
+ "loss": 1.0056,
3264
+ "step": 10603
3265
+ },
3266
+ {
3267
+ "epoch": 0.4582740328632423,
3268
+ "grad_norm": 0.3359375,
3269
+ "learning_rate": 0.00054226385771024,
3270
+ "loss": 1.0115,
3271
+ "step": 10626
3272
+ },
3273
+ {
3274
+ "epoch": 0.45926596799930997,
3275
+ "grad_norm": 0.3515625,
3276
+ "learning_rate": 0.0005412709376618892,
3277
+ "loss": 1.0143,
3278
+ "step": 10649
3279
+ },
3280
+ {
3281
+ "epoch": 0.46025790313537757,
3282
+ "grad_norm": 0.388671875,
3283
+ "learning_rate": 0.0005402780176135383,
3284
+ "loss": 0.9916,
3285
+ "step": 10672
3286
+ },
3287
+ {
3288
+ "epoch": 0.4612498382714452,
3289
+ "grad_norm": 0.396484375,
3290
+ "learning_rate": 0.0005392850975651874,
3291
+ "loss": 0.9967,
3292
+ "step": 10695
3293
+ },
3294
+ {
3295
+ "epoch": 0.4622417734075128,
3296
+ "grad_norm": 0.41796875,
3297
+ "learning_rate": 0.0005382921775168364,
3298
+ "loss": 1.0009,
3299
+ "step": 10718
3300
+ },
3301
+ {
3302
+ "epoch": 0.4632337085435805,
3303
+ "grad_norm": 0.34765625,
3304
+ "learning_rate": 0.0005372992574684856,
3305
+ "loss": 0.9919,
3306
+ "step": 10741
3307
+ },
3308
+ {
3309
+ "epoch": 0.4642256436796481,
3310
+ "grad_norm": 0.3515625,
3311
+ "learning_rate": 0.0005363063374201347,
3312
+ "loss": 1.0128,
3313
+ "step": 10764
3314
+ },
3315
+ {
3316
+ "epoch": 0.4652175788157157,
3317
+ "grad_norm": 0.400390625,
3318
+ "learning_rate": 0.0005353134173717839,
3319
+ "loss": 0.9982,
3320
+ "step": 10787
3321
+ },
3322
+ {
3323
+ "epoch": 0.46620951395178334,
3324
+ "grad_norm": 0.3515625,
3325
+ "learning_rate": 0.0005343204973234329,
3326
+ "loss": 0.9998,
3327
+ "step": 10810
3328
+ },
3329
+ {
3330
+ "epoch": 0.46720144908785094,
3331
+ "grad_norm": 0.5390625,
3332
+ "learning_rate": 0.0005333275772750821,
3333
+ "loss": 1.0177,
3334
+ "step": 10833
3335
+ },
3336
+ {
3337
+ "epoch": 0.4681933842239186,
3338
+ "grad_norm": 0.37890625,
3339
+ "learning_rate": 0.0005323346572267311,
3340
+ "loss": 0.9899,
3341
+ "step": 10856
3342
+ },
3343
+ {
3344
+ "epoch": 0.4691853193599862,
3345
+ "grad_norm": 0.38671875,
3346
+ "learning_rate": 0.0005313417371783802,
3347
+ "loss": 1.0052,
3348
+ "step": 10879
3349
+ },
3350
+ {
3351
+ "epoch": 0.4701772544960538,
3352
+ "grad_norm": 0.36328125,
3353
+ "learning_rate": 0.0005303488171300294,
3354
+ "loss": 0.9741,
3355
+ "step": 10902
3356
+ },
3357
+ {
3358
+ "epoch": 0.47116918963212145,
3359
+ "grad_norm": 0.4453125,
3360
+ "learning_rate": 0.0005293558970816785,
3361
+ "loss": 1.0021,
3362
+ "step": 10925
3363
+ },
3364
+ {
3365
+ "epoch": 0.47216112476818906,
3366
+ "grad_norm": 0.322265625,
3367
+ "learning_rate": 0.0005283629770333276,
3368
+ "loss": 0.9896,
3369
+ "step": 10948
3370
+ },
3371
+ {
3372
+ "epoch": 0.4731530599042567,
3373
+ "grad_norm": 0.36328125,
3374
+ "learning_rate": 0.0005273700569849767,
3375
+ "loss": 1.0046,
3376
+ "step": 10971
3377
+ },
3378
+ {
3379
+ "epoch": 0.4741449950403243,
3380
+ "grad_norm": 0.345703125,
3381
+ "learning_rate": 0.0005263771369366258,
3382
+ "loss": 1.0004,
3383
+ "step": 10994
3384
+ },
3385
+ {
3386
+ "epoch": 0.47513693017639197,
3387
+ "grad_norm": 0.357421875,
3388
+ "learning_rate": 0.0005253842168882749,
3389
+ "loss": 1.0031,
3390
+ "step": 11017
3391
+ },
3392
+ {
3393
+ "epoch": 0.47612886531245957,
3394
+ "grad_norm": 0.359375,
3395
+ "learning_rate": 0.0005243912968399241,
3396
+ "loss": 1.007,
3397
+ "step": 11040
3398
+ },
3399
+ {
3400
+ "epoch": 0.47712080044852717,
3401
+ "grad_norm": 0.38671875,
3402
+ "learning_rate": 0.0005233983767915731,
3403
+ "loss": 1.0046,
3404
+ "step": 11063
3405
+ },
3406
+ {
3407
+ "epoch": 0.47811273558459483,
3408
+ "grad_norm": 0.341796875,
3409
+ "learning_rate": 0.0005224054567432222,
3410
+ "loss": 0.9956,
3411
+ "step": 11086
3412
+ },
3413
+ {
3414
+ "epoch": 0.47910467072066243,
3415
+ "grad_norm": 0.3515625,
3416
+ "learning_rate": 0.0005214125366948713,
3417
+ "loss": 1.01,
3418
+ "step": 11109
3419
+ },
3420
+ {
3421
+ "epoch": 0.4800966058567301,
3422
+ "grad_norm": 0.431640625,
3423
+ "learning_rate": 0.0005204196166465205,
3424
+ "loss": 1.0211,
3425
+ "step": 11132
3426
+ },
3427
+ {
3428
+ "epoch": 0.4810885409927977,
3429
+ "grad_norm": 0.375,
3430
+ "learning_rate": 0.0005194266965981696,
3431
+ "loss": 1.0039,
3432
+ "step": 11155
3433
+ },
3434
+ {
3435
+ "epoch": 0.4820804761288653,
3436
+ "grad_norm": 0.392578125,
3437
+ "learning_rate": 0.0005184337765498187,
3438
+ "loss": 0.9886,
3439
+ "step": 11178
3440
+ },
3441
+ {
3442
+ "epoch": 0.48307241126493294,
3443
+ "grad_norm": 0.515625,
3444
+ "learning_rate": 0.0005174408565014678,
3445
+ "loss": 0.9973,
3446
+ "step": 11201
3447
+ },
3448
+ {
3449
+ "epoch": 0.48406434640100054,
3450
+ "grad_norm": 0.396484375,
3451
+ "learning_rate": 0.0005164479364531169,
3452
+ "loss": 1.013,
3453
+ "step": 11224
3454
+ },
3455
+ {
3456
+ "epoch": 0.4850562815370682,
3457
+ "grad_norm": 0.451171875,
3458
+ "learning_rate": 0.000515455016404766,
3459
+ "loss": 0.9876,
3460
+ "step": 11247
3461
+ },
3462
+ {
3463
+ "epoch": 0.4860482166731358,
3464
+ "grad_norm": 0.375,
3465
+ "learning_rate": 0.0005144620963564152,
3466
+ "loss": 0.9984,
3467
+ "step": 11270
3468
+ },
3469
+ {
3470
+ "epoch": 0.48704015180920346,
3471
+ "grad_norm": 0.416015625,
3472
+ "learning_rate": 0.0005134691763080642,
3473
+ "loss": 0.9875,
3474
+ "step": 11293
3475
+ },
3476
+ {
3477
+ "epoch": 0.48803208694527106,
3478
+ "grad_norm": 0.369140625,
3479
+ "learning_rate": 0.0005124762562597134,
3480
+ "loss": 0.9954,
3481
+ "step": 11316
3482
+ },
3483
+ {
3484
+ "epoch": 0.48902402208133866,
3485
+ "grad_norm": 0.337890625,
3486
+ "learning_rate": 0.0005114833362113624,
3487
+ "loss": 0.9825,
3488
+ "step": 11339
3489
+ },
3490
+ {
3491
+ "epoch": 0.4900159572174063,
3492
+ "grad_norm": 0.37890625,
3493
+ "learning_rate": 0.0005104904161630117,
3494
+ "loss": 0.9983,
3495
+ "step": 11362
3496
+ },
3497
+ {
3498
+ "epoch": 0.4910078923534739,
3499
+ "grad_norm": 0.328125,
3500
+ "learning_rate": 0.0005094974961146607,
3501
+ "loss": 0.9818,
3502
+ "step": 11385
3503
+ },
3504
+ {
3505
+ "epoch": 0.4919998274895416,
3506
+ "grad_norm": 0.357421875,
3507
+ "learning_rate": 0.0005085045760663098,
3508
+ "loss": 0.9928,
3509
+ "step": 11408
3510
+ },
3511
+ {
3512
+ "epoch": 0.4929917626256092,
3513
+ "grad_norm": 0.4921875,
3514
+ "learning_rate": 0.0005075116560179589,
3515
+ "loss": 0.9771,
3516
+ "step": 11431
3517
+ },
3518
+ {
3519
+ "epoch": 0.4939836977616768,
3520
+ "grad_norm": 0.341796875,
3521
+ "learning_rate": 0.000506518735969608,
3522
+ "loss": 1.0059,
3523
+ "step": 11454
3524
+ },
3525
+ {
3526
+ "epoch": 0.49497563289774443,
3527
+ "grad_norm": 0.400390625,
3528
+ "learning_rate": 0.0005055258159212571,
3529
+ "loss": 1.0058,
3530
+ "step": 11477
3531
+ },
3532
+ {
3533
+ "epoch": 0.49596756803381203,
3534
+ "grad_norm": 0.328125,
3535
+ "learning_rate": 0.0005045328958729063,
3536
+ "loss": 0.9962,
3537
+ "step": 11500
3538
+ },
3539
+ {
3540
+ "epoch": 0.4969595031698797,
3541
+ "grad_norm": 0.326171875,
3542
+ "learning_rate": 0.0005035399758245554,
3543
+ "loss": 0.9828,
3544
+ "step": 11523
3545
+ },
3546
+ {
3547
+ "epoch": 0.4979514383059473,
3548
+ "grad_norm": 0.5078125,
3549
+ "learning_rate": 0.0005025470557762045,
3550
+ "loss": 0.9881,
3551
+ "step": 11546
3552
+ },
3553
+ {
3554
+ "epoch": 0.49894337344201495,
3555
+ "grad_norm": 0.43359375,
3556
+ "learning_rate": 0.0005015541357278536,
3557
+ "loss": 0.9863,
3558
+ "step": 11569
3559
+ },
3560
+ {
3561
+ "epoch": 0.49984905334885926,
3562
+ "eval_runtime": 163.9862,
3563
+ "eval_samples_per_second": 609.807,
3564
+ "eval_steps_per_second": 7.623,
3565
+ "step": 11590
3566
+ },
3567
+ {
3568
+ "epoch": 0.49993530857808255,
3569
+ "grad_norm": 0.353515625,
3570
+ "learning_rate": 0.0005005612156795026,
3571
+ "loss": 0.9764,
3572
+ "step": 11592
3573
+ },
3574
+ {
3575
+ "epoch": 0.5009272437141502,
3576
+ "grad_norm": 0.36328125,
3577
+ "learning_rate": 0.0004995682956311518,
3578
+ "loss": 0.9923,
3579
+ "step": 11615
3580
+ },
3581
+ {
3582
+ "epoch": 0.5019191788502178,
3583
+ "grad_norm": 0.39453125,
3584
+ "learning_rate": 0.0004985753755828009,
3585
+ "loss": 0.9738,
3586
+ "step": 11638
3587
+ },
3588
+ {
3589
+ "epoch": 0.5029111139862854,
3590
+ "grad_norm": 0.48828125,
3591
+ "learning_rate": 0.00049758245553445,
3592
+ "loss": 0.973,
3593
+ "step": 11661
3594
+ },
3595
+ {
3596
+ "epoch": 0.5039030491223531,
3597
+ "grad_norm": 0.384765625,
3598
+ "learning_rate": 0.0004965895354860991,
3599
+ "loss": 0.9741,
3600
+ "step": 11684
3601
+ },
3602
+ {
3603
+ "epoch": 0.5048949842584207,
3604
+ "grad_norm": 0.359375,
3605
+ "learning_rate": 0.0004955966154377482,
3606
+ "loss": 0.9842,
3607
+ "step": 11707
3608
+ },
3609
+ {
3610
+ "epoch": 0.5058869193944883,
3611
+ "grad_norm": 0.443359375,
3612
+ "learning_rate": 0.0004946036953893974,
3613
+ "loss": 0.9927,
3614
+ "step": 11730
3615
+ },
3616
+ {
3617
+ "epoch": 0.5068788545305559,
3618
+ "grad_norm": 0.44921875,
3619
+ "learning_rate": 0.0004936107753410465,
3620
+ "loss": 0.9921,
3621
+ "step": 11753
3622
+ },
3623
+ {
3624
+ "epoch": 0.5078707896666236,
3625
+ "grad_norm": 0.40625,
3626
+ "learning_rate": 0.0004926178552926956,
3627
+ "loss": 0.9827,
3628
+ "step": 11776
3629
+ },
3630
+ {
3631
+ "epoch": 0.5088627248026911,
3632
+ "grad_norm": 0.416015625,
3633
+ "learning_rate": 0.0004916249352443447,
3634
+ "loss": 0.9836,
3635
+ "step": 11799
3636
+ },
3637
+ {
3638
+ "epoch": 0.5098546599387588,
3639
+ "grad_norm": 0.36328125,
3640
+ "learning_rate": 0.0004906320151959938,
3641
+ "loss": 0.9783,
3642
+ "step": 11822
3643
+ },
3644
+ {
3645
+ "epoch": 0.5108465950748264,
3646
+ "grad_norm": 0.357421875,
3647
+ "learning_rate": 0.0004896390951476428,
3648
+ "loss": 1.0003,
3649
+ "step": 11845
3650
+ },
3651
+ {
3652
+ "epoch": 0.511838530210894,
3653
+ "grad_norm": 0.39453125,
3654
+ "learning_rate": 0.000488646175099292,
3655
+ "loss": 0.995,
3656
+ "step": 11868
3657
+ },
3658
+ {
3659
+ "epoch": 0.5128304653469616,
3660
+ "grad_norm": 0.376953125,
3661
+ "learning_rate": 0.0004876532550509411,
3662
+ "loss": 0.9952,
3663
+ "step": 11891
3664
+ },
3665
+ {
3666
+ "epoch": 0.5138224004830293,
3667
+ "grad_norm": 0.3828125,
3668
+ "learning_rate": 0.0004866603350025902,
3669
+ "loss": 0.9912,
3670
+ "step": 11914
3671
+ },
3672
+ {
3673
+ "epoch": 0.514814335619097,
3674
+ "grad_norm": 0.34375,
3675
+ "learning_rate": 0.00048566741495423933,
3676
+ "loss": 0.995,
3677
+ "step": 11937
3678
+ },
3679
+ {
3680
+ "epoch": 0.5158062707551645,
3681
+ "grad_norm": 0.408203125,
3682
+ "learning_rate": 0.00048467449490588845,
3683
+ "loss": 0.9856,
3684
+ "step": 11960
3685
+ },
3686
+ {
3687
+ "epoch": 0.5167982058912322,
3688
+ "grad_norm": 0.427734375,
3689
+ "learning_rate": 0.00048368157485753757,
3690
+ "loss": 0.9887,
3691
+ "step": 11983
3692
+ },
3693
+ {
3694
+ "epoch": 0.5177901410272998,
3695
+ "grad_norm": 0.376953125,
3696
+ "learning_rate": 0.0004826886548091867,
3697
+ "loss": 0.9815,
3698
+ "step": 12006
3699
+ },
3700
+ {
3701
+ "epoch": 0.5187820761633674,
3702
+ "grad_norm": 0.369140625,
3703
+ "learning_rate": 0.00048169573476083575,
3704
+ "loss": 0.9944,
3705
+ "step": 12029
3706
+ },
3707
+ {
3708
+ "epoch": 0.519774011299435,
3709
+ "grad_norm": 0.427734375,
3710
+ "learning_rate": 0.0004807028147124849,
3711
+ "loss": 0.9714,
3712
+ "step": 12052
3713
+ },
3714
+ {
3715
+ "epoch": 0.5207659464355027,
3716
+ "grad_norm": 0.326171875,
3717
+ "learning_rate": 0.000479709894664134,
3718
+ "loss": 0.9849,
3719
+ "step": 12075
3720
+ },
3721
+ {
3722
+ "epoch": 0.5217578815715703,
3723
+ "grad_norm": 0.427734375,
3724
+ "learning_rate": 0.0004787169746157831,
3725
+ "loss": 0.9861,
3726
+ "step": 12098
3727
+ },
3728
+ {
3729
+ "epoch": 0.5227498167076379,
3730
+ "grad_norm": 0.4765625,
3731
+ "learning_rate": 0.00047772405456743223,
3732
+ "loss": 1.0009,
3733
+ "step": 12121
3734
+ },
3735
+ {
3736
+ "epoch": 0.5237417518437055,
3737
+ "grad_norm": 0.345703125,
3738
+ "learning_rate": 0.00047673113451908135,
3739
+ "loss": 0.9892,
3740
+ "step": 12144
3741
+ },
3742
+ {
3743
+ "epoch": 0.5247336869797732,
3744
+ "grad_norm": 0.345703125,
3745
+ "learning_rate": 0.00047573821447073047,
3746
+ "loss": 0.9843,
3747
+ "step": 12167
3748
+ },
3749
+ {
3750
+ "epoch": 0.5257256221158407,
3751
+ "grad_norm": 0.40234375,
3752
+ "learning_rate": 0.0004747452944223796,
3753
+ "loss": 0.9767,
3754
+ "step": 12190
3755
+ },
3756
+ {
3757
+ "epoch": 0.5267175572519084,
3758
+ "grad_norm": 0.359375,
3759
+ "learning_rate": 0.00047375237437402866,
3760
+ "loss": 0.9599,
3761
+ "step": 12213
3762
+ },
3763
+ {
3764
+ "epoch": 0.527709492387976,
3765
+ "grad_norm": 0.388671875,
3766
+ "learning_rate": 0.0004727594543256778,
3767
+ "loss": 0.9797,
3768
+ "step": 12236
3769
+ },
3770
+ {
3771
+ "epoch": 0.5287014275240437,
3772
+ "grad_norm": 0.3359375,
3773
+ "learning_rate": 0.0004717665342773269,
3774
+ "loss": 0.9952,
3775
+ "step": 12259
3776
+ },
3777
+ {
3778
+ "epoch": 0.5296933626601112,
3779
+ "grad_norm": 0.359375,
3780
+ "learning_rate": 0.000470773614228976,
3781
+ "loss": 0.9851,
3782
+ "step": 12282
3783
+ },
3784
+ {
3785
+ "epoch": 0.5306852977961789,
3786
+ "grad_norm": 0.4140625,
3787
+ "learning_rate": 0.00046978069418062514,
3788
+ "loss": 0.9728,
3789
+ "step": 12305
3790
+ },
3791
+ {
3792
+ "epoch": 0.5316772329322466,
3793
+ "grad_norm": 0.376953125,
3794
+ "learning_rate": 0.00046878777413227426,
3795
+ "loss": 0.9813,
3796
+ "step": 12328
3797
+ },
3798
+ {
3799
+ "epoch": 0.5326691680683141,
3800
+ "grad_norm": 0.326171875,
3801
+ "learning_rate": 0.0004677948540839234,
3802
+ "loss": 0.9729,
3803
+ "step": 12351
3804
+ },
3805
+ {
3806
+ "epoch": 0.5336611032043818,
3807
+ "grad_norm": 0.33203125,
3808
+ "learning_rate": 0.0004668019340355725,
3809
+ "loss": 0.969,
3810
+ "step": 12374
3811
+ },
3812
+ {
3813
+ "epoch": 0.5346530383404494,
3814
+ "grad_norm": 0.43359375,
3815
+ "learning_rate": 0.00046580901398722156,
3816
+ "loss": 0.9786,
3817
+ "step": 12397
3818
+ },
3819
+ {
3820
+ "epoch": 0.535644973476517,
3821
+ "grad_norm": 0.388671875,
3822
+ "learning_rate": 0.00046481609393887063,
3823
+ "loss": 0.9773,
3824
+ "step": 12420
3825
+ },
3826
+ {
3827
+ "epoch": 0.5366369086125846,
3828
+ "grad_norm": 0.451171875,
3829
+ "learning_rate": 0.00046382317389051975,
3830
+ "loss": 0.9972,
3831
+ "step": 12443
3832
+ },
3833
+ {
3834
+ "epoch": 0.5376288437486523,
3835
+ "grad_norm": 0.408203125,
3836
+ "learning_rate": 0.00046283025384216887,
3837
+ "loss": 0.9893,
3838
+ "step": 12466
3839
+ },
3840
+ {
3841
+ "epoch": 0.5386207788847199,
3842
+ "grad_norm": 0.400390625,
3843
+ "learning_rate": 0.000461837333793818,
3844
+ "loss": 0.9747,
3845
+ "step": 12489
3846
+ },
3847
+ {
3848
+ "epoch": 0.5396127140207875,
3849
+ "grad_norm": 0.4921875,
3850
+ "learning_rate": 0.0004608444137454671,
3851
+ "loss": 0.9795,
3852
+ "step": 12512
3853
+ },
3854
+ {
3855
+ "epoch": 0.5406046491568551,
3856
+ "grad_norm": 0.37109375,
3857
+ "learning_rate": 0.00045985149369711623,
3858
+ "loss": 0.9608,
3859
+ "step": 12535
3860
+ },
3861
+ {
3862
+ "epoch": 0.5415965842929228,
3863
+ "grad_norm": 0.3515625,
3864
+ "learning_rate": 0.00045885857364876535,
3865
+ "loss": 0.966,
3866
+ "step": 12558
3867
+ },
3868
+ {
3869
+ "epoch": 0.5425885194289903,
3870
+ "grad_norm": 0.361328125,
3871
+ "learning_rate": 0.0004578656536004144,
3872
+ "loss": 0.9689,
3873
+ "step": 12581
3874
+ },
3875
+ {
3876
+ "epoch": 0.543580454565058,
3877
+ "grad_norm": 0.404296875,
3878
+ "learning_rate": 0.00045687273355206353,
3879
+ "loss": 0.9655,
3880
+ "step": 12604
3881
+ },
3882
+ {
3883
+ "epoch": 0.5445723897011256,
3884
+ "grad_norm": 0.37890625,
3885
+ "learning_rate": 0.00045587981350371265,
3886
+ "loss": 0.9693,
3887
+ "step": 12627
3888
+ },
3889
+ {
3890
+ "epoch": 0.5455643248371933,
3891
+ "grad_norm": 0.41015625,
3892
+ "learning_rate": 0.00045488689345536177,
3893
+ "loss": 0.9986,
3894
+ "step": 12650
3895
+ },
3896
+ {
3897
+ "epoch": 0.5465562599732608,
3898
+ "grad_norm": 0.345703125,
3899
+ "learning_rate": 0.0004538939734070109,
3900
+ "loss": 0.9715,
3901
+ "step": 12673
3902
+ },
3903
+ {
3904
+ "epoch": 0.5475481951093285,
3905
+ "grad_norm": 0.37890625,
3906
+ "learning_rate": 0.00045290105335866,
3907
+ "loss": 0.9781,
3908
+ "step": 12696
3909
+ },
3910
+ {
3911
+ "epoch": 0.5485401302453962,
3912
+ "grad_norm": 0.42578125,
3913
+ "learning_rate": 0.00045190813331030913,
3914
+ "loss": 1.0001,
3915
+ "step": 12719
3916
+ },
3917
+ {
3918
+ "epoch": 0.5495320653814637,
3919
+ "grad_norm": 0.43359375,
3920
+ "learning_rate": 0.0004509152132619582,
3921
+ "loss": 0.9811,
3922
+ "step": 12742
3923
+ },
3924
+ {
3925
+ "epoch": 0.5505240005175314,
3926
+ "grad_norm": 0.341796875,
3927
+ "learning_rate": 0.0004499222932136073,
3928
+ "loss": 0.9584,
3929
+ "step": 12765
3930
+ },
3931
+ {
3932
+ "epoch": 0.551515935653599,
3933
+ "grad_norm": 0.419921875,
3934
+ "learning_rate": 0.00044892937316525644,
3935
+ "loss": 0.977,
3936
+ "step": 12788
3937
+ },
3938
+ {
3939
+ "epoch": 0.5525078707896667,
3940
+ "grad_norm": 0.416015625,
3941
+ "learning_rate": 0.00044793645311690556,
3942
+ "loss": 0.9746,
3943
+ "step": 12811
3944
+ },
3945
+ {
3946
+ "epoch": 0.5534998059257342,
3947
+ "grad_norm": 0.390625,
3948
+ "learning_rate": 0.0004469435330685547,
3949
+ "loss": 0.9811,
3950
+ "step": 12834
3951
+ },
3952
+ {
3953
+ "epoch": 0.5544917410618019,
3954
+ "grad_norm": 0.35546875,
3955
+ "learning_rate": 0.0004459506130202038,
3956
+ "loss": 0.9523,
3957
+ "step": 12857
3958
+ },
3959
+ {
3960
+ "epoch": 0.5554836761978695,
3961
+ "grad_norm": 0.37890625,
3962
+ "learning_rate": 0.0004449576929718529,
3963
+ "loss": 0.9641,
3964
+ "step": 12880
3965
+ },
3966
+ {
3967
+ "epoch": 0.5564756113339371,
3968
+ "grad_norm": 0.36328125,
3969
+ "learning_rate": 0.00044396477292350204,
3970
+ "loss": 0.9845,
3971
+ "step": 12903
3972
+ },
3973
+ {
3974
+ "epoch": 0.5574675464700047,
3975
+ "grad_norm": 0.365234375,
3976
+ "learning_rate": 0.0004429718528751511,
3977
+ "loss": 0.9788,
3978
+ "step": 12926
3979
+ },
3980
+ {
3981
+ "epoch": 0.5584594816060724,
3982
+ "grad_norm": 0.390625,
3983
+ "learning_rate": 0.0004419789328268002,
3984
+ "loss": 0.9795,
3985
+ "step": 12949
3986
+ },
3987
+ {
3988
+ "epoch": 0.5594514167421399,
3989
+ "grad_norm": 0.37109375,
3990
+ "learning_rate": 0.00044098601277844934,
3991
+ "loss": 0.9716,
3992
+ "step": 12972
3993
+ },
3994
+ {
3995
+ "epoch": 0.5604433518782076,
3996
+ "grad_norm": 0.38671875,
3997
+ "learning_rate": 0.00043999309273009846,
3998
+ "loss": 0.9814,
3999
+ "step": 12995
4000
+ },
4001
+ {
4002
+ "epoch": 0.5614352870142753,
4003
+ "grad_norm": 0.34765625,
4004
+ "learning_rate": 0.00043900017268174753,
4005
+ "loss": 0.9724,
4006
+ "step": 13018
4007
+ },
4008
+ {
4009
+ "epoch": 0.5624272221503429,
4010
+ "grad_norm": 0.44921875,
4011
+ "learning_rate": 0.00043800725263339665,
4012
+ "loss": 0.9538,
4013
+ "step": 13041
4014
+ },
4015
+ {
4016
+ "epoch": 0.5634191572864105,
4017
+ "grad_norm": 0.3828125,
4018
+ "learning_rate": 0.00043701433258504577,
4019
+ "loss": 0.9744,
4020
+ "step": 13064
4021
+ },
4022
+ {
4023
+ "epoch": 0.5644110924224781,
4024
+ "grad_norm": 0.423828125,
4025
+ "learning_rate": 0.0004360214125366949,
4026
+ "loss": 0.9777,
4027
+ "step": 13087
4028
+ },
4029
+ {
4030
+ "epoch": 0.5654030275585458,
4031
+ "grad_norm": 0.365234375,
4032
+ "learning_rate": 0.00043502849248834395,
4033
+ "loss": 0.9688,
4034
+ "step": 13110
4035
+ },
4036
+ {
4037
+ "epoch": 0.5663949626946133,
4038
+ "grad_norm": 0.470703125,
4039
+ "learning_rate": 0.00043403557243999307,
4040
+ "loss": 0.988,
4041
+ "step": 13133
4042
+ },
4043
+ {
4044
+ "epoch": 0.567386897830681,
4045
+ "grad_norm": 0.341796875,
4046
+ "learning_rate": 0.0004330426523916422,
4047
+ "loss": 0.9678,
4048
+ "step": 13156
4049
+ },
4050
+ {
4051
+ "epoch": 0.5683788329667486,
4052
+ "grad_norm": 0.345703125,
4053
+ "learning_rate": 0.0004320497323432913,
4054
+ "loss": 0.9735,
4055
+ "step": 13179
4056
+ },
4057
+ {
4058
+ "epoch": 0.5693707681028163,
4059
+ "grad_norm": 0.416015625,
4060
+ "learning_rate": 0.00043105681229494043,
4061
+ "loss": 0.9612,
4062
+ "step": 13202
4063
+ },
4064
+ {
4065
+ "epoch": 0.5703627032388838,
4066
+ "grad_norm": 0.375,
4067
+ "learning_rate": 0.00043006389224658955,
4068
+ "loss": 0.9428,
4069
+ "step": 13225
4070
+ },
4071
+ {
4072
+ "epoch": 0.5713546383749515,
4073
+ "grad_norm": 0.4296875,
4074
+ "learning_rate": 0.00042907097219823867,
4075
+ "loss": 0.9654,
4076
+ "step": 13248
4077
+ },
4078
+ {
4079
+ "epoch": 0.5723465735110191,
4080
+ "grad_norm": 0.353515625,
4081
+ "learning_rate": 0.0004280780521498878,
4082
+ "loss": 0.9739,
4083
+ "step": 13271
4084
+ },
4085
+ {
4086
+ "epoch": 0.5733385086470867,
4087
+ "grad_norm": 0.380859375,
4088
+ "learning_rate": 0.00042708513210153686,
4089
+ "loss": 0.9755,
4090
+ "step": 13294
4091
+ },
4092
+ {
4093
+ "epoch": 0.5743304437831543,
4094
+ "grad_norm": 0.357421875,
4095
+ "learning_rate": 0.000426092212053186,
4096
+ "loss": 0.9784,
4097
+ "step": 13317
4098
+ },
4099
+ {
4100
+ "epoch": 0.575322378919222,
4101
+ "grad_norm": 0.3125,
4102
+ "learning_rate": 0.0004250992920048351,
4103
+ "loss": 0.9625,
4104
+ "step": 13340
4105
+ },
4106
+ {
4107
+ "epoch": 0.5763143140552897,
4108
+ "grad_norm": 0.345703125,
4109
+ "learning_rate": 0.0004241063719564842,
4110
+ "loss": 0.9521,
4111
+ "step": 13363
4112
+ },
4113
+ {
4114
+ "epoch": 0.5773062491913572,
4115
+ "grad_norm": 0.333984375,
4116
+ "learning_rate": 0.00042311345190813334,
4117
+ "loss": 0.984,
4118
+ "step": 13386
4119
+ },
4120
+ {
4121
+ "epoch": 0.5782981843274249,
4122
+ "grad_norm": 0.45703125,
4123
+ "learning_rate": 0.00042212053185978246,
4124
+ "loss": 0.9794,
4125
+ "step": 13409
4126
+ },
4127
+ {
4128
+ "epoch": 0.5792901194634925,
4129
+ "grad_norm": 0.396484375,
4130
+ "learning_rate": 0.0004211276118114316,
4131
+ "loss": 0.9705,
4132
+ "step": 13432
4133
+ },
4134
+ {
4135
+ "epoch": 0.5802820545995601,
4136
+ "grad_norm": 0.400390625,
4137
+ "learning_rate": 0.00042013469176308064,
4138
+ "loss": 0.97,
4139
+ "step": 13455
4140
+ },
4141
+ {
4142
+ "epoch": 0.5812739897356277,
4143
+ "grad_norm": 0.37890625,
4144
+ "learning_rate": 0.00041914177171472976,
4145
+ "loss": 0.968,
4146
+ "step": 13478
4147
+ },
4148
+ {
4149
+ "epoch": 0.5822659248716954,
4150
+ "grad_norm": 0.365234375,
4151
+ "learning_rate": 0.0004181488516663789,
4152
+ "loss": 0.9664,
4153
+ "step": 13501
4154
+ },
4155
+ {
4156
+ "epoch": 0.5832578600077629,
4157
+ "grad_norm": 0.361328125,
4158
+ "learning_rate": 0.000417155931618028,
4159
+ "loss": 0.9722,
4160
+ "step": 13524
4161
+ },
4162
+ {
4163
+ "epoch": 0.5842497951438306,
4164
+ "grad_norm": 0.369140625,
4165
+ "learning_rate": 0.0004161630115696771,
4166
+ "loss": 0.9695,
4167
+ "step": 13547
4168
+ },
4169
+ {
4170
+ "epoch": 0.5852417302798982,
4171
+ "grad_norm": 0.337890625,
4172
+ "learning_rate": 0.00041517009152132624,
4173
+ "loss": 0.9628,
4174
+ "step": 13570
4175
+ },
4176
+ {
4177
+ "epoch": 0.5862336654159659,
4178
+ "grad_norm": 0.330078125,
4179
+ "learning_rate": 0.0004141771714729753,
4180
+ "loss": 0.9515,
4181
+ "step": 13593
4182
+ },
4183
+ {
4184
+ "epoch": 0.5872256005520334,
4185
+ "grad_norm": 0.359375,
4186
+ "learning_rate": 0.0004131842514246244,
4187
+ "loss": 0.965,
4188
+ "step": 13616
4189
+ },
4190
+ {
4191
+ "epoch": 0.5882175356881011,
4192
+ "grad_norm": 0.392578125,
4193
+ "learning_rate": 0.0004121913313762735,
4194
+ "loss": 0.9598,
4195
+ "step": 13639
4196
+ },
4197
+ {
4198
+ "epoch": 0.5892094708241687,
4199
+ "grad_norm": 0.41796875,
4200
+ "learning_rate": 0.0004111984113279226,
4201
+ "loss": 0.9575,
4202
+ "step": 13662
4203
+ },
4204
+ {
4205
+ "epoch": 0.5902014059602363,
4206
+ "grad_norm": 0.5234375,
4207
+ "learning_rate": 0.00041020549127957173,
4208
+ "loss": 0.9933,
4209
+ "step": 13685
4210
+ },
4211
+ {
4212
+ "epoch": 0.591193341096304,
4213
+ "grad_norm": 0.423828125,
4214
+ "learning_rate": 0.00040921257123122085,
4215
+ "loss": 0.9621,
4216
+ "step": 13708
4217
+ },
4218
+ {
4219
+ "epoch": 0.5921852762323716,
4220
+ "grad_norm": 0.33203125,
4221
+ "learning_rate": 0.00040821965118286997,
4222
+ "loss": 0.964,
4223
+ "step": 13731
4224
+ },
4225
+ {
4226
+ "epoch": 0.5931772113684393,
4227
+ "grad_norm": 0.423828125,
4228
+ "learning_rate": 0.0004072267311345191,
4229
+ "loss": 0.9854,
4230
+ "step": 13754
4231
+ },
4232
+ {
4233
+ "epoch": 0.5941691465045068,
4234
+ "grad_norm": 0.3515625,
4235
+ "learning_rate": 0.0004062338110861682,
4236
+ "loss": 0.9883,
4237
+ "step": 13777
4238
+ },
4239
+ {
4240
+ "epoch": 0.5951610816405745,
4241
+ "grad_norm": 0.408203125,
4242
+ "learning_rate": 0.00040524089103781733,
4243
+ "loss": 0.9853,
4244
+ "step": 13800
4245
+ },
4246
+ {
4247
+ "epoch": 0.5961530167766421,
4248
+ "grad_norm": 0.408203125,
4249
+ "learning_rate": 0.0004042479709894664,
4250
+ "loss": 0.9557,
4251
+ "step": 13823
4252
+ },
4253
+ {
4254
+ "epoch": 0.5971449519127097,
4255
+ "grad_norm": 0.42578125,
4256
+ "learning_rate": 0.0004032550509411155,
4257
+ "loss": 0.9587,
4258
+ "step": 13846
4259
+ },
4260
+ {
4261
+ "epoch": 0.5981368870487773,
4262
+ "grad_norm": 0.44921875,
4263
+ "learning_rate": 0.00040226213089276464,
4264
+ "loss": 0.9771,
4265
+ "step": 13869
4266
+ },
4267
+ {
4268
+ "epoch": 0.599128822184845,
4269
+ "grad_norm": 0.431640625,
4270
+ "learning_rate": 0.00040126921084441376,
4271
+ "loss": 0.9661,
4272
+ "step": 13892
4273
+ },
4274
+ {
4275
+ "epoch": 0.5998188640186312,
4276
+ "eval_runtime": 163.7921,
4277
+ "eval_samples_per_second": 610.53,
4278
+ "eval_steps_per_second": 7.632,
4279
+ "step": 13908
4280
  }
4281
  ],
4282
  "logging_steps": 23,
 
4296
  "attributes": {}
4297
  }
4298
  },
4299
+ "total_flos": 1.0167159364234772e+18,
4300
  "train_batch_size": 8,
4301
  "trial_name": null,
4302
  "trial_params": null