mtzig commited on
Commit
9256123
·
verified ·
1 Parent(s): 3aecb61

Training in progress, step 1100, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:258fa13d0dd3a9484ec2b8bba17c7c79f8fae592c54988db0060f30bed4fd479
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd4094f5600490ddf6d9dc86706a89c258972627eabb482c09db8601aaa408b6
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5ddb62a05e33303e25df1c1b59bf783af13705be97b8587921a6631bbf41f12
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a5f19e172755f2ce57b999b6bf91cc0cd71f655dfb983069bc0cf1f20c1a06d
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fb6a1a5c2377031003c7d16853c28d7a0326058f17d16ddaa590879db829783
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c2c51b674c05b19e09a6f9dc112d8aed01c92bba25c7ff3c02cc7e583e58316
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a518b3cdba9c599ba3497e050cfb494de6051c337793cf768f8446e88e0ef4ba
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87060ce519920d72bc5688fe4b87ba053fe5674703e0cfb88414391c60a767ad
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1aebed5961971ac0185b5d1edd913501186a04d8cb3949bbd870af297dec990
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:223e910b7b1616737b5bc86e1ebfb716e2a5b926a5993ed4e39db2e7651a4478
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4034df41a1c0d3738bcd73e59e7e4471dae0dae7adcc4ea761a25657d8d77fa1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43b45ba64474b3b8076d79d336bf19cbfbf47a3077e59ccccda7247f6abf0ebd
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7294b9b4f0bbf7be3fd83aa7bd5f3c71e8668b2c2e36b427b4e526da02d8b63
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02d133caa8a7c8c9f20ecb1f747d463913eb8e7adea3e916057db45aee893c68
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c060ae75b35787edb9e656fa359b5e8baf41200c17d6d0b344faaeac0c182233
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81cb5bfebcce1a4979fbe0cfe517ce8ed3410829b5d415fc3687e8cd5e5c8a63
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46d64aa80154edb0becdb4e60fed57a8aac14a4ebe5ad47164b8308b04d1a370
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:435d97755e037c527a64ed888b5fccf61252600460ddb0a957d40f8cf8984322
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2af1c350e67f963da31a05dc6edc793160cea95a04ea98f5a78d582b25ac08c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e4a7ce1edd4170f6d4eb155e5e19998fd066ff3bfebd60f589551b3e6deedd6
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99c03513e72c4db6bdeb7d74baec0b63e12465bd1567437e670ad367c25edffc
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7b6f523269f600825123123e93e374bafdb4065da7c3500423ba2da40982a17
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2416af50204c614cf7f740b07f5654dd4e7764bc2fe0987af0691cc96765362c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36347d57510ac50d1215fd7cfb5a25f5354d812e876333fa5409094c79836493
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13782dda1786550739f5e5b9748c9c3674e47c6e87347dbff2f82a544e235cb5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30bfb6fa4fd21ac286df0550c82cdbf8a597994647e5b3f5b958394e3a125a12
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7654037504783774,
5
  "eval_steps": 20,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -7619,6 +7619,766 @@
7619
  "eval_samples_per_second": 6.985,
7620
  "eval_steps_per_second": 0.232,
7621
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7622
  }
7623
  ],
7624
  "logging_steps": 1,
@@ -7638,7 +8398,7 @@
7638
  "attributes": {}
7639
  }
7640
  },
7641
- "total_flos": 1.536494547196969e+17,
7642
  "train_batch_size": 8,
7643
  "trial_name": null,
7644
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8419441255262151,
5
  "eval_steps": 20,
6
+ "global_step": 1100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
7619
  "eval_samples_per_second": 6.985,
7620
  "eval_steps_per_second": 0.232,
7621
  "step": 1000
7622
+ },
7623
+ {
7624
+ "epoch": 0.7661691542288557,
7625
+ "grad_norm": 5.599081993103027,
7626
+ "learning_rate": 3.1447898585370386e-06,
7627
+ "loss": 0.2121,
7628
+ "step": 1001
7629
+ },
7630
+ {
7631
+ "epoch": 0.7669345579793341,
7632
+ "grad_norm": 4.840198516845703,
7633
+ "learning_rate": 3.125348465684439e-06,
7634
+ "loss": 0.1907,
7635
+ "step": 1002
7636
+ },
7637
+ {
7638
+ "epoch": 0.7676999617298125,
7639
+ "grad_norm": 7.6106858253479,
7640
+ "learning_rate": 3.105956217277738e-06,
7641
+ "loss": 0.3576,
7642
+ "step": 1003
7643
+ },
7644
+ {
7645
+ "epoch": 0.7684653654802909,
7646
+ "grad_norm": 6.351868629455566,
7647
+ "learning_rate": 3.086613251945246e-06,
7648
+ "loss": 0.2508,
7649
+ "step": 1004
7650
+ },
7651
+ {
7652
+ "epoch": 0.7692307692307693,
7653
+ "grad_norm": 9.808284759521484,
7654
+ "learning_rate": 3.067319707962957e-06,
7655
+ "loss": 0.4001,
7656
+ "step": 1005
7657
+ },
7658
+ {
7659
+ "epoch": 0.7699961729812476,
7660
+ "grad_norm": 7.302840709686279,
7661
+ "learning_rate": 3.0480757232535773e-06,
7662
+ "loss": 0.3344,
7663
+ "step": 1006
7664
+ },
7665
+ {
7666
+ "epoch": 0.770761576731726,
7667
+ "grad_norm": 7.3297438621521,
7668
+ "learning_rate": 3.02888143538553e-06,
7669
+ "loss": 0.2596,
7670
+ "step": 1007
7671
+ },
7672
+ {
7673
+ "epoch": 0.7715269804822044,
7674
+ "grad_norm": 6.303321838378906,
7675
+ "learning_rate": 3.0097369815719746e-06,
7676
+ "loss": 0.2743,
7677
+ "step": 1008
7678
+ },
7679
+ {
7680
+ "epoch": 0.7722923842326828,
7681
+ "grad_norm": 7.253098011016846,
7682
+ "learning_rate": 2.990642498669816e-06,
7683
+ "loss": 0.3192,
7684
+ "step": 1009
7685
+ },
7686
+ {
7687
+ "epoch": 0.7730577879831612,
7688
+ "grad_norm": 6.633822441101074,
7689
+ "learning_rate": 2.971598123178744e-06,
7690
+ "loss": 0.2047,
7691
+ "step": 1010
7692
+ },
7693
+ {
7694
+ "epoch": 0.7738231917336394,
7695
+ "grad_norm": 6.165762901306152,
7696
+ "learning_rate": 2.9526039912402504e-06,
7697
+ "loss": 0.3222,
7698
+ "step": 1011
7699
+ },
7700
+ {
7701
+ "epoch": 0.7745885954841178,
7702
+ "grad_norm": 4.74859094619751,
7703
+ "learning_rate": 2.9336602386366396e-06,
7704
+ "loss": 0.209,
7705
+ "step": 1012
7706
+ },
7707
+ {
7708
+ "epoch": 0.7753539992345962,
7709
+ "grad_norm": 7.679808616638184,
7710
+ "learning_rate": 2.9147670007900875e-06,
7711
+ "loss": 0.2636,
7712
+ "step": 1013
7713
+ },
7714
+ {
7715
+ "epoch": 0.7761194029850746,
7716
+ "grad_norm": 6.208285331726074,
7717
+ "learning_rate": 2.8959244127616483e-06,
7718
+ "loss": 0.3115,
7719
+ "step": 1014
7720
+ },
7721
+ {
7722
+ "epoch": 0.776884806735553,
7723
+ "grad_norm": 7.4330902099609375,
7724
+ "learning_rate": 2.877132609250303e-06,
7725
+ "loss": 0.2613,
7726
+ "step": 1015
7727
+ },
7728
+ {
7729
+ "epoch": 0.7776502104860313,
7730
+ "grad_norm": 7.014687538146973,
7731
+ "learning_rate": 2.8583917245919944e-06,
7732
+ "loss": 0.2428,
7733
+ "step": 1016
7734
+ },
7735
+ {
7736
+ "epoch": 0.7784156142365097,
7737
+ "grad_norm": 6.197044372558594,
7738
+ "learning_rate": 2.839701892758655e-06,
7739
+ "loss": 0.3242,
7740
+ "step": 1017
7741
+ },
7742
+ {
7743
+ "epoch": 0.7791810179869881,
7744
+ "grad_norm": 5.864729404449463,
7745
+ "learning_rate": 2.8210632473572664e-06,
7746
+ "loss": 0.3934,
7747
+ "step": 1018
7748
+ },
7749
+ {
7750
+ "epoch": 0.7799464217374665,
7751
+ "grad_norm": 9.904959678649902,
7752
+ "learning_rate": 2.8024759216288953e-06,
7753
+ "loss": 0.4493,
7754
+ "step": 1019
7755
+ },
7756
+ {
7757
+ "epoch": 0.7807118254879449,
7758
+ "grad_norm": 7.564253330230713,
7759
+ "learning_rate": 2.783940048447743e-06,
7760
+ "loss": 0.3237,
7761
+ "step": 1020
7762
+ },
7763
+ {
7764
+ "epoch": 0.7807118254879449,
7765
+ "eval_accuracy": 0.8898916967509025,
7766
+ "eval_f1": 0.8431876606683805,
7767
+ "eval_loss": 0.3033747375011444,
7768
+ "eval_precision": 0.8723404255319149,
7769
+ "eval_recall": 0.8159203980099502,
7770
+ "eval_runtime": 44.2093,
7771
+ "eval_samples_per_second": 6.809,
7772
+ "eval_steps_per_second": 0.226,
7773
+ "step": 1020
7774
+ },
7775
+ {
7776
+ "epoch": 0.7814772292384232,
7777
+ "grad_norm": 7.01906156539917,
7778
+ "learning_rate": 2.765455760320196e-06,
7779
+ "loss": 0.2956,
7780
+ "step": 1021
7781
+ },
7782
+ {
7783
+ "epoch": 0.7822426329889016,
7784
+ "grad_norm": 5.770253658294678,
7785
+ "learning_rate": 2.7470231893838684e-06,
7786
+ "loss": 0.2521,
7787
+ "step": 1022
7788
+ },
7789
+ {
7790
+ "epoch": 0.78300803673938,
7791
+ "grad_norm": 5.1977410316467285,
7792
+ "learning_rate": 2.728642467406679e-06,
7793
+ "loss": 0.2315,
7794
+ "step": 1023
7795
+ },
7796
+ {
7797
+ "epoch": 0.7837734404898584,
7798
+ "grad_norm": 6.585521697998047,
7799
+ "learning_rate": 2.7103137257858867e-06,
7800
+ "loss": 0.266,
7801
+ "step": 1024
7802
+ },
7803
+ {
7804
+ "epoch": 0.7845388442403368,
7805
+ "grad_norm": 4.882285118103027,
7806
+ "learning_rate": 2.692037095547164e-06,
7807
+ "loss": 0.2697,
7808
+ "step": 1025
7809
+ },
7810
+ {
7811
+ "epoch": 0.7853042479908151,
7812
+ "grad_norm": 5.839199542999268,
7813
+ "learning_rate": 2.6738127073436694e-06,
7814
+ "loss": 0.2287,
7815
+ "step": 1026
7816
+ },
7817
+ {
7818
+ "epoch": 0.7860696517412935,
7819
+ "grad_norm": 6.01020622253418,
7820
+ "learning_rate": 2.6556406914550803e-06,
7821
+ "loss": 0.2481,
7822
+ "step": 1027
7823
+ },
7824
+ {
7825
+ "epoch": 0.7868350554917719,
7826
+ "grad_norm": 6.746147632598877,
7827
+ "learning_rate": 2.6375211777867015e-06,
7828
+ "loss": 0.307,
7829
+ "step": 1028
7830
+ },
7831
+ {
7832
+ "epoch": 0.7876004592422503,
7833
+ "grad_norm": 7.418403148651123,
7834
+ "learning_rate": 2.6194542958685052e-06,
7835
+ "loss": 0.3297,
7836
+ "step": 1029
7837
+ },
7838
+ {
7839
+ "epoch": 0.7883658629927287,
7840
+ "grad_norm": 5.111098766326904,
7841
+ "learning_rate": 2.601440174854225e-06,
7842
+ "loss": 0.25,
7843
+ "step": 1030
7844
+ },
7845
+ {
7846
+ "epoch": 0.789131266743207,
7847
+ "grad_norm": 5.072177410125732,
7848
+ "learning_rate": 2.5834789435204245e-06,
7849
+ "loss": 0.217,
7850
+ "step": 1031
7851
+ },
7852
+ {
7853
+ "epoch": 0.7898966704936854,
7854
+ "grad_norm": 9.598026275634766,
7855
+ "learning_rate": 2.5655707302655766e-06,
7856
+ "loss": 0.2256,
7857
+ "step": 1032
7858
+ },
7859
+ {
7860
+ "epoch": 0.7906620742441638,
7861
+ "grad_norm": 5.46431303024292,
7862
+ "learning_rate": 2.5477156631091503e-06,
7863
+ "loss": 0.2236,
7864
+ "step": 1033
7865
+ },
7866
+ {
7867
+ "epoch": 0.7914274779946422,
7868
+ "grad_norm": 6.440191268920898,
7869
+ "learning_rate": 2.5299138696906833e-06,
7870
+ "loss": 0.2711,
7871
+ "step": 1034
7872
+ },
7873
+ {
7874
+ "epoch": 0.7921928817451206,
7875
+ "grad_norm": 6.173571586608887,
7876
+ "learning_rate": 2.512165477268889e-06,
7877
+ "loss": 0.2715,
7878
+ "step": 1035
7879
+ },
7880
+ {
7881
+ "epoch": 0.7929582854955989,
7882
+ "grad_norm": 9.205805778503418,
7883
+ "learning_rate": 2.4944706127207252e-06,
7884
+ "loss": 0.3408,
7885
+ "step": 1036
7886
+ },
7887
+ {
7888
+ "epoch": 0.7937236892460773,
7889
+ "grad_norm": 6.677053451538086,
7890
+ "learning_rate": 2.476829402540504e-06,
7891
+ "loss": 0.1972,
7892
+ "step": 1037
7893
+ },
7894
+ {
7895
+ "epoch": 0.7944890929965557,
7896
+ "grad_norm": 9.463765144348145,
7897
+ "learning_rate": 2.459241972838988e-06,
7898
+ "loss": 0.2984,
7899
+ "step": 1038
7900
+ },
7901
+ {
7902
+ "epoch": 0.7952544967470341,
7903
+ "grad_norm": 6.406791687011719,
7904
+ "learning_rate": 2.4417084493424693e-06,
7905
+ "loss": 0.2565,
7906
+ "step": 1039
7907
+ },
7908
+ {
7909
+ "epoch": 0.7960199004975125,
7910
+ "grad_norm": 8.168065071105957,
7911
+ "learning_rate": 2.4242289573918933e-06,
7912
+ "loss": 0.4034,
7913
+ "step": 1040
7914
+ },
7915
+ {
7916
+ "epoch": 0.7960199004975125,
7917
+ "eval_accuracy": 0.8898916967509025,
7918
+ "eval_f1": 0.8381962864721485,
7919
+ "eval_loss": 0.2988373935222626,
7920
+ "eval_precision": 0.8977272727272727,
7921
+ "eval_recall": 0.7860696517412935,
7922
+ "eval_runtime": 43.2157,
7923
+ "eval_samples_per_second": 6.965,
7924
+ "eval_steps_per_second": 0.231,
7925
+ "step": 1040
7926
+ },
7927
+ {
7928
+ "epoch": 0.7967853042479908,
7929
+ "grad_norm": 5.31250524520874,
7930
+ "learning_rate": 2.4068036219419433e-06,
7931
+ "loss": 0.2661,
7932
+ "step": 1041
7933
+ },
7934
+ {
7935
+ "epoch": 0.7975507079984692,
7936
+ "grad_norm": 15.13749885559082,
7937
+ "learning_rate": 2.3894325675601683e-06,
7938
+ "loss": 0.3992,
7939
+ "step": 1042
7940
+ },
7941
+ {
7942
+ "epoch": 0.7983161117489476,
7943
+ "grad_norm": 7.951029300689697,
7944
+ "learning_rate": 2.3721159184260733e-06,
7945
+ "loss": 0.3912,
7946
+ "step": 1043
7947
+ },
7948
+ {
7949
+ "epoch": 0.799081515499426,
7950
+ "grad_norm": 7.845716953277588,
7951
+ "learning_rate": 2.354853798330242e-06,
7952
+ "loss": 0.2465,
7953
+ "step": 1044
7954
+ },
7955
+ {
7956
+ "epoch": 0.7998469192499044,
7957
+ "grad_norm": 10.499722480773926,
7958
+ "learning_rate": 2.3376463306734543e-06,
7959
+ "loss": 0.4171,
7960
+ "step": 1045
7961
+ },
7962
+ {
7963
+ "epoch": 0.8006123230003827,
7964
+ "grad_norm": 7.074059009552002,
7965
+ "learning_rate": 2.3204936384657873e-06,
7966
+ "loss": 0.333,
7967
+ "step": 1046
7968
+ },
7969
+ {
7970
+ "epoch": 0.801377726750861,
7971
+ "grad_norm": 5.240701198577881,
7972
+ "learning_rate": 2.303395844325761e-06,
7973
+ "loss": 0.2089,
7974
+ "step": 1047
7975
+ },
7976
+ {
7977
+ "epoch": 0.8021431305013395,
7978
+ "grad_norm": 6.376049041748047,
7979
+ "learning_rate": 2.2863530704794334e-06,
7980
+ "loss": 0.2528,
7981
+ "step": 1048
7982
+ },
7983
+ {
7984
+ "epoch": 0.8029085342518179,
7985
+ "grad_norm": 6.376869201660156,
7986
+ "learning_rate": 2.26936543875956e-06,
7987
+ "loss": 0.2357,
7988
+ "step": 1049
7989
+ },
7990
+ {
7991
+ "epoch": 0.8036739380022963,
7992
+ "grad_norm": 7.007830619812012,
7993
+ "learning_rate": 2.252433070604695e-06,
7994
+ "loss": 0.3034,
7995
+ "step": 1050
7996
+ },
7997
+ {
7998
+ "epoch": 0.8044393417527745,
7999
+ "grad_norm": 5.573482990264893,
8000
+ "learning_rate": 2.2355560870583283e-06,
8001
+ "loss": 0.2589,
8002
+ "step": 1051
8003
+ },
8004
+ {
8005
+ "epoch": 0.8052047455032529,
8006
+ "grad_norm": 7.182730674743652,
8007
+ "learning_rate": 2.2187346087680363e-06,
8008
+ "loss": 0.2601,
8009
+ "step": 1052
8010
+ },
8011
+ {
8012
+ "epoch": 0.8059701492537313,
8013
+ "grad_norm": 6.265502452850342,
8014
+ "learning_rate": 2.201968755984596e-06,
8015
+ "loss": 0.2131,
8016
+ "step": 1053
8017
+ },
8018
+ {
8019
+ "epoch": 0.8067355530042097,
8020
+ "grad_norm": 6.296632289886475,
8021
+ "learning_rate": 2.185258648561147e-06,
8022
+ "loss": 0.2377,
8023
+ "step": 1054
8024
+ },
8025
+ {
8026
+ "epoch": 0.8075009567546881,
8027
+ "grad_norm": 6.393189430236816,
8028
+ "learning_rate": 2.1686044059523192e-06,
8029
+ "loss": 0.3424,
8030
+ "step": 1055
8031
+ },
8032
+ {
8033
+ "epoch": 0.8082663605051664,
8034
+ "grad_norm": 5.018173694610596,
8035
+ "learning_rate": 2.1520061472133903e-06,
8036
+ "loss": 0.2154,
8037
+ "step": 1056
8038
+ },
8039
+ {
8040
+ "epoch": 0.8090317642556448,
8041
+ "grad_norm": 6.2258687019348145,
8042
+ "learning_rate": 2.1354639909994258e-06,
8043
+ "loss": 0.2585,
8044
+ "step": 1057
8045
+ },
8046
+ {
8047
+ "epoch": 0.8097971680061232,
8048
+ "grad_norm": 6.394412040710449,
8049
+ "learning_rate": 2.1189780555644302e-06,
8050
+ "loss": 0.3103,
8051
+ "step": 1058
8052
+ },
8053
+ {
8054
+ "epoch": 0.8105625717566016,
8055
+ "grad_norm": 6.619604110717773,
8056
+ "learning_rate": 2.1025484587605115e-06,
8057
+ "loss": 0.4228,
8058
+ "step": 1059
8059
+ },
8060
+ {
8061
+ "epoch": 0.81132797550708,
8062
+ "grad_norm": 6.956901550292969,
8063
+ "learning_rate": 2.0861753180370324e-06,
8064
+ "loss": 0.2682,
8065
+ "step": 1060
8066
+ },
8067
+ {
8068
+ "epoch": 0.81132797550708,
8069
+ "eval_accuracy": 0.8844765342960289,
8070
+ "eval_f1": 0.8350515463917526,
8071
+ "eval_loss": 0.3001127541065216,
8072
+ "eval_precision": 0.8663101604278075,
8073
+ "eval_recall": 0.8059701492537313,
8074
+ "eval_runtime": 43.5926,
8075
+ "eval_samples_per_second": 6.905,
8076
+ "eval_steps_per_second": 0.229,
8077
+ "step": 1060
8078
+ },
8079
+ {
8080
+ "epoch": 0.8120933792575583,
8081
+ "grad_norm": 7.195816993713379,
8082
+ "learning_rate": 2.0698587504397684e-06,
8083
+ "loss": 0.3363,
8084
+ "step": 1061
8085
+ },
8086
+ {
8087
+ "epoch": 0.8128587830080367,
8088
+ "grad_norm": 7.6192498207092285,
8089
+ "learning_rate": 2.0535988726100774e-06,
8090
+ "loss": 0.3632,
8091
+ "step": 1062
8092
+ },
8093
+ {
8094
+ "epoch": 0.8136241867585151,
8095
+ "grad_norm": 8.436617851257324,
8096
+ "learning_rate": 2.0373958007840545e-06,
8097
+ "loss": 0.3416,
8098
+ "step": 1063
8099
+ },
8100
+ {
8101
+ "epoch": 0.8143895905089935,
8102
+ "grad_norm": 8.453132629394531,
8103
+ "learning_rate": 2.0212496507917214e-06,
8104
+ "loss": 0.2447,
8105
+ "step": 1064
8106
+ },
8107
+ {
8108
+ "epoch": 0.8151549942594719,
8109
+ "grad_norm": 11.446274757385254,
8110
+ "learning_rate": 2.0051605380561702e-06,
8111
+ "loss": 0.3519,
8112
+ "step": 1065
8113
+ },
8114
+ {
8115
+ "epoch": 0.8159203980099502,
8116
+ "grad_norm": 5.6378254890441895,
8117
+ "learning_rate": 1.9891285775927684e-06,
8118
+ "loss": 0.2777,
8119
+ "step": 1066
8120
+ },
8121
+ {
8122
+ "epoch": 0.8166858017604286,
8123
+ "grad_norm": 11.645222663879395,
8124
+ "learning_rate": 1.973153884008312e-06,
8125
+ "loss": 0.2887,
8126
+ "step": 1067
8127
+ },
8128
+ {
8129
+ "epoch": 0.817451205510907,
8130
+ "grad_norm": 5.4502434730529785,
8131
+ "learning_rate": 1.957236571500224e-06,
8132
+ "loss": 0.2295,
8133
+ "step": 1068
8134
+ },
8135
+ {
8136
+ "epoch": 0.8182166092613854,
8137
+ "grad_norm": 5.929233074188232,
8138
+ "learning_rate": 1.941376753855728e-06,
8139
+ "loss": 0.2573,
8140
+ "step": 1069
8141
+ },
8142
+ {
8143
+ "epoch": 0.8189820130118638,
8144
+ "grad_norm": 7.02168083190918,
8145
+ "learning_rate": 1.925574544451031e-06,
8146
+ "loss": 0.3507,
8147
+ "step": 1070
8148
+ },
8149
+ {
8150
+ "epoch": 0.8197474167623421,
8151
+ "grad_norm": 6.753659248352051,
8152
+ "learning_rate": 1.9098300562505266e-06,
8153
+ "loss": 0.3882,
8154
+ "step": 1071
8155
+ },
8156
+ {
8157
+ "epoch": 0.8205128205128205,
8158
+ "grad_norm": 7.252670764923096,
8159
+ "learning_rate": 1.8941434018059779e-06,
8160
+ "loss": 0.3117,
8161
+ "step": 1072
8162
+ },
8163
+ {
8164
+ "epoch": 0.8212782242632989,
8165
+ "grad_norm": 4.297603130340576,
8166
+ "learning_rate": 1.878514693255714e-06,
8167
+ "loss": 0.2392,
8168
+ "step": 1073
8169
+ },
8170
+ {
8171
+ "epoch": 0.8220436280137773,
8172
+ "grad_norm": 8.13119125366211,
8173
+ "learning_rate": 1.8629440423238333e-06,
8174
+ "loss": 0.3269,
8175
+ "step": 1074
8176
+ },
8177
+ {
8178
+ "epoch": 0.8228090317642557,
8179
+ "grad_norm": 8.53504753112793,
8180
+ "learning_rate": 1.8474315603193916e-06,
8181
+ "loss": 0.3209,
8182
+ "step": 1075
8183
+ },
8184
+ {
8185
+ "epoch": 0.823574435514734,
8186
+ "grad_norm": 7.265506267547607,
8187
+ "learning_rate": 1.8319773581356248e-06,
8188
+ "loss": 0.3279,
8189
+ "step": 1076
8190
+ },
8191
+ {
8192
+ "epoch": 0.8243398392652124,
8193
+ "grad_norm": 6.86147928237915,
8194
+ "learning_rate": 1.8165815462491466e-06,
8195
+ "loss": 0.2692,
8196
+ "step": 1077
8197
+ },
8198
+ {
8199
+ "epoch": 0.8251052430156908,
8200
+ "grad_norm": 7.1608686447143555,
8201
+ "learning_rate": 1.8012442347191483e-06,
8202
+ "loss": 0.3593,
8203
+ "step": 1078
8204
+ },
8205
+ {
8206
+ "epoch": 0.8258706467661692,
8207
+ "grad_norm": 5.432136535644531,
8208
+ "learning_rate": 1.7859655331866422e-06,
8209
+ "loss": 0.3488,
8210
+ "step": 1079
8211
+ },
8212
+ {
8213
+ "epoch": 0.8266360505166476,
8214
+ "grad_norm": 8.375764846801758,
8215
+ "learning_rate": 1.7707455508736381e-06,
8216
+ "loss": 0.2921,
8217
+ "step": 1080
8218
+ },
8219
+ {
8220
+ "epoch": 0.8266360505166476,
8221
+ "eval_accuracy": 0.8844765342960289,
8222
+ "eval_f1": 0.8324607329842932,
8223
+ "eval_loss": 0.29819196462631226,
8224
+ "eval_precision": 0.8784530386740331,
8225
+ "eval_recall": 0.7910447761194029,
8226
+ "eval_runtime": 43.9709,
8227
+ "eval_samples_per_second": 6.845,
8228
+ "eval_steps_per_second": 0.227,
8229
+ "step": 1080
8230
+ },
8231
+ {
8232
+ "epoch": 0.8274014542671259,
8233
+ "grad_norm": 5.469040870666504,
8234
+ "learning_rate": 1.7555843965823992e-06,
8235
+ "loss": 0.2931,
8236
+ "step": 1081
8237
+ },
8238
+ {
8239
+ "epoch": 0.8281668580176043,
8240
+ "grad_norm": 6.6713409423828125,
8241
+ "learning_rate": 1.7404821786946346e-06,
8242
+ "loss": 0.3362,
8243
+ "step": 1082
8244
+ },
8245
+ {
8246
+ "epoch": 0.8289322617680827,
8247
+ "grad_norm": 7.332741737365723,
8248
+ "learning_rate": 1.725439005170747e-06,
8249
+ "loss": 0.2507,
8250
+ "step": 1083
8251
+ },
8252
+ {
8253
+ "epoch": 0.8296976655185611,
8254
+ "grad_norm": 8.208507537841797,
8255
+ "learning_rate": 1.7104549835490491e-06,
8256
+ "loss": 0.3192,
8257
+ "step": 1084
8258
+ },
8259
+ {
8260
+ "epoch": 0.8304630692690395,
8261
+ "grad_norm": 6.0084967613220215,
8262
+ "learning_rate": 1.6955302209449987e-06,
8263
+ "loss": 0.3366,
8264
+ "step": 1085
8265
+ },
8266
+ {
8267
+ "epoch": 0.8312284730195177,
8268
+ "grad_norm": 6.15051794052124,
8269
+ "learning_rate": 1.680664824050432e-06,
8270
+ "loss": 0.3023,
8271
+ "step": 1086
8272
+ },
8273
+ {
8274
+ "epoch": 0.8319938767699961,
8275
+ "grad_norm": 8.824700355529785,
8276
+ "learning_rate": 1.6658588991327962e-06,
8277
+ "loss": 0.2097,
8278
+ "step": 1087
8279
+ },
8280
+ {
8281
+ "epoch": 0.8327592805204745,
8282
+ "grad_norm": 4.843833923339844,
8283
+ "learning_rate": 1.6511125520344007e-06,
8284
+ "loss": 0.2767,
8285
+ "step": 1088
8286
+ },
8287
+ {
8288
+ "epoch": 0.8335246842709529,
8289
+ "grad_norm": 4.750216007232666,
8290
+ "learning_rate": 1.636425888171652e-06,
8291
+ "loss": 0.2911,
8292
+ "step": 1089
8293
+ },
8294
+ {
8295
+ "epoch": 0.8342900880214313,
8296
+ "grad_norm": 4.159714698791504,
8297
+ "learning_rate": 1.6217990125342964e-06,
8298
+ "loss": 0.2666,
8299
+ "step": 1090
8300
+ },
8301
+ {
8302
+ "epoch": 0.8350554917719096,
8303
+ "grad_norm": 9.31843090057373,
8304
+ "learning_rate": 1.6072320296846898e-06,
8305
+ "loss": 0.2472,
8306
+ "step": 1091
8307
+ },
8308
+ {
8309
+ "epoch": 0.835820895522388,
8310
+ "grad_norm": 11.698112487792969,
8311
+ "learning_rate": 1.5927250437570197e-06,
8312
+ "loss": 0.2629,
8313
+ "step": 1092
8314
+ },
8315
+ {
8316
+ "epoch": 0.8365862992728664,
8317
+ "grad_norm": 6.662525653839111,
8318
+ "learning_rate": 1.5782781584565854e-06,
8319
+ "loss": 0.3005,
8320
+ "step": 1093
8321
+ },
8322
+ {
8323
+ "epoch": 0.8373517030233448,
8324
+ "grad_norm": 6.270053386688232,
8325
+ "learning_rate": 1.5638914770590508e-06,
8326
+ "loss": 0.2998,
8327
+ "step": 1094
8328
+ },
8329
+ {
8330
+ "epoch": 0.8381171067738232,
8331
+ "grad_norm": 8.17238712310791,
8332
+ "learning_rate": 1.5495651024096925e-06,
8333
+ "loss": 0.4168,
8334
+ "step": 1095
8335
+ },
8336
+ {
8337
+ "epoch": 0.8388825105243015,
8338
+ "grad_norm": 7.660524845123291,
8339
+ "learning_rate": 1.5352991369226865e-06,
8340
+ "loss": 0.2949,
8341
+ "step": 1096
8342
+ },
8343
+ {
8344
+ "epoch": 0.8396479142747799,
8345
+ "grad_norm": 6.679647922515869,
8346
+ "learning_rate": 1.5210936825803602e-06,
8347
+ "loss": 0.3089,
8348
+ "step": 1097
8349
+ },
8350
+ {
8351
+ "epoch": 0.8404133180252583,
8352
+ "grad_norm": 6.309601306915283,
8353
+ "learning_rate": 1.5069488409324696e-06,
8354
+ "loss": 0.2447,
8355
+ "step": 1098
8356
+ },
8357
+ {
8358
+ "epoch": 0.8411787217757367,
8359
+ "grad_norm": 6.660057544708252,
8360
+ "learning_rate": 1.4928647130954743e-06,
8361
+ "loss": 0.2332,
8362
+ "step": 1099
8363
+ },
8364
+ {
8365
+ "epoch": 0.8419441255262151,
8366
+ "grad_norm": 7.898063659667969,
8367
+ "learning_rate": 1.4788413997518026e-06,
8368
+ "loss": 0.3732,
8369
+ "step": 1100
8370
+ },
8371
+ {
8372
+ "epoch": 0.8419441255262151,
8373
+ "eval_accuracy": 0.8790613718411552,
8374
+ "eval_f1": 0.8277634961439588,
8375
+ "eval_loss": 0.3002900779247284,
8376
+ "eval_precision": 0.8563829787234043,
8377
+ "eval_recall": 0.8009950248756219,
8378
+ "eval_runtime": 42.9938,
8379
+ "eval_samples_per_second": 7.001,
8380
+ "eval_steps_per_second": 0.233,
8381
+ "step": 1100
8382
  }
8383
  ],
8384
  "logging_steps": 1,
 
8398
  "attributes": {}
8399
  }
8400
  },
8401
+ "total_flos": 1.6907983391188582e+17,
8402
  "train_batch_size": 8,
8403
  "trial_name": null,
8404
  "trial_params": null