fats-fme commited on
Commit
0fffa4d
1 Parent(s): 2aa689d

Training in progress, step 166, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:183bec216f162855d8196fd7cf94fded4640ff06d48effb208c6796a7c31450f
3
  size 216151256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9e7aff42b36fe14e95ece06193160112474b8a29fc3680ce273c922ca5686f6
3
  size 216151256
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37f0da2f48deae95a8eac8e2ea0c5f79ebb6089ae60fae8a45e2a45959193cab
3
  size 432640054
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e21f976a284dd81c64396ec6b6206079943029f7c09ac486e503562b06e47e6
3
  size 432640054
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da219fe1bf032ad9359b76003d71096a223611f94927e798bf577253282a8180
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb3607b5839cda7054779e8f957cbf2db3456879873cc4e34eac04cbf33f5db8
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23f39055b0ed21e2804d587dafb8a5710bb91a89aeb68f9ee9a9bdecb4f6c223
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd635f6ad590a43a7a075b3fb4377adaa95cf2d835f115014607cf181d2b6449
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2300eb0a85a826f84a38ebe148c80a476986dcda0381a18dc5644264e1ec5bb
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3070c5337425657c2fec031251a5e4e8042c43dd7a5d4d7f77fa453b02282be
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.25009416195856876,
5
  "eval_steps": 83,
6
- "global_step": 83,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -604,6 +604,595 @@
604
  "eval_samples_per_second": 6.015,
605
  "eval_steps_per_second": 1.506,
606
  "step": 83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  }
608
  ],
609
  "logging_steps": 1,
@@ -623,7 +1212,7 @@
623
  "attributes": {}
624
  }
625
  },
626
- "total_flos": 2.73439358382506e+17,
627
  "train_batch_size": 2,
628
  "trial_name": null,
629
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5001883239171375,
5
  "eval_steps": 83,
6
+ "global_step": 166,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
604
  "eval_samples_per_second": 6.015,
605
  "eval_steps_per_second": 1.506,
606
  "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.25310734463276835,
610
+ "grad_norm": 1.4990551471710205,
611
+ "learning_rate": 9.643097751841854e-05,
612
+ "loss": 0.5189,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.256120527306968,
617
+ "grad_norm": 1.4080356359481812,
618
+ "learning_rate": 9.622067162048112e-05,
619
+ "loss": 0.5112,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.2591337099811676,
624
+ "grad_norm": 1.3661057949066162,
625
+ "learning_rate": 9.600458850350588e-05,
626
+ "loss": 0.4688,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.2621468926553672,
631
+ "grad_norm": 1.570552945137024,
632
+ "learning_rate": 9.578275517617645e-05,
633
+ "loss": 0.5058,
634
+ "step": 87
635
+ },
636
+ {
637
+ "epoch": 0.26516007532956687,
638
+ "grad_norm": 1.6037708520889282,
639
+ "learning_rate": 9.555519936590738e-05,
640
+ "loss": 0.5201,
641
+ "step": 88
642
+ },
643
+ {
644
+ "epoch": 0.26817325800376646,
645
+ "grad_norm": 1.5268930196762085,
646
+ "learning_rate": 9.532194951537838e-05,
647
+ "loss": 0.4661,
648
+ "step": 89
649
+ },
650
+ {
651
+ "epoch": 0.2711864406779661,
652
+ "grad_norm": 1.7837523221969604,
653
+ "learning_rate": 9.508303477897924e-05,
654
+ "loss": 0.5005,
655
+ "step": 90
656
+ },
657
+ {
658
+ "epoch": 0.27419962335216574,
659
+ "grad_norm": 1.3590326309204102,
660
+ "learning_rate": 9.483848501916578e-05,
661
+ "loss": 0.3866,
662
+ "step": 91
663
+ },
664
+ {
665
+ "epoch": 0.27721280602636533,
666
+ "grad_norm": 1.5031671524047852,
667
+ "learning_rate": 9.458833080272722e-05,
668
+ "loss": 0.3559,
669
+ "step": 92
670
+ },
671
+ {
672
+ "epoch": 0.280225988700565,
673
+ "grad_norm": 1.2212880849838257,
674
+ "learning_rate": 9.433260339696563e-05,
675
+ "loss": 0.3586,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 0.2832391713747646,
680
+ "grad_norm": 1.8385019302368164,
681
+ "learning_rate": 9.407133476578778e-05,
682
+ "loss": 0.4775,
683
+ "step": 94
684
+ },
685
+ {
686
+ "epoch": 0.2862523540489642,
687
+ "grad_norm": 2.6899161338806152,
688
+ "learning_rate": 9.38045575657098e-05,
689
+ "loss": 0.6809,
690
+ "step": 95
691
+ },
692
+ {
693
+ "epoch": 0.28926553672316385,
694
+ "grad_norm": 3.9981398582458496,
695
+ "learning_rate": 9.353230514177552e-05,
696
+ "loss": 0.8967,
697
+ "step": 96
698
+ },
699
+ {
700
+ "epoch": 0.29227871939736344,
701
+ "grad_norm": 3.7616143226623535,
702
+ "learning_rate": 9.325461152338846e-05,
703
+ "loss": 0.9173,
704
+ "step": 97
705
+ },
706
+ {
707
+ "epoch": 0.2952919020715631,
708
+ "grad_norm": 3.3938989639282227,
709
+ "learning_rate": 9.297151142005851e-05,
710
+ "loss": 0.7849,
711
+ "step": 98
712
+ },
713
+ {
714
+ "epoch": 0.2983050847457627,
715
+ "grad_norm": 3.3373446464538574,
716
+ "learning_rate": 9.268304021706349e-05,
717
+ "loss": 0.6619,
718
+ "step": 99
719
+ },
720
+ {
721
+ "epoch": 0.3013182674199623,
722
+ "grad_norm": 4.476459503173828,
723
+ "learning_rate": 9.23892339710263e-05,
724
+ "loss": 0.7758,
725
+ "step": 100
726
+ },
727
+ {
728
+ "epoch": 0.30433145009416196,
729
+ "grad_norm": 2.75358510017395,
730
+ "learning_rate": 9.209012940540805e-05,
731
+ "loss": 0.7565,
732
+ "step": 101
733
+ },
734
+ {
735
+ "epoch": 0.3073446327683616,
736
+ "grad_norm": 2.192662000656128,
737
+ "learning_rate": 9.178576390591802e-05,
738
+ "loss": 0.6634,
739
+ "step": 102
740
+ },
741
+ {
742
+ "epoch": 0.3103578154425612,
743
+ "grad_norm": 2.3334836959838867,
744
+ "learning_rate": 9.147617551584066e-05,
745
+ "loss": 0.6961,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 0.31337099811676083,
750
+ "grad_norm": 1.9057625532150269,
751
+ "learning_rate": 9.116140293128051e-05,
752
+ "loss": 0.5762,
753
+ "step": 104
754
+ },
755
+ {
756
+ "epoch": 0.3163841807909605,
757
+ "grad_norm": 1.5543274879455566,
758
+ "learning_rate": 9.084148549632547e-05,
759
+ "loss": 0.5249,
760
+ "step": 105
761
+ },
762
+ {
763
+ "epoch": 0.31939736346516007,
764
+ "grad_norm": 1.3116902112960815,
765
+ "learning_rate": 9.051646319812918e-05,
766
+ "loss": 0.4895,
767
+ "step": 106
768
+ },
769
+ {
770
+ "epoch": 0.3224105461393597,
771
+ "grad_norm": 1.6137094497680664,
772
+ "learning_rate": 9.018637666191283e-05,
773
+ "loss": 0.5036,
774
+ "step": 107
775
+ },
776
+ {
777
+ "epoch": 0.3254237288135593,
778
+ "grad_norm": 1.4955766201019287,
779
+ "learning_rate": 8.985126714588738e-05,
780
+ "loss": 0.4571,
781
+ "step": 108
782
+ },
783
+ {
784
+ "epoch": 0.32843691148775894,
785
+ "grad_norm": 1.5371748208999634,
786
+ "learning_rate": 8.951117653609666e-05,
787
+ "loss": 0.4958,
788
+ "step": 109
789
+ },
790
+ {
791
+ "epoch": 0.3314500941619586,
792
+ "grad_norm": 1.2266839742660522,
793
+ "learning_rate": 8.916614734118184e-05,
794
+ "loss": 0.4171,
795
+ "step": 110
796
+ },
797
+ {
798
+ "epoch": 0.3344632768361582,
799
+ "grad_norm": 1.21657133102417,
800
+ "learning_rate": 8.881622268706825e-05,
801
+ "loss": 0.421,
802
+ "step": 111
803
+ },
804
+ {
805
+ "epoch": 0.3374764595103578,
806
+ "grad_norm": 1.2184901237487793,
807
+ "learning_rate": 8.8461446311575e-05,
808
+ "loss": 0.4307,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 0.34048964218455746,
813
+ "grad_norm": 1.5124021768569946,
814
+ "learning_rate": 8.810186255894803e-05,
815
+ "loss": 0.4865,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 0.34350282485875705,
820
+ "grad_norm": 1.078994870185852,
821
+ "learning_rate": 8.773751637431748e-05,
822
+ "loss": 0.3592,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 0.3465160075329567,
827
+ "grad_norm": 1.2173560857772827,
828
+ "learning_rate": 8.736845329807993e-05,
829
+ "loss": 0.3757,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 0.3495291902071563,
834
+ "grad_norm": 1.4223103523254395,
835
+ "learning_rate": 8.69947194602061e-05,
836
+ "loss": 0.4002,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 0.3525423728813559,
841
+ "grad_norm": 1.2369580268859863,
842
+ "learning_rate": 8.66163615744751e-05,
843
+ "loss": 0.3891,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 0.35555555555555557,
848
+ "grad_norm": 1.2306034564971924,
849
+ "learning_rate": 8.623342693263548e-05,
850
+ "loss": 0.3176,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 0.35856873822975516,
855
+ "grad_norm": 1.20809805393219,
856
+ "learning_rate": 8.584596339849417e-05,
857
+ "loss": 0.3715,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 0.3615819209039548,
862
+ "grad_norm": 1.59524405002594,
863
+ "learning_rate": 8.545401940193392e-05,
864
+ "loss": 0.4539,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 0.36459510357815444,
869
+ "grad_norm": 2.4288361072540283,
870
+ "learning_rate": 8.505764393285984e-05,
871
+ "loss": 0.7094,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 0.36760828625235403,
876
+ "grad_norm": 2.587125778198242,
877
+ "learning_rate": 8.46568865350762e-05,
878
+ "loss": 0.7052,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 0.3706214689265537,
883
+ "grad_norm": 3.610764980316162,
884
+ "learning_rate": 8.425179730009368e-05,
885
+ "loss": 0.6835,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 0.3736346516007533,
890
+ "grad_norm": 2.254451274871826,
891
+ "learning_rate": 8.384242686086848e-05,
892
+ "loss": 0.5733,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 0.3766478342749529,
897
+ "grad_norm": 3.2182092666625977,
898
+ "learning_rate": 8.342882638547351e-05,
899
+ "loss": 0.7416,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 0.37966101694915255,
904
+ "grad_norm": 2.0895962715148926,
905
+ "learning_rate": 8.301104757070274e-05,
906
+ "loss": 0.611,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 0.38267419962335214,
911
+ "grad_norm": 1.9307582378387451,
912
+ "learning_rate": 8.258914263560971e-05,
913
+ "loss": 0.6099,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 0.3856873822975518,
918
+ "grad_norm": 1.7885206937789917,
919
+ "learning_rate": 8.216316431498028e-05,
920
+ "loss": 0.4832,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 0.3887005649717514,
925
+ "grad_norm": 1.2265185117721558,
926
+ "learning_rate": 8.173316585274145e-05,
927
+ "loss": 0.4042,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 0.391713747645951,
932
+ "grad_norm": 1.369534969329834,
933
+ "learning_rate": 8.129920099530607e-05,
934
+ "loss": 0.4681,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 0.39472693032015066,
939
+ "grad_norm": 1.340951681137085,
940
+ "learning_rate": 8.086132398485524e-05,
941
+ "loss": 0.4775,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 0.3977401129943503,
946
+ "grad_norm": 1.1047234535217285,
947
+ "learning_rate": 8.041958955255814e-05,
948
+ "loss": 0.4508,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 0.4007532956685499,
953
+ "grad_norm": 1.0403156280517578,
954
+ "learning_rate": 7.99740529117313e-05,
955
+ "loss": 0.4217,
956
+ "step": 133
957
+ },
958
+ {
959
+ "epoch": 0.40376647834274954,
960
+ "grad_norm": 0.9500618577003479,
961
+ "learning_rate": 7.952476975093729e-05,
962
+ "loss": 0.34,
963
+ "step": 134
964
+ },
965
+ {
966
+ "epoch": 0.4067796610169492,
967
+ "grad_norm": 1.1021428108215332,
968
+ "learning_rate": 7.907179622702408e-05,
969
+ "loss": 0.392,
970
+ "step": 135
971
+ },
972
+ {
973
+ "epoch": 0.40979284369114877,
974
+ "grad_norm": 1.2623156309127808,
975
+ "learning_rate": 7.861518895810596e-05,
976
+ "loss": 0.4238,
977
+ "step": 136
978
+ },
979
+ {
980
+ "epoch": 0.4128060263653484,
981
+ "grad_norm": 1.395652413368225,
982
+ "learning_rate": 7.815500501648653e-05,
983
+ "loss": 0.4211,
984
+ "step": 137
985
+ },
986
+ {
987
+ "epoch": 0.415819209039548,
988
+ "grad_norm": 1.3175368309020996,
989
+ "learning_rate": 7.769130192152538e-05,
990
+ "loss": 0.415,
991
+ "step": 138
992
+ },
993
+ {
994
+ "epoch": 0.41883239171374764,
995
+ "grad_norm": 1.3882197141647339,
996
+ "learning_rate": 7.722413763244838e-05,
997
+ "loss": 0.422,
998
+ "step": 139
999
+ },
1000
+ {
1001
+ "epoch": 0.4218455743879473,
1002
+ "grad_norm": 1.396023154258728,
1003
+ "learning_rate": 7.675357054110336e-05,
1004
+ "loss": 0.466,
1005
+ "step": 140
1006
+ },
1007
+ {
1008
+ "epoch": 0.4248587570621469,
1009
+ "grad_norm": 1.0779083967208862,
1010
+ "learning_rate": 7.627965946466166e-05,
1011
+ "loss": 0.3576,
1012
+ "step": 141
1013
+ },
1014
+ {
1015
+ "epoch": 0.4278719397363465,
1016
+ "grad_norm": 1.2511008977890015,
1017
+ "learning_rate": 7.580246363826621e-05,
1018
+ "loss": 0.301,
1019
+ "step": 142
1020
+ },
1021
+ {
1022
+ "epoch": 0.43088512241054616,
1023
+ "grad_norm": 1.13119375705719,
1024
+ "learning_rate": 7.532204270762786e-05,
1025
+ "loss": 0.3332,
1026
+ "step": 143
1027
+ },
1028
+ {
1029
+ "epoch": 0.43389830508474575,
1030
+ "grad_norm": 2.0195682048797607,
1031
+ "learning_rate": 7.483845672156998e-05,
1032
+ "loss": 0.6475,
1033
+ "step": 144
1034
+ },
1035
+ {
1036
+ "epoch": 0.4369114877589454,
1037
+ "grad_norm": 2.429945230484009,
1038
+ "learning_rate": 7.435176612452286e-05,
1039
+ "loss": 0.7177,
1040
+ "step": 145
1041
+ },
1042
+ {
1043
+ "epoch": 0.439924670433145,
1044
+ "grad_norm": 3.0756828784942627,
1045
+ "learning_rate": 7.386203174896872e-05,
1046
+ "loss": 0.741,
1047
+ "step": 146
1048
+ },
1049
+ {
1050
+ "epoch": 0.4429378531073446,
1051
+ "grad_norm": 3.7236998081207275,
1052
+ "learning_rate": 7.336931480783801e-05,
1053
+ "loss": 0.7999,
1054
+ "step": 147
1055
+ },
1056
+ {
1057
+ "epoch": 0.44595103578154427,
1058
+ "grad_norm": 2.7121517658233643,
1059
+ "learning_rate": 7.287367688685835e-05,
1060
+ "loss": 0.6044,
1061
+ "step": 148
1062
+ },
1063
+ {
1064
+ "epoch": 0.44896421845574386,
1065
+ "grad_norm": 3.661588668823242,
1066
+ "learning_rate": 7.237517993685678e-05,
1067
+ "loss": 0.5553,
1068
+ "step": 149
1069
+ },
1070
+ {
1071
+ "epoch": 0.4519774011299435,
1072
+ "grad_norm": 4.68520975112915,
1073
+ "learning_rate": 7.187388626601637e-05,
1074
+ "loss": 0.411,
1075
+ "step": 150
1076
+ },
1077
+ {
1078
+ "epoch": 0.45499058380414314,
1079
+ "grad_norm": 1.866217017173767,
1080
+ "learning_rate": 7.136985853208824e-05,
1081
+ "loss": 0.5442,
1082
+ "step": 151
1083
+ },
1084
+ {
1085
+ "epoch": 0.45800376647834273,
1086
+ "grad_norm": 1.6526014804840088,
1087
+ "learning_rate": 7.086315973455981e-05,
1088
+ "loss": 0.5071,
1089
+ "step": 152
1090
+ },
1091
+ {
1092
+ "epoch": 0.4610169491525424,
1093
+ "grad_norm": 1.3213937282562256,
1094
+ "learning_rate": 7.035385320678036e-05,
1095
+ "loss": 0.4598,
1096
+ "step": 153
1097
+ },
1098
+ {
1099
+ "epoch": 0.464030131826742,
1100
+ "grad_norm": 0.959452211856842,
1101
+ "learning_rate": 6.984200260804484e-05,
1102
+ "loss": 0.3485,
1103
+ "step": 154
1104
+ },
1105
+ {
1106
+ "epoch": 0.4670433145009416,
1107
+ "grad_norm": 1.0355703830718994,
1108
+ "learning_rate": 6.932767191563703e-05,
1109
+ "loss": 0.3648,
1110
+ "step": 155
1111
+ },
1112
+ {
1113
+ "epoch": 0.47005649717514125,
1114
+ "grad_norm": 0.9991386532783508,
1115
+ "learning_rate": 6.881092541683278e-05,
1116
+ "loss": 0.3535,
1117
+ "step": 156
1118
+ },
1119
+ {
1120
+ "epoch": 0.47306967984934084,
1121
+ "grad_norm": 1.0915963649749756,
1122
+ "learning_rate": 6.829182770086474e-05,
1123
+ "loss": 0.3682,
1124
+ "step": 157
1125
+ },
1126
+ {
1127
+ "epoch": 0.4760828625235405,
1128
+ "grad_norm": 0.9837580323219299,
1129
+ "learning_rate": 6.777044365084907e-05,
1130
+ "loss": 0.3703,
1131
+ "step": 158
1132
+ },
1133
+ {
1134
+ "epoch": 0.47909604519774013,
1135
+ "grad_norm": 1.258581280708313,
1136
+ "learning_rate": 6.724683843567568e-05,
1137
+ "loss": 0.4104,
1138
+ "step": 159
1139
+ },
1140
+ {
1141
+ "epoch": 0.4821092278719397,
1142
+ "grad_norm": 0.832224428653717,
1143
+ "learning_rate": 6.672107750186255e-05,
1144
+ "loss": 0.2934,
1145
+ "step": 160
1146
+ },
1147
+ {
1148
+ "epoch": 0.48512241054613936,
1149
+ "grad_norm": 0.881106436252594,
1150
+ "learning_rate": 6.619322656537552e-05,
1151
+ "loss": 0.3127,
1152
+ "step": 161
1153
+ },
1154
+ {
1155
+ "epoch": 0.488135593220339,
1156
+ "grad_norm": 1.257350206375122,
1157
+ "learning_rate": 6.566335160341424e-05,
1158
+ "loss": 0.3804,
1159
+ "step": 162
1160
+ },
1161
+ {
1162
+ "epoch": 0.4911487758945386,
1163
+ "grad_norm": 1.6826411485671997,
1164
+ "learning_rate": 6.513151884616556e-05,
1165
+ "loss": 0.4807,
1166
+ "step": 163
1167
+ },
1168
+ {
1169
+ "epoch": 0.49416195856873824,
1170
+ "grad_norm": 1.31766676902771,
1171
+ "learning_rate": 6.459779476852528e-05,
1172
+ "loss": 0.3872,
1173
+ "step": 164
1174
+ },
1175
+ {
1176
+ "epoch": 0.4971751412429379,
1177
+ "grad_norm": 1.438594102859497,
1178
+ "learning_rate": 6.406224608178932e-05,
1179
+ "loss": 0.3868,
1180
+ "step": 165
1181
+ },
1182
+ {
1183
+ "epoch": 0.5001883239171375,
1184
+ "grad_norm": 1.198364496231079,
1185
+ "learning_rate": 6.352493972531534e-05,
1186
+ "loss": 0.3361,
1187
+ "step": 166
1188
+ },
1189
+ {
1190
+ "epoch": 0.5001883239171375,
1191
+ "eval_loss": NaN,
1192
+ "eval_runtime": 93.1419,
1193
+ "eval_samples_per_second": 6.002,
1194
+ "eval_steps_per_second": 1.503,
1195
+ "step": 166
1196
  }
1197
  ],
1198
  "logging_steps": 1,
 
1212
  "attributes": {}
1213
  }
1214
  },
1215
+ "total_flos": 5.46878716765012e+17,
1216
  "train_batch_size": 2,
1217
  "trial_name": null,
1218
  "trial_params": null