lesso11 commited on
Commit
1a3d79b
·
verified ·
1 Parent(s): 876c796

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba2c0cca31f098f614c5e2899a67ea75c9baabf478efe85dde1a62cedbf440dd
3
  size 50503544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e014bafd26aa060f607da8eef483b6e7e82be9550547c1d471fd7b3ab0e10947
3
  size 50503544
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f5327a9bfdffedd78d6f803093679e8171964a0829b0a80d61c7be379a13991
3
  size 101184122
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d8acbf7d94fd8f2b136a5c18026b3b053386588e0323db70d945c591b1f9139
3
  size 101184122
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13c5c39f9b6daa1d7f3d7a5d83c285676ff1d110f0ea0172ff09b0c46c9806cc
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70fe1072043eb74154c0f5a2511bc78205c8ae7abae96348cc35ee1cc6003593
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a89ffc445067fef9d6d02bb3ff9e61d5e3209e6fa67c7259b3b364b90dbaa2cd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49d60a69e2379be2053e816cbaff31e6c931b5922dd86c71c9eaf473299cbf62
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.02313743637204998,
5
  "eval_steps": 9,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -405,6 +405,404 @@
405
  "learning_rate": 5.868240888334653e-05,
406
  "loss": 0.0647,
407
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  }
409
  ],
410
  "logging_steps": 1,
@@ -419,12 +817,12 @@
419
  "should_evaluate": false,
420
  "should_log": false,
421
  "should_save": true,
422
- "should_training_stop": false
423
  },
424
  "attributes": {}
425
  }
426
  },
427
- "total_flos": 9779003334328320.0,
428
  "train_batch_size": 8,
429
  "trial_name": null,
430
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.04627487274409996,
5
  "eval_steps": 9,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
405
  "learning_rate": 5.868240888334653e-05,
406
  "loss": 0.0647,
407
  "step": 50
408
+ },
409
+ {
410
+ "epoch": 0.023600185099490977,
411
+ "grad_norm": 2.2488627433776855,
412
+ "learning_rate": 5.695865504800327e-05,
413
+ "loss": 0.1892,
414
+ "step": 51
415
+ },
416
+ {
417
+ "epoch": 0.024062933826931976,
418
+ "grad_norm": 3.109647750854492,
419
+ "learning_rate": 5.522642316338268e-05,
420
+ "loss": 0.2971,
421
+ "step": 52
422
+ },
423
+ {
424
+ "epoch": 0.024525682554372975,
425
+ "grad_norm": 2.9143030643463135,
426
+ "learning_rate": 5.348782368720626e-05,
427
+ "loss": 0.4799,
428
+ "step": 53
429
+ },
430
+ {
431
+ "epoch": 0.024988431281813973,
432
+ "grad_norm": 3.5207865238189697,
433
+ "learning_rate": 5.174497483512506e-05,
434
+ "loss": 0.232,
435
+ "step": 54
436
+ },
437
+ {
438
+ "epoch": 0.024988431281813973,
439
+ "eval_loss": 0.19303588569164276,
440
+ "eval_runtime": 108.8496,
441
+ "eval_samples_per_second": 16.72,
442
+ "eval_steps_per_second": 2.095,
443
+ "step": 54
444
+ },
445
+ {
446
+ "epoch": 0.025451180009254976,
447
+ "grad_norm": 2.1500442028045654,
448
+ "learning_rate": 5e-05,
449
+ "loss": 0.2456,
450
+ "step": 55
451
+ },
452
+ {
453
+ "epoch": 0.025913928736695974,
454
+ "grad_norm": 5.131832122802734,
455
+ "learning_rate": 4.825502516487497e-05,
456
+ "loss": 0.6216,
457
+ "step": 56
458
+ },
459
+ {
460
+ "epoch": 0.026376677464136973,
461
+ "grad_norm": 1.655394434928894,
462
+ "learning_rate": 4.6512176312793736e-05,
463
+ "loss": 0.3206,
464
+ "step": 57
465
+ },
466
+ {
467
+ "epoch": 0.026839426191577972,
468
+ "grad_norm": 2.8779218196868896,
469
+ "learning_rate": 4.477357683661734e-05,
470
+ "loss": 0.3797,
471
+ "step": 58
472
+ },
473
+ {
474
+ "epoch": 0.027302174919018974,
475
+ "grad_norm": 1.5078076124191284,
476
+ "learning_rate": 4.3041344951996746e-05,
477
+ "loss": 0.0985,
478
+ "step": 59
479
+ },
480
+ {
481
+ "epoch": 0.027764923646459973,
482
+ "grad_norm": 1.3938862085342407,
483
+ "learning_rate": 4.131759111665349e-05,
484
+ "loss": 0.0939,
485
+ "step": 60
486
+ },
487
+ {
488
+ "epoch": 0.02822767237390097,
489
+ "grad_norm": 3.5782883167266846,
490
+ "learning_rate": 3.960441545911204e-05,
491
+ "loss": 0.2676,
492
+ "step": 61
493
+ },
494
+ {
495
+ "epoch": 0.02869042110134197,
496
+ "grad_norm": 1.3503000736236572,
497
+ "learning_rate": 3.790390522001662e-05,
498
+ "loss": 0.1315,
499
+ "step": 62
500
+ },
501
+ {
502
+ "epoch": 0.029153169828782972,
503
+ "grad_norm": 5.648515701293945,
504
+ "learning_rate": 3.6218132209150045e-05,
505
+ "loss": 0.1448,
506
+ "step": 63
507
+ },
508
+ {
509
+ "epoch": 0.029153169828782972,
510
+ "eval_loss": 0.17772692441940308,
511
+ "eval_runtime": 109.2089,
512
+ "eval_samples_per_second": 16.665,
513
+ "eval_steps_per_second": 2.088,
514
+ "step": 63
515
+ },
516
+ {
517
+ "epoch": 0.02961591855622397,
518
+ "grad_norm": 0.6387403011322021,
519
+ "learning_rate": 3.4549150281252636e-05,
520
+ "loss": 0.0981,
521
+ "step": 64
522
+ },
523
+ {
524
+ "epoch": 0.03007866728366497,
525
+ "grad_norm": 5.184665679931641,
526
+ "learning_rate": 3.289899283371657e-05,
527
+ "loss": 0.3126,
528
+ "step": 65
529
+ },
530
+ {
531
+ "epoch": 0.03054141601110597,
532
+ "grad_norm": 1.2718273401260376,
533
+ "learning_rate": 3.12696703292044e-05,
534
+ "loss": 0.2393,
535
+ "step": 66
536
+ },
537
+ {
538
+ "epoch": 0.031004164738546967,
539
+ "grad_norm": 0.9245361685752869,
540
+ "learning_rate": 2.9663167846209998e-05,
541
+ "loss": 0.2016,
542
+ "step": 67
543
+ },
544
+ {
545
+ "epoch": 0.03146691346598797,
546
+ "grad_norm": 3.493025541305542,
547
+ "learning_rate": 2.8081442660546125e-05,
548
+ "loss": 0.21,
549
+ "step": 68
550
+ },
551
+ {
552
+ "epoch": 0.031929662193428965,
553
+ "grad_norm": 3.8758113384246826,
554
+ "learning_rate": 2.6526421860705473e-05,
555
+ "loss": 0.2169,
556
+ "step": 69
557
+ },
558
+ {
559
+ "epoch": 0.03239241092086997,
560
+ "grad_norm": 4.775798797607422,
561
+ "learning_rate": 2.500000000000001e-05,
562
+ "loss": 0.3688,
563
+ "step": 70
564
+ },
565
+ {
566
+ "epoch": 0.03285515964831097,
567
+ "grad_norm": 1.524680495262146,
568
+ "learning_rate": 2.350403678833976e-05,
569
+ "loss": 0.1639,
570
+ "step": 71
571
+ },
572
+ {
573
+ "epoch": 0.033317908375751965,
574
+ "grad_norm": 1.8387486934661865,
575
+ "learning_rate": 2.2040354826462668e-05,
576
+ "loss": 0.224,
577
+ "step": 72
578
+ },
579
+ {
580
+ "epoch": 0.033317908375751965,
581
+ "eval_loss": 0.18191145360469818,
582
+ "eval_runtime": 108.8432,
583
+ "eval_samples_per_second": 16.721,
584
+ "eval_steps_per_second": 2.095,
585
+ "step": 72
586
+ },
587
+ {
588
+ "epoch": 0.03378065710319297,
589
+ "grad_norm": 0.8230963945388794,
590
+ "learning_rate": 2.061073738537635e-05,
591
+ "loss": 0.1038,
592
+ "step": 73
593
+ },
594
+ {
595
+ "epoch": 0.03424340583063397,
596
+ "grad_norm": 4.8260722160339355,
597
+ "learning_rate": 1.9216926233717085e-05,
598
+ "loss": 0.2824,
599
+ "step": 74
600
+ },
601
+ {
602
+ "epoch": 0.034706154558074964,
603
+ "grad_norm": 2.352128267288208,
604
+ "learning_rate": 1.7860619515673033e-05,
605
+ "loss": 0.2273,
606
+ "step": 75
607
+ },
608
+ {
609
+ "epoch": 0.035168903285515966,
610
+ "grad_norm": 1.7466846704483032,
611
+ "learning_rate": 1.6543469682057106e-05,
612
+ "loss": 0.2123,
613
+ "step": 76
614
+ },
615
+ {
616
+ "epoch": 0.03563165201295696,
617
+ "grad_norm": 1.6248657703399658,
618
+ "learning_rate": 1.526708147705013e-05,
619
+ "loss": 0.1353,
620
+ "step": 77
621
+ },
622
+ {
623
+ "epoch": 0.036094400740397964,
624
+ "grad_norm": 1.3933769464492798,
625
+ "learning_rate": 1.4033009983067452e-05,
626
+ "loss": 0.2277,
627
+ "step": 78
628
+ },
629
+ {
630
+ "epoch": 0.036557149467838966,
631
+ "grad_norm": 1.7838780879974365,
632
+ "learning_rate": 1.2842758726130283e-05,
633
+ "loss": 0.2165,
634
+ "step": 79
635
+ },
636
+ {
637
+ "epoch": 0.03701989819527996,
638
+ "grad_norm": 1.778045415878296,
639
+ "learning_rate": 1.1697777844051105e-05,
640
+ "loss": 0.1446,
641
+ "step": 80
642
+ },
643
+ {
644
+ "epoch": 0.037482646922720964,
645
+ "grad_norm": 2.422689199447632,
646
+ "learning_rate": 1.0599462319663905e-05,
647
+ "loss": 0.1707,
648
+ "step": 81
649
+ },
650
+ {
651
+ "epoch": 0.037482646922720964,
652
+ "eval_loss": 0.1773979812860489,
653
+ "eval_runtime": 108.8294,
654
+ "eval_samples_per_second": 16.723,
655
+ "eval_steps_per_second": 2.095,
656
+ "step": 81
657
+ },
658
+ {
659
+ "epoch": 0.03794539565016196,
660
+ "grad_norm": 2.397000312805176,
661
+ "learning_rate": 9.549150281252633e-06,
662
+ "loss": 0.2127,
663
+ "step": 82
664
+ },
665
+ {
666
+ "epoch": 0.03840814437760296,
667
+ "grad_norm": 2.0644094944000244,
668
+ "learning_rate": 8.548121372247918e-06,
669
+ "loss": 0.2302,
670
+ "step": 83
671
+ },
672
+ {
673
+ "epoch": 0.03887089310504396,
674
+ "grad_norm": 1.9861756563186646,
675
+ "learning_rate": 7.597595192178702e-06,
676
+ "loss": 0.2531,
677
+ "step": 84
678
+ },
679
+ {
680
+ "epoch": 0.03933364183248496,
681
+ "grad_norm": 1.7267298698425293,
682
+ "learning_rate": 6.698729810778065e-06,
683
+ "loss": 0.2176,
684
+ "step": 85
685
+ },
686
+ {
687
+ "epoch": 0.03979639055992596,
688
+ "grad_norm": 0.7792419791221619,
689
+ "learning_rate": 5.852620357053651e-06,
690
+ "loss": 0.0763,
691
+ "step": 86
692
+ },
693
+ {
694
+ "epoch": 0.04025913928736696,
695
+ "grad_norm": 1.2674682140350342,
696
+ "learning_rate": 5.060297685041659e-06,
697
+ "loss": 0.1173,
698
+ "step": 87
699
+ },
700
+ {
701
+ "epoch": 0.04072188801480796,
702
+ "grad_norm": 1.3142675161361694,
703
+ "learning_rate": 4.322727117869951e-06,
704
+ "loss": 0.1326,
705
+ "step": 88
706
+ },
707
+ {
708
+ "epoch": 0.04118463674224896,
709
+ "grad_norm": 3.0400171279907227,
710
+ "learning_rate": 3.6408072716606346e-06,
711
+ "loss": 0.3707,
712
+ "step": 89
713
+ },
714
+ {
715
+ "epoch": 0.041647385469689956,
716
+ "grad_norm": 1.8144176006317139,
717
+ "learning_rate": 3.0153689607045845e-06,
718
+ "loss": 0.1973,
719
+ "step": 90
720
+ },
721
+ {
722
+ "epoch": 0.041647385469689956,
723
+ "eval_loss": 0.1692516952753067,
724
+ "eval_runtime": 108.9085,
725
+ "eval_samples_per_second": 16.711,
726
+ "eval_steps_per_second": 2.094,
727
+ "step": 90
728
+ },
729
+ {
730
+ "epoch": 0.04211013419713096,
731
+ "grad_norm": 0.7678631544113159,
732
+ "learning_rate": 2.4471741852423237e-06,
733
+ "loss": 0.0723,
734
+ "step": 91
735
+ },
736
+ {
737
+ "epoch": 0.04257288292457196,
738
+ "grad_norm": 7.679274559020996,
739
+ "learning_rate": 1.9369152030840556e-06,
740
+ "loss": 0.385,
741
+ "step": 92
742
+ },
743
+ {
744
+ "epoch": 0.043035631652012955,
745
+ "grad_norm": 1.0919711589813232,
746
+ "learning_rate": 1.4852136862001764e-06,
747
+ "loss": 0.2062,
748
+ "step": 93
749
+ },
750
+ {
751
+ "epoch": 0.04349838037945396,
752
+ "grad_norm": 0.8858644366264343,
753
+ "learning_rate": 1.0926199633097157e-06,
754
+ "loss": 0.0903,
755
+ "step": 94
756
+ },
757
+ {
758
+ "epoch": 0.04396112910689495,
759
+ "grad_norm": 1.427178978919983,
760
+ "learning_rate": 7.596123493895991e-07,
761
+ "loss": 0.2173,
762
+ "step": 95
763
+ },
764
+ {
765
+ "epoch": 0.044423877834335955,
766
+ "grad_norm": 2.144537925720215,
767
+ "learning_rate": 4.865965629214819e-07,
768
+ "loss": 0.221,
769
+ "step": 96
770
+ },
771
+ {
772
+ "epoch": 0.04488662656177696,
773
+ "grad_norm": 2.2111377716064453,
774
+ "learning_rate": 2.7390523158633554e-07,
775
+ "loss": 0.1411,
776
+ "step": 97
777
+ },
778
+ {
779
+ "epoch": 0.04534937528921795,
780
+ "grad_norm": 1.809673547744751,
781
+ "learning_rate": 1.2179748700879012e-07,
782
+ "loss": 0.1562,
783
+ "step": 98
784
+ },
785
+ {
786
+ "epoch": 0.045812124016658955,
787
+ "grad_norm": 1.1327333450317383,
788
+ "learning_rate": 3.04586490452119e-08,
789
+ "loss": 0.0806,
790
+ "step": 99
791
+ },
792
+ {
793
+ "epoch": 0.045812124016658955,
794
+ "eval_loss": 0.16798387467861176,
795
+ "eval_runtime": 109.2031,
796
+ "eval_samples_per_second": 16.666,
797
+ "eval_steps_per_second": 2.088,
798
+ "step": 99
799
+ },
800
+ {
801
+ "epoch": 0.04627487274409996,
802
+ "grad_norm": 2.2550415992736816,
803
+ "learning_rate": 0.0,
804
+ "loss": 0.4077,
805
+ "step": 100
806
  }
807
  ],
808
  "logging_steps": 1,
 
817
  "should_evaluate": false,
818
  "should_log": false,
819
  "should_save": true,
820
+ "should_training_stop": true
821
  },
822
  "attributes": {}
823
  }
824
  },
825
+ "total_flos": 1.9969754177470464e+16,
826
  "train_batch_size": 8,
827
  "trial_name": null,
828
  "trial_params": null