beamaia commited on
Commit
f98097a
1 Parent(s): fbd8ef1

Training in progress, step 100, checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-100/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-100/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-100/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
checkpoint-100/optimizer_0/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a74ba8fcf2d857e573ced9e8ccd472ece612ef1ca47c4379e8bbc05bf43f4fa8
3
+ size 2108254
checkpoint-100/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:279aeb412860ec2411b607ca1d224edd0a200f94e95a9639539bf22fd73bce77
3
+ size 13256787644
checkpoint-100/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc6d1fb26db4d101efafcaa7e7bcfde898dd2f243ac8061e688295e25d7d7adf
3
+ size 13257964260
checkpoint-100/pytorch_model_fsdp_0/.metadata ADDED
Binary file (734 kB). View file
 
checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:428599b87893bf2cc5bdcc4df3e7518cb37f0349bd5eeeba36aac64acce595b7
3
+ size 6628321920
checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2bc1196dddd4042dcd616bc03c9fbf4c68315c342234e1b9b5126135d7b70b
3
+ size 6628321920
checkpoint-100/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d54c5084f33c509ac45b338b11639cfe7bc84d2e04580e41f2e890393612f49
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9933a1be37579869b948857aa6f00c9819fe4f66ee68e2a29ef74a9f27e2737
3
  size 14512
checkpoint-100/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a8085e6062dc7d5d06565ca1225c048c0553237aea788b0c04267e7be7d323f
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb2b9f73f6c726d541e2b15cf5cb6c566ea0512dd15613e1c2878f1d4825318
3
  size 14512
checkpoint-100/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 0.6374139189720154,
3
- "best_model_checkpoint": "./llama3/28-08-24-Weni-Pipeline_test_Experiment with SFT and Llama3 70b-2_max_steps-1362_batch_16_2024-08-28/checkpoint-100",
4
- "epoch": 0.2202036884117809,
5
  "eval_steps": 100,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
@@ -9,98 +9,98 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02202036884117809,
13
- "grad_norm": 0.6965782642364502,
14
  "learning_rate": 7.5e-05,
15
- "loss": 2.1201,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.04404073768235618,
20
- "grad_norm": 0.7504029870033264,
21
  "learning_rate": 0.00015,
22
- "loss": 0.9392,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.06606110652353427,
27
- "grad_norm": 0.7269854545593262,
28
  "learning_rate": 0.000225,
29
- "loss": 0.7958,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.08808147536471236,
34
- "grad_norm": 0.15891791880130768,
35
  "learning_rate": 0.0003,
36
- "loss": 0.7251,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.11010184420589045,
41
- "grad_norm": 0.14764881134033203,
42
  "learning_rate": 0.00029995764763563235,
43
- "loss": 0.6941,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.13212221304706853,
48
- "grad_norm": 0.11882930248975754,
49
  "learning_rate": 0.00029983061445883305,
50
- "loss": 0.673,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.15414258188824662,
55
- "grad_norm": 0.15152081847190857,
56
  "learning_rate": 0.0002996189722050073,
57
- "loss": 0.6428,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.17616295072942473,
62
- "grad_norm": 0.1619480848312378,
63
  "learning_rate": 0.0002993228403881531,
64
- "loss": 0.6465,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.1981833195706028,
69
- "grad_norm": 0.15019242465496063,
70
  "learning_rate": 0.00029894238623337174,
71
- "loss": 0.6308,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.2202036884117809,
76
- "grad_norm": 0.14553773403167725,
77
  "learning_rate": 0.00029847782458243663,
78
- "loss": 0.6314,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.2202036884117809,
83
  "eval_accuracy": 1.0,
84
  "eval_f1": 1.0,
85
  "eval_f1_macro": 1.0,
86
  "eval_f1_micro": 1.0,
87
- "eval_loss": 0.6374139189720154,
88
  "eval_precision": 1.0,
89
  "eval_precision_macro": 1.0,
90
  "eval_precision_micro": 1.0,
91
  "eval_recall": 1.0,
92
  "eval_recall_macro": 1.0,
93
  "eval_recall_micro": 1.0,
94
- "eval_runtime": 1957.1676,
95
- "eval_samples_per_second": 0.413,
96
- "eval_steps_per_second": 0.206,
97
  "step": 100
98
  }
99
  ],
100
  "logging_steps": 10,
101
  "max_steps": 1362,
102
  "num_input_tokens_seen": 0,
103
- "num_train_epochs": 3,
104
  "save_steps": 100,
105
  "stateful_callbacks": {
106
  "TrainerControl": {
@@ -114,7 +114,7 @@
114
  "attributes": {}
115
  }
116
  },
117
- "total_flos": 4711855325184.0,
118
  "train_batch_size": 2,
119
  "trial_name": null,
120
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5742923021316528,
3
+ "best_model_checkpoint": "./llama3/30-08-24-Weni-Pipeline_test_Experiment with SFT and Llama3 70b-2_max_steps-1362_batch_8_2024-08-30/checkpoint-100",
4
+ "epoch": 0.44004400440044006,
5
  "eval_steps": 100,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.04400440044004401,
13
+ "grad_norm": 0.5568628907203674,
14
  "learning_rate": 7.5e-05,
15
+ "loss": 2.0875,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.08800880088008801,
20
+ "grad_norm": 0.2537558972835541,
21
  "learning_rate": 0.00015,
22
+ "loss": 0.9378,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.132013201320132,
27
+ "grad_norm": 0.24558919668197632,
28
  "learning_rate": 0.000225,
29
+ "loss": 0.7,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.17601760176017603,
34
+ "grad_norm": 0.13937097787857056,
35
  "learning_rate": 0.0003,
36
+ "loss": 0.6298,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.22002200220022003,
41
+ "grad_norm": 0.1871194988489151,
42
  "learning_rate": 0.00029995764763563235,
43
+ "loss": 0.6321,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.264026402640264,
48
+ "grad_norm": 0.14626263082027435,
49
  "learning_rate": 0.00029983061445883305,
50
+ "loss": 0.6403,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.30803080308030806,
55
+ "grad_norm": 0.12049665302038193,
56
  "learning_rate": 0.0002996189722050073,
57
+ "loss": 0.5998,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.35203520352035206,
62
+ "grad_norm": 0.13617923855781555,
63
  "learning_rate": 0.0002993228403881531,
64
+ "loss": 0.5942,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.39603960396039606,
69
+ "grad_norm": 0.1271793246269226,
70
  "learning_rate": 0.00029894238623337174,
71
+ "loss": 0.5647,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.44004400440044006,
76
+ "grad_norm": 0.18757876753807068,
77
  "learning_rate": 0.00029847782458243663,
78
+ "loss": 0.5619,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.44004400440044006,
83
  "eval_accuracy": 1.0,
84
  "eval_f1": 1.0,
85
  "eval_f1_macro": 1.0,
86
  "eval_f1_micro": 1.0,
87
+ "eval_loss": 0.5742923021316528,
88
  "eval_precision": 1.0,
89
  "eval_precision_macro": 1.0,
90
  "eval_precision_micro": 1.0,
91
  "eval_recall": 1.0,
92
  "eval_recall_macro": 1.0,
93
  "eval_recall_micro": 1.0,
94
+ "eval_runtime": 90.5857,
95
+ "eval_samples_per_second": 4.46,
96
+ "eval_steps_per_second": 1.115,
97
  "step": 100
98
  }
99
  ],
100
  "logging_steps": 10,
101
  "max_steps": 1362,
102
  "num_input_tokens_seen": 0,
103
+ "num_train_epochs": 6,
104
  "save_steps": 100,
105
  "stateful_callbacks": {
106
  "TrainerControl": {
 
114
  "attributes": {}
115
  }
116
  },
117
+ "total_flos": 1.3925298981778227e+17,
118
  "train_batch_size": 2,
119
  "trial_name": null,
120
  "trial_params": null