sedrickkeh commited on
Commit
6274054
·
verified ·
1 Parent(s): ac38def

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Meta-Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: stackexchange_codereview
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # stackexchange_codereview
17
 
18
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.7186
21
 
 
4
  base_model: meta-llama/Meta-Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: stackexchange_codereview
 
16
 
17
  # stackexchange_codereview
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/stackexchange_codereview dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.7186
22
 
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9774436090225564,
3
+ "eval_loss": 0.7185549139976501,
4
+ "eval_runtime": 70.7881,
5
+ "eval_samples_per_second": 25.273,
6
+ "eval_steps_per_second": 0.396,
7
+ "total_flos": 331447994941440.0,
8
+ "train_loss": 0.7033214352347634,
9
+ "train_runtime": 12042.9211,
10
+ "train_samples_per_second": 8.467,
11
+ "train_steps_per_second": 0.016
12
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9774436090225564,
3
+ "eval_loss": 0.7185549139976501,
4
+ "eval_runtime": 70.7881,
5
+ "eval_samples_per_second": 25.273,
6
+ "eval_steps_per_second": 0.396
7
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9774436090225564,
3
+ "total_flos": 331447994941440.0,
4
+ "train_loss": 0.7033214352347634,
5
+ "train_runtime": 12042.9211,
6
+ "train_samples_per_second": 8.467,
7
+ "train_steps_per_second": 0.016
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9774436090225564,
5
+ "eval_steps": 500,
6
+ "global_step": 198,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.15037593984962405,
13
+ "grad_norm": 10.319588848192522,
14
+ "learning_rate": 5e-06,
15
+ "loss": 0.9599,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.3007518796992481,
20
+ "grad_norm": 1.0338807806874135,
21
+ "learning_rate": 5e-06,
22
+ "loss": 0.8336,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.45112781954887216,
27
+ "grad_norm": 0.6910926553714657,
28
+ "learning_rate": 5e-06,
29
+ "loss": 0.7791,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.6015037593984962,
34
+ "grad_norm": 1.0761644360414062,
35
+ "learning_rate": 5e-06,
36
+ "loss": 0.7583,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.7518796992481203,
41
+ "grad_norm": 0.6947583128003021,
42
+ "learning_rate": 5e-06,
43
+ "loss": 0.7441,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.9022556390977443,
48
+ "grad_norm": 0.6992165317159396,
49
+ "learning_rate": 5e-06,
50
+ "loss": 0.7377,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.9924812030075187,
55
+ "eval_loss": 0.7360510230064392,
56
+ "eval_runtime": 71.1419,
57
+ "eval_samples_per_second": 25.147,
58
+ "eval_steps_per_second": 0.394,
59
+ "step": 66
60
+ },
61
+ {
62
+ "epoch": 1.0526315789473684,
63
+ "grad_norm": 0.8078315357639781,
64
+ "learning_rate": 5e-06,
65
+ "loss": 0.7494,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 1.2030075187969924,
70
+ "grad_norm": 0.7080326935985787,
71
+ "learning_rate": 5e-06,
72
+ "loss": 0.6823,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 1.3533834586466165,
77
+ "grad_norm": 0.9139892615025267,
78
+ "learning_rate": 5e-06,
79
+ "loss": 0.6835,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 1.5037593984962405,
84
+ "grad_norm": 0.7719225630624682,
85
+ "learning_rate": 5e-06,
86
+ "loss": 0.6778,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 1.6541353383458648,
91
+ "grad_norm": 0.690505807116012,
92
+ "learning_rate": 5e-06,
93
+ "loss": 0.6741,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 1.8045112781954886,
98
+ "grad_norm": 0.8408786966441595,
99
+ "learning_rate": 5e-06,
100
+ "loss": 0.6786,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 1.954887218045113,
105
+ "grad_norm": 0.9105349697754412,
106
+ "learning_rate": 5e-06,
107
+ "loss": 0.6668,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 2.0,
112
+ "eval_loss": 0.7167445421218872,
113
+ "eval_runtime": 70.8575,
114
+ "eval_samples_per_second": 25.248,
115
+ "eval_steps_per_second": 0.395,
116
+ "step": 133
117
+ },
118
+ {
119
+ "epoch": 2.1052631578947367,
120
+ "grad_norm": 0.7611289966467718,
121
+ "learning_rate": 5e-06,
122
+ "loss": 0.6702,
123
+ "step": 140
124
+ },
125
+ {
126
+ "epoch": 2.255639097744361,
127
+ "grad_norm": 0.7215253637975946,
128
+ "learning_rate": 5e-06,
129
+ "loss": 0.6221,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 2.406015037593985,
134
+ "grad_norm": 0.7012758019701782,
135
+ "learning_rate": 5e-06,
136
+ "loss": 0.6227,
137
+ "step": 160
138
+ },
139
+ {
140
+ "epoch": 2.556390977443609,
141
+ "grad_norm": 0.8277361299197685,
142
+ "learning_rate": 5e-06,
143
+ "loss": 0.625,
144
+ "step": 170
145
+ },
146
+ {
147
+ "epoch": 2.706766917293233,
148
+ "grad_norm": 0.6173084956333096,
149
+ "learning_rate": 5e-06,
150
+ "loss": 0.6189,
151
+ "step": 180
152
+ },
153
+ {
154
+ "epoch": 2.857142857142857,
155
+ "grad_norm": 0.7220685374712632,
156
+ "learning_rate": 5e-06,
157
+ "loss": 0.6284,
158
+ "step": 190
159
+ },
160
+ {
161
+ "epoch": 2.9774436090225564,
162
+ "eval_loss": 0.7185549139976501,
163
+ "eval_runtime": 70.2334,
164
+ "eval_samples_per_second": 25.472,
165
+ "eval_steps_per_second": 0.399,
166
+ "step": 198
167
+ },
168
+ {
169
+ "epoch": 2.9774436090225564,
170
+ "step": 198,
171
+ "total_flos": 331447994941440.0,
172
+ "train_loss": 0.7033214352347634,
173
+ "train_runtime": 12042.9211,
174
+ "train_samples_per_second": 8.467,
175
+ "train_steps_per_second": 0.016
176
+ }
177
+ ],
178
+ "logging_steps": 10,
179
+ "max_steps": 198,
180
+ "num_input_tokens_seen": 0,
181
+ "num_train_epochs": 3,
182
+ "save_steps": 500,
183
+ "stateful_callbacks": {
184
+ "TrainerControl": {
185
+ "args": {
186
+ "should_epoch_stop": false,
187
+ "should_evaluate": false,
188
+ "should_log": false,
189
+ "should_save": true,
190
+ "should_training_stop": true
191
+ },
192
+ "attributes": {}
193
+ }
194
+ },
195
+ "total_flos": 331447994941440.0,
196
+ "train_batch_size": 8,
197
+ "trial_name": null,
198
+ "trial_params": null
199
+ }
training_eval_loss.png ADDED
training_loss.png ADDED