DorinSht commited on
Commit
837d036
·
verified ·
1 Parent(s): 4feb707

End of training

Browse files
README.md CHANGED
@@ -15,10 +15,10 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # recreate_llama_68M_vanilla
17
 
18
- This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 1.6571
21
- - Accuracy: 0.6944
22
 
23
  ## Model description
24
 
@@ -37,7 +37,7 @@ More information needed
37
  ### Training hyperparameters
38
 
39
  The following hyperparameters were used during training:
40
- - learning_rate: 0.0005
41
  - train_batch_size: 32
42
  - eval_batch_size: 16
43
  - seed: 42
@@ -49,15 +49,15 @@ The following hyperparameters were used during training:
49
 
50
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
51
  |:-------------:|:------:|:----:|:---------------:|:--------:|
52
- | No log | 0.3125 | 10 | 2.1108 | 0.6368 |
53
- | No log | 0.625 | 20 | 1.8348 | 0.6696 |
54
- | No log | 0.9375 | 30 | 1.7489 | 0.6794 |
55
- | No log | 1.25 | 40 | 1.7315 | 0.6781 |
56
- | No log | 1.5625 | 50 | 1.6956 | 0.6846 |
57
- | No log | 1.875 | 60 | 1.6770 | 0.6873 |
58
- | No log | 2.1875 | 70 | 1.6866 | 0.6903 |
59
- | No log | 2.5 | 80 | 1.6705 | 0.6911 |
60
- | No log | 2.8125 | 90 | 1.6571 | 0.6944 |
61
 
62
 
63
  ### Framework versions
 
15
 
16
  # recreate_llama_68M_vanilla
17
 
18
+ This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on the anon8231489123/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 9.5494
21
+ - Accuracy: 0.3512
22
 
23
  ## Model description
24
 
 
37
  ### Training hyperparameters
38
 
39
  The following hyperparameters were used during training:
40
+ - learning_rate: 0.005
41
  - train_batch_size: 32
42
  - eval_batch_size: 16
43
  - seed: 42
 
49
 
50
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
51
  |:-------------:|:------:|:----:|:---------------:|:--------:|
52
+ | No log | 0.3125 | 10 | 7.9370 | 0.3676 |
53
+ | No log | 0.625 | 20 | 8.6808 | 0.3478 |
54
+ | No log | 0.9375 | 30 | 10.9798 | 0.1029 |
55
+ | No log | 1.25 | 40 | 10.3023 | 0.2493 |
56
+ | No log | 1.5625 | 50 | 9.7688 | 0.3501 |
57
+ | No log | 1.875 | 60 | 9.6190 | 0.3510 |
58
+ | No log | 2.1875 | 70 | 9.5617 | 0.3510 |
59
+ | No log | 2.5 | 80 | 9.5470 | 0.3511 |
60
+ | No log | 2.8125 | 90 | 9.5487 | 0.3511 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.6942843185148998,
4
- "eval_loss": 1.6576473712921143,
5
- "eval_runtime": 2.7252,
6
  "eval_samples": 10,
7
- "eval_samples_per_second": 3.669,
8
- "eval_steps_per_second": 0.367,
9
- "perplexity": 5.246952182142712,
10
  "total_flos": 1601895923712000.0,
11
- "train_loss": 1.7650197347005208,
12
- "train_runtime": 162.5074,
13
  "train_samples": 1000,
14
- "train_samples_per_second": 18.461,
15
- "train_steps_per_second": 0.591
16
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.3512457254518808,
4
+ "eval_loss": 9.549426078796387,
5
+ "eval_runtime": 2.7507,
6
  "eval_samples": 10,
7
+ "eval_samples_per_second": 3.635,
8
+ "eval_steps_per_second": 0.364,
9
+ "perplexity": 14036.636436049854,
10
  "total_flos": 1601895923712000.0,
11
+ "train_loss": 11.46164576212565,
12
+ "train_runtime": 150.2617,
13
  "train_samples": 1000,
14
+ "train_samples_per_second": 19.965,
15
+ "train_steps_per_second": 0.639
16
  }
args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e481c33f0c155084ebf57a6ceaca65e925a3ba4ee05104ef931038f26a2eb0bf
3
  size 5908
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7565f26b6b3335b4c74c77cbb2af2017cb5dcb6983320e6a9a66a11f8e0ac811
3
  size 5908
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.6942843185148998,
4
- "eval_loss": 1.6576473712921143,
5
- "eval_runtime": 2.7252,
6
  "eval_samples": 10,
7
- "eval_samples_per_second": 3.669,
8
- "eval_steps_per_second": 0.367,
9
- "perplexity": 5.246952182142712
10
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.3512457254518808,
4
+ "eval_loss": 9.549426078796387,
5
+ "eval_runtime": 2.7507,
6
  "eval_samples": 10,
7
+ "eval_samples_per_second": 3.635,
8
+ "eval_steps_per_second": 0.364,
9
+ "perplexity": 14036.636436049854
10
  }
events.out.tfevents.1716139017.isl-gpu35.3557179.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98c3477ac61c16a0d1cca6ea301fafb4dc74245b003f3b412dce32f2a0503a48
3
+ size 405
events.out.tfevents.1716139017.isl-gpu35.3557181.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0568c4a11760460bb7c389a5ab134111369723d834d41545005aa2ee1e7a7313
3
+ size 405
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6e148b15d7f63a647f7d322707d37a3ac6f623a58f6691f421eef2c22c92c7a
3
  size 272123144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99e0aa1f57b9d3b412748068c5f9dd3e0251c942e88329be1c63e7a93fe20583
3
  size 272123144
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 1601895923712000.0,
4
- "train_loss": 1.7650197347005208,
5
- "train_runtime": 162.5074,
6
  "train_samples": 1000,
7
- "train_samples_per_second": 18.461,
8
- "train_steps_per_second": 0.591
9
  }
 
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 1601895923712000.0,
4
+ "train_loss": 11.46164576212565,
5
+ "train_runtime": 150.2617,
6
  "train_samples": 1000,
7
+ "train_samples_per_second": 19.965,
8
+ "train_steps_per_second": 0.639
9
  }
trainer_state.json CHANGED
@@ -10,93 +10,93 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.3125,
13
- "eval_accuracy": 0.6367855398143625,
14
- "eval_loss": 2.1108450889587402,
15
- "eval_runtime": 2.8362,
16
- "eval_samples_per_second": 3.526,
17
- "eval_steps_per_second": 0.353,
18
  "step": 10
19
  },
20
  {
21
  "epoch": 0.625,
22
- "eval_accuracy": 0.6696140693698095,
23
- "eval_loss": 1.8348472118377686,
24
- "eval_runtime": 2.9089,
25
- "eval_samples_per_second": 3.438,
26
- "eval_steps_per_second": 0.344,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.9375,
31
- "eval_accuracy": 0.6793844650708354,
32
- "eval_loss": 1.7488861083984375,
33
- "eval_runtime": 2.8499,
34
- "eval_samples_per_second": 3.509,
35
- "eval_steps_per_second": 0.351,
36
  "step": 30
37
  },
38
  {
39
  "epoch": 1.25,
40
- "eval_accuracy": 0.678114313629702,
41
- "eval_loss": 1.7314647436141968,
42
- "eval_runtime": 2.7552,
43
- "eval_samples_per_second": 3.63,
44
- "eval_steps_per_second": 0.363,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 1.5625,
49
- "eval_accuracy": 0.6846116267708843,
50
- "eval_loss": 1.6956342458724976,
51
- "eval_runtime": 2.7968,
52
- "eval_samples_per_second": 3.575,
53
- "eval_steps_per_second": 0.358,
54
  "step": 50
55
  },
56
  {
57
  "epoch": 1.875,
58
- "eval_accuracy": 0.6872984855886664,
59
- "eval_loss": 1.6769542694091797,
60
- "eval_runtime": 2.7447,
61
- "eval_samples_per_second": 3.643,
62
- "eval_steps_per_second": 0.364,
63
  "step": 60
64
  },
65
  {
66
  "epoch": 2.1875,
67
- "eval_accuracy": 0.6902784562774792,
68
- "eval_loss": 1.6866405010223389,
69
- "eval_runtime": 2.7321,
70
- "eval_samples_per_second": 3.66,
71
- "eval_steps_per_second": 0.366,
72
  "step": 70
73
  },
74
  {
75
  "epoch": 2.5,
76
- "eval_accuracy": 0.6911089399120665,
77
- "eval_loss": 1.6704612970352173,
78
- "eval_runtime": 2.8666,
79
- "eval_samples_per_second": 3.488,
80
- "eval_steps_per_second": 0.349,
81
  "step": 80
82
  },
83
  {
84
  "epoch": 2.8125,
85
- "eval_accuracy": 0.6943820224719102,
86
- "eval_loss": 1.6571474075317383,
87
- "eval_runtime": 2.9068,
88
- "eval_samples_per_second": 3.44,
89
- "eval_steps_per_second": 0.344,
90
  "step": 90
91
  },
92
  {
93
  "epoch": 3.0,
94
  "step": 96,
95
  "total_flos": 1601895923712000.0,
96
- "train_loss": 1.7650197347005208,
97
- "train_runtime": 162.5074,
98
- "train_samples_per_second": 18.461,
99
- "train_steps_per_second": 0.591
100
  }
101
  ],
102
  "logging_steps": 100,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.3125,
13
+ "eval_accuracy": 0.367562286272594,
14
+ "eval_loss": 7.9370436668396,
15
+ "eval_runtime": 2.8708,
16
+ "eval_samples_per_second": 3.483,
17
+ "eval_steps_per_second": 0.348,
18
  "step": 10
19
  },
20
  {
21
  "epoch": 0.625,
22
+ "eval_accuracy": 0.34782608695652173,
23
+ "eval_loss": 8.680798530578613,
24
+ "eval_runtime": 2.66,
25
+ "eval_samples_per_second": 3.759,
26
+ "eval_steps_per_second": 0.376,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.9375,
31
+ "eval_accuracy": 0.10293111871030777,
32
+ "eval_loss": 10.979837417602539,
33
+ "eval_runtime": 2.7406,
34
+ "eval_samples_per_second": 3.649,
35
+ "eval_steps_per_second": 0.365,
36
  "step": 30
37
  },
38
  {
39
  "epoch": 1.25,
40
+ "eval_accuracy": 0.24934049829018076,
41
+ "eval_loss": 10.302330017089844,
42
+ "eval_runtime": 2.7648,
43
+ "eval_samples_per_second": 3.617,
44
+ "eval_steps_per_second": 0.362,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 1.5625,
49
+ "eval_accuracy": 0.35012212994626285,
50
+ "eval_loss": 9.768780708312988,
51
+ "eval_runtime": 2.7184,
52
+ "eval_samples_per_second": 3.679,
53
+ "eval_steps_per_second": 0.368,
54
  "step": 50
55
  },
56
  {
57
  "epoch": 1.875,
58
+ "eval_accuracy": 0.35100146555935513,
59
+ "eval_loss": 9.619012832641602,
60
+ "eval_runtime": 2.8506,
61
+ "eval_samples_per_second": 3.508,
62
+ "eval_steps_per_second": 0.351,
63
  "step": 60
64
  },
65
  {
66
  "epoch": 2.1875,
67
+ "eval_accuracy": 0.35095261358085,
68
+ "eval_loss": 9.561655044555664,
69
+ "eval_runtime": 2.7142,
70
+ "eval_samples_per_second": 3.684,
71
+ "eval_steps_per_second": 0.368,
72
  "step": 70
73
  },
74
  {
75
  "epoch": 2.5,
76
+ "eval_accuracy": 0.35109916951636544,
77
+ "eval_loss": 9.547042846679688,
78
+ "eval_runtime": 2.8217,
79
+ "eval_samples_per_second": 3.544,
80
+ "eval_steps_per_second": 0.354,
81
  "step": 80
82
  },
83
  {
84
  "epoch": 2.8125,
85
+ "eval_accuracy": 0.35109916951636544,
86
+ "eval_loss": 9.548730850219727,
87
+ "eval_runtime": 2.742,
88
+ "eval_samples_per_second": 3.647,
89
+ "eval_steps_per_second": 0.365,
90
  "step": 90
91
  },
92
  {
93
  "epoch": 3.0,
94
  "step": 96,
95
  "total_flos": 1601895923712000.0,
96
+ "train_loss": 11.46164576212565,
97
+ "train_runtime": 150.2617,
98
+ "train_samples_per_second": 19.965,
99
+ "train_steps_per_second": 0.639
100
  }
101
  ],
102
  "logging_steps": 100,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec9cb7f17c2d28d6943c3b741d8f0971acf4de82635294d24e7b0d070d306f0a
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7e736d0a0ca3bfb4692ce9f2e011bee553b4b200689ef7c6910f21f466d39b9
3
  size 5112