End of training
Browse files- README.md +49 -36
- config.json +2 -2
- generation_config.json +7 -0
- model.safetensors +2 -2
- training_args.bin +3 -0
- training_log.json +1 -0
README.md
CHANGED
@@ -1,37 +1,50 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
-
|
4 |
-
|
5 |
-
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
tags:
|
3 |
+
- trl
|
4 |
+
- sft
|
5 |
+
- generated_from_trainer
|
6 |
+
model-index:
|
7 |
+
- name: ytu_doktor_gpt2-medium
|
8 |
+
results: []
|
9 |
+
---
|
10 |
+
|
11 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
12 |
+
should probably proofread and complete it, then remove this comment. -->
|
13 |
+
|
14 |
+
# ytu_doktor_gpt2-medium
|
15 |
+
|
16 |
+
This model was trained from scratch on an unknown dataset.
|
17 |
+
|
18 |
+
## Model description
|
19 |
+
|
20 |
+
More information needed
|
21 |
+
|
22 |
+
## Intended uses & limitations
|
23 |
+
|
24 |
+
More information needed
|
25 |
+
|
26 |
+
## Training and evaluation data
|
27 |
+
|
28 |
+
More information needed
|
29 |
+
|
30 |
+
## Training procedure
|
31 |
+
|
32 |
+
### Training hyperparameters
|
33 |
+
|
34 |
+
The following hyperparameters were used during training:
|
35 |
+
- learning_rate: 5e-05
|
36 |
+
- train_batch_size: 8
|
37 |
+
- eval_batch_size: 8
|
38 |
+
- seed: 42
|
39 |
+
- gradient_accumulation_steps: 8
|
40 |
+
- total_train_batch_size: 64
|
41 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
42 |
+
- lr_scheduler_type: linear
|
43 |
+
- num_epochs: 20
|
44 |
+
|
45 |
+
### Framework versions
|
46 |
+
|
47 |
+
- Transformers 4.41.2
|
48 |
+
- Pytorch 2.3.0+cu121
|
49 |
+
- Datasets 2.20.0
|
50 |
+
- Tokenizers 0.19.1
|
config.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "/content/drive/My Drive/yl_tez/
|
3 |
"activation_function": "gelu_new",
|
4 |
"architectures": [
|
5 |
-
"
|
6 |
],
|
7 |
"attn_pdrop": 0.1,
|
8 |
"bos_token_id": 0,
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "/content/drive/My Drive/yl_tez/bestmodel",
|
3 |
"activation_function": "gelu_new",
|
4 |
"architectures": [
|
5 |
+
"GPT2LMHeadModel"
|
6 |
],
|
7 |
"attn_pdrop": 0.1,
|
8 |
"bos_token_id": 0,
|
generation_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 0,
|
4 |
+
"eos_token_id": 0,
|
5 |
+
"pad_token_id": 0,
|
6 |
+
"transformers_version": "4.41.2"
|
7 |
+
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1558e2be8468593763031a129198c7023a5d6585ed9fe4a01e51b7a3a7c7296b
|
3 |
+
size 1419343360
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62a0373ff6ae33846adc2849067e799c88acfb2c620f90cf5843be5adab64829
|
3 |
+
size 5368
|
training_log.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[{"epoch": 0.7631403224267862, "grad_norm": 1.257724642753601, "learning_rate": 4.809160305343512e-05, "loss": 2.8522, "step": 2000}, {"epoch": 0.7631403224267862, "eval_loss": 2.5629115104675293, "eval_runtime": 295.1424, "eval_samples_per_second": 56.834, "eval_steps_per_second": 7.105, "step": 2000}, {"epoch": 1.5262806448535724, "grad_norm": 1.275610089302063, "learning_rate": 4.618320610687023e-05, "loss": 2.62, "step": 4000}, {"epoch": 1.5262806448535724, "eval_loss": 2.397400379180908, "eval_runtime": 294.9588, "eval_samples_per_second": 56.869, "eval_steps_per_second": 7.109, "step": 4000}, {"epoch": 2.2894209672803587, "grad_norm": 1.3073314428329468, "learning_rate": 4.4274809160305345e-05, "loss": 2.505, "step": 6000}, {"epoch": 2.2894209672803587, "eval_loss": 2.2722864151000977, "eval_runtime": 295.0007, "eval_samples_per_second": 56.861, "eval_steps_per_second": 7.108, "step": 6000}, {"epoch": 3.052561289707145, "grad_norm": 1.3305290937423706, "learning_rate": 4.236641221374046e-05, "loss": 2.4148, "step": 8000}, {"epoch": 3.052561289707145, "eval_loss": 2.160352945327759, "eval_runtime": 294.7538, "eval_samples_per_second": 56.909, "eval_steps_per_second": 7.114, "step": 8000}, {"epoch": 3.815701612133931, "grad_norm": 1.262954831123352, "learning_rate": 4.0458015267175576e-05, "loss": 2.3052, "step": 10000}, {"epoch": 3.815701612133931, "eval_loss": 2.0768959522247314, "eval_runtime": 294.7672, "eval_samples_per_second": 56.906, "eval_steps_per_second": 7.114, "step": 10000}, {"epoch": 4.579700467423447, "grad_norm": 1.2735399007797241, "learning_rate": 3.854961832061069e-05, "loss": 2.2227, "step": 12000}, {"epoch": 4.579700467423447, "eval_loss": 1.9794929027557373, "eval_runtime": 294.5056, "eval_samples_per_second": 56.956, "eval_steps_per_second": 7.12, "step": 12000}, {"epoch": 5.342840789850234, "grad_norm": 1.4666306972503662, "learning_rate": 3.66412213740458e-05, "loss": 2.1621, "step": 14000}, {"epoch": 5.342840789850234, "eval_loss": 1.8896831274032593, "eval_runtime": 294.4865, "eval_samples_per_second": 56.96, "eval_steps_per_second": 7.121, "step": 14000}, {"epoch": 6.10598111227702, "grad_norm": 1.38246750831604, "learning_rate": 3.473282442748092e-05, "loss": 2.1021, "step": 16000}, {"epoch": 6.10598111227702, "eval_loss": 1.806349515914917, "eval_runtime": 294.3881, "eval_samples_per_second": 56.979, "eval_steps_per_second": 7.123, "step": 16000}, {"epoch": 6.869121434703806, "grad_norm": 1.3253750801086426, "learning_rate": 3.282442748091603e-05, "loss": 2.0297, "step": 18000}, {"epoch": 6.869121434703806, "eval_loss": 1.743876576423645, "eval_runtime": 294.3768, "eval_samples_per_second": 56.981, "eval_steps_per_second": 7.124, "step": 18000}, {"epoch": 7.632261757130593, "grad_norm": 1.5019088983535767, "learning_rate": 3.091603053435115e-05, "loss": 1.9588, "step": 20000}, {"epoch": 7.632261757130593, "eval_loss": 1.67059326171875, "eval_runtime": 294.351, "eval_samples_per_second": 56.986, "eval_steps_per_second": 7.124, "step": 20000}, {"epoch": 8.395402079557378, "grad_norm": 1.5486341714859009, "learning_rate": 2.900763358778626e-05, "loss": 1.9108, "step": 22000}, {"epoch": 8.395402079557378, "eval_loss": 1.6049710512161255, "eval_runtime": 294.1072, "eval_samples_per_second": 57.034, "eval_steps_per_second": 7.13, "step": 22000}, {"epoch": 9.158542401984166, "grad_norm": 1.5598807334899902, "learning_rate": 2.7099236641221375e-05, "loss": 1.8736, "step": 24000}, {"epoch": 9.158542401984166, "eval_loss": 1.5380265712738037, "eval_runtime": 294.3331, "eval_samples_per_second": 56.99, "eval_steps_per_second": 7.125, "step": 24000}, {"epoch": 9.921682724410951, "grad_norm": 1.614626169204712, "learning_rate": 2.5190839694656487e-05, "loss": 1.8195, "step": 26000}, {"epoch": 9.921682724410951, "eval_loss": 1.4939802885055542, "eval_runtime": 294.2619, "eval_samples_per_second": 57.004, "eval_steps_per_second": 7.126, "step": 26000}, {"epoch": 10.684823046837737, "grad_norm": 1.627020001411438, "learning_rate": 2.3282442748091605e-05, "loss": 1.7592, "step": 28000}, {"epoch": 10.684823046837737, "eval_loss": 1.43486487865448, "eval_runtime": 294.3221, "eval_samples_per_second": 56.992, "eval_steps_per_second": 7.125, "step": 28000}, {"epoch": 11.447963369264524, "grad_norm": 1.5647703409194946, "learning_rate": 2.1374045801526718e-05, "loss": 1.7219, "step": 30000}, {"epoch": 11.447963369264524, "eval_loss": 1.3888773918151855, "eval_runtime": 294.6832, "eval_samples_per_second": 56.922, "eval_steps_per_second": 7.116, "step": 30000}, {"epoch": 12.21110369169131, "grad_norm": 1.6487740278244019, "learning_rate": 1.9465648854961833e-05, "loss": 1.6906, "step": 32000}, {"epoch": 12.21110369169131, "eval_loss": 1.3379969596862793, "eval_runtime": 294.5375, "eval_samples_per_second": 56.95, "eval_steps_per_second": 7.12, "step": 32000}, {"epoch": 12.974244014118096, "grad_norm": 1.5964055061340332, "learning_rate": 1.7557251908396945e-05, "loss": 1.6591, "step": 34000}, {"epoch": 12.974244014118096, "eval_loss": 1.307100534439087, "eval_runtime": 294.488, "eval_samples_per_second": 56.96, "eval_steps_per_second": 7.121, "step": 34000}, {"epoch": 13.739959935133072, "grad_norm": 1.768680214881897, "learning_rate": 1.5648854961832064e-05, "loss": 1.6052, "step": 36000}, {"epoch": 13.739959935133072, "eval_loss": 1.2658464908599854, "eval_runtime": 294.9507, "eval_samples_per_second": 56.871, "eval_steps_per_second": 7.11, "step": 36000}, {"epoch": 14.50310025755986, "grad_norm": 1.681711196899414, "learning_rate": 1.3740458015267178e-05, "loss": 1.5787, "step": 38000}, {"epoch": 14.50310025755986, "eval_loss": 1.233357548713684, "eval_runtime": 294.9872, "eval_samples_per_second": 56.863, "eval_steps_per_second": 7.109, "step": 38000}, {"epoch": 15.266240579986645, "grad_norm": 1.7337956428527832, "learning_rate": 1.1832061068702292e-05, "loss": 1.5556, "step": 40000}, {"epoch": 15.266240579986645, "eval_loss": 1.1987721920013428, "eval_runtime": 294.7218, "eval_samples_per_second": 56.915, "eval_steps_per_second": 7.115, "step": 40000}, {"loss": 1.5353, "grad_norm": 1.7468492984771729, "learning_rate": 9.923664122137405e-06, "epoch": 16.03023943527616, "step": 42000}, {"eval_loss": 1.169547438621521, "eval_runtime": 295.0094, "eval_samples_per_second": 56.859, "eval_steps_per_second": 7.108, "epoch": 16.03023943527616, "step": 42000}, {"loss": 1.4952, "grad_norm": 1.7336703538894653, "learning_rate": 8.015267175572519e-06, "epoch": 16.793379757702947, "step": 44000}, {"eval_loss": 1.1509828567504883, "eval_runtime": 295.2883, "eval_samples_per_second": 56.806, "eval_steps_per_second": 7.102, "epoch": 16.793379757702947, "step": 44000}, {"loss": 1.479, "grad_norm": 1.8032091856002808, "learning_rate": 6.106870229007634e-06, "epoch": 17.556520080129733, "step": 46000}, {"eval_loss": 1.1298787593841553, "eval_runtime": 295.287, "eval_samples_per_second": 56.806, "eval_steps_per_second": 7.102, "epoch": 17.556520080129733, "step": 46000}, {"loss": 1.4641, "grad_norm": 1.8275638818740845, "learning_rate": 4.198473282442748e-06, "epoch": 18.31966040255652, "step": 48000}, {"eval_loss": 1.113271713256836, "eval_runtime": 295.0087, "eval_samples_per_second": 56.859, "eval_steps_per_second": 7.108, "epoch": 18.31966040255652, "step": 48000}, {"loss": 1.4478, "grad_norm": 1.71540367603302, "learning_rate": 2.2900763358778625e-06, "epoch": 19.082800724983308, "step": 50000}, {"eval_loss": 1.1037347316741943, "eval_runtime": 295.0761, "eval_samples_per_second": 56.846, "eval_steps_per_second": 7.107, "epoch": 19.082800724983308, "step": 50000}, {"loss": 1.4305, "grad_norm": 1.7618632316589355, "learning_rate": 3.816793893129771e-07, "epoch": 19.845941047410093, "step": 52000}, {"eval_loss": 1.0987213850021362, "eval_runtime": 295.367, "eval_samples_per_second": 56.79, "eval_steps_per_second": 7.1, "epoch": 19.845941047410093, "step": 52000}, {"train_runtime": 43957.4906, "train_samples_per_second": 76.316, "train_steps_per_second": 1.192, "total_flos": 1.8627399959765975e+18, "train_loss": 0.34880192647453484, "epoch": 19.99856911189545, "step": 52400}]
|