liujqian commited on
Commit
9a4c48e
Β·
1 Parent(s): 8a5f250

Upload 5 files

Browse files
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2-medium",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1024,
16
+ "n_head": 16,
17
+ "n_inner": null,
18
+ "n_layer": 24,
19
+ "n_positions": 1024,
20
+ "n_special": 0,
21
+ "predict_special_tokens": true,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "task_specific_params": {
32
+ "text-generation": {
33
+ "do_sample": true,
34
+ "max_length": 50
35
+ }
36
+ },
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.26.1",
39
+ "use_cache": true,
40
+ "vocab_size": 50257
41
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.26.1"
6
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08d130260118893300dedaaad6a9e4fb238d261f45fa067d1debb018c69e86d2
3
+ size 1444569373
training-log.txt ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Downloading (…)lve/main/config.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 718/718 [00:00<00:00, 180kB/s]
2
+ Downloading (…)olve/main/vocab.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1.04M/1.04M [00:00<00:00, 1.34MB/s]
3
+ Downloading (…)olve/main/merges.txt: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 456k/456k [00:01<00:00, 283kB/s]
4
+ Downloading (…)/main/tokenizer.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1.36M/1.36M [00:03<00:00, 386kB/s]
5
+ Downloading (…)"pytorch_model.bin";: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1.52G/1.52G [06:17<00:00, 4.03MB/s]
6
+ Downloading (…)neration_config.json: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 124/124 [00:00<00:00, 24.8kB/s]
7
+ Found cached dataset common_gen (C:/Users/Jingqian/.cache/huggingface/datasets/common_gen/default/2020.5.30/1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23)
8
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3/3 [00:00<00:00, 749.61it/s]
9
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 68/68 [00:01<00:00, 65.32ba/s]
10
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:00<00:00, 84.73ba/s]
11
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 133.33ba/s]
12
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 68/68 [00:03<00:00, 22.30ba/s]
13
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:00<00:00, 25.37ba/s]
14
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 76.81ba/s]
15
+ C:\Users\Jingqian\anaconda3\lib\site-packages\transformers\optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
16
+ warnings.warn(
17
+ ***** Running training *****
18
+ Num examples = 4592
19
+ Num Epochs = 5
20
+ Instantaneous batch size per device = 8
21
+ Total train batch size (w. parallel, distributed & accumulation) = 8
22
+ Gradient Accumulation steps = 1
23
+ Total optimization steps = 2870
24
+ Number of trainable parameters = 354823168
25
+ 17%|β–ˆβ–‹ | 500/2870 [03:25<16:23, 2.41it/s]{'loss': 2.4535, 'learning_rate': 4.128919860627178e-05, 'epoch': 0.87}
26
+ 20%|β–ˆβ–ˆ | 574/2870 [03:55<15:39, 2.44it/s]***** Running Evaluation *****
27
+ Num examples = 297
28
+ Batch size = 8
29
+
30
+ 0%| | 0/38 [00:00<?, ?it/s]
31
+ 5%|β–Œ | 2/38 [00:00<00:02, 17.24it/s]
32
+ 11%|β–ˆ | 4/38 [00:00<00:03, 10.30it/s]
33
+ 16%|β–ˆβ–Œ | 6/38 [00:00<00:03, 9.32it/s]
34
+ 21%|β–ˆβ–ˆ | 8/38 [00:00<00:03, 8.95it/s]
35
+ 24%|β–ˆβ–ˆβ–Ž | 9/38 [00:00<00:03, 8.82it/s]
36
+ 26%|β–ˆβ–ˆβ–‹ | 10/38 [00:01<00:03, 8.74it/s]
37
+ 29%|β–ˆβ–ˆβ–‰ | 11/38 [00:01<00:03, 8.61it/s]
38
+ 32%|β–ˆβ–ˆβ–ˆβ– | 12/38 [00:01<00:03, 8.52it/s]
39
+ 34%|β–ˆβ–ˆβ–ˆβ– | 13/38 [00:01<00:02, 8.41it/s]
40
+ 37%|β–ˆβ–ˆβ–ˆβ–‹ | 14/38 [00:01<00:02, 8.45it/s]
41
+ 39%|β–ˆβ–ˆβ–ˆβ–‰ | 15/38 [00:01<00:02, 8.33it/s]
42
+ 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 16/38 [00:01<00:02, 8.27it/s]
43
+ 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 17/38 [00:01<00:02, 8.31it/s]
44
+ 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 18/38 [00:02<00:02, 8.32it/s]
45
+ 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 19/38 [00:02<00:02, 8.28it/s]
46
+ 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 20/38 [00:02<00:02, 8.26it/s]
47
+ 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 21/38 [00:02<00:02, 8.24it/s]
48
+ 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/38 [00:02<00:01, 8.17it/s]
49
+ 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 23/38 [00:02<00:01, 8.24it/s]
50
+ 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 24/38 [00:02<00:01, 8.18it/s]
51
+ 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 25/38 [00:02<00:01, 8.19it/s]
52
+ 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 26/38 [00:03<00:01, 8.17it/s]
53
+ 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 27/38 [00:03<00:01, 8.24it/s]
54
+ 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 28/38 [00:03<00:01, 8.25it/s]
55
+ 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 29/38 [00:03<00:01, 8.29it/s]
56
+ 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 30/38 [00:03<00:00, 8.24it/s]
57
+ 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31/38 [00:03<00:00, 8.31it/s]
58
+ 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 32/38 [00:03<00:00, 8.38it/s]
59
+ 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 33/38 [00:03<00:00, 8.28it/s]
60
+ 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 34/38 [00:03<00:00, 8.24it/s]
61
+ 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 35/38 [00:04<00:00, 8.29it/s]
62
+ 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 36/38 [00:04<00:00, 8.34it/s]
63
+
64
+ 20%|β–ˆβ–ˆ | 574/2870 [04:00<15:39, 2.44it/s]
65
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 38/38 [00:04<00:00, 8.26it/s]
66
+ Saving model checkpoint to gpt2-medium-finetuned-commongen\checkpoint-574
67
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-574\config.json
68
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-574\generation_config.json
69
+ {'eval_loss': 2.1204423904418945, 'eval_runtime': 4.495, 'eval_samples_per_second': 66.073, 'eval_steps_per_second': 8.454, 'epoch': 1.0}
70
+ Model weights saved in gpt2-medium-finetuned-commongen\checkpoint-574\pytorch_model.bin
71
+ 35%|β–ˆβ–ˆβ–ˆβ– | 1000/2870 [06:59<12:28, 2.50it/s]{'loss': 2.1332, 'learning_rate': 3.2578397212543556e-05, 'epoch': 1.74}
72
+ 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 1148/2870 [07:58<11:29, 2.50it/s]***** Running Evaluation *****
73
+ Num examples = 297
74
+ Batch size = 8
75
+
76
+ 0%| | 0/38 [00:00<?, ?it/s]
77
+ 5%|β–Œ | 2/38 [00:00<00:02, 15.27it/s]
78
+ 11%|β–ˆ | 4/38 [00:00<00:03, 9.98it/s]
79
+ 16%|β–ˆβ–Œ | 6/38 [00:00<00:03, 9.21it/s]
80
+ 18%|β–ˆβ–Š | 7/38 [00:00<00:03, 9.04it/s]
81
+ 21%|β–ˆβ–ˆ | 8/38 [00:00<00:03, 8.76it/s]
82
+ 24%|β–ˆβ–ˆβ–Ž | 9/38 [00:00<00:03, 8.64it/s]
83
+ 26%|β–ˆβ–ˆβ–‹ | 10/38 [00:01<00:03, 8.37it/s]
84
+ 29%|β–ˆβ–ˆβ–‰ | 11/38 [00:01<00:03, 8.44it/s]
85
+ 32%|β–ˆβ–ˆβ–ˆβ– | 12/38 [00:01<00:03, 8.21it/s]
86
+ 34%|β–ˆβ–ˆβ–ˆβ– | 13/38 [00:01<00:03, 8.33it/s]
87
+ 37%|β–ˆβ–ˆβ–ˆβ–‹ | 14/38 [00:01<00:02, 8.15it/s]
88
+ 39%|β–ˆβ–ˆβ–ˆβ–‰ | 15/38 [00:01<00:02, 8.28it/s]
89
+ 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 16/38 [00:01<00:02, 8.06it/s]
90
+ 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 17/38 [00:01<00:02, 8.22it/s]
91
+ 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 18/38 [00:02<00:02, 8.11it/s]
92
+ 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 19/38 [00:02<00:02, 8.20it/s]
93
+ 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 20/38 [00:02<00:02, 8.08it/s]
94
+ 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 21/38 [00:02<00:02, 8.25it/s]
95
+ 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/38 [00:02<00:01, 8.02it/s]
96
+ 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 23/38 [00:02<00:01, 8.19it/s]
97
+ 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 24/38 [00:02<00:01, 8.16it/s]
98
+ 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 25/38 [00:02<00:01, 8.17it/s]
99
+ 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 26/38 [00:03<00:01, 8.00it/s]
100
+ 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 27/38 [00:03<00:01, 8.16it/s]
101
+ 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 28/38 [00:03<00:01, 8.13it/s]
102
+ 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 29/38 [00:03<00:01, 8.17it/s]
103
+ 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 30/38 [00:03<00:00, 8.00it/s]
104
+ 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31/38 [00:03<00:00, 8.16it/s]
105
+ 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 32/38 [00:03<00:00, 8.01it/s]
106
+ 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 33/38 [00:03<00:00, 8.21it/s]
107
+ 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 34/38 [00:04<00:00, 8.14it/s]
108
+ 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 35/38 [00:04<00:00, 8.14it/s]
109
+ 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 36/38 [00:04<00:00, 8.00it/s]
110
+
111
+ 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 1148/2870 [08:03<11:29, 2.50it/s]
112
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 38/38 [00:04<00:00, 8.18it/s]
113
+ Saving model checkpoint to gpt2-medium-finetuned-commongen\checkpoint-1148
114
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-1148\config.json
115
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-1148\generation_config.json
116
+ {'eval_loss': 2.131284475326538, 'eval_runtime': 4.5627, 'eval_samples_per_second': 65.093, 'eval_steps_per_second': 8.328, 'epoch': 2.0}
117
+ Model weights saved in gpt2-medium-finetuned-commongen\checkpoint-1148\pytorch_model.bin
118
+ 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1500/2870 [10:29<09:09, 2.49it/s]{'loss': 1.9859, 'learning_rate': 2.3867595818815333e-05, 'epoch': 2.61}
119
+ 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1722/2870 [11:58<07:39, 2.50it/s]***** Running Evaluation *****
120
+ Num examples = 297
121
+ Batch size = 8
122
+
123
+ 0%| | 0/38 [00:00<?, ?it/s]
124
+ 5%|β–Œ | 2/38 [00:00<00:02, 15.26it/s]
125
+ 11%|β–ˆ | 4/38 [00:00<00:03, 10.01it/s]
126
+ 16%|β–ˆβ–Œ | 6/38 [00:00<00:03, 9.30it/s]
127
+ 18%|β–ˆβ–Š | 7/38 [00:00<00:03, 9.04it/s]
128
+ 21%|β–ˆβ–ˆ | 8/38 [00:00<00:03, 8.65it/s]
129
+ 24%|β–ˆβ–ˆβ–Ž | 9/38 [00:00<00:03, 8.64it/s]
130
+ 26%|β–ˆβ–ˆβ–‹ | 10/38 [00:01<00:03, 8.33it/s]
131
+ 29%|β–ˆβ–ˆβ–‰ | 11/38 [00:01<00:03, 8.39it/s]
132
+ 32%|β–ˆβ–ˆβ–ˆβ– | 12/38 [00:01<00:03, 8.18it/s]
133
+ 34%|β–ˆβ–ˆβ–ˆβ– | 13/38 [00:01<00:03, 8.30it/s]
134
+ 37%|β–ˆβ–ˆβ–ˆβ–‹ | 14/38 [00:01<00:02, 8.19it/s]
135
+ 39%|β–ˆβ–ˆβ–ˆβ–‰ | 15/38 [00:01<00:02, 8.31it/s]
136
+ 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 16/38 [00:01<00:02, 8.20it/s]
137
+ 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 17/38 [00:01<00:02, 8.26it/s]
138
+ 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 18/38 [00:02<00:02, 8.14it/s]
139
+ 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 19/38 [00:02<00:02, 8.27it/s]
140
+ 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 20/38 [00:02<00:02, 8.09it/s]
141
+ 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 21/38 [00:02<00:02, 8.22it/s]
142
+ 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/38 [00:02<00:01, 8.05it/s]
143
+ 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 23/38 [00:02<00:01, 8.19it/s]
144
+ 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 24/38 [00:02<00:01, 8.06it/s]
145
+ 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 25/38 [00:02<00:01, 8.20it/s]
146
+ 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 26/38 [00:03<00:01, 8.04it/s]
147
+ 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 27/38 [00:03<00:01, 8.20it/s]
148
+ 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 28/38 [00:03<00:01, 8.16it/s]
149
+ 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 29/38 [00:03<00:01, 8.15it/s]
150
+ 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 30/38 [00:03<00:00, 8.01it/s]
151
+ 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31/38 [00:03<00:00, 8.19it/s]
152
+ 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 32/38 [00:03<00:00, 8.03it/s]
153
+ 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 33/38 [00:03<00:00, 8.18it/s]
154
+ 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 34/38 [00:04<00:00, 8.01it/s]
155
+ 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 35/38 [00:04<00:00, 8.18it/s]
156
+ 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 36/38 [00:04<00:00, 8.03it/s]
157
+
158
+ {'eval_loss': 2.1727805137634277, 'eval_runtime': 4.5585, 'eval_samples_per_second': 65.152, 'eval_steps_per_second': 8.336, 'epoch': 3.0}
159
+ 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆοΏ½οΏ½ | 1722/2870 [12:03<07:39, 2.50it/s]
160
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 38/38 [00:04<00:00, 8.18it/s]
161
+ Saving model checkpoint to gpt2-medium-finetuned-commongen\checkpoint-1722
162
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-1722\config.json
163
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-1722\generation_config.json
164
+ Model weights saved in gpt2-medium-finetuned-commongen\checkpoint-1722\pytorch_model.bin
165
+ 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 2000/2870 [13:59<05:47, 2.51it/s]{'loss': 1.885, 'learning_rate': 1.5156794425087109e-05, 'epoch': 3.48}
166
+ 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2296/2870 [15:57<03:48, 2.51it/s]***** Running Evaluation *****
167
+ Num examples = 297
168
+ Batch size = 8
169
+
170
+ 0%| | 0/38 [00:00<?, ?it/s]
171
+ 5%|β–Œ | 2/38 [00:00<00:02, 14.39it/s]
172
+ 11%|β–ˆ | 4/38 [00:00<00:03, 10.02it/s]
173
+ 16%|β–ˆβ–Œ | 6/38 [00:00<00:03, 9.21it/s]
174
+ 18%|β–ˆβ–Š | 7/38 [00:00<00:03, 9.04it/s]
175
+ 21%|β–ˆβ–ˆ | 8/38 [00:00<00:03, 8.78it/s]
176
+ 24%|β–ˆβ–ˆβ–Ž | 9/38 [00:00<00:03, 8.62it/s]
177
+ 26%|β–ˆβ–ˆβ–‹ | 10/38 [00:01<00:03, 8.37it/s]
178
+ 29%|β–ˆβ–ˆβ–‰ | 11/38 [00:01<00:03, 8.42it/s]
179
+ 32%|β–ˆβ–ˆβ–ˆβ– | 12/38 [00:01<00:03, 8.20it/s]
180
+ 34%|β–ˆβ–ˆβ–ˆβ– | 13/38 [00:01<00:02, 8.34it/s]
181
+ 37%|β–ˆβ–ˆβ–ˆβ–‹ | 14/38 [00:01<00:02, 8.20it/s]
182
+ 39%|β–ˆβ–ˆβ–ˆβ–‰ | 15/38 [00:01<00:02, 8.26it/s]
183
+ 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 16/38 [00:01<00:02, 8.06it/s]
184
+ 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 17/38 [00:01<00:02, 8.22it/s]
185
+ 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 18/38 [00:02<00:02, 8.11it/s]
186
+ 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 19/38 [00:02<00:02, 8.26it/s]
187
+ 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 20/38 [00:02<00:02, 8.14it/s]
188
+ 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 21/38 [00:02<00:02, 8.28it/s]
189
+ 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/38 [00:02<00:01, 8.15it/s]
190
+ 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 23/38 [00:02<00:01, 8.29it/s]
191
+ 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 24/38 [00:02<00:01, 8.16it/s]
192
+ 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 25/38 [00:02<00:01, 8.31it/s]
193
+ 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 26/38 [00:03<00:01, 8.10it/s]
194
+ 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 27/38 [00:03<00:01, 8.27it/s]
195
+ 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 28/38 [00:03<00:01, 8.07it/s]
196
+ 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 29/38 [00:03<00:01, 8.22it/s]
197
+ 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 30/38 [00:03<00:00, 8.04it/s]
198
+ 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31/38 [00:03<00:00, 8.20it/s]
199
+ 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 32/38 [00:03<00:00, 8.04it/s]
200
+ 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 33/38 [00:03<00:00, 8.21it/s]
201
+ 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 34/38 [00:04<00:00, 8.03it/s]
202
+ 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 35/38 [00:04<00:00, 8.20it/s]
203
+ 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 36/38 [00:04<00:00, 8.04it/s]
204
+
205
+ {'eval_loss': 2.2184627056121826, 'eval_runtime': 4.5511, 'eval_samples_per_second': 65.259, 'eval_steps_per_second': 8.35, 'epoch': 4.0}
206
+ 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2296/2870 [16:01<03:48, 2.51it/s]
207
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 38/38 [00:04<00:00, 8.20it/s]
208
+ Saving model checkpoint to gpt2-medium-finetuned-commongen\checkpoint-2296
209
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-2296\config.json
210
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-2296\generation_config.json
211
+ Model weights saved in gpt2-medium-finetuned-commongen\checkpoint-2296\pytorch_model.bin
212
+ 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2500/2870 [17:28<02:28, 2.50it/s]{'loss': 1.8157, 'learning_rate': 6.445993031358885e-06, 'epoch': 4.36}
213
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2870/2870 [19:56<00:00, 2.49it/s]***** Running Evaluation *****
214
+ Num examples = 297
215
+ Batch size = 8
216
+
217
+ 0%| | 0/38 [00:00<?, ?it/s]
218
+ 5%|β–Œ | 2/38 [00:00<00:02, 14.49it/s]
219
+ 11%|β–ˆ | 4/38 [00:00<00:03, 10.04it/s]
220
+ 16%|β–ˆβ–Œ | 6/38 [00:00<00:03, 9.32it/s]
221
+ 18%|β–ˆβ–Š | 7/38 [00:00<00:03, 9.08it/s]
222
+ 21%|β–ˆβ–ˆ | 8/38 [00:00<00:03, 8.63it/s]
223
+ 24%|β–ˆβ–ˆβ–Ž | 9/38 [00:00<00:03, 8.63it/s]
224
+ 26%|β–ˆβ–ˆβ–‹ | 10/38 [00:01<00:03, 8.33it/s]
225
+ 29%|β–ˆβ–ˆβ–‰ | 11/38 [00:01<00:03, 8.41it/s]
226
+ 32%|β–ˆβ–ˆβ–ˆβ– | 12/38 [00:01<00:03, 8.19it/s]
227
+ 34%|β–ˆβ–ˆβ–ˆβ– | 13/38 [00:01<00:03, 8.31it/s]
228
+ 37%|β–ˆβ–ˆβ–ˆβ–‹ | 14/38 [00:01<00:02, 8.12it/s]
229
+ 39%|β–ˆβ–ˆβ–ˆβ–‰ | 15/38 [00:01<00:02, 8.26it/s]
230
+ 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 16/38 [00:01<00:02, 8.08it/s]
231
+ 45%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 17/38 [00:01<00:02, 8.24it/s]
232
+ 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 18/38 [00:02<00:02, 8.07it/s]
233
+ 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 19/38 [00:02<00:02, 8.22it/s]
234
+ 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 20/38 [00:02<00:02, 8.04it/s]
235
+ 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 21/38 [00:02<00:02, 8.20it/s]
236
+ 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/38 [00:02<00:01, 8.02it/s]
237
+ 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 23/38 [00:02<00:01, 8.19it/s]
238
+ 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 24/38 [00:02<00:01, 8.17it/s]
239
+ 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 25/38 [00:02<00:01, 8.18it/s]
240
+ 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 26/38 [00:03<00:01, 8.03it/s]
241
+ 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 27/38 [00:03<00:01, 8.18it/s]
242
+ 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 28/38 [00:03<00:01, 8.03it/s]
243
+ 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 29/38 [00:03<00:01, 8.20it/s]
244
+ 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 30/38 [00:03<00:00, 8.18it/s]
245
+ 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31/38 [00:03<00:00, 8.18it/s]
246
+ 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 32/38 [00:03<00:00, 8.01it/s]
247
+ 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 33/38 [00:03<00:00, 8.16it/s]
248
+ 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 34/38 [00:04<00:00, 8.02it/s]
249
+ 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 35/38 [00:04<00:00, 8.19it/s]
250
+ 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 36/38 [00:04<00:00, 8.03it/s]
251
+
252
+ {'eval_loss': 2.2336463928222656, 'eval_runtime': 4.5651, 'eval_samples_per_second': 65.059, 'eval_steps_per_second': 8.324, 'epoch': 5.0}
253
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2870/2870 [20:01<00:00, 2.49it/s]
254
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 38/38 [00:04<00:00, 8.20it/s]
255
+ Saving model checkpoint to gpt2-medium-finetuned-commongen\checkpoint-2870
256
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-2870\config.json
257
+ Configuration saved in gpt2-medium-finetuned-commongen\checkpoint-2870\generation_config.json
258
+ Model weights saved in gpt2-medium-finetuned-commongen\checkpoint-2870\pytorch_model.bin
259
+
260
+
261
+ Training completed. Do not forget to share your model on huggingface.co/models =)
262
+
263
+
264
+ Loading best model from gpt2-medium-finetuned-commongen\checkpoint-574 (score: 2.1204423904418945).
265
+ {'train_runtime': 1208.3557, 'train_samples_per_second': 19.001, 'train_steps_per_second': 2.375, 'train_loss': 2.0178993145347888, 'epoch': 5.0}
266
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2870/2870 [20:08<00:00, 2.38it/s]
267
+ Saving model checkpoint to gpt2-medium-finetuned-commongen
268
+ Configuration saved in gpt2-medium-finetuned-commongen\config.json
269
+ Configuration saved in gpt2-medium-finetuned-commongen\generation_config.json
270
+ Model weights saved in gpt2-medium-finetuned-commongen\pytorch_model.bin
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b137bf1edc916528945b557b930e16ce5b38fdb15382f6071cfe065758708add
3
+ size 3579