XeTute commited on
Commit
4e5b716
·
verified ·
1 Parent(s): 3127633

Upload 17 files

Browse files
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 12.0,
3
+ "num_input_tokens_seen": 620593344,
4
+ "total_flos": 3.16711124803584e+17,
5
+ "train_loss": 0.07982336108915004,
6
+ "train_runtime": 2874.0318,
7
+ "train_samples_per_second": 212.069,
8
+ "train_steps_per_second": 212.069
9
+ }
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "XeTute/Phantasor_V0.2-137M",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.48.2",
37
+ "use_cache": false,
38
+ "vocab_size": 50257
39
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.48.2"
6
+ }
llamaboard_config.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ top.booster: auto
2
+ top.checkpoint_path: null
3
+ top.finetuning_type: full
4
+ top.model_name: GPT-2-Small
5
+ top.quantization_bit: none
6
+ top.quantization_method: bitsandbytes
7
+ top.rope_scaling: none
8
+ top.template: alpaca
9
+ train.additional_target: ''
10
+ train.apollo_rank: 16
11
+ train.apollo_scale: 32
12
+ train.apollo_target: all
13
+ train.apollo_update_interval: 200
14
+ train.badam_mode: layer
15
+ train.badam_switch_interval: 50
16
+ train.badam_switch_mode: ascending
17
+ train.badam_update_ratio: 0.05
18
+ train.batch_size: 1
19
+ train.compute_type: bf16
20
+ train.create_new_adapter: false
21
+ train.cutoff_len: 1024
22
+ train.dataset:
23
+ - XeTute/SStory-Gen-EN_ZH
24
+ - MatanP/emotion_mapped_story_dataset
25
+ - webnovel
26
+ - jaydenccc/AI_Storyteller_Dataset
27
+ train.dataset_dir: data
28
+ train.ds_offload: false
29
+ train.ds_stage: none
30
+ train.extra_args: '{"optim": "sgd"}'
31
+ train.freeze_extra_modules: ''
32
+ train.freeze_trainable_layers: 2
33
+ train.freeze_trainable_modules: all
34
+ train.galore_rank: 16
35
+ train.galore_scale: 2
36
+ train.galore_target: all
37
+ train.galore_update_interval: 200
38
+ train.gradient_accumulation_steps: 1
39
+ train.learning_rate: 1e-6
40
+ train.logging_steps: 100
41
+ train.lora_alpha: 16
42
+ train.lora_dropout: 0
43
+ train.lora_rank: 8
44
+ train.lora_target: ''
45
+ train.loraplus_lr_ratio: 0
46
+ train.lr_scheduler_type: cosine
47
+ train.mask_history: false
48
+ train.max_grad_norm: '1.0'
49
+ train.max_samples: '1000000000'
50
+ train.neat_packing: false
51
+ train.neftune_alpha: 0
52
+ train.num_train_epochs: '12.0'
53
+ train.packing: false
54
+ train.ppo_score_norm: false
55
+ train.ppo_whiten_rewards: false
56
+ train.pref_beta: 0.1
57
+ train.pref_ftx: 0
58
+ train.pref_loss: sigmoid
59
+ train.report_to:
60
+ - none
61
+ train.resize_vocab: false
62
+ train.reward_model: []
63
+ train.save_steps: 5000
64
+ train.swanlab_api_key: ''
65
+ train.swanlab_mode: cloud
66
+ train.swanlab_project: llamafactory
67
+ train.swanlab_run_name: ''
68
+ train.swanlab_workspace: ''
69
+ train.train_on_prompt: false
70
+ train.training_stage: Supervised Fine-Tuning
71
+ train.use_apollo: false
72
+ train.use_badam: false
73
+ train.use_dora: false
74
+ train.use_galore: false
75
+ train.use_llama_pro: false
76
+ train.use_pissa: false
77
+ train.use_rslora: false
78
+ train.use_swanlab: false
79
+ train.val_size: 0
80
+ train.warmup_steps: 10
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:002933714d638e62f3a7c2796c1075d9aaf99119481284246c9012400fc8eee7
3
+ size 497774208
running_log.txt ADDED
@@ -0,0 +1,803 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [INFO|2025-02-11 17:41:48] configuration_utils.py:696 >> loading configuration file config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\config.json
2
+
3
+ [INFO|2025-02-11 17:41:48] configuration_utils.py:768 >> Model config GPT2Config {
4
+ "_name_or_path": "XeTute/Phantasor_V0.2-137M",
5
+ "activation_function": "gelu_new",
6
+ "architectures": [
7
+ "GPT2LMHeadModel"
8
+ ],
9
+ "attn_pdrop": 0.1,
10
+ "bos_token_id": 50256,
11
+ "embd_pdrop": 0.1,
12
+ "eos_token_id": 50256,
13
+ "initializer_range": 0.02,
14
+ "layer_norm_epsilon": 1e-05,
15
+ "model_type": "gpt2",
16
+ "n_ctx": 1024,
17
+ "n_embd": 768,
18
+ "n_head": 12,
19
+ "n_inner": null,
20
+ "n_layer": 12,
21
+ "n_positions": 1024,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "task_specific_params": {
32
+ "text-generation": {
33
+ "do_sample": true,
34
+ "max_length": 50
35
+ }
36
+ },
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.48.2",
39
+ "use_cache": false,
40
+ "vocab_size": 50257
41
+ }
42
+
43
+
44
+ [INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file vocab.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\vocab.json
45
+
46
+ [INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file merges.txt from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\merges.txt
47
+
48
+ [INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer.json
49
+
50
+ [INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None
51
+
52
+ [INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\special_tokens_map.json
53
+
54
+ [INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer_config.json
55
+
56
+ [INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
57
+
58
+ [INFO|2025-02-11 17:41:50] configuration_utils.py:696 >> loading configuration file config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\config.json
59
+
60
+ [INFO|2025-02-11 17:41:50] configuration_utils.py:768 >> Model config GPT2Config {
61
+ "_name_or_path": "XeTute/Phantasor_V0.2-137M",
62
+ "activation_function": "gelu_new",
63
+ "architectures": [
64
+ "GPT2LMHeadModel"
65
+ ],
66
+ "attn_pdrop": 0.1,
67
+ "bos_token_id": 50256,
68
+ "embd_pdrop": 0.1,
69
+ "eos_token_id": 50256,
70
+ "initializer_range": 0.02,
71
+ "layer_norm_epsilon": 1e-05,
72
+ "model_type": "gpt2",
73
+ "n_ctx": 1024,
74
+ "n_embd": 768,
75
+ "n_head": 12,
76
+ "n_inner": null,
77
+ "n_layer": 12,
78
+ "n_positions": 1024,
79
+ "reorder_and_upcast_attn": false,
80
+ "resid_pdrop": 0.1,
81
+ "scale_attn_by_inverse_layer_idx": false,
82
+ "scale_attn_weights": true,
83
+ "summary_activation": null,
84
+ "summary_first_dropout": 0.1,
85
+ "summary_proj_to_labels": true,
86
+ "summary_type": "cls_index",
87
+ "summary_use_proj": true,
88
+ "task_specific_params": {
89
+ "text-generation": {
90
+ "do_sample": true,
91
+ "max_length": 50
92
+ }
93
+ },
94
+ "torch_dtype": "float32",
95
+ "transformers_version": "4.48.2",
96
+ "use_cache": false,
97
+ "vocab_size": 50257
98
+ }
99
+
100
+
101
+ [INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file vocab.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\vocab.json
102
+
103
+ [INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file merges.txt from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\merges.txt
104
+
105
+ [INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer.json
106
+
107
+ [INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None
108
+
109
+ [INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\special_tokens_map.json
110
+
111
+ [INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer_config.json
112
+
113
+ [INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
114
+
115
+ [INFO|2025-02-11 17:41:50] logging.py:157 >> Loading dataset XeTute/SStory-Gen-EN_ZH...
116
+
117
+ [INFO|2025-02-11 17:41:59] logging.py:157 >> Loading dataset MatanP/emotion_mapped_story_dataset...
118
+
119
+ [INFO|2025-02-11 17:42:03] logging.py:157 >> Loading dataset zxbsmk/webnovel_cn...
120
+
121
+ [INFO|2025-02-11 17:42:10] logging.py:157 >> Loading dataset jaydenccc/AI_Storyteller_Dataset...
122
+
123
+ [INFO|2025-02-11 17:42:13] configuration_utils.py:696 >> loading configuration file config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\config.json
124
+
125
+ [INFO|2025-02-11 17:42:13] configuration_utils.py:768 >> Model config GPT2Config {
126
+ "_name_or_path": "XeTute/Phantasor_V0.2-137M",
127
+ "activation_function": "gelu_new",
128
+ "architectures": [
129
+ "GPT2LMHeadModel"
130
+ ],
131
+ "attn_pdrop": 0.1,
132
+ "bos_token_id": 50256,
133
+ "embd_pdrop": 0.1,
134
+ "eos_token_id": 50256,
135
+ "initializer_range": 0.02,
136
+ "layer_norm_epsilon": 1e-05,
137
+ "model_type": "gpt2",
138
+ "n_ctx": 1024,
139
+ "n_embd": 768,
140
+ "n_head": 12,
141
+ "n_inner": null,
142
+ "n_layer": 12,
143
+ "n_positions": 1024,
144
+ "reorder_and_upcast_attn": false,
145
+ "resid_pdrop": 0.1,
146
+ "scale_attn_by_inverse_layer_idx": false,
147
+ "scale_attn_weights": true,
148
+ "summary_activation": null,
149
+ "summary_first_dropout": 0.1,
150
+ "summary_proj_to_labels": true,
151
+ "summary_type": "cls_index",
152
+ "summary_use_proj": true,
153
+ "task_specific_params": {
154
+ "text-generation": {
155
+ "do_sample": true,
156
+ "max_length": 50
157
+ }
158
+ },
159
+ "torch_dtype": "float32",
160
+ "transformers_version": "4.48.2",
161
+ "use_cache": false,
162
+ "vocab_size": 50257
163
+ }
164
+
165
+
166
+ [INFO|2025-02-11 17:42:13] modeling_utils.py:3904 >> loading weights file model.safetensors from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\model.safetensors
167
+
168
+ [INFO|2025-02-11 17:42:13] modeling_utils.py:1582 >> Instantiating GPT2LMHeadModel model under default dtype torch.bfloat16.
169
+
170
+ [INFO|2025-02-11 17:42:13] configuration_utils.py:1140 >> Generate config GenerationConfig {
171
+ "bos_token_id": 50256,
172
+ "eos_token_id": 50256,
173
+ "use_cache": false
174
+ }
175
+
176
+
177
+ [INFO|2025-02-11 17:42:14] modeling_utils.py:4888 >> All model checkpoint weights were used when initializing GPT2LMHeadModel.
178
+
179
+
180
+ [INFO|2025-02-11 17:42:14] modeling_utils.py:4896 >> All the weights of GPT2LMHeadModel were initialized from the model checkpoint at XeTute/Phantasor_V0.2-137M.
181
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.
182
+
183
+ [INFO|2025-02-11 17:42:14] configuration_utils.py:1095 >> loading configuration file generation_config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\generation_config.json
184
+
185
+ [INFO|2025-02-11 17:42:14] configuration_utils.py:1140 >> Generate config GenerationConfig {
186
+ "bos_token_id": 50256,
187
+ "eos_token_id": 50256
188
+ }
189
+
190
+
191
+ [INFO|2025-02-11 17:42:14] logging.py:157 >> Gradient checkpointing enabled.
192
+
193
+ [INFO|2025-02-11 17:42:14] logging.py:157 >> Using torch SDPA for faster training and inference.
194
+
195
+ [INFO|2025-02-11 17:42:14] logging.py:157 >> Upcasting trainable params to float32.
196
+
197
+ [INFO|2025-02-11 17:42:14] logging.py:157 >> Fine-tuning method: Full
198
+
199
+ [INFO|2025-02-11 17:42:14] logging.py:157 >> trainable params: 124,439,808 || all params: 124,439,808 || trainable%: 100.0000
200
+
201
+ [INFO|2025-02-11 17:42:14] trainer.py:741 >> Using auto half precision backend
202
+
203
+ [INFO|2025-02-11 17:42:14] trainer.py:2775 >> Loading model from saves\GPT-2-Small\full\10-02-2025\checkpoint-585000.
204
+
205
+ [WARNING|2025-02-11 17:42:14] trainer.py:3018 >> There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
206
+
207
+ [INFO|2025-02-11 17:42:14] trainer.py:2369 >> ***** Running training *****
208
+
209
+ [INFO|2025-02-11 17:42:14] trainer.py:2370 >> Num examples = 50,791
210
+
211
+ [INFO|2025-02-11 17:42:14] trainer.py:2371 >> Num Epochs = 12
212
+
213
+ [INFO|2025-02-11 17:42:14] trainer.py:2372 >> Instantaneous batch size per device = 1
214
+
215
+ [INFO|2025-02-11 17:42:14] trainer.py:2375 >> Total train batch size (w. parallel, distributed & accumulation) = 1
216
+
217
+ [INFO|2025-02-11 17:42:14] trainer.py:2376 >> Gradient Accumulation steps = 1
218
+
219
+ [INFO|2025-02-11 17:42:14] trainer.py:2377 >> Total optimization steps = 609,492
220
+
221
+ [INFO|2025-02-11 17:42:14] trainer.py:2378 >> Number of trainable parameters = 124,439,808
222
+
223
+ [INFO|2025-02-11 17:42:14] trainer.py:2400 >> Continuing training from checkpoint, will skip to saved global_step
224
+
225
+ [INFO|2025-02-11 17:42:14] trainer.py:2401 >> Continuing training from epoch 11
226
+
227
+ [INFO|2025-02-11 17:42:14] trainer.py:2402 >> Continuing training from global step 585000
228
+
229
+ [INFO|2025-02-11 17:42:14] trainer.py:2404 >> Will skip the first 11 epochs then the first 26299 batches in the first epoch.
230
+
231
+ [INFO|2025-02-11 17:42:26] logging.py:157 >> {'loss': 1.9812, 'learning_rate': 3.9468e-09, 'epoch': 11.52, 'throughput': 50003805.20}
232
+
233
+ [INFO|2025-02-11 17:42:38] logging.py:157 >> {'loss': 2.0162, 'learning_rate': 3.9145e-09, 'epoch': 11.52, 'throughput': 25202148.23}
234
+
235
+ [INFO|2025-02-11 17:42:50] logging.py:157 >> {'loss': 1.9694, 'learning_rate': 3.8824e-09, 'epoch': 11.52, 'throughput': 16752405.16}
236
+
237
+ [INFO|2025-02-11 17:43:02] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 3.8504e-09, 'epoch': 11.53, 'throughput': 12548387.20}
238
+
239
+ [INFO|2025-02-11 17:43:14] logging.py:157 >> {'loss': 1.9634, 'learning_rate': 3.8185e-09, 'epoch': 11.53, 'throughput': 10008058.95}
240
+
241
+ [INFO|2025-02-11 17:43:26] logging.py:157 >> {'loss': 2.0249, 'learning_rate': 3.7868e-09, 'epoch': 11.53, 'throughput': 8363752.91}
242
+
243
+ [INFO|2025-02-11 17:43:38] logging.py:157 >> {'loss': 1.9759, 'learning_rate': 3.7552e-09, 'epoch': 11.53, 'throughput': 7170473.74}
244
+
245
+ [INFO|2025-02-11 17:43:49] logging.py:157 >> {'loss': 2.0019, 'learning_rate': 3.7238e-09, 'epoch': 11.53, 'throughput': 6291290.77}
246
+
247
+ [INFO|2025-02-11 17:44:01] logging.py:157 >> {'loss': 1.9800, 'learning_rate': 3.6924e-09, 'epoch': 11.54, 'throughput': 5603383.58}
248
+
249
+ [INFO|2025-02-11 17:44:12] logging.py:157 >> {'loss': 1.9993, 'learning_rate': 3.6612e-09, 'epoch': 11.54, 'throughput': 5058860.90}
250
+
251
+ [INFO|2025-02-11 17:44:24] logging.py:157 >> {'loss': 1.9438, 'learning_rate': 3.6302e-09, 'epoch': 11.54, 'throughput': 4612411.90}
252
+
253
+ [INFO|2025-02-11 17:44:35] logging.py:157 >> {'loss': 2.0000, 'learning_rate': 3.5992e-09, 'epoch': 11.54, 'throughput': 4237182.77}
254
+
255
+ [INFO|2025-02-11 17:44:47] logging.py:157 >> {'loss': 2.0493, 'learning_rate': 3.5684e-09, 'epoch': 11.54, 'throughput': 3919609.73}
256
+
257
+ [INFO|2025-02-11 17:44:58] logging.py:157 >> {'loss': 1.9846, 'learning_rate': 3.5378e-09, 'epoch': 11.55, 'throughput': 3644083.96}
258
+
259
+ [INFO|2025-02-11 17:45:10] logging.py:157 >> {'loss': 1.9597, 'learning_rate': 3.5072e-09, 'epoch': 11.55, 'throughput': 3405057.81}
260
+
261
+ [INFO|2025-02-11 17:45:21] logging.py:157 >> {'loss': 2.0429, 'learning_rate': 3.4768e-09, 'epoch': 11.55, 'throughput': 3196451.63}
262
+
263
+ [INFO|2025-02-11 17:45:33] logging.py:157 >> {'loss': 1.9481, 'learning_rate': 3.4465e-09, 'epoch': 11.55, 'throughput': 3011092.35}
264
+
265
+ [INFO|2025-02-11 17:45:45] logging.py:157 >> {'loss': 1.9765, 'learning_rate': 3.4164e-09, 'epoch': 11.55, 'throughput': 2844004.86}
266
+
267
+ [INFO|2025-02-11 17:45:56] logging.py:157 >> {'loss': 2.0500, 'learning_rate': 3.3864e-09, 'epoch': 11.56, 'throughput': 2695596.64}
268
+
269
+ [INFO|2025-02-11 17:46:08] logging.py:157 >> {'loss': 2.0157, 'learning_rate': 3.3565e-09, 'epoch': 11.56, 'throughput': 2563562.80}
270
+
271
+ [INFO|2025-02-11 17:46:19] logging.py:157 >> {'loss': 2.0172, 'learning_rate': 3.3268e-09, 'epoch': 11.56, 'throughput': 2443655.33}
272
+
273
+ [INFO|2025-02-11 17:46:31] logging.py:157 >> {'loss': 1.9731, 'learning_rate': 3.2971e-09, 'epoch': 11.56, 'throughput': 2330850.00}
274
+
275
+ [INFO|2025-02-11 17:46:43] logging.py:157 >> {'loss': 2.0210, 'learning_rate': 3.2677e-09, 'epoch': 11.56, 'throughput': 2228412.35}
276
+
277
+ [INFO|2025-02-11 17:46:55] logging.py:157 >> {'loss': 1.9729, 'learning_rate': 3.2383e-09, 'epoch': 11.57, 'throughput': 2135025.81}
278
+
279
+ [INFO|2025-02-11 17:47:06] logging.py:157 >> {'loss': 2.0153, 'learning_rate': 3.2091e-09, 'epoch': 11.57, 'throughput': 2049017.68}
280
+
281
+ [INFO|2025-02-11 17:47:18] logging.py:157 >> {'loss': 1.9696, 'learning_rate': 3.1800e-09, 'epoch': 11.57, 'throughput': 1968648.47}
282
+
283
+ [INFO|2025-02-11 17:47:30] logging.py:157 >> {'loss': 1.9419, 'learning_rate': 3.1511e-09, 'epoch': 11.57, 'throughput': 1895060.01}
284
+
285
+ [INFO|2025-02-11 17:47:42] logging.py:157 >> {'loss': 2.0142, 'learning_rate': 3.1222e-09, 'epoch': 11.57, 'throughput': 1827059.79}
286
+
287
+ [INFO|2025-02-11 17:47:54] logging.py:157 >> {'loss': 2.0028, 'learning_rate': 3.0935e-09, 'epoch': 11.57, 'throughput': 1763616.14}
288
+
289
+ [INFO|2025-02-11 17:48:06] logging.py:157 >> {'loss': 1.9800, 'learning_rate': 3.0650e-09, 'epoch': 11.58, 'throughput': 1704997.68}
290
+
291
+ [INFO|2025-02-11 17:48:17] logging.py:157 >> {'loss': 2.0216, 'learning_rate': 3.0366e-09, 'epoch': 11.58, 'throughput': 1650129.04}
292
+
293
+ [INFO|2025-02-11 17:48:29] logging.py:157 >> {'loss': 1.9755, 'learning_rate': 3.0083e-09, 'epoch': 11.58, 'throughput': 1598132.07}
294
+
295
+ [INFO|2025-02-11 17:48:41] logging.py:157 >> {'loss': 2.0083, 'learning_rate': 2.9801e-09, 'epoch': 11.58, 'throughput': 1550207.97}
296
+
297
+ [INFO|2025-02-11 17:48:53] logging.py:157 >> {'loss': 2.0161, 'learning_rate': 2.9521e-09, 'epoch': 11.58, 'throughput': 1504244.14}
298
+
299
+ [INFO|2025-02-11 17:49:04] logging.py:157 >> {'loss': 2.0157, 'learning_rate': 2.9242e-09, 'epoch': 11.59, 'throughput': 1461629.99}
300
+
301
+ [INFO|2025-02-11 17:49:16] logging.py:157 >> {'loss': 1.9792, 'learning_rate': 2.8964e-09, 'epoch': 11.59, 'throughput': 1421289.44}
302
+
303
+ [INFO|2025-02-11 17:49:28] logging.py:157 >> {'loss': 1.9482, 'learning_rate': 2.8688e-09, 'epoch': 11.59, 'throughput': 1382339.41}
304
+
305
+ [INFO|2025-02-11 17:49:40] logging.py:157 >> {'loss': 2.0295, 'learning_rate': 2.8413e-09, 'epoch': 11.59, 'throughput': 1346070.98}
306
+
307
+ [INFO|2025-02-11 17:49:52] logging.py:157 >> {'loss': 1.9778, 'learning_rate': 2.8139e-09, 'epoch': 11.59, 'throughput': 1311592.72}
308
+
309
+ [INFO|2025-02-11 17:50:03] logging.py:157 >> {'loss': 2.0103, 'learning_rate': 2.7867e-09, 'epoch': 11.60, 'throughput': 1279350.97}
310
+
311
+ [INFO|2025-02-11 17:50:15] logging.py:157 >> {'loss': 1.9449, 'learning_rate': 2.7595e-09, 'epoch': 11.60, 'throughput': 1248312.09}
312
+
313
+ [INFO|2025-02-11 17:50:27] logging.py:157 >> {'loss': 2.0110, 'learning_rate': 2.7326e-09, 'epoch': 11.60, 'throughput': 1218768.16}
314
+
315
+ [INFO|2025-02-11 17:50:39] logging.py:157 >> {'loss': 1.9304, 'learning_rate': 2.7057e-09, 'epoch': 11.60, 'throughput': 1190297.15}
316
+
317
+ [INFO|2025-02-11 17:50:51] logging.py:157 >> {'loss': 1.9709, 'learning_rate': 2.6790e-09, 'epoch': 11.60, 'throughput': 1162183.03}
318
+
319
+ [INFO|2025-02-11 17:51:03] logging.py:157 >> {'loss': 1.9486, 'learning_rate': 2.6524e-09, 'epoch': 11.61, 'throughput': 1136347.30}
320
+
321
+ [INFO|2025-02-11 17:51:14] logging.py:157 >> {'loss': 1.9941, 'learning_rate': 2.6260e-09, 'epoch': 11.61, 'throughput': 1111856.81}
322
+
323
+ [INFO|2025-02-11 17:51:26] logging.py:157 >> {'loss': 1.9774, 'learning_rate': 2.5997e-09, 'epoch': 11.61, 'throughput': 1088182.32}
324
+
325
+ [INFO|2025-02-11 17:51:38] logging.py:157 >> {'loss': 1.9954, 'learning_rate': 2.5735e-09, 'epoch': 11.61, 'throughput': 1065718.96}
326
+
327
+ [INFO|2025-02-11 17:51:50] logging.py:157 >> {'loss': 2.0438, 'learning_rate': 2.5475e-09, 'epoch': 11.61, 'throughput': 1044277.87}
328
+
329
+ [INFO|2025-02-11 17:52:01] logging.py:157 >> {'loss': 2.0343, 'learning_rate': 2.5215e-09, 'epoch': 11.62, 'throughput': 1023489.66}
330
+
331
+ [INFO|2025-02-11 17:52:01] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-590000
332
+
333
+ [INFO|2025-02-11 17:52:01] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\config.json
334
+
335
+ [INFO|2025-02-11 17:52:01] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\generation_config.json
336
+
337
+ [INFO|2025-02-11 17:52:02] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\model.safetensors
338
+
339
+ [INFO|2025-02-11 17:52:02] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\tokenizer_config.json
340
+
341
+ [INFO|2025-02-11 17:52:02] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\special_tokens_map.json
342
+
343
+ [INFO|2025-02-11 17:52:14] logging.py:157 >> {'loss': 2.0148, 'learning_rate': 2.4958e-09, 'epoch': 11.62, 'throughput': 1002469.52}
344
+
345
+ [INFO|2025-02-11 17:52:26] logging.py:157 >> {'loss': 1.9398, 'learning_rate': 2.4701e-09, 'epoch': 11.62, 'throughput': 983195.98}
346
+
347
+ [INFO|2025-02-11 17:52:37] logging.py:157 >> {'loss': 2.0115, 'learning_rate': 2.4446e-09, 'epoch': 11.62, 'throughput': 964942.63}
348
+
349
+ [INFO|2025-02-11 17:52:49] logging.py:157 >> {'loss': 1.9977, 'learning_rate': 2.4192e-09, 'epoch': 11.62, 'throughput': 946799.71}
350
+
351
+ [INFO|2025-02-11 17:53:01] logging.py:157 >> {'loss': 1.9790, 'learning_rate': 2.3939e-09, 'epoch': 11.63, 'throughput': 929612.66}
352
+
353
+ [INFO|2025-02-11 17:53:13] logging.py:157 >> {'loss': 2.0179, 'learning_rate': 2.3688e-09, 'epoch': 11.63, 'throughput': 912840.69}
354
+
355
+ [INFO|2025-02-11 17:53:25] logging.py:157 >> {'loss': 1.9422, 'learning_rate': 2.3438e-09, 'epoch': 11.63, 'throughput': 897250.33}
356
+
357
+ [INFO|2025-02-11 17:53:36] logging.py:157 >> {'loss': 1.9710, 'learning_rate': 2.3190e-09, 'epoch': 11.63, 'throughput': 882243.27}
358
+
359
+ [INFO|2025-02-11 17:53:48] logging.py:157 >> {'loss': 1.9900, 'learning_rate': 2.2942e-09, 'epoch': 11.63, 'throughput': 867690.03}
360
+
361
+ [INFO|2025-02-11 17:53:59] logging.py:157 >> {'loss': 2.0106, 'learning_rate': 2.2696e-09, 'epoch': 11.64, 'throughput': 853674.20}
362
+
363
+ [INFO|2025-02-11 17:54:11] logging.py:157 >> {'loss': 1.9897, 'learning_rate': 2.2452e-09, 'epoch': 11.64, 'throughput': 840110.77}
364
+
365
+ [INFO|2025-02-11 17:54:22] logging.py:157 >> {'loss': 2.0037, 'learning_rate': 2.2208e-09, 'epoch': 11.64, 'throughput': 826914.64}
366
+
367
+ [INFO|2025-02-11 17:54:34] logging.py:157 >> {'loss': 1.9609, 'learning_rate': 2.1966e-09, 'epoch': 11.64, 'throughput': 814139.50}
368
+
369
+ [INFO|2025-02-11 17:54:45] logging.py:157 >> {'loss': 2.0072, 'learning_rate': 2.1726e-09, 'epoch': 11.64, 'throughput': 801848.40}
370
+
371
+ [INFO|2025-02-11 17:54:57] logging.py:157 >> {'loss': 1.9598, 'learning_rate': 2.1486e-09, 'epoch': 11.65, 'throughput': 789459.13}
372
+
373
+ [INFO|2025-02-11 17:55:09] logging.py:157 >> {'loss': 1.9845, 'learning_rate': 2.1248e-09, 'epoch': 11.65, 'throughput': 777598.46}
374
+
375
+ [INFO|2025-02-11 17:55:21] logging.py:157 >> {'loss': 1.9423, 'learning_rate': 2.1012e-09, 'epoch': 11.65, 'throughput': 766045.00}
376
+
377
+ [INFO|2025-02-11 17:55:33] logging.py:157 >> {'loss': 1.9774, 'learning_rate': 2.0776e-09, 'epoch': 11.65, 'throughput': 754823.16}
378
+
379
+ [INFO|2025-02-11 17:55:45] logging.py:157 >> {'loss': 2.0093, 'learning_rate': 2.0542e-09, 'epoch': 11.65, 'throughput': 743976.59}
380
+
381
+ [INFO|2025-02-11 17:55:56] logging.py:157 >> {'loss': 1.9717, 'learning_rate': 2.0310e-09, 'epoch': 11.66, 'throughput': 733371.47}
382
+
383
+ [INFO|2025-02-11 17:56:08] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 2.0078e-09, 'epoch': 11.66, 'throughput': 723085.26}
384
+
385
+ [INFO|2025-02-11 17:56:20] logging.py:157 >> {'loss': 1.9593, 'learning_rate': 1.9848e-09, 'epoch': 11.66, 'throughput': 713131.55}
386
+
387
+ [INFO|2025-02-11 17:56:32] logging.py:157 >> {'loss': 1.9501, 'learning_rate': 1.9619e-09, 'epoch': 11.66, 'throughput': 703409.15}
388
+
389
+ [INFO|2025-02-11 17:56:44] logging.py:157 >> {'loss': 1.9638, 'learning_rate': 1.9392e-09, 'epoch': 11.66, 'throughput': 693950.69}
390
+
391
+ [INFO|2025-02-11 17:56:55] logging.py:157 >> {'loss': 1.9802, 'learning_rate': 1.9166e-09, 'epoch': 11.67, 'throughput': 684783.60}
392
+
393
+ [INFO|2025-02-11 17:57:08] logging.py:157 >> {'loss': 1.9731, 'learning_rate': 1.8941e-09, 'epoch': 11.67, 'throughput': 675622.49}
394
+
395
+ [INFO|2025-02-11 17:57:20] logging.py:157 >> {'loss': 1.9767, 'learning_rate': 1.8718e-09, 'epoch': 11.67, 'throughput': 666580.81}
396
+
397
+ [INFO|2025-02-11 17:57:32] logging.py:157 >> {'loss': 1.9927, 'learning_rate': 1.8496e-09, 'epoch': 11.67, 'throughput': 658156.99}
398
+
399
+ [INFO|2025-02-11 17:57:43] logging.py:157 >> {'loss': 1.9931, 'learning_rate': 1.8275e-09, 'epoch': 11.67, 'throughput': 649934.04}
400
+
401
+ [INFO|2025-02-11 17:57:55] logging.py:157 >> {'loss': 1.9789, 'learning_rate': 1.8055e-09, 'epoch': 11.68, 'throughput': 641903.03}
402
+
403
+ [INFO|2025-02-11 17:58:07] logging.py:157 >> {'loss': 2.0209, 'learning_rate': 1.7837e-09, 'epoch': 11.68, 'throughput': 634098.47}
404
+
405
+ [INFO|2025-02-11 17:58:19] logging.py:157 >> {'loss': 1.9245, 'learning_rate': 1.7620e-09, 'epoch': 11.68, 'throughput': 626445.98}
406
+
407
+ [INFO|2025-02-11 17:58:30] logging.py:157 >> {'loss': 2.0252, 'learning_rate': 1.7405e-09, 'epoch': 11.68, 'throughput': 618981.54}
408
+
409
+ [INFO|2025-02-11 17:58:42] logging.py:157 >> {'loss': 2.0151, 'learning_rate': 1.7191e-09, 'epoch': 11.68, 'throughput': 611668.79}
410
+
411
+ [INFO|2025-02-11 17:58:54] logging.py:157 >> {'loss': 1.9501, 'learning_rate': 1.6978e-09, 'epoch': 11.69, 'throughput': 604546.71}
412
+
413
+ [INFO|2025-02-11 17:59:06] logging.py:157 >> {'loss': 2.0443, 'learning_rate': 1.6766e-09, 'epoch': 11.69, 'throughput': 597713.96}
414
+
415
+ [INFO|2025-02-11 17:59:17] logging.py:157 >> {'loss': 1.9772, 'learning_rate': 1.6556e-09, 'epoch': 11.69, 'throughput': 591071.07}
416
+
417
+ [INFO|2025-02-11 17:59:29] logging.py:157 >> {'loss': 1.9930, 'learning_rate': 1.6347e-09, 'epoch': 11.69, 'throughput': 584451.24}
418
+
419
+ [INFO|2025-02-11 17:59:41] logging.py:157 >> {'loss': 2.0026, 'learning_rate': 1.6139e-09, 'epoch': 11.69, 'throughput': 577957.44}
420
+
421
+ [INFO|2025-02-11 17:59:52] logging.py:157 >> {'loss': 2.0245, 'learning_rate': 1.5933e-09, 'epoch': 11.69, 'throughput': 571700.05}
422
+
423
+ [INFO|2025-02-11 18:00:04] logging.py:157 >> {'loss': 2.0352, 'learning_rate': 1.5728e-09, 'epoch': 11.70, 'throughput': 565427.48}
424
+
425
+ [INFO|2025-02-11 18:00:16] logging.py:157 >> {'loss': 1.9875, 'learning_rate': 1.5525e-09, 'epoch': 11.70, 'throughput': 559389.83}
426
+
427
+ [INFO|2025-02-11 18:00:28] logging.py:157 >> {'loss': 1.9973, 'learning_rate': 1.5322e-09, 'epoch': 11.70, 'throughput': 553408.92}
428
+
429
+ [INFO|2025-02-11 18:00:40] logging.py:157 >> {'loss': 2.0091, 'learning_rate': 1.5121e-09, 'epoch': 11.70, 'throughput': 547663.70}
430
+
431
+ [INFO|2025-02-11 18:00:51] logging.py:157 >> {'loss': 2.0294, 'learning_rate': 1.4922e-09, 'epoch': 11.70, 'throughput': 542034.94}
432
+
433
+ [INFO|2025-02-11 18:01:03] logging.py:157 >> {'loss': 1.9653, 'learning_rate': 1.4723e-09, 'epoch': 11.71, 'throughput': 536571.61}
434
+
435
+ [INFO|2025-02-11 18:01:14] logging.py:157 >> {'loss': 1.9635, 'learning_rate': 1.4527e-09, 'epoch': 11.71, 'throughput': 531161.51}
436
+
437
+ [INFO|2025-02-11 18:01:26] logging.py:157 >> {'loss': 1.9859, 'learning_rate': 1.4331e-09, 'epoch': 11.71, 'throughput': 525822.29}
438
+
439
+ [INFO|2025-02-11 18:01:38] logging.py:157 >> {'loss': 1.9933, 'learning_rate': 1.4137e-09, 'epoch': 11.71, 'throughput': 520580.98}
440
+
441
+ [INFO|2025-02-11 18:01:50] logging.py:157 >> {'loss': 1.9638, 'learning_rate': 1.3944e-09, 'epoch': 11.71, 'throughput': 515438.63}
442
+
443
+ [INFO|2025-02-11 18:01:50] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-595000
444
+
445
+ [INFO|2025-02-11 18:01:50] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\config.json
446
+
447
+ [INFO|2025-02-11 18:01:50] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\generation_config.json
448
+
449
+ [INFO|2025-02-11 18:01:50] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\model.safetensors
450
+
451
+ [INFO|2025-02-11 18:01:50] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\tokenizer_config.json
452
+
453
+ [INFO|2025-02-11 18:01:50] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\special_tokens_map.json
454
+
455
+ [INFO|2025-02-11 18:02:02] logging.py:157 >> {'loss': 2.0220, 'learning_rate': 1.3752e-09, 'epoch': 11.72, 'throughput': 510131.56}
456
+
457
+ [INFO|2025-02-11 18:02:14] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 1.3561e-09, 'epoch': 11.72, 'throughput': 505271.09}
458
+
459
+ [INFO|2025-02-11 18:02:25] logging.py:157 >> {'loss': 1.9671, 'learning_rate': 1.3372e-09, 'epoch': 11.72, 'throughput': 500534.56}
460
+
461
+ [INFO|2025-02-11 18:02:37] logging.py:157 >> {'loss': 1.9840, 'learning_rate': 1.3185e-09, 'epoch': 11.72, 'throughput': 495856.27}
462
+
463
+ [INFO|2025-02-11 18:02:49] logging.py:157 >> {'loss': 1.9937, 'learning_rate': 1.2998e-09, 'epoch': 11.72, 'throughput': 491327.18}
464
+
465
+ [INFO|2025-02-11 18:03:00] logging.py:157 >> {'loss': 1.9848, 'learning_rate': 1.2813e-09, 'epoch': 11.73, 'throughput': 486779.60}
466
+
467
+ [INFO|2025-02-11 18:03:12] logging.py:157 >> {'loss': 1.9859, 'learning_rate': 1.2630e-09, 'epoch': 11.73, 'throughput': 482340.06}
468
+
469
+ [INFO|2025-02-11 18:03:23] logging.py:157 >> {'loss': 2.0021, 'learning_rate': 1.2447e-09, 'epoch': 11.73, 'throughput': 478029.97}
470
+
471
+ [INFO|2025-02-11 18:03:35] logging.py:157 >> {'loss': 1.9589, 'learning_rate': 1.2266e-09, 'epoch': 11.73, 'throughput': 473763.66}
472
+
473
+ [INFO|2025-02-11 18:03:47] logging.py:157 >> {'loss': 2.0034, 'learning_rate': 1.2086e-09, 'epoch': 11.73, 'throughput': 469588.35}
474
+
475
+ [INFO|2025-02-11 18:03:58] logging.py:157 >> {'loss': 1.9454, 'learning_rate': 1.1908e-09, 'epoch': 11.74, 'throughput': 465484.85}
476
+
477
+ [INFO|2025-02-11 18:04:10] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 1.1731e-09, 'epoch': 11.74, 'throughput': 461443.54}
478
+
479
+ [INFO|2025-02-11 18:04:22] logging.py:157 >> {'loss': 1.9869, 'learning_rate': 1.1555e-09, 'epoch': 11.74, 'throughput': 457476.90}
480
+
481
+ [INFO|2025-02-11 18:04:33] logging.py:157 >> {'loss': 1.9897, 'learning_rate': 1.1381e-09, 'epoch': 11.74, 'throughput': 453577.26}
482
+
483
+ [INFO|2025-02-11 18:04:45] logging.py:157 >> {'loss': 1.9508, 'learning_rate': 1.1207e-09, 'epoch': 11.74, 'throughput': 449756.40}
484
+
485
+ [INFO|2025-02-11 18:04:57] logging.py:157 >> {'loss': 2.0472, 'learning_rate': 1.1036e-09, 'epoch': 11.75, 'throughput': 445933.85}
486
+
487
+ [INFO|2025-02-11 18:05:08] logging.py:157 >> {'loss': 2.0183, 'learning_rate': 1.0865e-09, 'epoch': 11.75, 'throughput': 442211.33}
488
+
489
+ [INFO|2025-02-11 18:05:20] logging.py:157 >> {'loss': 1.9619, 'learning_rate': 1.0696e-09, 'epoch': 11.75, 'throughput': 438547.56}
490
+
491
+ [INFO|2025-02-11 18:05:32] logging.py:157 >> {'loss': 1.9705, 'learning_rate': 1.0528e-09, 'epoch': 11.75, 'throughput': 434931.99}
492
+
493
+ [INFO|2025-02-11 18:05:44] logging.py:157 >> {'loss': 1.9639, 'learning_rate': 1.0362e-09, 'epoch': 11.75, 'throughput': 431369.53}
494
+
495
+ [INFO|2025-02-11 18:05:55] logging.py:157 >> {'loss': 2.0180, 'learning_rate': 1.0197e-09, 'epoch': 11.76, 'throughput': 427895.84}
496
+
497
+ [INFO|2025-02-11 18:06:07] logging.py:157 >> {'loss': 1.9725, 'learning_rate': 1.0033e-09, 'epoch': 11.76, 'throughput': 424536.52}
498
+
499
+ [INFO|2025-02-11 18:06:18] logging.py:157 >> {'loss': 1.9720, 'learning_rate': 9.8702e-10, 'epoch': 11.76, 'throughput': 421155.65}
500
+
501
+ [INFO|2025-02-11 18:06:30] logging.py:157 >> {'loss': 1.9493, 'learning_rate': 9.7090e-10, 'epoch': 11.76, 'throughput': 417872.57}
502
+
503
+ [INFO|2025-02-11 18:06:42] logging.py:157 >> {'loss': 1.9629, 'learning_rate': 9.5491e-10, 'epoch': 11.76, 'throughput': 414591.42}
504
+
505
+ [INFO|2025-02-11 18:06:53] logging.py:157 >> {'loss': 2.0116, 'learning_rate': 9.3906e-10, 'epoch': 11.77, 'throughput': 411412.15}
506
+
507
+ [INFO|2025-02-11 18:07:05] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 9.2333e-10, 'epoch': 11.77, 'throughput': 408307.17}
508
+
509
+ [INFO|2025-02-11 18:07:16] logging.py:157 >> {'loss': 2.0005, 'learning_rate': 9.0775e-10, 'epoch': 11.77, 'throughput': 405234.40}
510
+
511
+ [INFO|2025-02-11 18:07:28] logging.py:157 >> {'loss': 2.0201, 'learning_rate': 8.9229e-10, 'epoch': 11.77, 'throughput': 402202.34}
512
+
513
+ [INFO|2025-02-11 18:07:40] logging.py:157 >> {'loss': 2.0112, 'learning_rate': 8.7696e-10, 'epoch': 11.77, 'throughput': 399205.39}
514
+
515
+ [INFO|2025-02-11 18:07:51] logging.py:157 >> {'loss': 1.9830, 'learning_rate': 8.6177e-10, 'epoch': 11.78, 'throughput': 396210.92}
516
+
517
+ [INFO|2025-02-11 18:08:03] logging.py:157 >> {'loss': 1.9361, 'learning_rate': 8.4671e-10, 'epoch': 11.78, 'throughput': 393320.81}
518
+
519
+ [INFO|2025-02-11 18:08:15] logging.py:157 >> {'loss': 1.9500, 'learning_rate': 8.3179e-10, 'epoch': 11.78, 'throughput': 390469.11}
520
+
521
+ [INFO|2025-02-11 18:08:26] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 8.1700e-10, 'epoch': 11.78, 'throughput': 387669.28}
522
+
523
+ [INFO|2025-02-11 18:08:38] logging.py:157 >> {'loss': 1.9561, 'learning_rate': 8.0233e-10, 'epoch': 11.78, 'throughput': 384887.83}
524
+
525
+ [INFO|2025-02-11 18:08:49] logging.py:157 >> {'loss': 1.9669, 'learning_rate': 7.8781e-10, 'epoch': 11.79, 'throughput': 382152.22}
526
+
527
+ [INFO|2025-02-11 18:09:01] logging.py:157 >> {'loss': 2.0171, 'learning_rate': 7.7341e-10, 'epoch': 11.79, 'throughput': 379451.83}
528
+
529
+ [INFO|2025-02-11 18:09:13] logging.py:157 >> {'loss': 1.9596, 'learning_rate': 7.5915e-10, 'epoch': 11.79, 'throughput': 376779.36}
530
+
531
+ [INFO|2025-02-11 18:09:24] logging.py:157 >> {'loss': 1.9839, 'learning_rate': 7.4502e-10, 'epoch': 11.79, 'throughput': 374121.06}
532
+
533
+ [INFO|2025-02-11 18:09:36] logging.py:157 >> {'loss': 1.9670, 'learning_rate': 7.3102e-10, 'epoch': 11.79, 'throughput': 371467.09}
534
+
535
+ [INFO|2025-02-11 18:09:48] logging.py:157 >> {'loss': 1.9590, 'learning_rate': 7.1715e-10, 'epoch': 11.80, 'throughput': 368908.16}
536
+
537
+ [INFO|2025-02-11 18:10:00] logging.py:157 >> {'loss': 1.9685, 'learning_rate': 7.0342e-10, 'epoch': 11.80, 'throughput': 366412.85}
538
+
539
+ [INFO|2025-02-11 18:10:11] logging.py:157 >> {'loss': 1.9602, 'learning_rate': 6.8982e-10, 'epoch': 11.80, 'throughput': 363886.09}
540
+
541
+ [INFO|2025-02-11 18:10:23] logging.py:157 >> {'loss': 1.9667, 'learning_rate': 6.7635e-10, 'epoch': 11.80, 'throughput': 361409.68}
542
+
543
+ [INFO|2025-02-11 18:10:35] logging.py:157 >> {'loss': 1.9997, 'learning_rate': 6.6302e-10, 'epoch': 11.80, 'throughput': 359002.79}
544
+
545
+ [INFO|2025-02-11 18:10:46] logging.py:157 >> {'loss': 1.9776, 'learning_rate': 6.4982e-10, 'epoch': 11.81, 'throughput': 356673.52}
546
+
547
+ [INFO|2025-02-11 18:10:58] logging.py:157 >> {'loss': 1.9370, 'learning_rate': 6.3675e-10, 'epoch': 11.81, 'throughput': 354316.74}
548
+
549
+ [INFO|2025-02-11 18:11:10] logging.py:157 >> {'loss': 1.9884, 'learning_rate': 6.2381e-10, 'epoch': 11.81, 'throughput': 351970.04}
550
+
551
+ [INFO|2025-02-11 18:11:21] logging.py:157 >> {'loss': 1.9506, 'learning_rate': 6.1101e-10, 'epoch': 11.81, 'throughput': 349703.67}
552
+
553
+ [INFO|2025-02-11 18:11:33] logging.py:157 >> {'loss': 1.9925, 'learning_rate': 5.9834e-10, 'epoch': 11.81, 'throughput': 347455.50}
554
+
555
+ [INFO|2025-02-11 18:11:33] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-600000
556
+
557
+ [INFO|2025-02-11 18:11:33] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\config.json
558
+
559
+ [INFO|2025-02-11 18:11:33] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\generation_config.json
560
+
561
+ [INFO|2025-02-11 18:11:33] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\model.safetensors
562
+
563
+ [INFO|2025-02-11 18:11:33] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\tokenizer_config.json
564
+
565
+ [INFO|2025-02-11 18:11:33] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\special_tokens_map.json
566
+
567
+ [INFO|2025-02-11 18:11:45] logging.py:157 >> {'loss': 2.0176, 'learning_rate': 5.8580e-10, 'epoch': 11.82, 'throughput': 345137.06}
568
+
569
+ [INFO|2025-02-11 18:11:56] logging.py:157 >> {'loss': 1.9770, 'learning_rate': 5.7339e-10, 'epoch': 11.82, 'throughput': 342956.52}
570
+
571
+ [INFO|2025-02-11 18:12:08] logging.py:157 >> {'loss': 2.0058, 'learning_rate': 5.6112e-10, 'epoch': 11.82, 'throughput': 340798.55}
572
+
573
+ [INFO|2025-02-11 18:12:20] logging.py:157 >> {'loss': 1.9480, 'learning_rate': 5.4898e-10, 'epoch': 11.82, 'throughput': 338660.27}
574
+
575
+ [INFO|2025-02-11 18:12:31] logging.py:157 >> {'loss': 1.9882, 'learning_rate': 5.3697e-10, 'epoch': 11.82, 'throughput': 336520.05}
576
+
577
+ [INFO|2025-02-11 18:12:44] logging.py:157 >> {'loss': 1.9543, 'learning_rate': 5.2510e-10, 'epoch': 11.82, 'throughput': 334342.07}
578
+
579
+ [INFO|2025-02-11 18:12:55] logging.py:157 >> {'loss': 2.0162, 'learning_rate': 5.1336e-10, 'epoch': 11.83, 'throughput': 332223.27}
580
+
581
+ [INFO|2025-02-11 18:13:07] logging.py:157 >> {'loss': 1.9700, 'learning_rate': 5.0175e-10, 'epoch': 11.83, 'throughput': 330148.95}
582
+
583
+ [INFO|2025-02-11 18:13:19] logging.py:157 >> {'loss': 1.9556, 'learning_rate': 4.9027e-10, 'epoch': 11.83, 'throughput': 328120.14}
584
+
585
+ [INFO|2025-02-11 18:13:31] logging.py:157 >> {'loss': 1.9164, 'learning_rate': 4.7893e-10, 'epoch': 11.83, 'throughput': 326103.62}
586
+
587
+ [INFO|2025-02-11 18:13:43] logging.py:157 >> {'loss': 2.0214, 'learning_rate': 4.6771e-10, 'epoch': 11.83, 'throughput': 324137.85}
588
+
589
+ [INFO|2025-02-11 18:13:54] logging.py:157 >> {'loss': 1.9946, 'learning_rate': 4.5664e-10, 'epoch': 11.84, 'throughput': 322178.13}
590
+
591
+ [INFO|2025-02-11 18:14:06] logging.py:157 >> {'loss': 1.9869, 'learning_rate': 4.4569e-10, 'epoch': 11.84, 'throughput': 320240.19}
592
+
593
+ [INFO|2025-02-11 18:14:18] logging.py:157 >> {'loss': 1.9422, 'learning_rate': 4.3488e-10, 'epoch': 11.84, 'throughput': 318336.19}
594
+
595
+ [INFO|2025-02-11 18:14:30] logging.py:157 >> {'loss': 2.0274, 'learning_rate': 4.2420e-10, 'epoch': 11.84, 'throughput': 316426.29}
596
+
597
+ [INFO|2025-02-11 18:14:42] logging.py:157 >> {'loss': 1.9705, 'learning_rate': 4.1365e-10, 'epoch': 11.84, 'throughput': 314563.35}
598
+
599
+ [INFO|2025-02-11 18:14:53] logging.py:157 >> {'loss': 2.0292, 'learning_rate': 4.0323e-10, 'epoch': 11.85, 'throughput': 312747.74}
600
+
601
+ [INFO|2025-02-11 18:15:05] logging.py:157 >> {'loss': 1.9715, 'learning_rate': 3.9295e-10, 'epoch': 11.85, 'throughput': 310944.31}
602
+
603
+ [INFO|2025-02-11 18:15:17] logging.py:157 >> {'loss': 1.9632, 'learning_rate': 3.8280e-10, 'epoch': 11.85, 'throughput': 309151.82}
604
+
605
+ [INFO|2025-02-11 18:15:29] logging.py:157 >> {'loss': 1.9986, 'learning_rate': 3.7279e-10, 'epoch': 11.85, 'throughput': 307389.38}
606
+
607
+ [INFO|2025-02-11 18:15:40] logging.py:157 >> {'loss': 1.9875, 'learning_rate': 3.6290e-10, 'epoch': 11.85, 'throughput': 305642.32}
608
+
609
+ [INFO|2025-02-11 18:15:52] logging.py:157 >> {'loss': 1.9834, 'learning_rate': 3.5315e-10, 'epoch': 11.86, 'throughput': 303915.22}
610
+
611
+ [INFO|2025-02-11 18:16:04] logging.py:157 >> {'loss': 1.9761, 'learning_rate': 3.4353e-10, 'epoch': 11.86, 'throughput': 302186.17}
612
+
613
+ [INFO|2025-02-11 18:16:16] logging.py:157 >> {'loss': 1.9958, 'learning_rate': 3.3405e-10, 'epoch': 11.86, 'throughput': 300490.49}
614
+
615
+ [INFO|2025-02-11 18:16:27] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 3.2469e-10, 'epoch': 11.86, 'throughput': 298820.94}
616
+
617
+ [INFO|2025-02-11 18:16:39] logging.py:157 >> {'loss': 1.9959, 'learning_rate': 3.1547e-10, 'epoch': 11.86, 'throughput': 297188.72}
618
+
619
+ [INFO|2025-02-11 18:16:51] logging.py:157 >> {'loss': 2.0291, 'learning_rate': 3.0639e-10, 'epoch': 11.87, 'throughput': 295566.17}
620
+
621
+ [INFO|2025-02-11 18:17:02] logging.py:157 >> {'loss': 1.9732, 'learning_rate': 2.9743e-10, 'epoch': 11.87, 'throughput': 293965.39}
622
+
623
+ [INFO|2025-02-11 18:17:14] logging.py:157 >> {'loss': 1.9883, 'learning_rate': 2.8861e-10, 'epoch': 11.87, 'throughput': 292397.39}
624
+
625
+ [INFO|2025-02-11 18:17:25] logging.py:157 >> {'loss': 2.0485, 'learning_rate': 2.7992e-10, 'epoch': 11.87, 'throughput': 290839.54}
626
+
627
+ [INFO|2025-02-11 18:17:37] logging.py:157 >> {'loss': 1.9821, 'learning_rate': 2.7136e-10, 'epoch': 11.87, 'throughput': 289296.97}
628
+
629
+ [INFO|2025-02-11 18:17:49] logging.py:157 >> {'loss': 1.9260, 'learning_rate': 2.6294e-10, 'epoch': 11.88, 'throughput': 287758.88}
630
+
631
+ [INFO|2025-02-11 18:18:00] logging.py:157 >> {'loss': 1.9753, 'learning_rate': 2.5465e-10, 'epoch': 11.88, 'throughput': 286236.56}
632
+
633
+ [INFO|2025-02-11 18:18:12] logging.py:157 >> {'loss': 1.9688, 'learning_rate': 2.4649e-10, 'epoch': 11.88, 'throughput': 284738.27}
634
+
635
+ [INFO|2025-02-11 18:18:24] logging.py:157 >> {'loss': 2.0239, 'learning_rate': 2.3847e-10, 'epoch': 11.88, 'throughput': 283255.23}
636
+
637
+ [INFO|2025-02-11 18:18:35] logging.py:157 >> {'loss': 1.9905, 'learning_rate': 2.3057e-10, 'epoch': 11.88, 'throughput': 281788.13}
638
+
639
+ [INFO|2025-02-11 18:18:47] logging.py:157 >> {'loss': 1.9786, 'learning_rate': 2.2281e-10, 'epoch': 11.89, 'throughput': 280336.59}
640
+
641
+ [INFO|2025-02-11 18:18:59] logging.py:157 >> {'loss': 1.9951, 'learning_rate': 2.1519e-10, 'epoch': 11.89, 'throughput': 278902.58}
642
+
643
+ [INFO|2025-02-11 18:19:10] logging.py:157 >> {'loss': 1.9598, 'learning_rate': 2.0769e-10, 'epoch': 11.89, 'throughput': 277472.95}
644
+
645
+ [INFO|2025-02-11 18:19:22] logging.py:157 >> {'loss': 2.0105, 'learning_rate': 2.0033e-10, 'epoch': 11.89, 'throughput': 276052.37}
646
+
647
+ [INFO|2025-02-11 18:19:34] logging.py:157 >> {'loss': 1.9820, 'learning_rate': 1.9310e-10, 'epoch': 11.89, 'throughput': 274651.96}
648
+
649
+ [INFO|2025-02-11 18:19:46] logging.py:157 >> {'loss': 1.9581, 'learning_rate': 1.8601e-10, 'epoch': 11.90, 'throughput': 273266.57}
650
+
651
+ [INFO|2025-02-11 18:19:57] logging.py:157 >> {'loss': 1.9834, 'learning_rate': 1.7904e-10, 'epoch': 11.90, 'throughput': 271897.28}
652
+
653
+ [INFO|2025-02-11 18:20:09] logging.py:157 >> {'loss': 2.0045, 'learning_rate': 1.7221e-10, 'epoch': 11.90, 'throughput': 270528.26}
654
+
655
+ [INFO|2025-02-11 18:20:21] logging.py:157 >> {'loss': 1.9594, 'learning_rate': 1.6552e-10, 'epoch': 11.90, 'throughput': 269176.81}
656
+
657
+ [INFO|2025-02-11 18:20:33] logging.py:157 >> {'loss': 1.9738, 'learning_rate': 1.5895e-10, 'epoch': 11.90, 'throughput': 267831.20}
658
+
659
+ [INFO|2025-02-11 18:20:45] logging.py:157 >> {'loss': 2.0208, 'learning_rate': 1.5252e-10, 'epoch': 11.91, 'throughput': 266498.59}
660
+
661
+ [INFO|2025-02-11 18:20:57] logging.py:157 >> {'loss': 1.9997, 'learning_rate': 1.4622e-10, 'epoch': 11.91, 'throughput': 265185.54}
662
+
663
+ [INFO|2025-02-11 18:21:09] logging.py:157 >> {'loss': 2.0192, 'learning_rate': 1.4006e-10, 'epoch': 11.91, 'throughput': 263875.74}
664
+
665
+ [INFO|2025-02-11 18:21:21] logging.py:157 >> {'loss': 2.0046, 'learning_rate': 1.3402e-10, 'epoch': 11.91, 'throughput': 262555.68}
666
+
667
+ [INFO|2025-02-11 18:21:21] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-605000
668
+
669
+ [INFO|2025-02-11 18:21:21] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\config.json
670
+
671
+ [INFO|2025-02-11 18:21:21] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\generation_config.json
672
+
673
+ [INFO|2025-02-11 18:21:21] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\model.safetensors
674
+
675
+ [INFO|2025-02-11 18:21:21] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\tokenizer_config.json
676
+
677
+ [INFO|2025-02-11 18:21:21] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\special_tokens_map.json
678
+
679
+ [INFO|2025-02-11 18:21:33] logging.py:157 >> {'loss': 1.9669, 'learning_rate': 1.2812e-10, 'epoch': 11.91, 'throughput': 261169.48}
680
+
681
+ [INFO|2025-02-11 18:21:45] logging.py:157 >> {'loss': 1.9956, 'learning_rate': 1.2235e-10, 'epoch': 11.92, 'throughput': 259898.90}
682
+
683
+ [INFO|2025-02-11 18:21:57] logging.py:157 >> {'loss': 2.0026, 'learning_rate': 1.1672e-10, 'epoch': 11.92, 'throughput': 258652.69}
684
+
685
+ [INFO|2025-02-11 18:22:09] logging.py:157 >> {'loss': 1.9802, 'learning_rate': 1.1122e-10, 'epoch': 11.92, 'throughput': 257444.84}
686
+
687
+ [INFO|2025-02-11 18:22:20] logging.py:157 >> {'loss': 1.9955, 'learning_rate': 1.0585e-10, 'epoch': 11.92, 'throughput': 256244.85}
688
+
689
+ [INFO|2025-02-11 18:22:32] logging.py:157 >> {'loss': 2.0080, 'learning_rate': 1.0061e-10, 'epoch': 11.92, 'throughput': 255055.64}
690
+
691
+ [INFO|2025-02-11 18:22:44] logging.py:157 >> {'loss': 2.0313, 'learning_rate': 9.5508e-11, 'epoch': 11.93, 'throughput': 253878.46}
692
+
693
+ [INFO|2025-02-11 18:22:55] logging.py:157 >> {'loss': 2.0100, 'learning_rate': 9.0537e-11, 'epoch': 11.93, 'throughput': 252726.02}
694
+
695
+ [INFO|2025-02-11 18:23:07] logging.py:157 >> {'loss': 2.0303, 'learning_rate': 8.5699e-11, 'epoch': 11.93, 'throughput': 251573.33}
696
+
697
+ [INFO|2025-02-11 18:23:18] logging.py:157 >> {'loss': 1.9680, 'learning_rate': 8.0994e-11, 'epoch': 11.93, 'throughput': 250425.41}
698
+
699
+ [INFO|2025-02-11 18:23:30] logging.py:157 >> {'loss': 2.0075, 'learning_rate': 7.6422e-11, 'epoch': 11.93, 'throughput': 249294.78}
700
+
701
+ [INFO|2025-02-11 18:23:42] logging.py:157 >> {'loss': 1.9790, 'learning_rate': 7.1983e-11, 'epoch': 11.94, 'throughput': 248167.19}
702
+
703
+ [INFO|2025-02-11 18:23:53] logging.py:157 >> {'loss': 2.0086, 'learning_rate': 6.7676e-11, 'epoch': 11.94, 'throughput': 247047.41}
704
+
705
+ [INFO|2025-02-11 18:24:05] logging.py:157 >> {'loss': 1.9897, 'learning_rate': 6.3502e-11, 'epoch': 11.94, 'throughput': 245938.79}
706
+
707
+ [INFO|2025-02-11 18:24:17] logging.py:157 >> {'loss': 2.0329, 'learning_rate': 5.9461e-11, 'epoch': 11.94, 'throughput': 244853.80}
708
+
709
+ [INFO|2025-02-11 18:24:28] logging.py:157 >> {'loss': 1.9813, 'learning_rate': 5.5553e-11, 'epoch': 11.94, 'throughput': 243763.46}
710
+
711
+ [INFO|2025-02-11 18:24:40] logging.py:157 >> {'loss': 1.9675, 'learning_rate': 5.1778e-11, 'epoch': 11.95, 'throughput': 242684.81}
712
+
713
+ [INFO|2025-02-11 18:24:52] logging.py:157 >> {'loss': 2.0206, 'learning_rate': 4.8135e-11, 'epoch': 11.95, 'throughput': 241619.87}
714
+
715
+ [INFO|2025-02-11 18:25:03] logging.py:157 >> {'loss': 1.9695, 'learning_rate': 4.4625e-11, 'epoch': 11.95, 'throughput': 240551.71}
716
+
717
+ [INFO|2025-02-11 18:25:15] logging.py:157 >> {'loss': 1.9561, 'learning_rate': 4.1248e-11, 'epoch': 11.95, 'throughput': 239500.87}
718
+
719
+ [INFO|2025-02-11 18:25:27] logging.py:157 >> {'loss': 1.9766, 'learning_rate': 3.8004e-11, 'epoch': 11.95, 'throughput': 238446.21}
720
+
721
+ [INFO|2025-02-11 18:25:39] logging.py:157 >> {'loss': 2.0330, 'learning_rate': 3.4893e-11, 'epoch': 11.95, 'throughput': 237392.63}
722
+
723
+ [INFO|2025-02-11 18:25:51] logging.py:157 >> {'loss': 1.9738, 'learning_rate': 3.1915e-11, 'epoch': 11.96, 'throughput': 236352.43}
724
+
725
+ [INFO|2025-02-11 18:26:03] logging.py:157 >> {'loss': 2.0233, 'learning_rate': 2.9069e-11, 'epoch': 11.96, 'throughput': 235318.62}
726
+
727
+ [INFO|2025-02-11 18:26:14] logging.py:157 >> {'loss': 1.9637, 'learning_rate': 2.6357e-11, 'epoch': 11.96, 'throughput': 234313.34}
728
+
729
+ [INFO|2025-02-11 18:26:26] logging.py:157 >> {'loss': 1.9714, 'learning_rate': 2.3777e-11, 'epoch': 11.96, 'throughput': 233322.02}
730
+
731
+ [INFO|2025-02-11 18:26:38] logging.py:157 >> {'loss': 1.9451, 'learning_rate': 2.1330e-11, 'epoch': 11.96, 'throughput': 232339.87}
732
+
733
+ [INFO|2025-02-11 18:26:49] logging.py:157 >> {'loss': 2.0128, 'learning_rate': 1.9016e-11, 'epoch': 11.97, 'throughput': 231362.97}
734
+
735
+ [INFO|2025-02-11 18:27:01] logging.py:157 >> {'loss': 1.9903, 'learning_rate': 1.6835e-11, 'epoch': 11.97, 'throughput': 230391.77}
736
+
737
+ [INFO|2025-02-11 18:27:13] logging.py:157 >> {'loss': 2.0050, 'learning_rate': 1.4786e-11, 'epoch': 11.97, 'throughput': 229432.14}
738
+
739
+ [INFO|2025-02-11 18:27:24] logging.py:157 >> {'loss': 1.9584, 'learning_rate': 1.2870e-11, 'epoch': 11.97, 'throughput': 228474.82}
740
+
741
+ [INFO|2025-02-11 18:27:36] logging.py:157 >> {'loss': 1.9683, 'learning_rate': 1.1088e-11, 'epoch': 11.97, 'throughput': 227535.44}
742
+
743
+ [INFO|2025-02-11 18:27:48] logging.py:157 >> {'loss': 1.9758, 'learning_rate': 9.4378e-12, 'epoch': 11.98, 'throughput': 226600.82}
744
+
745
+ [INFO|2025-02-11 18:27:59] logging.py:157 >> {'loss': 2.0262, 'learning_rate': 7.9207e-12, 'epoch': 11.98, 'throughput': 225680.27}
746
+
747
+ [INFO|2025-02-11 18:28:11] logging.py:157 >> {'loss': 1.9921, 'learning_rate': 6.5364e-12, 'epoch': 11.98, 'throughput': 224757.71}
748
+
749
+ [INFO|2025-02-11 18:28:23] logging.py:157 >> {'loss': 1.9910, 'learning_rate': 5.2850e-12, 'epoch': 11.98, 'throughput': 223841.89}
750
+
751
+ [INFO|2025-02-11 18:28:35] logging.py:157 >> {'loss': 1.9481, 'learning_rate': 4.1665e-12, 'epoch': 11.98, 'throughput': 222924.90}
752
+
753
+ [INFO|2025-02-11 18:28:46] logging.py:157 >> {'loss': 1.9444, 'learning_rate': 3.1808e-12, 'epoch': 11.99, 'throughput': 222033.51}
754
+
755
+ [INFO|2025-02-11 18:28:58] logging.py:157 >> {'loss': 1.9570, 'learning_rate': 2.3279e-12, 'epoch': 11.99, 'throughput': 221145.11}
756
+
757
+ [INFO|2025-02-11 18:29:10] logging.py:157 >> {'loss': 2.0438, 'learning_rate': 1.6079e-12, 'epoch': 11.99, 'throughput': 220258.60}
758
+
759
+ [INFO|2025-02-11 18:29:22] logging.py:157 >> {'loss': 1.9305, 'learning_rate': 1.0207e-12, 'epoch': 11.99, 'throughput': 219369.78}
760
+
761
+ [INFO|2025-02-11 18:29:33] logging.py:157 >> {'loss': 1.9854, 'learning_rate': 5.6635e-13, 'epoch': 11.99, 'throughput': 218488.66}
762
+
763
+ [INFO|2025-02-11 18:29:45] logging.py:157 >> {'loss': 1.9765, 'learning_rate': 2.4486e-13, 'epoch': 12.00, 'throughput': 217621.92}
764
+
765
+ [INFO|2025-02-11 18:29:57] logging.py:157 >> {'loss': 1.9388, 'learning_rate': 5.6220e-14, 'epoch': 12.00, 'throughput': 216764.99}
766
+
767
+ [INFO|2025-02-11 18:30:08] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-609492
768
+
769
+ [INFO|2025-02-11 18:30:08] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\config.json
770
+
771
+ [INFO|2025-02-11 18:30:08] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\generation_config.json
772
+
773
+ [INFO|2025-02-11 18:30:08] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\model.safetensors
774
+
775
+ [INFO|2025-02-11 18:30:08] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\tokenizer_config.json
776
+
777
+ [INFO|2025-02-11 18:30:08] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\special_tokens_map.json
778
+
779
+ [INFO|2025-02-11 18:30:08] trainer.py:2643 >>
780
+
781
+ Training completed. Do not forget to share your model on huggingface.co/models =)
782
+
783
+
784
+
785
+ [INFO|2025-02-11 18:30:08] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025
786
+
787
+ [INFO|2025-02-11 18:30:08] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\config.json
788
+
789
+ [INFO|2025-02-11 18:30:08] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\generation_config.json
790
+
791
+ [INFO|2025-02-11 18:30:09] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\model.safetensors
792
+
793
+ [INFO|2025-02-11 18:30:09] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\tokenizer_config.json
794
+
795
+ [INFO|2025-02-11 18:30:09] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\special_tokens_map.json
796
+
797
+ [WARNING|2025-02-11 18:30:09] logging.py:162 >> No metric eval_loss to plot.
798
+
799
+ [WARNING|2025-02-11 18:30:09] logging.py:162 >> No metric eval_accuracy to plot.
800
+
801
+ [INFO|2025-02-11 18:30:09] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields:
802
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
803
+
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "chat_template": "{% set system_message = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + content + '\n\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|endoftext|>' + '\n\n' }}{% endif %}{% endfor %}",
15
+ "clean_up_tokenization_spaces": false,
16
+ "eos_token": "<|endoftext|>",
17
+ "extra_special_tokens": {},
18
+ "model_max_length": 1024,
19
+ "pad_token": "<|endoftext|>",
20
+ "padding_side": "right",
21
+ "split_special_tokens": false,
22
+ "tokenizer_class": "GPT2Tokenizer",
23
+ "unk_token": "<|endoftext|>"
24
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 12.0,
3
+ "num_input_tokens_seen": 620593344,
4
+ "total_flos": 3.16711124803584e+17,
5
+ "train_loss": 0.07982336108915004,
6
+ "train_runtime": 2874.0318,
7
+ "train_samples_per_second": 212.069,
8
+ "train_steps_per_second": 212.069
9
+ }
trainer_log.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff9a7d93fc153a1403128fcff75f60e26e16253da1fa185ac9859426dec960b5
3
+ size 5688
training_args.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bf16: true
2
+ cutoff_len: 1024
3
+ dataset: XeTute/SStory-Gen-EN_ZH,MatanP/emotion_mapped_story_dataset,webnovel,jaydenccc/AI_Storyteller_Dataset
4
+ dataset_dir: data
5
+ ddp_timeout: 180000000
6
+ do_train: true
7
+ finetuning_type: full
8
+ flash_attn: auto
9
+ gradient_accumulation_steps: 1
10
+ include_num_input_tokens_seen: true
11
+ learning_rate: 1.0e-06
12
+ logging_steps: 100
13
+ lr_scheduler_type: cosine
14
+ max_grad_norm: 1.0
15
+ max_samples: 1000000000
16
+ model_name_or_path: XeTute/Phantasor_V0.2-137M
17
+ num_train_epochs: 12.0
18
+ optim: sgd
19
+ output_dir: saves\GPT-2-Small\full\10-02-2025
20
+ packing: false
21
+ per_device_train_batch_size: 1
22
+ plot_loss: true
23
+ preprocessing_num_workers: 16
24
+ report_to: none
25
+ save_steps: 5000
26
+ stage: sft
27
+ template: alpaca
28
+ trust_remote_code: true
29
+ warmup_steps: 10
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff