Upload 17 files
Browse files- all_results.json +9 -0
- config.json +39 -0
- generation_config.json +6 -0
- llamaboard_config.yaml +80 -0
- merges.txt +0 -0
- model.safetensors +3 -0
- running_log.txt +803 -0
- special_tokens_map.json +30 -0
- tokenizer.json +0 -0
- tokenizer_config.json +24 -0
- train_results.json +9 -0
- trainer_log.jsonl +0 -0
- trainer_state.json +0 -0
- training_args.bin +3 -0
- training_args.yaml +29 -0
- training_loss.png +0 -0
- vocab.json +0 -0
all_results.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 12.0,
|
3 |
+
"num_input_tokens_seen": 620593344,
|
4 |
+
"total_flos": 3.16711124803584e+17,
|
5 |
+
"train_loss": 0.07982336108915004,
|
6 |
+
"train_runtime": 2874.0318,
|
7 |
+
"train_samples_per_second": 212.069,
|
8 |
+
"train_steps_per_second": 212.069
|
9 |
+
}
|
config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "XeTute/Phantasor_V0.2-137M",
|
3 |
+
"activation_function": "gelu_new",
|
4 |
+
"architectures": [
|
5 |
+
"GPT2LMHeadModel"
|
6 |
+
],
|
7 |
+
"attn_pdrop": 0.1,
|
8 |
+
"bos_token_id": 50256,
|
9 |
+
"embd_pdrop": 0.1,
|
10 |
+
"eos_token_id": 50256,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"layer_norm_epsilon": 1e-05,
|
13 |
+
"model_type": "gpt2",
|
14 |
+
"n_ctx": 1024,
|
15 |
+
"n_embd": 768,
|
16 |
+
"n_head": 12,
|
17 |
+
"n_inner": null,
|
18 |
+
"n_layer": 12,
|
19 |
+
"n_positions": 1024,
|
20 |
+
"reorder_and_upcast_attn": false,
|
21 |
+
"resid_pdrop": 0.1,
|
22 |
+
"scale_attn_by_inverse_layer_idx": false,
|
23 |
+
"scale_attn_weights": true,
|
24 |
+
"summary_activation": null,
|
25 |
+
"summary_first_dropout": 0.1,
|
26 |
+
"summary_proj_to_labels": true,
|
27 |
+
"summary_type": "cls_index",
|
28 |
+
"summary_use_proj": true,
|
29 |
+
"task_specific_params": {
|
30 |
+
"text-generation": {
|
31 |
+
"do_sample": true,
|
32 |
+
"max_length": 50
|
33 |
+
}
|
34 |
+
},
|
35 |
+
"torch_dtype": "float32",
|
36 |
+
"transformers_version": "4.48.2",
|
37 |
+
"use_cache": false,
|
38 |
+
"vocab_size": 50257
|
39 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 50256,
|
4 |
+
"eos_token_id": 50256,
|
5 |
+
"transformers_version": "4.48.2"
|
6 |
+
}
|
llamaboard_config.yaml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
top.booster: auto
|
2 |
+
top.checkpoint_path: null
|
3 |
+
top.finetuning_type: full
|
4 |
+
top.model_name: GPT-2-Small
|
5 |
+
top.quantization_bit: none
|
6 |
+
top.quantization_method: bitsandbytes
|
7 |
+
top.rope_scaling: none
|
8 |
+
top.template: alpaca
|
9 |
+
train.additional_target: ''
|
10 |
+
train.apollo_rank: 16
|
11 |
+
train.apollo_scale: 32
|
12 |
+
train.apollo_target: all
|
13 |
+
train.apollo_update_interval: 200
|
14 |
+
train.badam_mode: layer
|
15 |
+
train.badam_switch_interval: 50
|
16 |
+
train.badam_switch_mode: ascending
|
17 |
+
train.badam_update_ratio: 0.05
|
18 |
+
train.batch_size: 1
|
19 |
+
train.compute_type: bf16
|
20 |
+
train.create_new_adapter: false
|
21 |
+
train.cutoff_len: 1024
|
22 |
+
train.dataset:
|
23 |
+
- XeTute/SStory-Gen-EN_ZH
|
24 |
+
- MatanP/emotion_mapped_story_dataset
|
25 |
+
- webnovel
|
26 |
+
- jaydenccc/AI_Storyteller_Dataset
|
27 |
+
train.dataset_dir: data
|
28 |
+
train.ds_offload: false
|
29 |
+
train.ds_stage: none
|
30 |
+
train.extra_args: '{"optim": "sgd"}'
|
31 |
+
train.freeze_extra_modules: ''
|
32 |
+
train.freeze_trainable_layers: 2
|
33 |
+
train.freeze_trainable_modules: all
|
34 |
+
train.galore_rank: 16
|
35 |
+
train.galore_scale: 2
|
36 |
+
train.galore_target: all
|
37 |
+
train.galore_update_interval: 200
|
38 |
+
train.gradient_accumulation_steps: 1
|
39 |
+
train.learning_rate: 1e-6
|
40 |
+
train.logging_steps: 100
|
41 |
+
train.lora_alpha: 16
|
42 |
+
train.lora_dropout: 0
|
43 |
+
train.lora_rank: 8
|
44 |
+
train.lora_target: ''
|
45 |
+
train.loraplus_lr_ratio: 0
|
46 |
+
train.lr_scheduler_type: cosine
|
47 |
+
train.mask_history: false
|
48 |
+
train.max_grad_norm: '1.0'
|
49 |
+
train.max_samples: '1000000000'
|
50 |
+
train.neat_packing: false
|
51 |
+
train.neftune_alpha: 0
|
52 |
+
train.num_train_epochs: '12.0'
|
53 |
+
train.packing: false
|
54 |
+
train.ppo_score_norm: false
|
55 |
+
train.ppo_whiten_rewards: false
|
56 |
+
train.pref_beta: 0.1
|
57 |
+
train.pref_ftx: 0
|
58 |
+
train.pref_loss: sigmoid
|
59 |
+
train.report_to:
|
60 |
+
- none
|
61 |
+
train.resize_vocab: false
|
62 |
+
train.reward_model: []
|
63 |
+
train.save_steps: 5000
|
64 |
+
train.swanlab_api_key: ''
|
65 |
+
train.swanlab_mode: cloud
|
66 |
+
train.swanlab_project: llamafactory
|
67 |
+
train.swanlab_run_name: ''
|
68 |
+
train.swanlab_workspace: ''
|
69 |
+
train.train_on_prompt: false
|
70 |
+
train.training_stage: Supervised Fine-Tuning
|
71 |
+
train.use_apollo: false
|
72 |
+
train.use_badam: false
|
73 |
+
train.use_dora: false
|
74 |
+
train.use_galore: false
|
75 |
+
train.use_llama_pro: false
|
76 |
+
train.use_pissa: false
|
77 |
+
train.use_rslora: false
|
78 |
+
train.use_swanlab: false
|
79 |
+
train.val_size: 0
|
80 |
+
train.warmup_steps: 10
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:002933714d638e62f3a7c2796c1075d9aaf99119481284246c9012400fc8eee7
|
3 |
+
size 497774208
|
running_log.txt
ADDED
@@ -0,0 +1,803 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[INFO|2025-02-11 17:41:48] configuration_utils.py:696 >> loading configuration file config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\config.json
|
2 |
+
|
3 |
+
[INFO|2025-02-11 17:41:48] configuration_utils.py:768 >> Model config GPT2Config {
|
4 |
+
"_name_or_path": "XeTute/Phantasor_V0.2-137M",
|
5 |
+
"activation_function": "gelu_new",
|
6 |
+
"architectures": [
|
7 |
+
"GPT2LMHeadModel"
|
8 |
+
],
|
9 |
+
"attn_pdrop": 0.1,
|
10 |
+
"bos_token_id": 50256,
|
11 |
+
"embd_pdrop": 0.1,
|
12 |
+
"eos_token_id": 50256,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"layer_norm_epsilon": 1e-05,
|
15 |
+
"model_type": "gpt2",
|
16 |
+
"n_ctx": 1024,
|
17 |
+
"n_embd": 768,
|
18 |
+
"n_head": 12,
|
19 |
+
"n_inner": null,
|
20 |
+
"n_layer": 12,
|
21 |
+
"n_positions": 1024,
|
22 |
+
"reorder_and_upcast_attn": false,
|
23 |
+
"resid_pdrop": 0.1,
|
24 |
+
"scale_attn_by_inverse_layer_idx": false,
|
25 |
+
"scale_attn_weights": true,
|
26 |
+
"summary_activation": null,
|
27 |
+
"summary_first_dropout": 0.1,
|
28 |
+
"summary_proj_to_labels": true,
|
29 |
+
"summary_type": "cls_index",
|
30 |
+
"summary_use_proj": true,
|
31 |
+
"task_specific_params": {
|
32 |
+
"text-generation": {
|
33 |
+
"do_sample": true,
|
34 |
+
"max_length": 50
|
35 |
+
}
|
36 |
+
},
|
37 |
+
"torch_dtype": "float32",
|
38 |
+
"transformers_version": "4.48.2",
|
39 |
+
"use_cache": false,
|
40 |
+
"vocab_size": 50257
|
41 |
+
}
|
42 |
+
|
43 |
+
|
44 |
+
[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file vocab.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\vocab.json
|
45 |
+
|
46 |
+
[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file merges.txt from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\merges.txt
|
47 |
+
|
48 |
+
[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer.json
|
49 |
+
|
50 |
+
[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None
|
51 |
+
|
52 |
+
[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\special_tokens_map.json
|
53 |
+
|
54 |
+
[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer_config.json
|
55 |
+
|
56 |
+
[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
|
57 |
+
|
58 |
+
[INFO|2025-02-11 17:41:50] configuration_utils.py:696 >> loading configuration file config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\config.json
|
59 |
+
|
60 |
+
[INFO|2025-02-11 17:41:50] configuration_utils.py:768 >> Model config GPT2Config {
|
61 |
+
"_name_or_path": "XeTute/Phantasor_V0.2-137M",
|
62 |
+
"activation_function": "gelu_new",
|
63 |
+
"architectures": [
|
64 |
+
"GPT2LMHeadModel"
|
65 |
+
],
|
66 |
+
"attn_pdrop": 0.1,
|
67 |
+
"bos_token_id": 50256,
|
68 |
+
"embd_pdrop": 0.1,
|
69 |
+
"eos_token_id": 50256,
|
70 |
+
"initializer_range": 0.02,
|
71 |
+
"layer_norm_epsilon": 1e-05,
|
72 |
+
"model_type": "gpt2",
|
73 |
+
"n_ctx": 1024,
|
74 |
+
"n_embd": 768,
|
75 |
+
"n_head": 12,
|
76 |
+
"n_inner": null,
|
77 |
+
"n_layer": 12,
|
78 |
+
"n_positions": 1024,
|
79 |
+
"reorder_and_upcast_attn": false,
|
80 |
+
"resid_pdrop": 0.1,
|
81 |
+
"scale_attn_by_inverse_layer_idx": false,
|
82 |
+
"scale_attn_weights": true,
|
83 |
+
"summary_activation": null,
|
84 |
+
"summary_first_dropout": 0.1,
|
85 |
+
"summary_proj_to_labels": true,
|
86 |
+
"summary_type": "cls_index",
|
87 |
+
"summary_use_proj": true,
|
88 |
+
"task_specific_params": {
|
89 |
+
"text-generation": {
|
90 |
+
"do_sample": true,
|
91 |
+
"max_length": 50
|
92 |
+
}
|
93 |
+
},
|
94 |
+
"torch_dtype": "float32",
|
95 |
+
"transformers_version": "4.48.2",
|
96 |
+
"use_cache": false,
|
97 |
+
"vocab_size": 50257
|
98 |
+
}
|
99 |
+
|
100 |
+
|
101 |
+
[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file vocab.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\vocab.json
|
102 |
+
|
103 |
+
[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file merges.txt from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\merges.txt
|
104 |
+
|
105 |
+
[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer.json
|
106 |
+
|
107 |
+
[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None
|
108 |
+
|
109 |
+
[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\special_tokens_map.json
|
110 |
+
|
111 |
+
[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer_config.json
|
112 |
+
|
113 |
+
[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
|
114 |
+
|
115 |
+
[INFO|2025-02-11 17:41:50] logging.py:157 >> Loading dataset XeTute/SStory-Gen-EN_ZH...
|
116 |
+
|
117 |
+
[INFO|2025-02-11 17:41:59] logging.py:157 >> Loading dataset MatanP/emotion_mapped_story_dataset...
|
118 |
+
|
119 |
+
[INFO|2025-02-11 17:42:03] logging.py:157 >> Loading dataset zxbsmk/webnovel_cn...
|
120 |
+
|
121 |
+
[INFO|2025-02-11 17:42:10] logging.py:157 >> Loading dataset jaydenccc/AI_Storyteller_Dataset...
|
122 |
+
|
123 |
+
[INFO|2025-02-11 17:42:13] configuration_utils.py:696 >> loading configuration file config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\config.json
|
124 |
+
|
125 |
+
[INFO|2025-02-11 17:42:13] configuration_utils.py:768 >> Model config GPT2Config {
|
126 |
+
"_name_or_path": "XeTute/Phantasor_V0.2-137M",
|
127 |
+
"activation_function": "gelu_new",
|
128 |
+
"architectures": [
|
129 |
+
"GPT2LMHeadModel"
|
130 |
+
],
|
131 |
+
"attn_pdrop": 0.1,
|
132 |
+
"bos_token_id": 50256,
|
133 |
+
"embd_pdrop": 0.1,
|
134 |
+
"eos_token_id": 50256,
|
135 |
+
"initializer_range": 0.02,
|
136 |
+
"layer_norm_epsilon": 1e-05,
|
137 |
+
"model_type": "gpt2",
|
138 |
+
"n_ctx": 1024,
|
139 |
+
"n_embd": 768,
|
140 |
+
"n_head": 12,
|
141 |
+
"n_inner": null,
|
142 |
+
"n_layer": 12,
|
143 |
+
"n_positions": 1024,
|
144 |
+
"reorder_and_upcast_attn": false,
|
145 |
+
"resid_pdrop": 0.1,
|
146 |
+
"scale_attn_by_inverse_layer_idx": false,
|
147 |
+
"scale_attn_weights": true,
|
148 |
+
"summary_activation": null,
|
149 |
+
"summary_first_dropout": 0.1,
|
150 |
+
"summary_proj_to_labels": true,
|
151 |
+
"summary_type": "cls_index",
|
152 |
+
"summary_use_proj": true,
|
153 |
+
"task_specific_params": {
|
154 |
+
"text-generation": {
|
155 |
+
"do_sample": true,
|
156 |
+
"max_length": 50
|
157 |
+
}
|
158 |
+
},
|
159 |
+
"torch_dtype": "float32",
|
160 |
+
"transformers_version": "4.48.2",
|
161 |
+
"use_cache": false,
|
162 |
+
"vocab_size": 50257
|
163 |
+
}
|
164 |
+
|
165 |
+
|
166 |
+
[INFO|2025-02-11 17:42:13] modeling_utils.py:3904 >> loading weights file model.safetensors from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\model.safetensors
|
167 |
+
|
168 |
+
[INFO|2025-02-11 17:42:13] modeling_utils.py:1582 >> Instantiating GPT2LMHeadModel model under default dtype torch.bfloat16.
|
169 |
+
|
170 |
+
[INFO|2025-02-11 17:42:13] configuration_utils.py:1140 >> Generate config GenerationConfig {
|
171 |
+
"bos_token_id": 50256,
|
172 |
+
"eos_token_id": 50256,
|
173 |
+
"use_cache": false
|
174 |
+
}
|
175 |
+
|
176 |
+
|
177 |
+
[INFO|2025-02-11 17:42:14] modeling_utils.py:4888 >> All model checkpoint weights were used when initializing GPT2LMHeadModel.
|
178 |
+
|
179 |
+
|
180 |
+
[INFO|2025-02-11 17:42:14] modeling_utils.py:4896 >> All the weights of GPT2LMHeadModel were initialized from the model checkpoint at XeTute/Phantasor_V0.2-137M.
|
181 |
+
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.
|
182 |
+
|
183 |
+
[INFO|2025-02-11 17:42:14] configuration_utils.py:1095 >> loading configuration file generation_config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\generation_config.json
|
184 |
+
|
185 |
+
[INFO|2025-02-11 17:42:14] configuration_utils.py:1140 >> Generate config GenerationConfig {
|
186 |
+
"bos_token_id": 50256,
|
187 |
+
"eos_token_id": 50256
|
188 |
+
}
|
189 |
+
|
190 |
+
|
191 |
+
[INFO|2025-02-11 17:42:14] logging.py:157 >> Gradient checkpointing enabled.
|
192 |
+
|
193 |
+
[INFO|2025-02-11 17:42:14] logging.py:157 >> Using torch SDPA for faster training and inference.
|
194 |
+
|
195 |
+
[INFO|2025-02-11 17:42:14] logging.py:157 >> Upcasting trainable params to float32.
|
196 |
+
|
197 |
+
[INFO|2025-02-11 17:42:14] logging.py:157 >> Fine-tuning method: Full
|
198 |
+
|
199 |
+
[INFO|2025-02-11 17:42:14] logging.py:157 >> trainable params: 124,439,808 || all params: 124,439,808 || trainable%: 100.0000
|
200 |
+
|
201 |
+
[INFO|2025-02-11 17:42:14] trainer.py:741 >> Using auto half precision backend
|
202 |
+
|
203 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2775 >> Loading model from saves\GPT-2-Small\full\10-02-2025\checkpoint-585000.
|
204 |
+
|
205 |
+
[WARNING|2025-02-11 17:42:14] trainer.py:3018 >> There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
|
206 |
+
|
207 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2369 >> ***** Running training *****
|
208 |
+
|
209 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2370 >> Num examples = 50,791
|
210 |
+
|
211 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2371 >> Num Epochs = 12
|
212 |
+
|
213 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2372 >> Instantaneous batch size per device = 1
|
214 |
+
|
215 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2375 >> Total train batch size (w. parallel, distributed & accumulation) = 1
|
216 |
+
|
217 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2376 >> Gradient Accumulation steps = 1
|
218 |
+
|
219 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2377 >> Total optimization steps = 609,492
|
220 |
+
|
221 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2378 >> Number of trainable parameters = 124,439,808
|
222 |
+
|
223 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2400 >> Continuing training from checkpoint, will skip to saved global_step
|
224 |
+
|
225 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2401 >> Continuing training from epoch 11
|
226 |
+
|
227 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2402 >> Continuing training from global step 585000
|
228 |
+
|
229 |
+
[INFO|2025-02-11 17:42:14] trainer.py:2404 >> Will skip the first 11 epochs then the first 26299 batches in the first epoch.
|
230 |
+
|
231 |
+
[INFO|2025-02-11 17:42:26] logging.py:157 >> {'loss': 1.9812, 'learning_rate': 3.9468e-09, 'epoch': 11.52, 'throughput': 50003805.20}
|
232 |
+
|
233 |
+
[INFO|2025-02-11 17:42:38] logging.py:157 >> {'loss': 2.0162, 'learning_rate': 3.9145e-09, 'epoch': 11.52, 'throughput': 25202148.23}
|
234 |
+
|
235 |
+
[INFO|2025-02-11 17:42:50] logging.py:157 >> {'loss': 1.9694, 'learning_rate': 3.8824e-09, 'epoch': 11.52, 'throughput': 16752405.16}
|
236 |
+
|
237 |
+
[INFO|2025-02-11 17:43:02] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 3.8504e-09, 'epoch': 11.53, 'throughput': 12548387.20}
|
238 |
+
|
239 |
+
[INFO|2025-02-11 17:43:14] logging.py:157 >> {'loss': 1.9634, 'learning_rate': 3.8185e-09, 'epoch': 11.53, 'throughput': 10008058.95}
|
240 |
+
|
241 |
+
[INFO|2025-02-11 17:43:26] logging.py:157 >> {'loss': 2.0249, 'learning_rate': 3.7868e-09, 'epoch': 11.53, 'throughput': 8363752.91}
|
242 |
+
|
243 |
+
[INFO|2025-02-11 17:43:38] logging.py:157 >> {'loss': 1.9759, 'learning_rate': 3.7552e-09, 'epoch': 11.53, 'throughput': 7170473.74}
|
244 |
+
|
245 |
+
[INFO|2025-02-11 17:43:49] logging.py:157 >> {'loss': 2.0019, 'learning_rate': 3.7238e-09, 'epoch': 11.53, 'throughput': 6291290.77}
|
246 |
+
|
247 |
+
[INFO|2025-02-11 17:44:01] logging.py:157 >> {'loss': 1.9800, 'learning_rate': 3.6924e-09, 'epoch': 11.54, 'throughput': 5603383.58}
|
248 |
+
|
249 |
+
[INFO|2025-02-11 17:44:12] logging.py:157 >> {'loss': 1.9993, 'learning_rate': 3.6612e-09, 'epoch': 11.54, 'throughput': 5058860.90}
|
250 |
+
|
251 |
+
[INFO|2025-02-11 17:44:24] logging.py:157 >> {'loss': 1.9438, 'learning_rate': 3.6302e-09, 'epoch': 11.54, 'throughput': 4612411.90}
|
252 |
+
|
253 |
+
[INFO|2025-02-11 17:44:35] logging.py:157 >> {'loss': 2.0000, 'learning_rate': 3.5992e-09, 'epoch': 11.54, 'throughput': 4237182.77}
|
254 |
+
|
255 |
+
[INFO|2025-02-11 17:44:47] logging.py:157 >> {'loss': 2.0493, 'learning_rate': 3.5684e-09, 'epoch': 11.54, 'throughput': 3919609.73}
|
256 |
+
|
257 |
+
[INFO|2025-02-11 17:44:58] logging.py:157 >> {'loss': 1.9846, 'learning_rate': 3.5378e-09, 'epoch': 11.55, 'throughput': 3644083.96}
|
258 |
+
|
259 |
+
[INFO|2025-02-11 17:45:10] logging.py:157 >> {'loss': 1.9597, 'learning_rate': 3.5072e-09, 'epoch': 11.55, 'throughput': 3405057.81}
|
260 |
+
|
261 |
+
[INFO|2025-02-11 17:45:21] logging.py:157 >> {'loss': 2.0429, 'learning_rate': 3.4768e-09, 'epoch': 11.55, 'throughput': 3196451.63}
|
262 |
+
|
263 |
+
[INFO|2025-02-11 17:45:33] logging.py:157 >> {'loss': 1.9481, 'learning_rate': 3.4465e-09, 'epoch': 11.55, 'throughput': 3011092.35}
|
264 |
+
|
265 |
+
[INFO|2025-02-11 17:45:45] logging.py:157 >> {'loss': 1.9765, 'learning_rate': 3.4164e-09, 'epoch': 11.55, 'throughput': 2844004.86}
|
266 |
+
|
267 |
+
[INFO|2025-02-11 17:45:56] logging.py:157 >> {'loss': 2.0500, 'learning_rate': 3.3864e-09, 'epoch': 11.56, 'throughput': 2695596.64}
|
268 |
+
|
269 |
+
[INFO|2025-02-11 17:46:08] logging.py:157 >> {'loss': 2.0157, 'learning_rate': 3.3565e-09, 'epoch': 11.56, 'throughput': 2563562.80}
|
270 |
+
|
271 |
+
[INFO|2025-02-11 17:46:19] logging.py:157 >> {'loss': 2.0172, 'learning_rate': 3.3268e-09, 'epoch': 11.56, 'throughput': 2443655.33}
|
272 |
+
|
273 |
+
[INFO|2025-02-11 17:46:31] logging.py:157 >> {'loss': 1.9731, 'learning_rate': 3.2971e-09, 'epoch': 11.56, 'throughput': 2330850.00}
|
274 |
+
|
275 |
+
[INFO|2025-02-11 17:46:43] logging.py:157 >> {'loss': 2.0210, 'learning_rate': 3.2677e-09, 'epoch': 11.56, 'throughput': 2228412.35}
|
276 |
+
|
277 |
+
[INFO|2025-02-11 17:46:55] logging.py:157 >> {'loss': 1.9729, 'learning_rate': 3.2383e-09, 'epoch': 11.57, 'throughput': 2135025.81}
|
278 |
+
|
279 |
+
[INFO|2025-02-11 17:47:06] logging.py:157 >> {'loss': 2.0153, 'learning_rate': 3.2091e-09, 'epoch': 11.57, 'throughput': 2049017.68}
|
280 |
+
|
281 |
+
[INFO|2025-02-11 17:47:18] logging.py:157 >> {'loss': 1.9696, 'learning_rate': 3.1800e-09, 'epoch': 11.57, 'throughput': 1968648.47}
|
282 |
+
|
283 |
+
[INFO|2025-02-11 17:47:30] logging.py:157 >> {'loss': 1.9419, 'learning_rate': 3.1511e-09, 'epoch': 11.57, 'throughput': 1895060.01}
|
284 |
+
|
285 |
+
[INFO|2025-02-11 17:47:42] logging.py:157 >> {'loss': 2.0142, 'learning_rate': 3.1222e-09, 'epoch': 11.57, 'throughput': 1827059.79}
|
286 |
+
|
287 |
+
[INFO|2025-02-11 17:47:54] logging.py:157 >> {'loss': 2.0028, 'learning_rate': 3.0935e-09, 'epoch': 11.57, 'throughput': 1763616.14}
|
288 |
+
|
289 |
+
[INFO|2025-02-11 17:48:06] logging.py:157 >> {'loss': 1.9800, 'learning_rate': 3.0650e-09, 'epoch': 11.58, 'throughput': 1704997.68}
|
290 |
+
|
291 |
+
[INFO|2025-02-11 17:48:17] logging.py:157 >> {'loss': 2.0216, 'learning_rate': 3.0366e-09, 'epoch': 11.58, 'throughput': 1650129.04}
|
292 |
+
|
293 |
+
[INFO|2025-02-11 17:48:29] logging.py:157 >> {'loss': 1.9755, 'learning_rate': 3.0083e-09, 'epoch': 11.58, 'throughput': 1598132.07}
|
294 |
+
|
295 |
+
[INFO|2025-02-11 17:48:41] logging.py:157 >> {'loss': 2.0083, 'learning_rate': 2.9801e-09, 'epoch': 11.58, 'throughput': 1550207.97}
|
296 |
+
|
297 |
+
[INFO|2025-02-11 17:48:53] logging.py:157 >> {'loss': 2.0161, 'learning_rate': 2.9521e-09, 'epoch': 11.58, 'throughput': 1504244.14}
|
298 |
+
|
299 |
+
[INFO|2025-02-11 17:49:04] logging.py:157 >> {'loss': 2.0157, 'learning_rate': 2.9242e-09, 'epoch': 11.59, 'throughput': 1461629.99}
|
300 |
+
|
301 |
+
[INFO|2025-02-11 17:49:16] logging.py:157 >> {'loss': 1.9792, 'learning_rate': 2.8964e-09, 'epoch': 11.59, 'throughput': 1421289.44}
|
302 |
+
|
303 |
+
[INFO|2025-02-11 17:49:28] logging.py:157 >> {'loss': 1.9482, 'learning_rate': 2.8688e-09, 'epoch': 11.59, 'throughput': 1382339.41}
|
304 |
+
|
305 |
+
[INFO|2025-02-11 17:49:40] logging.py:157 >> {'loss': 2.0295, 'learning_rate': 2.8413e-09, 'epoch': 11.59, 'throughput': 1346070.98}
|
306 |
+
|
307 |
+
[INFO|2025-02-11 17:49:52] logging.py:157 >> {'loss': 1.9778, 'learning_rate': 2.8139e-09, 'epoch': 11.59, 'throughput': 1311592.72}
|
308 |
+
|
309 |
+
[INFO|2025-02-11 17:50:03] logging.py:157 >> {'loss': 2.0103, 'learning_rate': 2.7867e-09, 'epoch': 11.60, 'throughput': 1279350.97}
|
310 |
+
|
311 |
+
[INFO|2025-02-11 17:50:15] logging.py:157 >> {'loss': 1.9449, 'learning_rate': 2.7595e-09, 'epoch': 11.60, 'throughput': 1248312.09}
|
312 |
+
|
313 |
+
[INFO|2025-02-11 17:50:27] logging.py:157 >> {'loss': 2.0110, 'learning_rate': 2.7326e-09, 'epoch': 11.60, 'throughput': 1218768.16}
|
314 |
+
|
315 |
+
[INFO|2025-02-11 17:50:39] logging.py:157 >> {'loss': 1.9304, 'learning_rate': 2.7057e-09, 'epoch': 11.60, 'throughput': 1190297.15}
|
316 |
+
|
317 |
+
[INFO|2025-02-11 17:50:51] logging.py:157 >> {'loss': 1.9709, 'learning_rate': 2.6790e-09, 'epoch': 11.60, 'throughput': 1162183.03}
|
318 |
+
|
319 |
+
[INFO|2025-02-11 17:51:03] logging.py:157 >> {'loss': 1.9486, 'learning_rate': 2.6524e-09, 'epoch': 11.61, 'throughput': 1136347.30}
|
320 |
+
|
321 |
+
[INFO|2025-02-11 17:51:14] logging.py:157 >> {'loss': 1.9941, 'learning_rate': 2.6260e-09, 'epoch': 11.61, 'throughput': 1111856.81}
|
322 |
+
|
323 |
+
[INFO|2025-02-11 17:51:26] logging.py:157 >> {'loss': 1.9774, 'learning_rate': 2.5997e-09, 'epoch': 11.61, 'throughput': 1088182.32}
|
324 |
+
|
325 |
+
[INFO|2025-02-11 17:51:38] logging.py:157 >> {'loss': 1.9954, 'learning_rate': 2.5735e-09, 'epoch': 11.61, 'throughput': 1065718.96}
|
326 |
+
|
327 |
+
[INFO|2025-02-11 17:51:50] logging.py:157 >> {'loss': 2.0438, 'learning_rate': 2.5475e-09, 'epoch': 11.61, 'throughput': 1044277.87}
|
328 |
+
|
329 |
+
[INFO|2025-02-11 17:52:01] logging.py:157 >> {'loss': 2.0343, 'learning_rate': 2.5215e-09, 'epoch': 11.62, 'throughput': 1023489.66}
|
330 |
+
|
331 |
+
[INFO|2025-02-11 17:52:01] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-590000
|
332 |
+
|
333 |
+
[INFO|2025-02-11 17:52:01] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\config.json
|
334 |
+
|
335 |
+
[INFO|2025-02-11 17:52:01] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\generation_config.json
|
336 |
+
|
337 |
+
[INFO|2025-02-11 17:52:02] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\model.safetensors
|
338 |
+
|
339 |
+
[INFO|2025-02-11 17:52:02] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\tokenizer_config.json
|
340 |
+
|
341 |
+
[INFO|2025-02-11 17:52:02] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\special_tokens_map.json
|
342 |
+
|
343 |
+
[INFO|2025-02-11 17:52:14] logging.py:157 >> {'loss': 2.0148, 'learning_rate': 2.4958e-09, 'epoch': 11.62, 'throughput': 1002469.52}
|
344 |
+
|
345 |
+
[INFO|2025-02-11 17:52:26] logging.py:157 >> {'loss': 1.9398, 'learning_rate': 2.4701e-09, 'epoch': 11.62, 'throughput': 983195.98}
|
346 |
+
|
347 |
+
[INFO|2025-02-11 17:52:37] logging.py:157 >> {'loss': 2.0115, 'learning_rate': 2.4446e-09, 'epoch': 11.62, 'throughput': 964942.63}
|
348 |
+
|
349 |
+
[INFO|2025-02-11 17:52:49] logging.py:157 >> {'loss': 1.9977, 'learning_rate': 2.4192e-09, 'epoch': 11.62, 'throughput': 946799.71}
|
350 |
+
|
351 |
+
[INFO|2025-02-11 17:53:01] logging.py:157 >> {'loss': 1.9790, 'learning_rate': 2.3939e-09, 'epoch': 11.63, 'throughput': 929612.66}
|
352 |
+
|
353 |
+
[INFO|2025-02-11 17:53:13] logging.py:157 >> {'loss': 2.0179, 'learning_rate': 2.3688e-09, 'epoch': 11.63, 'throughput': 912840.69}
|
354 |
+
|
355 |
+
[INFO|2025-02-11 17:53:25] logging.py:157 >> {'loss': 1.9422, 'learning_rate': 2.3438e-09, 'epoch': 11.63, 'throughput': 897250.33}
|
356 |
+
|
357 |
+
[INFO|2025-02-11 17:53:36] logging.py:157 >> {'loss': 1.9710, 'learning_rate': 2.3190e-09, 'epoch': 11.63, 'throughput': 882243.27}
|
358 |
+
|
359 |
+
[INFO|2025-02-11 17:53:48] logging.py:157 >> {'loss': 1.9900, 'learning_rate': 2.2942e-09, 'epoch': 11.63, 'throughput': 867690.03}
|
360 |
+
|
361 |
+
[INFO|2025-02-11 17:53:59] logging.py:157 >> {'loss': 2.0106, 'learning_rate': 2.2696e-09, 'epoch': 11.64, 'throughput': 853674.20}
|
362 |
+
|
363 |
+
[INFO|2025-02-11 17:54:11] logging.py:157 >> {'loss': 1.9897, 'learning_rate': 2.2452e-09, 'epoch': 11.64, 'throughput': 840110.77}
|
364 |
+
|
365 |
+
[INFO|2025-02-11 17:54:22] logging.py:157 >> {'loss': 2.0037, 'learning_rate': 2.2208e-09, 'epoch': 11.64, 'throughput': 826914.64}
|
366 |
+
|
367 |
+
[INFO|2025-02-11 17:54:34] logging.py:157 >> {'loss': 1.9609, 'learning_rate': 2.1966e-09, 'epoch': 11.64, 'throughput': 814139.50}
|
368 |
+
|
369 |
+
[INFO|2025-02-11 17:54:45] logging.py:157 >> {'loss': 2.0072, 'learning_rate': 2.1726e-09, 'epoch': 11.64, 'throughput': 801848.40}
|
370 |
+
|
371 |
+
[INFO|2025-02-11 17:54:57] logging.py:157 >> {'loss': 1.9598, 'learning_rate': 2.1486e-09, 'epoch': 11.65, 'throughput': 789459.13}
|
372 |
+
|
373 |
+
[INFO|2025-02-11 17:55:09] logging.py:157 >> {'loss': 1.9845, 'learning_rate': 2.1248e-09, 'epoch': 11.65, 'throughput': 777598.46}
|
374 |
+
|
375 |
+
[INFO|2025-02-11 17:55:21] logging.py:157 >> {'loss': 1.9423, 'learning_rate': 2.1012e-09, 'epoch': 11.65, 'throughput': 766045.00}
|
376 |
+
|
377 |
+
[INFO|2025-02-11 17:55:33] logging.py:157 >> {'loss': 1.9774, 'learning_rate': 2.0776e-09, 'epoch': 11.65, 'throughput': 754823.16}
|
378 |
+
|
379 |
+
[INFO|2025-02-11 17:55:45] logging.py:157 >> {'loss': 2.0093, 'learning_rate': 2.0542e-09, 'epoch': 11.65, 'throughput': 743976.59}
|
380 |
+
|
381 |
+
[INFO|2025-02-11 17:55:56] logging.py:157 >> {'loss': 1.9717, 'learning_rate': 2.0310e-09, 'epoch': 11.66, 'throughput': 733371.47}
|
382 |
+
|
383 |
+
[INFO|2025-02-11 17:56:08] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 2.0078e-09, 'epoch': 11.66, 'throughput': 723085.26}
|
384 |
+
|
385 |
+
[INFO|2025-02-11 17:56:20] logging.py:157 >> {'loss': 1.9593, 'learning_rate': 1.9848e-09, 'epoch': 11.66, 'throughput': 713131.55}
|
386 |
+
|
387 |
+
[INFO|2025-02-11 17:56:32] logging.py:157 >> {'loss': 1.9501, 'learning_rate': 1.9619e-09, 'epoch': 11.66, 'throughput': 703409.15}
|
388 |
+
|
389 |
+
[INFO|2025-02-11 17:56:44] logging.py:157 >> {'loss': 1.9638, 'learning_rate': 1.9392e-09, 'epoch': 11.66, 'throughput': 693950.69}
|
390 |
+
|
391 |
+
[INFO|2025-02-11 17:56:55] logging.py:157 >> {'loss': 1.9802, 'learning_rate': 1.9166e-09, 'epoch': 11.67, 'throughput': 684783.60}
|
392 |
+
|
393 |
+
[INFO|2025-02-11 17:57:08] logging.py:157 >> {'loss': 1.9731, 'learning_rate': 1.8941e-09, 'epoch': 11.67, 'throughput': 675622.49}
|
394 |
+
|
395 |
+
[INFO|2025-02-11 17:57:20] logging.py:157 >> {'loss': 1.9767, 'learning_rate': 1.8718e-09, 'epoch': 11.67, 'throughput': 666580.81}
|
396 |
+
|
397 |
+
[INFO|2025-02-11 17:57:32] logging.py:157 >> {'loss': 1.9927, 'learning_rate': 1.8496e-09, 'epoch': 11.67, 'throughput': 658156.99}
|
398 |
+
|
399 |
+
[INFO|2025-02-11 17:57:43] logging.py:157 >> {'loss': 1.9931, 'learning_rate': 1.8275e-09, 'epoch': 11.67, 'throughput': 649934.04}
|
400 |
+
|
401 |
+
[INFO|2025-02-11 17:57:55] logging.py:157 >> {'loss': 1.9789, 'learning_rate': 1.8055e-09, 'epoch': 11.68, 'throughput': 641903.03}
|
402 |
+
|
403 |
+
[INFO|2025-02-11 17:58:07] logging.py:157 >> {'loss': 2.0209, 'learning_rate': 1.7837e-09, 'epoch': 11.68, 'throughput': 634098.47}
|
404 |
+
|
405 |
+
[INFO|2025-02-11 17:58:19] logging.py:157 >> {'loss': 1.9245, 'learning_rate': 1.7620e-09, 'epoch': 11.68, 'throughput': 626445.98}
|
406 |
+
|
407 |
+
[INFO|2025-02-11 17:58:30] logging.py:157 >> {'loss': 2.0252, 'learning_rate': 1.7405e-09, 'epoch': 11.68, 'throughput': 618981.54}
|
408 |
+
|
409 |
+
[INFO|2025-02-11 17:58:42] logging.py:157 >> {'loss': 2.0151, 'learning_rate': 1.7191e-09, 'epoch': 11.68, 'throughput': 611668.79}
|
410 |
+
|
411 |
+
[INFO|2025-02-11 17:58:54] logging.py:157 >> {'loss': 1.9501, 'learning_rate': 1.6978e-09, 'epoch': 11.69, 'throughput': 604546.71}
|
412 |
+
|
413 |
+
[INFO|2025-02-11 17:59:06] logging.py:157 >> {'loss': 2.0443, 'learning_rate': 1.6766e-09, 'epoch': 11.69, 'throughput': 597713.96}
|
414 |
+
|
415 |
+
[INFO|2025-02-11 17:59:17] logging.py:157 >> {'loss': 1.9772, 'learning_rate': 1.6556e-09, 'epoch': 11.69, 'throughput': 591071.07}
|
416 |
+
|
417 |
+
[INFO|2025-02-11 17:59:29] logging.py:157 >> {'loss': 1.9930, 'learning_rate': 1.6347e-09, 'epoch': 11.69, 'throughput': 584451.24}
|
418 |
+
|
419 |
+
[INFO|2025-02-11 17:59:41] logging.py:157 >> {'loss': 2.0026, 'learning_rate': 1.6139e-09, 'epoch': 11.69, 'throughput': 577957.44}
|
420 |
+
|
421 |
+
[INFO|2025-02-11 17:59:52] logging.py:157 >> {'loss': 2.0245, 'learning_rate': 1.5933e-09, 'epoch': 11.69, 'throughput': 571700.05}
|
422 |
+
|
423 |
+
[INFO|2025-02-11 18:00:04] logging.py:157 >> {'loss': 2.0352, 'learning_rate': 1.5728e-09, 'epoch': 11.70, 'throughput': 565427.48}
|
424 |
+
|
425 |
+
[INFO|2025-02-11 18:00:16] logging.py:157 >> {'loss': 1.9875, 'learning_rate': 1.5525e-09, 'epoch': 11.70, 'throughput': 559389.83}
|
426 |
+
|
427 |
+
[INFO|2025-02-11 18:00:28] logging.py:157 >> {'loss': 1.9973, 'learning_rate': 1.5322e-09, 'epoch': 11.70, 'throughput': 553408.92}
|
428 |
+
|
429 |
+
[INFO|2025-02-11 18:00:40] logging.py:157 >> {'loss': 2.0091, 'learning_rate': 1.5121e-09, 'epoch': 11.70, 'throughput': 547663.70}
|
430 |
+
|
431 |
+
[INFO|2025-02-11 18:00:51] logging.py:157 >> {'loss': 2.0294, 'learning_rate': 1.4922e-09, 'epoch': 11.70, 'throughput': 542034.94}
|
432 |
+
|
433 |
+
[INFO|2025-02-11 18:01:03] logging.py:157 >> {'loss': 1.9653, 'learning_rate': 1.4723e-09, 'epoch': 11.71, 'throughput': 536571.61}
|
434 |
+
|
435 |
+
[INFO|2025-02-11 18:01:14] logging.py:157 >> {'loss': 1.9635, 'learning_rate': 1.4527e-09, 'epoch': 11.71, 'throughput': 531161.51}
|
436 |
+
|
437 |
+
[INFO|2025-02-11 18:01:26] logging.py:157 >> {'loss': 1.9859, 'learning_rate': 1.4331e-09, 'epoch': 11.71, 'throughput': 525822.29}
|
438 |
+
|
439 |
+
[INFO|2025-02-11 18:01:38] logging.py:157 >> {'loss': 1.9933, 'learning_rate': 1.4137e-09, 'epoch': 11.71, 'throughput': 520580.98}
|
440 |
+
|
441 |
+
[INFO|2025-02-11 18:01:50] logging.py:157 >> {'loss': 1.9638, 'learning_rate': 1.3944e-09, 'epoch': 11.71, 'throughput': 515438.63}
|
442 |
+
|
443 |
+
[INFO|2025-02-11 18:01:50] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-595000
|
444 |
+
|
445 |
+
[INFO|2025-02-11 18:01:50] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\config.json
|
446 |
+
|
447 |
+
[INFO|2025-02-11 18:01:50] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\generation_config.json
|
448 |
+
|
449 |
+
[INFO|2025-02-11 18:01:50] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\model.safetensors
|
450 |
+
|
451 |
+
[INFO|2025-02-11 18:01:50] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\tokenizer_config.json
|
452 |
+
|
453 |
+
[INFO|2025-02-11 18:01:50] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\special_tokens_map.json
|
454 |
+
|
455 |
+
[INFO|2025-02-11 18:02:02] logging.py:157 >> {'loss': 2.0220, 'learning_rate': 1.3752e-09, 'epoch': 11.72, 'throughput': 510131.56}
|
456 |
+
|
457 |
+
[INFO|2025-02-11 18:02:14] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 1.3561e-09, 'epoch': 11.72, 'throughput': 505271.09}
|
458 |
+
|
459 |
+
[INFO|2025-02-11 18:02:25] logging.py:157 >> {'loss': 1.9671, 'learning_rate': 1.3372e-09, 'epoch': 11.72, 'throughput': 500534.56}
|
460 |
+
|
461 |
+
[INFO|2025-02-11 18:02:37] logging.py:157 >> {'loss': 1.9840, 'learning_rate': 1.3185e-09, 'epoch': 11.72, 'throughput': 495856.27}
|
462 |
+
|
463 |
+
[INFO|2025-02-11 18:02:49] logging.py:157 >> {'loss': 1.9937, 'learning_rate': 1.2998e-09, 'epoch': 11.72, 'throughput': 491327.18}
|
464 |
+
|
465 |
+
[INFO|2025-02-11 18:03:00] logging.py:157 >> {'loss': 1.9848, 'learning_rate': 1.2813e-09, 'epoch': 11.73, 'throughput': 486779.60}
|
466 |
+
|
467 |
+
[INFO|2025-02-11 18:03:12] logging.py:157 >> {'loss': 1.9859, 'learning_rate': 1.2630e-09, 'epoch': 11.73, 'throughput': 482340.06}
|
468 |
+
|
469 |
+
[INFO|2025-02-11 18:03:23] logging.py:157 >> {'loss': 2.0021, 'learning_rate': 1.2447e-09, 'epoch': 11.73, 'throughput': 478029.97}
|
470 |
+
|
471 |
+
[INFO|2025-02-11 18:03:35] logging.py:157 >> {'loss': 1.9589, 'learning_rate': 1.2266e-09, 'epoch': 11.73, 'throughput': 473763.66}
|
472 |
+
|
473 |
+
[INFO|2025-02-11 18:03:47] logging.py:157 >> {'loss': 2.0034, 'learning_rate': 1.2086e-09, 'epoch': 11.73, 'throughput': 469588.35}
|
474 |
+
|
475 |
+
[INFO|2025-02-11 18:03:58] logging.py:157 >> {'loss': 1.9454, 'learning_rate': 1.1908e-09, 'epoch': 11.74, 'throughput': 465484.85}
|
476 |
+
|
477 |
+
[INFO|2025-02-11 18:04:10] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 1.1731e-09, 'epoch': 11.74, 'throughput': 461443.54}
|
478 |
+
|
479 |
+
[INFO|2025-02-11 18:04:22] logging.py:157 >> {'loss': 1.9869, 'learning_rate': 1.1555e-09, 'epoch': 11.74, 'throughput': 457476.90}
|
480 |
+
|
481 |
+
[INFO|2025-02-11 18:04:33] logging.py:157 >> {'loss': 1.9897, 'learning_rate': 1.1381e-09, 'epoch': 11.74, 'throughput': 453577.26}
|
482 |
+
|
483 |
+
[INFO|2025-02-11 18:04:45] logging.py:157 >> {'loss': 1.9508, 'learning_rate': 1.1207e-09, 'epoch': 11.74, 'throughput': 449756.40}
|
484 |
+
|
485 |
+
[INFO|2025-02-11 18:04:57] logging.py:157 >> {'loss': 2.0472, 'learning_rate': 1.1036e-09, 'epoch': 11.75, 'throughput': 445933.85}
|
486 |
+
|
487 |
+
[INFO|2025-02-11 18:05:08] logging.py:157 >> {'loss': 2.0183, 'learning_rate': 1.0865e-09, 'epoch': 11.75, 'throughput': 442211.33}
|
488 |
+
|
489 |
+
[INFO|2025-02-11 18:05:20] logging.py:157 >> {'loss': 1.9619, 'learning_rate': 1.0696e-09, 'epoch': 11.75, 'throughput': 438547.56}
|
490 |
+
|
491 |
+
[INFO|2025-02-11 18:05:32] logging.py:157 >> {'loss': 1.9705, 'learning_rate': 1.0528e-09, 'epoch': 11.75, 'throughput': 434931.99}
|
492 |
+
|
493 |
+
[INFO|2025-02-11 18:05:44] logging.py:157 >> {'loss': 1.9639, 'learning_rate': 1.0362e-09, 'epoch': 11.75, 'throughput': 431369.53}
|
494 |
+
|
495 |
+
[INFO|2025-02-11 18:05:55] logging.py:157 >> {'loss': 2.0180, 'learning_rate': 1.0197e-09, 'epoch': 11.76, 'throughput': 427895.84}
|
496 |
+
|
497 |
+
[INFO|2025-02-11 18:06:07] logging.py:157 >> {'loss': 1.9725, 'learning_rate': 1.0033e-09, 'epoch': 11.76, 'throughput': 424536.52}
|
498 |
+
|
499 |
+
[INFO|2025-02-11 18:06:18] logging.py:157 >> {'loss': 1.9720, 'learning_rate': 9.8702e-10, 'epoch': 11.76, 'throughput': 421155.65}
|
500 |
+
|
501 |
+
[INFO|2025-02-11 18:06:30] logging.py:157 >> {'loss': 1.9493, 'learning_rate': 9.7090e-10, 'epoch': 11.76, 'throughput': 417872.57}
|
502 |
+
|
503 |
+
[INFO|2025-02-11 18:06:42] logging.py:157 >> {'loss': 1.9629, 'learning_rate': 9.5491e-10, 'epoch': 11.76, 'throughput': 414591.42}
|
504 |
+
|
505 |
+
[INFO|2025-02-11 18:06:53] logging.py:157 >> {'loss': 2.0116, 'learning_rate': 9.3906e-10, 'epoch': 11.77, 'throughput': 411412.15}
|
506 |
+
|
507 |
+
[INFO|2025-02-11 18:07:05] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 9.2333e-10, 'epoch': 11.77, 'throughput': 408307.17}
|
508 |
+
|
509 |
+
[INFO|2025-02-11 18:07:16] logging.py:157 >> {'loss': 2.0005, 'learning_rate': 9.0775e-10, 'epoch': 11.77, 'throughput': 405234.40}
|
510 |
+
|
511 |
+
[INFO|2025-02-11 18:07:28] logging.py:157 >> {'loss': 2.0201, 'learning_rate': 8.9229e-10, 'epoch': 11.77, 'throughput': 402202.34}
|
512 |
+
|
513 |
+
[INFO|2025-02-11 18:07:40] logging.py:157 >> {'loss': 2.0112, 'learning_rate': 8.7696e-10, 'epoch': 11.77, 'throughput': 399205.39}
|
514 |
+
|
515 |
+
[INFO|2025-02-11 18:07:51] logging.py:157 >> {'loss': 1.9830, 'learning_rate': 8.6177e-10, 'epoch': 11.78, 'throughput': 396210.92}
|
516 |
+
|
517 |
+
[INFO|2025-02-11 18:08:03] logging.py:157 >> {'loss': 1.9361, 'learning_rate': 8.4671e-10, 'epoch': 11.78, 'throughput': 393320.81}
|
518 |
+
|
519 |
+
[INFO|2025-02-11 18:08:15] logging.py:157 >> {'loss': 1.9500, 'learning_rate': 8.3179e-10, 'epoch': 11.78, 'throughput': 390469.11}
|
520 |
+
|
521 |
+
[INFO|2025-02-11 18:08:26] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 8.1700e-10, 'epoch': 11.78, 'throughput': 387669.28}
|
522 |
+
|
523 |
+
[INFO|2025-02-11 18:08:38] logging.py:157 >> {'loss': 1.9561, 'learning_rate': 8.0233e-10, 'epoch': 11.78, 'throughput': 384887.83}
|
524 |
+
|
525 |
+
[INFO|2025-02-11 18:08:49] logging.py:157 >> {'loss': 1.9669, 'learning_rate': 7.8781e-10, 'epoch': 11.79, 'throughput': 382152.22}
|
526 |
+
|
527 |
+
[INFO|2025-02-11 18:09:01] logging.py:157 >> {'loss': 2.0171, 'learning_rate': 7.7341e-10, 'epoch': 11.79, 'throughput': 379451.83}
|
528 |
+
|
529 |
+
[INFO|2025-02-11 18:09:13] logging.py:157 >> {'loss': 1.9596, 'learning_rate': 7.5915e-10, 'epoch': 11.79, 'throughput': 376779.36}
|
530 |
+
|
531 |
+
[INFO|2025-02-11 18:09:24] logging.py:157 >> {'loss': 1.9839, 'learning_rate': 7.4502e-10, 'epoch': 11.79, 'throughput': 374121.06}
|
532 |
+
|
533 |
+
[INFO|2025-02-11 18:09:36] logging.py:157 >> {'loss': 1.9670, 'learning_rate': 7.3102e-10, 'epoch': 11.79, 'throughput': 371467.09}
|
534 |
+
|
535 |
+
[INFO|2025-02-11 18:09:48] logging.py:157 >> {'loss': 1.9590, 'learning_rate': 7.1715e-10, 'epoch': 11.80, 'throughput': 368908.16}
|
536 |
+
|
537 |
+
[INFO|2025-02-11 18:10:00] logging.py:157 >> {'loss': 1.9685, 'learning_rate': 7.0342e-10, 'epoch': 11.80, 'throughput': 366412.85}
|
538 |
+
|
539 |
+
[INFO|2025-02-11 18:10:11] logging.py:157 >> {'loss': 1.9602, 'learning_rate': 6.8982e-10, 'epoch': 11.80, 'throughput': 363886.09}
|
540 |
+
|
541 |
+
[INFO|2025-02-11 18:10:23] logging.py:157 >> {'loss': 1.9667, 'learning_rate': 6.7635e-10, 'epoch': 11.80, 'throughput': 361409.68}
|
542 |
+
|
543 |
+
[INFO|2025-02-11 18:10:35] logging.py:157 >> {'loss': 1.9997, 'learning_rate': 6.6302e-10, 'epoch': 11.80, 'throughput': 359002.79}
|
544 |
+
|
545 |
+
[INFO|2025-02-11 18:10:46] logging.py:157 >> {'loss': 1.9776, 'learning_rate': 6.4982e-10, 'epoch': 11.81, 'throughput': 356673.52}
|
546 |
+
|
547 |
+
[INFO|2025-02-11 18:10:58] logging.py:157 >> {'loss': 1.9370, 'learning_rate': 6.3675e-10, 'epoch': 11.81, 'throughput': 354316.74}
|
548 |
+
|
549 |
+
[INFO|2025-02-11 18:11:10] logging.py:157 >> {'loss': 1.9884, 'learning_rate': 6.2381e-10, 'epoch': 11.81, 'throughput': 351970.04}
|
550 |
+
|
551 |
+
[INFO|2025-02-11 18:11:21] logging.py:157 >> {'loss': 1.9506, 'learning_rate': 6.1101e-10, 'epoch': 11.81, 'throughput': 349703.67}
|
552 |
+
|
553 |
+
[INFO|2025-02-11 18:11:33] logging.py:157 >> {'loss': 1.9925, 'learning_rate': 5.9834e-10, 'epoch': 11.81, 'throughput': 347455.50}
|
554 |
+
|
555 |
+
[INFO|2025-02-11 18:11:33] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-600000
|
556 |
+
|
557 |
+
[INFO|2025-02-11 18:11:33] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\config.json
|
558 |
+
|
559 |
+
[INFO|2025-02-11 18:11:33] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\generation_config.json
|
560 |
+
|
561 |
+
[INFO|2025-02-11 18:11:33] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\model.safetensors
|
562 |
+
|
563 |
+
[INFO|2025-02-11 18:11:33] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\tokenizer_config.json
|
564 |
+
|
565 |
+
[INFO|2025-02-11 18:11:33] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\special_tokens_map.json
|
566 |
+
|
567 |
+
[INFO|2025-02-11 18:11:45] logging.py:157 >> {'loss': 2.0176, 'learning_rate': 5.8580e-10, 'epoch': 11.82, 'throughput': 345137.06}
|
568 |
+
|
569 |
+
[INFO|2025-02-11 18:11:56] logging.py:157 >> {'loss': 1.9770, 'learning_rate': 5.7339e-10, 'epoch': 11.82, 'throughput': 342956.52}
|
570 |
+
|
571 |
+
[INFO|2025-02-11 18:12:08] logging.py:157 >> {'loss': 2.0058, 'learning_rate': 5.6112e-10, 'epoch': 11.82, 'throughput': 340798.55}
|
572 |
+
|
573 |
+
[INFO|2025-02-11 18:12:20] logging.py:157 >> {'loss': 1.9480, 'learning_rate': 5.4898e-10, 'epoch': 11.82, 'throughput': 338660.27}
|
574 |
+
|
575 |
+
[INFO|2025-02-11 18:12:31] logging.py:157 >> {'loss': 1.9882, 'learning_rate': 5.3697e-10, 'epoch': 11.82, 'throughput': 336520.05}
|
576 |
+
|
577 |
+
[INFO|2025-02-11 18:12:44] logging.py:157 >> {'loss': 1.9543, 'learning_rate': 5.2510e-10, 'epoch': 11.82, 'throughput': 334342.07}
|
578 |
+
|
579 |
+
[INFO|2025-02-11 18:12:55] logging.py:157 >> {'loss': 2.0162, 'learning_rate': 5.1336e-10, 'epoch': 11.83, 'throughput': 332223.27}
|
580 |
+
|
581 |
+
[INFO|2025-02-11 18:13:07] logging.py:157 >> {'loss': 1.9700, 'learning_rate': 5.0175e-10, 'epoch': 11.83, 'throughput': 330148.95}
|
582 |
+
|
583 |
+
[INFO|2025-02-11 18:13:19] logging.py:157 >> {'loss': 1.9556, 'learning_rate': 4.9027e-10, 'epoch': 11.83, 'throughput': 328120.14}
|
584 |
+
|
585 |
+
[INFO|2025-02-11 18:13:31] logging.py:157 >> {'loss': 1.9164, 'learning_rate': 4.7893e-10, 'epoch': 11.83, 'throughput': 326103.62}
|
586 |
+
|
587 |
+
[INFO|2025-02-11 18:13:43] logging.py:157 >> {'loss': 2.0214, 'learning_rate': 4.6771e-10, 'epoch': 11.83, 'throughput': 324137.85}
|
588 |
+
|
589 |
+
[INFO|2025-02-11 18:13:54] logging.py:157 >> {'loss': 1.9946, 'learning_rate': 4.5664e-10, 'epoch': 11.84, 'throughput': 322178.13}
|
590 |
+
|
591 |
+
[INFO|2025-02-11 18:14:06] logging.py:157 >> {'loss': 1.9869, 'learning_rate': 4.4569e-10, 'epoch': 11.84, 'throughput': 320240.19}
|
592 |
+
|
593 |
+
[INFO|2025-02-11 18:14:18] logging.py:157 >> {'loss': 1.9422, 'learning_rate': 4.3488e-10, 'epoch': 11.84, 'throughput': 318336.19}
|
594 |
+
|
595 |
+
[INFO|2025-02-11 18:14:30] logging.py:157 >> {'loss': 2.0274, 'learning_rate': 4.2420e-10, 'epoch': 11.84, 'throughput': 316426.29}
|
596 |
+
|
597 |
+
[INFO|2025-02-11 18:14:42] logging.py:157 >> {'loss': 1.9705, 'learning_rate': 4.1365e-10, 'epoch': 11.84, 'throughput': 314563.35}
|
598 |
+
|
599 |
+
[INFO|2025-02-11 18:14:53] logging.py:157 >> {'loss': 2.0292, 'learning_rate': 4.0323e-10, 'epoch': 11.85, 'throughput': 312747.74}
|
600 |
+
|
601 |
+
[INFO|2025-02-11 18:15:05] logging.py:157 >> {'loss': 1.9715, 'learning_rate': 3.9295e-10, 'epoch': 11.85, 'throughput': 310944.31}
|
602 |
+
|
603 |
+
[INFO|2025-02-11 18:15:17] logging.py:157 >> {'loss': 1.9632, 'learning_rate': 3.8280e-10, 'epoch': 11.85, 'throughput': 309151.82}
|
604 |
+
|
605 |
+
[INFO|2025-02-11 18:15:29] logging.py:157 >> {'loss': 1.9986, 'learning_rate': 3.7279e-10, 'epoch': 11.85, 'throughput': 307389.38}
|
606 |
+
|
607 |
+
[INFO|2025-02-11 18:15:40] logging.py:157 >> {'loss': 1.9875, 'learning_rate': 3.6290e-10, 'epoch': 11.85, 'throughput': 305642.32}
|
608 |
+
|
609 |
+
[INFO|2025-02-11 18:15:52] logging.py:157 >> {'loss': 1.9834, 'learning_rate': 3.5315e-10, 'epoch': 11.86, 'throughput': 303915.22}
|
610 |
+
|
611 |
+
[INFO|2025-02-11 18:16:04] logging.py:157 >> {'loss': 1.9761, 'learning_rate': 3.4353e-10, 'epoch': 11.86, 'throughput': 302186.17}
|
612 |
+
|
613 |
+
[INFO|2025-02-11 18:16:16] logging.py:157 >> {'loss': 1.9958, 'learning_rate': 3.3405e-10, 'epoch': 11.86, 'throughput': 300490.49}
|
614 |
+
|
615 |
+
[INFO|2025-02-11 18:16:27] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 3.2469e-10, 'epoch': 11.86, 'throughput': 298820.94}
|
616 |
+
|
617 |
+
[INFO|2025-02-11 18:16:39] logging.py:157 >> {'loss': 1.9959, 'learning_rate': 3.1547e-10, 'epoch': 11.86, 'throughput': 297188.72}
|
618 |
+
|
619 |
+
[INFO|2025-02-11 18:16:51] logging.py:157 >> {'loss': 2.0291, 'learning_rate': 3.0639e-10, 'epoch': 11.87, 'throughput': 295566.17}
|
620 |
+
|
621 |
+
[INFO|2025-02-11 18:17:02] logging.py:157 >> {'loss': 1.9732, 'learning_rate': 2.9743e-10, 'epoch': 11.87, 'throughput': 293965.39}
|
622 |
+
|
623 |
+
[INFO|2025-02-11 18:17:14] logging.py:157 >> {'loss': 1.9883, 'learning_rate': 2.8861e-10, 'epoch': 11.87, 'throughput': 292397.39}
|
624 |
+
|
625 |
+
[INFO|2025-02-11 18:17:25] logging.py:157 >> {'loss': 2.0485, 'learning_rate': 2.7992e-10, 'epoch': 11.87, 'throughput': 290839.54}
|
626 |
+
|
627 |
+
[INFO|2025-02-11 18:17:37] logging.py:157 >> {'loss': 1.9821, 'learning_rate': 2.7136e-10, 'epoch': 11.87, 'throughput': 289296.97}
|
628 |
+
|
629 |
+
[INFO|2025-02-11 18:17:49] logging.py:157 >> {'loss': 1.9260, 'learning_rate': 2.6294e-10, 'epoch': 11.88, 'throughput': 287758.88}
|
630 |
+
|
631 |
+
[INFO|2025-02-11 18:18:00] logging.py:157 >> {'loss': 1.9753, 'learning_rate': 2.5465e-10, 'epoch': 11.88, 'throughput': 286236.56}
|
632 |
+
|
633 |
+
[INFO|2025-02-11 18:18:12] logging.py:157 >> {'loss': 1.9688, 'learning_rate': 2.4649e-10, 'epoch': 11.88, 'throughput': 284738.27}
|
634 |
+
|
635 |
+
[INFO|2025-02-11 18:18:24] logging.py:157 >> {'loss': 2.0239, 'learning_rate': 2.3847e-10, 'epoch': 11.88, 'throughput': 283255.23}
|
636 |
+
|
637 |
+
[INFO|2025-02-11 18:18:35] logging.py:157 >> {'loss': 1.9905, 'learning_rate': 2.3057e-10, 'epoch': 11.88, 'throughput': 281788.13}
|
638 |
+
|
639 |
+
[INFO|2025-02-11 18:18:47] logging.py:157 >> {'loss': 1.9786, 'learning_rate': 2.2281e-10, 'epoch': 11.89, 'throughput': 280336.59}
|
640 |
+
|
641 |
+
[INFO|2025-02-11 18:18:59] logging.py:157 >> {'loss': 1.9951, 'learning_rate': 2.1519e-10, 'epoch': 11.89, 'throughput': 278902.58}
|
642 |
+
|
643 |
+
[INFO|2025-02-11 18:19:10] logging.py:157 >> {'loss': 1.9598, 'learning_rate': 2.0769e-10, 'epoch': 11.89, 'throughput': 277472.95}
|
644 |
+
|
645 |
+
[INFO|2025-02-11 18:19:22] logging.py:157 >> {'loss': 2.0105, 'learning_rate': 2.0033e-10, 'epoch': 11.89, 'throughput': 276052.37}
|
646 |
+
|
647 |
+
[INFO|2025-02-11 18:19:34] logging.py:157 >> {'loss': 1.9820, 'learning_rate': 1.9310e-10, 'epoch': 11.89, 'throughput': 274651.96}
|
648 |
+
|
649 |
+
[INFO|2025-02-11 18:19:46] logging.py:157 >> {'loss': 1.9581, 'learning_rate': 1.8601e-10, 'epoch': 11.90, 'throughput': 273266.57}
|
650 |
+
|
651 |
+
[INFO|2025-02-11 18:19:57] logging.py:157 >> {'loss': 1.9834, 'learning_rate': 1.7904e-10, 'epoch': 11.90, 'throughput': 271897.28}
|
652 |
+
|
653 |
+
[INFO|2025-02-11 18:20:09] logging.py:157 >> {'loss': 2.0045, 'learning_rate': 1.7221e-10, 'epoch': 11.90, 'throughput': 270528.26}
|
654 |
+
|
655 |
+
[INFO|2025-02-11 18:20:21] logging.py:157 >> {'loss': 1.9594, 'learning_rate': 1.6552e-10, 'epoch': 11.90, 'throughput': 269176.81}
|
656 |
+
|
657 |
+
[INFO|2025-02-11 18:20:33] logging.py:157 >> {'loss': 1.9738, 'learning_rate': 1.5895e-10, 'epoch': 11.90, 'throughput': 267831.20}
|
658 |
+
|
659 |
+
[INFO|2025-02-11 18:20:45] logging.py:157 >> {'loss': 2.0208, 'learning_rate': 1.5252e-10, 'epoch': 11.91, 'throughput': 266498.59}
|
660 |
+
|
661 |
+
[INFO|2025-02-11 18:20:57] logging.py:157 >> {'loss': 1.9997, 'learning_rate': 1.4622e-10, 'epoch': 11.91, 'throughput': 265185.54}
|
662 |
+
|
663 |
+
[INFO|2025-02-11 18:21:09] logging.py:157 >> {'loss': 2.0192, 'learning_rate': 1.4006e-10, 'epoch': 11.91, 'throughput': 263875.74}
|
664 |
+
|
665 |
+
[INFO|2025-02-11 18:21:21] logging.py:157 >> {'loss': 2.0046, 'learning_rate': 1.3402e-10, 'epoch': 11.91, 'throughput': 262555.68}
|
666 |
+
|
667 |
+
[INFO|2025-02-11 18:21:21] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-605000
|
668 |
+
|
669 |
+
[INFO|2025-02-11 18:21:21] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\config.json
|
670 |
+
|
671 |
+
[INFO|2025-02-11 18:21:21] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\generation_config.json
|
672 |
+
|
673 |
+
[INFO|2025-02-11 18:21:21] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\model.safetensors
|
674 |
+
|
675 |
+
[INFO|2025-02-11 18:21:21] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\tokenizer_config.json
|
676 |
+
|
677 |
+
[INFO|2025-02-11 18:21:21] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\special_tokens_map.json
|
678 |
+
|
679 |
+
[INFO|2025-02-11 18:21:33] logging.py:157 >> {'loss': 1.9669, 'learning_rate': 1.2812e-10, 'epoch': 11.91, 'throughput': 261169.48}
|
680 |
+
|
681 |
+
[INFO|2025-02-11 18:21:45] logging.py:157 >> {'loss': 1.9956, 'learning_rate': 1.2235e-10, 'epoch': 11.92, 'throughput': 259898.90}
|
682 |
+
|
683 |
+
[INFO|2025-02-11 18:21:57] logging.py:157 >> {'loss': 2.0026, 'learning_rate': 1.1672e-10, 'epoch': 11.92, 'throughput': 258652.69}
|
684 |
+
|
685 |
+
[INFO|2025-02-11 18:22:09] logging.py:157 >> {'loss': 1.9802, 'learning_rate': 1.1122e-10, 'epoch': 11.92, 'throughput': 257444.84}
|
686 |
+
|
687 |
+
[INFO|2025-02-11 18:22:20] logging.py:157 >> {'loss': 1.9955, 'learning_rate': 1.0585e-10, 'epoch': 11.92, 'throughput': 256244.85}
|
688 |
+
|
689 |
+
[INFO|2025-02-11 18:22:32] logging.py:157 >> {'loss': 2.0080, 'learning_rate': 1.0061e-10, 'epoch': 11.92, 'throughput': 255055.64}
|
690 |
+
|
691 |
+
[INFO|2025-02-11 18:22:44] logging.py:157 >> {'loss': 2.0313, 'learning_rate': 9.5508e-11, 'epoch': 11.93, 'throughput': 253878.46}
|
692 |
+
|
693 |
+
[INFO|2025-02-11 18:22:55] logging.py:157 >> {'loss': 2.0100, 'learning_rate': 9.0537e-11, 'epoch': 11.93, 'throughput': 252726.02}
|
694 |
+
|
695 |
+
[INFO|2025-02-11 18:23:07] logging.py:157 >> {'loss': 2.0303, 'learning_rate': 8.5699e-11, 'epoch': 11.93, 'throughput': 251573.33}
|
696 |
+
|
697 |
+
[INFO|2025-02-11 18:23:18] logging.py:157 >> {'loss': 1.9680, 'learning_rate': 8.0994e-11, 'epoch': 11.93, 'throughput': 250425.41}
|
698 |
+
|
699 |
+
[INFO|2025-02-11 18:23:30] logging.py:157 >> {'loss': 2.0075, 'learning_rate': 7.6422e-11, 'epoch': 11.93, 'throughput': 249294.78}
|
700 |
+
|
701 |
+
[INFO|2025-02-11 18:23:42] logging.py:157 >> {'loss': 1.9790, 'learning_rate': 7.1983e-11, 'epoch': 11.94, 'throughput': 248167.19}
|
702 |
+
|
703 |
+
[INFO|2025-02-11 18:23:53] logging.py:157 >> {'loss': 2.0086, 'learning_rate': 6.7676e-11, 'epoch': 11.94, 'throughput': 247047.41}
|
704 |
+
|
705 |
+
[INFO|2025-02-11 18:24:05] logging.py:157 >> {'loss': 1.9897, 'learning_rate': 6.3502e-11, 'epoch': 11.94, 'throughput': 245938.79}
|
706 |
+
|
707 |
+
[INFO|2025-02-11 18:24:17] logging.py:157 >> {'loss': 2.0329, 'learning_rate': 5.9461e-11, 'epoch': 11.94, 'throughput': 244853.80}
|
708 |
+
|
709 |
+
[INFO|2025-02-11 18:24:28] logging.py:157 >> {'loss': 1.9813, 'learning_rate': 5.5553e-11, 'epoch': 11.94, 'throughput': 243763.46}
|
710 |
+
|
711 |
+
[INFO|2025-02-11 18:24:40] logging.py:157 >> {'loss': 1.9675, 'learning_rate': 5.1778e-11, 'epoch': 11.95, 'throughput': 242684.81}
|
712 |
+
|
713 |
+
[INFO|2025-02-11 18:24:52] logging.py:157 >> {'loss': 2.0206, 'learning_rate': 4.8135e-11, 'epoch': 11.95, 'throughput': 241619.87}
|
714 |
+
|
715 |
+
[INFO|2025-02-11 18:25:03] logging.py:157 >> {'loss': 1.9695, 'learning_rate': 4.4625e-11, 'epoch': 11.95, 'throughput': 240551.71}
|
716 |
+
|
717 |
+
[INFO|2025-02-11 18:25:15] logging.py:157 >> {'loss': 1.9561, 'learning_rate': 4.1248e-11, 'epoch': 11.95, 'throughput': 239500.87}
|
718 |
+
|
719 |
+
[INFO|2025-02-11 18:25:27] logging.py:157 >> {'loss': 1.9766, 'learning_rate': 3.8004e-11, 'epoch': 11.95, 'throughput': 238446.21}
|
720 |
+
|
721 |
+
[INFO|2025-02-11 18:25:39] logging.py:157 >> {'loss': 2.0330, 'learning_rate': 3.4893e-11, 'epoch': 11.95, 'throughput': 237392.63}
|
722 |
+
|
723 |
+
[INFO|2025-02-11 18:25:51] logging.py:157 >> {'loss': 1.9738, 'learning_rate': 3.1915e-11, 'epoch': 11.96, 'throughput': 236352.43}
|
724 |
+
|
725 |
+
[INFO|2025-02-11 18:26:03] logging.py:157 >> {'loss': 2.0233, 'learning_rate': 2.9069e-11, 'epoch': 11.96, 'throughput': 235318.62}
|
726 |
+
|
727 |
+
[INFO|2025-02-11 18:26:14] logging.py:157 >> {'loss': 1.9637, 'learning_rate': 2.6357e-11, 'epoch': 11.96, 'throughput': 234313.34}
|
728 |
+
|
729 |
+
[INFO|2025-02-11 18:26:26] logging.py:157 >> {'loss': 1.9714, 'learning_rate': 2.3777e-11, 'epoch': 11.96, 'throughput': 233322.02}
|
730 |
+
|
731 |
+
[INFO|2025-02-11 18:26:38] logging.py:157 >> {'loss': 1.9451, 'learning_rate': 2.1330e-11, 'epoch': 11.96, 'throughput': 232339.87}
|
732 |
+
|
733 |
+
[INFO|2025-02-11 18:26:49] logging.py:157 >> {'loss': 2.0128, 'learning_rate': 1.9016e-11, 'epoch': 11.97, 'throughput': 231362.97}
|
734 |
+
|
735 |
+
[INFO|2025-02-11 18:27:01] logging.py:157 >> {'loss': 1.9903, 'learning_rate': 1.6835e-11, 'epoch': 11.97, 'throughput': 230391.77}
|
736 |
+
|
737 |
+
[INFO|2025-02-11 18:27:13] logging.py:157 >> {'loss': 2.0050, 'learning_rate': 1.4786e-11, 'epoch': 11.97, 'throughput': 229432.14}
|
738 |
+
|
739 |
+
[INFO|2025-02-11 18:27:24] logging.py:157 >> {'loss': 1.9584, 'learning_rate': 1.2870e-11, 'epoch': 11.97, 'throughput': 228474.82}
|
740 |
+
|
741 |
+
[INFO|2025-02-11 18:27:36] logging.py:157 >> {'loss': 1.9683, 'learning_rate': 1.1088e-11, 'epoch': 11.97, 'throughput': 227535.44}
|
742 |
+
|
743 |
+
[INFO|2025-02-11 18:27:48] logging.py:157 >> {'loss': 1.9758, 'learning_rate': 9.4378e-12, 'epoch': 11.98, 'throughput': 226600.82}
|
744 |
+
|
745 |
+
[INFO|2025-02-11 18:27:59] logging.py:157 >> {'loss': 2.0262, 'learning_rate': 7.9207e-12, 'epoch': 11.98, 'throughput': 225680.27}
|
746 |
+
|
747 |
+
[INFO|2025-02-11 18:28:11] logging.py:157 >> {'loss': 1.9921, 'learning_rate': 6.5364e-12, 'epoch': 11.98, 'throughput': 224757.71}
|
748 |
+
|
749 |
+
[INFO|2025-02-11 18:28:23] logging.py:157 >> {'loss': 1.9910, 'learning_rate': 5.2850e-12, 'epoch': 11.98, 'throughput': 223841.89}
|
750 |
+
|
751 |
+
[INFO|2025-02-11 18:28:35] logging.py:157 >> {'loss': 1.9481, 'learning_rate': 4.1665e-12, 'epoch': 11.98, 'throughput': 222924.90}
|
752 |
+
|
753 |
+
[INFO|2025-02-11 18:28:46] logging.py:157 >> {'loss': 1.9444, 'learning_rate': 3.1808e-12, 'epoch': 11.99, 'throughput': 222033.51}
|
754 |
+
|
755 |
+
[INFO|2025-02-11 18:28:58] logging.py:157 >> {'loss': 1.9570, 'learning_rate': 2.3279e-12, 'epoch': 11.99, 'throughput': 221145.11}
|
756 |
+
|
757 |
+
[INFO|2025-02-11 18:29:10] logging.py:157 >> {'loss': 2.0438, 'learning_rate': 1.6079e-12, 'epoch': 11.99, 'throughput': 220258.60}
|
758 |
+
|
759 |
+
[INFO|2025-02-11 18:29:22] logging.py:157 >> {'loss': 1.9305, 'learning_rate': 1.0207e-12, 'epoch': 11.99, 'throughput': 219369.78}
|
760 |
+
|
761 |
+
[INFO|2025-02-11 18:29:33] logging.py:157 >> {'loss': 1.9854, 'learning_rate': 5.6635e-13, 'epoch': 11.99, 'throughput': 218488.66}
|
762 |
+
|
763 |
+
[INFO|2025-02-11 18:29:45] logging.py:157 >> {'loss': 1.9765, 'learning_rate': 2.4486e-13, 'epoch': 12.00, 'throughput': 217621.92}
|
764 |
+
|
765 |
+
[INFO|2025-02-11 18:29:57] logging.py:157 >> {'loss': 1.9388, 'learning_rate': 5.6220e-14, 'epoch': 12.00, 'throughput': 216764.99}
|
766 |
+
|
767 |
+
[INFO|2025-02-11 18:30:08] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-609492
|
768 |
+
|
769 |
+
[INFO|2025-02-11 18:30:08] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\config.json
|
770 |
+
|
771 |
+
[INFO|2025-02-11 18:30:08] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\generation_config.json
|
772 |
+
|
773 |
+
[INFO|2025-02-11 18:30:08] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\model.safetensors
|
774 |
+
|
775 |
+
[INFO|2025-02-11 18:30:08] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\tokenizer_config.json
|
776 |
+
|
777 |
+
[INFO|2025-02-11 18:30:08] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\special_tokens_map.json
|
778 |
+
|
779 |
+
[INFO|2025-02-11 18:30:08] trainer.py:2643 >>
|
780 |
+
|
781 |
+
Training completed. Do not forget to share your model on huggingface.co/models =)
|
782 |
+
|
783 |
+
|
784 |
+
|
785 |
+
[INFO|2025-02-11 18:30:08] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025
|
786 |
+
|
787 |
+
[INFO|2025-02-11 18:30:08] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\config.json
|
788 |
+
|
789 |
+
[INFO|2025-02-11 18:30:08] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\generation_config.json
|
790 |
+
|
791 |
+
[INFO|2025-02-11 18:30:09] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\model.safetensors
|
792 |
+
|
793 |
+
[INFO|2025-02-11 18:30:09] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\tokenizer_config.json
|
794 |
+
|
795 |
+
[INFO|2025-02-11 18:30:09] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\special_tokens_map.json
|
796 |
+
|
797 |
+
[WARNING|2025-02-11 18:30:09] logging.py:162 >> No metric eval_loss to plot.
|
798 |
+
|
799 |
+
[WARNING|2025-02-11 18:30:09] logging.py:162 >> No metric eval_accuracy to plot.
|
800 |
+
|
801 |
+
[INFO|2025-02-11 18:30:09] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields:
|
802 |
+
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
|
803 |
+
|
special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|endoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|endoftext|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<|endoftext|>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"50256": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"bos_token": "<|endoftext|>",
|
14 |
+
"chat_template": "{% set system_message = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + content + '\n\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|endoftext|>' + '\n\n' }}{% endif %}{% endfor %}",
|
15 |
+
"clean_up_tokenization_spaces": false,
|
16 |
+
"eos_token": "<|endoftext|>",
|
17 |
+
"extra_special_tokens": {},
|
18 |
+
"model_max_length": 1024,
|
19 |
+
"pad_token": "<|endoftext|>",
|
20 |
+
"padding_side": "right",
|
21 |
+
"split_special_tokens": false,
|
22 |
+
"tokenizer_class": "GPT2Tokenizer",
|
23 |
+
"unk_token": "<|endoftext|>"
|
24 |
+
}
|
train_results.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 12.0,
|
3 |
+
"num_input_tokens_seen": 620593344,
|
4 |
+
"total_flos": 3.16711124803584e+17,
|
5 |
+
"train_loss": 0.07982336108915004,
|
6 |
+
"train_runtime": 2874.0318,
|
7 |
+
"train_samples_per_second": 212.069,
|
8 |
+
"train_steps_per_second": 212.069
|
9 |
+
}
|
trainer_log.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
trainer_state.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff9a7d93fc153a1403128fcff75f60e26e16253da1fa185ac9859426dec960b5
|
3 |
+
size 5688
|
training_args.yaml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bf16: true
|
2 |
+
cutoff_len: 1024
|
3 |
+
dataset: XeTute/SStory-Gen-EN_ZH,MatanP/emotion_mapped_story_dataset,webnovel,jaydenccc/AI_Storyteller_Dataset
|
4 |
+
dataset_dir: data
|
5 |
+
ddp_timeout: 180000000
|
6 |
+
do_train: true
|
7 |
+
finetuning_type: full
|
8 |
+
flash_attn: auto
|
9 |
+
gradient_accumulation_steps: 1
|
10 |
+
include_num_input_tokens_seen: true
|
11 |
+
learning_rate: 1.0e-06
|
12 |
+
logging_steps: 100
|
13 |
+
lr_scheduler_type: cosine
|
14 |
+
max_grad_norm: 1.0
|
15 |
+
max_samples: 1000000000
|
16 |
+
model_name_or_path: XeTute/Phantasor_V0.2-137M
|
17 |
+
num_train_epochs: 12.0
|
18 |
+
optim: sgd
|
19 |
+
output_dir: saves\GPT-2-Small\full\10-02-2025
|
20 |
+
packing: false
|
21 |
+
per_device_train_batch_size: 1
|
22 |
+
plot_loss: true
|
23 |
+
preprocessing_num_workers: 16
|
24 |
+
report_to: none
|
25 |
+
save_steps: 5000
|
26 |
+
stage: sft
|
27 |
+
template: alpaca
|
28 |
+
trust_remote_code: true
|
29 |
+
warmup_steps: 10
|
training_loss.png
ADDED
![]() |
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|