Upload 17 files

Browse files

Files changed (17) hide show

all_results.json +9 -0
config.json +39 -0
generation_config.json +6 -0
llamaboard_config.yaml +80 -0
merges.txt +0 -0
model.safetensors +3 -0
running_log.txt +803 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +24 -0
train_results.json +9 -0
trainer_log.jsonl +0 -0
trainer_state.json +0 -0
training_args.bin +3 -0
training_args.yaml +29 -0
training_loss.png +0 -0
vocab.json +0 -0

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 12.0,
+    "num_input_tokens_seen": 620593344,
+    "total_flos": 3.16711124803584e+17,
+    "train_loss": 0.07982336108915004,
+    "train_runtime": 2874.0318,
+    "train_samples_per_second": 212.069,
+    "train_steps_per_second": 212.069
+}

config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_name_or_path": "XeTute/Phantasor_V0.2-137M",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.2",
+  "use_cache": false,
+  "vocab_size": 50257
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.48.2"
+}

llamaboard_config.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+top.booster: auto
+top.checkpoint_path: null
+top.finetuning_type: full
+top.model_name: GPT-2-Small
+top.quantization_bit: none
+top.quantization_method: bitsandbytes
+top.rope_scaling: none
+top.template: alpaca
+train.additional_target: ''
+train.apollo_rank: 16
+train.apollo_scale: 32
+train.apollo_target: all
+train.apollo_update_interval: 200
+train.badam_mode: layer
+train.badam_switch_interval: 50
+train.badam_switch_mode: ascending
+train.badam_update_ratio: 0.05
+train.batch_size: 1
+train.compute_type: bf16
+train.create_new_adapter: false
+train.cutoff_len: 1024
+train.dataset:
+- XeTute/SStory-Gen-EN_ZH
+- MatanP/emotion_mapped_story_dataset
+- webnovel
+- jaydenccc/AI_Storyteller_Dataset
+train.dataset_dir: data
+train.ds_offload: false
+train.ds_stage: none
+train.extra_args: '{"optim": "sgd"}'
+train.freeze_extra_modules: ''
+train.freeze_trainable_layers: 2
+train.freeze_trainable_modules: all
+train.galore_rank: 16
+train.galore_scale: 2
+train.galore_target: all
+train.galore_update_interval: 200
+train.gradient_accumulation_steps: 1
+train.learning_rate: 1e-6
+train.logging_steps: 100
+train.lora_alpha: 16
+train.lora_dropout: 0
+train.lora_rank: 8
+train.lora_target: ''
+train.loraplus_lr_ratio: 0
+train.lr_scheduler_type: cosine
+train.mask_history: false
+train.max_grad_norm: '1.0'
+train.max_samples: '1000000000'
+train.neat_packing: false
+train.neftune_alpha: 0
+train.num_train_epochs: '12.0'
+train.packing: false
+train.ppo_score_norm: false
+train.ppo_whiten_rewards: false
+train.pref_beta: 0.1
+train.pref_ftx: 0
+train.pref_loss: sigmoid
+train.report_to:
+- none
+train.resize_vocab: false
+train.reward_model: []
+train.save_steps: 5000
+train.swanlab_api_key: ''
+train.swanlab_mode: cloud
+train.swanlab_project: llamafactory
+train.swanlab_run_name: ''
+train.swanlab_workspace: ''
+train.train_on_prompt: false
+train.training_stage: Supervised Fine-Tuning
+train.use_apollo: false
+train.use_badam: false
+train.use_dora: false
+train.use_galore: false
+train.use_llama_pro: false
+train.use_pissa: false
+train.use_rslora: false
+train.use_swanlab: false
+train.val_size: 0
+train.warmup_steps: 10

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:002933714d638e62f3a7c2796c1075d9aaf99119481284246c9012400fc8eee7
+size 497774208

running_log.txt ADDED Viewed

	@@ -0,0 +1,803 @@

+[INFO|2025-02-11 17:41:48] configuration_utils.py:696 >> loading configuration file config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\config.json
+[INFO|2025-02-11 17:41:48] configuration_utils.py:768 >> Model config GPT2Config {
+  "_name_or_path": "XeTute/Phantasor_V0.2-137M",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.2",
+  "use_cache": false,
+  "vocab_size": 50257
+}
+[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file vocab.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\vocab.json
+[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file merges.txt from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\merges.txt
+[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer.json
+[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None
+[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\special_tokens_map.json
+[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer_config.json
+[INFO|2025-02-11 17:41:49] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
+[INFO|2025-02-11 17:41:50] configuration_utils.py:696 >> loading configuration file config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\config.json
+[INFO|2025-02-11 17:41:50] configuration_utils.py:768 >> Model config GPT2Config {
+  "_name_or_path": "XeTute/Phantasor_V0.2-137M",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.2",
+  "use_cache": false,
+  "vocab_size": 50257
+}
+[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file vocab.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\vocab.json
+[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file merges.txt from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\merges.txt
+[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer.json
+[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at None
+[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\special_tokens_map.json
+[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\tokenizer_config.json
+[INFO|2025-02-11 17:41:50] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
+[INFO|2025-02-11 17:41:50] logging.py:157 >> Loading dataset XeTute/SStory-Gen-EN_ZH...
+[INFO|2025-02-11 17:41:59] logging.py:157 >> Loading dataset MatanP/emotion_mapped_story_dataset...
+[INFO|2025-02-11 17:42:03] logging.py:157 >> Loading dataset zxbsmk/webnovel_cn...
+[INFO|2025-02-11 17:42:10] logging.py:157 >> Loading dataset jaydenccc/AI_Storyteller_Dataset...
+[INFO|2025-02-11 17:42:13] configuration_utils.py:696 >> loading configuration file config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\config.json
+[INFO|2025-02-11 17:42:13] configuration_utils.py:768 >> Model config GPT2Config {
+  "_name_or_path": "XeTute/Phantasor_V0.2-137M",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.2",
+  "use_cache": false,
+  "vocab_size": 50257
+}
+[INFO|2025-02-11 17:42:13] modeling_utils.py:3904 >> loading weights file model.safetensors from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\model.safetensors
+[INFO|2025-02-11 17:42:13] modeling_utils.py:1582 >> Instantiating GPT2LMHeadModel model under default dtype torch.bfloat16.
+[INFO|2025-02-11 17:42:13] configuration_utils.py:1140 >> Generate config GenerationConfig {
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "use_cache": false
+}
+[INFO|2025-02-11 17:42:14] modeling_utils.py:4888 >> All model checkpoint weights were used when initializing GPT2LMHeadModel.
+[INFO|2025-02-11 17:42:14] modeling_utils.py:4896 >> All the weights of GPT2LMHeadModel were initialized from the model checkpoint at XeTute/Phantasor_V0.2-137M.
+If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.
+[INFO|2025-02-11 17:42:14] configuration_utils.py:1095 >> loading configuration file generation_config.json from cache at C:\Users\Asad_\.cache\huggingface\hub\models--XeTute--Phantasor_V0.2-137M\snapshots\5399a9b18d0e7bab4c00cf5704b4ee1caceda6d5\generation_config.json
+[INFO|2025-02-11 17:42:14] configuration_utils.py:1140 >> Generate config GenerationConfig {
+  "bos_token_id": 50256,
+  "eos_token_id": 50256
+}
+[INFO|2025-02-11 17:42:14] logging.py:157 >> Gradient checkpointing enabled.
+[INFO|2025-02-11 17:42:14] logging.py:157 >> Using torch SDPA for faster training and inference.
+[INFO|2025-02-11 17:42:14] logging.py:157 >> Upcasting trainable params to float32.
+[INFO|2025-02-11 17:42:14] logging.py:157 >> Fine-tuning method: Full
+[INFO|2025-02-11 17:42:14] logging.py:157 >> trainable params: 124,439,808 || all params: 124,439,808 || trainable%: 100.0000
+[INFO|2025-02-11 17:42:14] trainer.py:741 >> Using auto half precision backend
+[INFO|2025-02-11 17:42:14] trainer.py:2775 >> Loading model from saves\GPT-2-Small\full\10-02-2025\checkpoint-585000.
+[WARNING|2025-02-11 17:42:14] trainer.py:3018 >> There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
+[INFO|2025-02-11 17:42:14] trainer.py:2369 >> ***** Running training *****
+[INFO|2025-02-11 17:42:14] trainer.py:2370 >>   Num examples = 50,791
+[INFO|2025-02-11 17:42:14] trainer.py:2371 >>   Num Epochs = 12
+[INFO|2025-02-11 17:42:14] trainer.py:2372 >>   Instantaneous batch size per device = 1
+[INFO|2025-02-11 17:42:14] trainer.py:2375 >>   Total train batch size (w. parallel, distributed & accumulation) = 1
+[INFO|2025-02-11 17:42:14] trainer.py:2376 >>   Gradient Accumulation steps = 1
+[INFO|2025-02-11 17:42:14] trainer.py:2377 >>   Total optimization steps = 609,492
+[INFO|2025-02-11 17:42:14] trainer.py:2378 >>   Number of trainable parameters = 124,439,808
+[INFO|2025-02-11 17:42:14] trainer.py:2400 >>   Continuing training from checkpoint, will skip to saved global_step
+[INFO|2025-02-11 17:42:14] trainer.py:2401 >>   Continuing training from epoch 11
+[INFO|2025-02-11 17:42:14] trainer.py:2402 >>   Continuing training from global step 585000
+[INFO|2025-02-11 17:42:14] trainer.py:2404 >>   Will skip the first 11 epochs then the first 26299 batches in the first epoch.
+[INFO|2025-02-11 17:42:26] logging.py:157 >> {'loss': 1.9812, 'learning_rate': 3.9468e-09, 'epoch': 11.52, 'throughput': 50003805.20}
+[INFO|2025-02-11 17:42:38] logging.py:157 >> {'loss': 2.0162, 'learning_rate': 3.9145e-09, 'epoch': 11.52, 'throughput': 25202148.23}
+[INFO|2025-02-11 17:42:50] logging.py:157 >> {'loss': 1.9694, 'learning_rate': 3.8824e-09, 'epoch': 11.52, 'throughput': 16752405.16}
+[INFO|2025-02-11 17:43:02] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 3.8504e-09, 'epoch': 11.53, 'throughput': 12548387.20}
+[INFO|2025-02-11 17:43:14] logging.py:157 >> {'loss': 1.9634, 'learning_rate': 3.8185e-09, 'epoch': 11.53, 'throughput': 10008058.95}
+[INFO|2025-02-11 17:43:26] logging.py:157 >> {'loss': 2.0249, 'learning_rate': 3.7868e-09, 'epoch': 11.53, 'throughput': 8363752.91}
+[INFO|2025-02-11 17:43:38] logging.py:157 >> {'loss': 1.9759, 'learning_rate': 3.7552e-09, 'epoch': 11.53, 'throughput': 7170473.74}
+[INFO|2025-02-11 17:43:49] logging.py:157 >> {'loss': 2.0019, 'learning_rate': 3.7238e-09, 'epoch': 11.53, 'throughput': 6291290.77}
+[INFO|2025-02-11 17:44:01] logging.py:157 >> {'loss': 1.9800, 'learning_rate': 3.6924e-09, 'epoch': 11.54, 'throughput': 5603383.58}
+[INFO|2025-02-11 17:44:12] logging.py:157 >> {'loss': 1.9993, 'learning_rate': 3.6612e-09, 'epoch': 11.54, 'throughput': 5058860.90}
+[INFO|2025-02-11 17:44:24] logging.py:157 >> {'loss': 1.9438, 'learning_rate': 3.6302e-09, 'epoch': 11.54, 'throughput': 4612411.90}
+[INFO|2025-02-11 17:44:35] logging.py:157 >> {'loss': 2.0000, 'learning_rate': 3.5992e-09, 'epoch': 11.54, 'throughput': 4237182.77}
+[INFO|2025-02-11 17:44:47] logging.py:157 >> {'loss': 2.0493, 'learning_rate': 3.5684e-09, 'epoch': 11.54, 'throughput': 3919609.73}
+[INFO|2025-02-11 17:44:58] logging.py:157 >> {'loss': 1.9846, 'learning_rate': 3.5378e-09, 'epoch': 11.55, 'throughput': 3644083.96}
+[INFO|2025-02-11 17:45:10] logging.py:157 >> {'loss': 1.9597, 'learning_rate': 3.5072e-09, 'epoch': 11.55, 'throughput': 3405057.81}
+[INFO|2025-02-11 17:45:21] logging.py:157 >> {'loss': 2.0429, 'learning_rate': 3.4768e-09, 'epoch': 11.55, 'throughput': 3196451.63}
+[INFO|2025-02-11 17:45:33] logging.py:157 >> {'loss': 1.9481, 'learning_rate': 3.4465e-09, 'epoch': 11.55, 'throughput': 3011092.35}
+[INFO|2025-02-11 17:45:45] logging.py:157 >> {'loss': 1.9765, 'learning_rate': 3.4164e-09, 'epoch': 11.55, 'throughput': 2844004.86}
+[INFO|2025-02-11 17:45:56] logging.py:157 >> {'loss': 2.0500, 'learning_rate': 3.3864e-09, 'epoch': 11.56, 'throughput': 2695596.64}
+[INFO|2025-02-11 17:46:08] logging.py:157 >> {'loss': 2.0157, 'learning_rate': 3.3565e-09, 'epoch': 11.56, 'throughput': 2563562.80}
+[INFO|2025-02-11 17:46:19] logging.py:157 >> {'loss': 2.0172, 'learning_rate': 3.3268e-09, 'epoch': 11.56, 'throughput': 2443655.33}
+[INFO|2025-02-11 17:46:31] logging.py:157 >> {'loss': 1.9731, 'learning_rate': 3.2971e-09, 'epoch': 11.56, 'throughput': 2330850.00}
+[INFO|2025-02-11 17:46:43] logging.py:157 >> {'loss': 2.0210, 'learning_rate': 3.2677e-09, 'epoch': 11.56, 'throughput': 2228412.35}
+[INFO|2025-02-11 17:46:55] logging.py:157 >> {'loss': 1.9729, 'learning_rate': 3.2383e-09, 'epoch': 11.57, 'throughput': 2135025.81}
+[INFO|2025-02-11 17:47:06] logging.py:157 >> {'loss': 2.0153, 'learning_rate': 3.2091e-09, 'epoch': 11.57, 'throughput': 2049017.68}
+[INFO|2025-02-11 17:47:18] logging.py:157 >> {'loss': 1.9696, 'learning_rate': 3.1800e-09, 'epoch': 11.57, 'throughput': 1968648.47}
+[INFO|2025-02-11 17:47:30] logging.py:157 >> {'loss': 1.9419, 'learning_rate': 3.1511e-09, 'epoch': 11.57, 'throughput': 1895060.01}
+[INFO|2025-02-11 17:47:42] logging.py:157 >> {'loss': 2.0142, 'learning_rate': 3.1222e-09, 'epoch': 11.57, 'throughput': 1827059.79}
+[INFO|2025-02-11 17:47:54] logging.py:157 >> {'loss': 2.0028, 'learning_rate': 3.0935e-09, 'epoch': 11.57, 'throughput': 1763616.14}
+[INFO|2025-02-11 17:48:06] logging.py:157 >> {'loss': 1.9800, 'learning_rate': 3.0650e-09, 'epoch': 11.58, 'throughput': 1704997.68}
+[INFO|2025-02-11 17:48:17] logging.py:157 >> {'loss': 2.0216, 'learning_rate': 3.0366e-09, 'epoch': 11.58, 'throughput': 1650129.04}
+[INFO|2025-02-11 17:48:29] logging.py:157 >> {'loss': 1.9755, 'learning_rate': 3.0083e-09, 'epoch': 11.58, 'throughput': 1598132.07}
+[INFO|2025-02-11 17:48:41] logging.py:157 >> {'loss': 2.0083, 'learning_rate': 2.9801e-09, 'epoch': 11.58, 'throughput': 1550207.97}
+[INFO|2025-02-11 17:48:53] logging.py:157 >> {'loss': 2.0161, 'learning_rate': 2.9521e-09, 'epoch': 11.58, 'throughput': 1504244.14}
+[INFO|2025-02-11 17:49:04] logging.py:157 >> {'loss': 2.0157, 'learning_rate': 2.9242e-09, 'epoch': 11.59, 'throughput': 1461629.99}
+[INFO|2025-02-11 17:49:16] logging.py:157 >> {'loss': 1.9792, 'learning_rate': 2.8964e-09, 'epoch': 11.59, 'throughput': 1421289.44}
+[INFO|2025-02-11 17:49:28] logging.py:157 >> {'loss': 1.9482, 'learning_rate': 2.8688e-09, 'epoch': 11.59, 'throughput': 1382339.41}
+[INFO|2025-02-11 17:49:40] logging.py:157 >> {'loss': 2.0295, 'learning_rate': 2.8413e-09, 'epoch': 11.59, 'throughput': 1346070.98}
+[INFO|2025-02-11 17:49:52] logging.py:157 >> {'loss': 1.9778, 'learning_rate': 2.8139e-09, 'epoch': 11.59, 'throughput': 1311592.72}
+[INFO|2025-02-11 17:50:03] logging.py:157 >> {'loss': 2.0103, 'learning_rate': 2.7867e-09, 'epoch': 11.60, 'throughput': 1279350.97}
+[INFO|2025-02-11 17:50:15] logging.py:157 >> {'loss': 1.9449, 'learning_rate': 2.7595e-09, 'epoch': 11.60, 'throughput': 1248312.09}
+[INFO|2025-02-11 17:50:27] logging.py:157 >> {'loss': 2.0110, 'learning_rate': 2.7326e-09, 'epoch': 11.60, 'throughput': 1218768.16}
+[INFO|2025-02-11 17:50:39] logging.py:157 >> {'loss': 1.9304, 'learning_rate': 2.7057e-09, 'epoch': 11.60, 'throughput': 1190297.15}
+[INFO|2025-02-11 17:50:51] logging.py:157 >> {'loss': 1.9709, 'learning_rate': 2.6790e-09, 'epoch': 11.60, 'throughput': 1162183.03}
+[INFO|2025-02-11 17:51:03] logging.py:157 >> {'loss': 1.9486, 'learning_rate': 2.6524e-09, 'epoch': 11.61, 'throughput': 1136347.30}
+[INFO|2025-02-11 17:51:14] logging.py:157 >> {'loss': 1.9941, 'learning_rate': 2.6260e-09, 'epoch': 11.61, 'throughput': 1111856.81}
+[INFO|2025-02-11 17:51:26] logging.py:157 >> {'loss': 1.9774, 'learning_rate': 2.5997e-09, 'epoch': 11.61, 'throughput': 1088182.32}
+[INFO|2025-02-11 17:51:38] logging.py:157 >> {'loss': 1.9954, 'learning_rate': 2.5735e-09, 'epoch': 11.61, 'throughput': 1065718.96}
+[INFO|2025-02-11 17:51:50] logging.py:157 >> {'loss': 2.0438, 'learning_rate': 2.5475e-09, 'epoch': 11.61, 'throughput': 1044277.87}
+[INFO|2025-02-11 17:52:01] logging.py:157 >> {'loss': 2.0343, 'learning_rate': 2.5215e-09, 'epoch': 11.62, 'throughput': 1023489.66}
+[INFO|2025-02-11 17:52:01] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-590000
+[INFO|2025-02-11 17:52:01] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\config.json
+[INFO|2025-02-11 17:52:01] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\generation_config.json
+[INFO|2025-02-11 17:52:02] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\model.safetensors
+[INFO|2025-02-11 17:52:02] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\tokenizer_config.json
+[INFO|2025-02-11 17:52:02] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-590000\special_tokens_map.json
+[INFO|2025-02-11 17:52:14] logging.py:157 >> {'loss': 2.0148, 'learning_rate': 2.4958e-09, 'epoch': 11.62, 'throughput': 1002469.52}
+[INFO|2025-02-11 17:52:26] logging.py:157 >> {'loss': 1.9398, 'learning_rate': 2.4701e-09, 'epoch': 11.62, 'throughput': 983195.98}
+[INFO|2025-02-11 17:52:37] logging.py:157 >> {'loss': 2.0115, 'learning_rate': 2.4446e-09, 'epoch': 11.62, 'throughput': 964942.63}
+[INFO|2025-02-11 17:52:49] logging.py:157 >> {'loss': 1.9977, 'learning_rate': 2.4192e-09, 'epoch': 11.62, 'throughput': 946799.71}
+[INFO|2025-02-11 17:53:01] logging.py:157 >> {'loss': 1.9790, 'learning_rate': 2.3939e-09, 'epoch': 11.63, 'throughput': 929612.66}
+[INFO|2025-02-11 17:53:13] logging.py:157 >> {'loss': 2.0179, 'learning_rate': 2.3688e-09, 'epoch': 11.63, 'throughput': 912840.69}
+[INFO|2025-02-11 17:53:25] logging.py:157 >> {'loss': 1.9422, 'learning_rate': 2.3438e-09, 'epoch': 11.63, 'throughput': 897250.33}
+[INFO|2025-02-11 17:53:36] logging.py:157 >> {'loss': 1.9710, 'learning_rate': 2.3190e-09, 'epoch': 11.63, 'throughput': 882243.27}
+[INFO|2025-02-11 17:53:48] logging.py:157 >> {'loss': 1.9900, 'learning_rate': 2.2942e-09, 'epoch': 11.63, 'throughput': 867690.03}
+[INFO|2025-02-11 17:53:59] logging.py:157 >> {'loss': 2.0106, 'learning_rate': 2.2696e-09, 'epoch': 11.64, 'throughput': 853674.20}
+[INFO|2025-02-11 17:54:11] logging.py:157 >> {'loss': 1.9897, 'learning_rate': 2.2452e-09, 'epoch': 11.64, 'throughput': 840110.77}
+[INFO|2025-02-11 17:54:22] logging.py:157 >> {'loss': 2.0037, 'learning_rate': 2.2208e-09, 'epoch': 11.64, 'throughput': 826914.64}
+[INFO|2025-02-11 17:54:34] logging.py:157 >> {'loss': 1.9609, 'learning_rate': 2.1966e-09, 'epoch': 11.64, 'throughput': 814139.50}
+[INFO|2025-02-11 17:54:45] logging.py:157 >> {'loss': 2.0072, 'learning_rate': 2.1726e-09, 'epoch': 11.64, 'throughput': 801848.40}
+[INFO|2025-02-11 17:54:57] logging.py:157 >> {'loss': 1.9598, 'learning_rate': 2.1486e-09, 'epoch': 11.65, 'throughput': 789459.13}
+[INFO|2025-02-11 17:55:09] logging.py:157 >> {'loss': 1.9845, 'learning_rate': 2.1248e-09, 'epoch': 11.65, 'throughput': 777598.46}
+[INFO|2025-02-11 17:55:21] logging.py:157 >> {'loss': 1.9423, 'learning_rate': 2.1012e-09, 'epoch': 11.65, 'throughput': 766045.00}
+[INFO|2025-02-11 17:55:33] logging.py:157 >> {'loss': 1.9774, 'learning_rate': 2.0776e-09, 'epoch': 11.65, 'throughput': 754823.16}
+[INFO|2025-02-11 17:55:45] logging.py:157 >> {'loss': 2.0093, 'learning_rate': 2.0542e-09, 'epoch': 11.65, 'throughput': 743976.59}
+[INFO|2025-02-11 17:55:56] logging.py:157 >> {'loss': 1.9717, 'learning_rate': 2.0310e-09, 'epoch': 11.66, 'throughput': 733371.47}
+[INFO|2025-02-11 17:56:08] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 2.0078e-09, 'epoch': 11.66, 'throughput': 723085.26}
+[INFO|2025-02-11 17:56:20] logging.py:157 >> {'loss': 1.9593, 'learning_rate': 1.9848e-09, 'epoch': 11.66, 'throughput': 713131.55}
+[INFO|2025-02-11 17:56:32] logging.py:157 >> {'loss': 1.9501, 'learning_rate': 1.9619e-09, 'epoch': 11.66, 'throughput': 703409.15}
+[INFO|2025-02-11 17:56:44] logging.py:157 >> {'loss': 1.9638, 'learning_rate': 1.9392e-09, 'epoch': 11.66, 'throughput': 693950.69}
+[INFO|2025-02-11 17:56:55] logging.py:157 >> {'loss': 1.9802, 'learning_rate': 1.9166e-09, 'epoch': 11.67, 'throughput': 684783.60}
+[INFO|2025-02-11 17:57:08] logging.py:157 >> {'loss': 1.9731, 'learning_rate': 1.8941e-09, 'epoch': 11.67, 'throughput': 675622.49}
+[INFO|2025-02-11 17:57:20] logging.py:157 >> {'loss': 1.9767, 'learning_rate': 1.8718e-09, 'epoch': 11.67, 'throughput': 666580.81}
+[INFO|2025-02-11 17:57:32] logging.py:157 >> {'loss': 1.9927, 'learning_rate': 1.8496e-09, 'epoch': 11.67, 'throughput': 658156.99}
+[INFO|2025-02-11 17:57:43] logging.py:157 >> {'loss': 1.9931, 'learning_rate': 1.8275e-09, 'epoch': 11.67, 'throughput': 649934.04}
+[INFO|2025-02-11 17:57:55] logging.py:157 >> {'loss': 1.9789, 'learning_rate': 1.8055e-09, 'epoch': 11.68, 'throughput': 641903.03}
+[INFO|2025-02-11 17:58:07] logging.py:157 >> {'loss': 2.0209, 'learning_rate': 1.7837e-09, 'epoch': 11.68, 'throughput': 634098.47}
+[INFO|2025-02-11 17:58:19] logging.py:157 >> {'loss': 1.9245, 'learning_rate': 1.7620e-09, 'epoch': 11.68, 'throughput': 626445.98}
+[INFO|2025-02-11 17:58:30] logging.py:157 >> {'loss': 2.0252, 'learning_rate': 1.7405e-09, 'epoch': 11.68, 'throughput': 618981.54}
+[INFO|2025-02-11 17:58:42] logging.py:157 >> {'loss': 2.0151, 'learning_rate': 1.7191e-09, 'epoch': 11.68, 'throughput': 611668.79}
+[INFO|2025-02-11 17:58:54] logging.py:157 >> {'loss': 1.9501, 'learning_rate': 1.6978e-09, 'epoch': 11.69, 'throughput': 604546.71}
+[INFO|2025-02-11 17:59:06] logging.py:157 >> {'loss': 2.0443, 'learning_rate': 1.6766e-09, 'epoch': 11.69, 'throughput': 597713.96}
+[INFO|2025-02-11 17:59:17] logging.py:157 >> {'loss': 1.9772, 'learning_rate': 1.6556e-09, 'epoch': 11.69, 'throughput': 591071.07}
+[INFO|2025-02-11 17:59:29] logging.py:157 >> {'loss': 1.9930, 'learning_rate': 1.6347e-09, 'epoch': 11.69, 'throughput': 584451.24}
+[INFO|2025-02-11 17:59:41] logging.py:157 >> {'loss': 2.0026, 'learning_rate': 1.6139e-09, 'epoch': 11.69, 'throughput': 577957.44}
+[INFO|2025-02-11 17:59:52] logging.py:157 >> {'loss': 2.0245, 'learning_rate': 1.5933e-09, 'epoch': 11.69, 'throughput': 571700.05}
+[INFO|2025-02-11 18:00:04] logging.py:157 >> {'loss': 2.0352, 'learning_rate': 1.5728e-09, 'epoch': 11.70, 'throughput': 565427.48}
+[INFO|2025-02-11 18:00:16] logging.py:157 >> {'loss': 1.9875, 'learning_rate': 1.5525e-09, 'epoch': 11.70, 'throughput': 559389.83}
+[INFO|2025-02-11 18:00:28] logging.py:157 >> {'loss': 1.9973, 'learning_rate': 1.5322e-09, 'epoch': 11.70, 'throughput': 553408.92}
+[INFO|2025-02-11 18:00:40] logging.py:157 >> {'loss': 2.0091, 'learning_rate': 1.5121e-09, 'epoch': 11.70, 'throughput': 547663.70}
+[INFO|2025-02-11 18:00:51] logging.py:157 >> {'loss': 2.0294, 'learning_rate': 1.4922e-09, 'epoch': 11.70, 'throughput': 542034.94}
+[INFO|2025-02-11 18:01:03] logging.py:157 >> {'loss': 1.9653, 'learning_rate': 1.4723e-09, 'epoch': 11.71, 'throughput': 536571.61}
+[INFO|2025-02-11 18:01:14] logging.py:157 >> {'loss': 1.9635, 'learning_rate': 1.4527e-09, 'epoch': 11.71, 'throughput': 531161.51}
+[INFO|2025-02-11 18:01:26] logging.py:157 >> {'loss': 1.9859, 'learning_rate': 1.4331e-09, 'epoch': 11.71, 'throughput': 525822.29}
+[INFO|2025-02-11 18:01:38] logging.py:157 >> {'loss': 1.9933, 'learning_rate': 1.4137e-09, 'epoch': 11.71, 'throughput': 520580.98}
+[INFO|2025-02-11 18:01:50] logging.py:157 >> {'loss': 1.9638, 'learning_rate': 1.3944e-09, 'epoch': 11.71, 'throughput': 515438.63}
+[INFO|2025-02-11 18:01:50] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-595000
+[INFO|2025-02-11 18:01:50] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\config.json
+[INFO|2025-02-11 18:01:50] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\generation_config.json
+[INFO|2025-02-11 18:01:50] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\model.safetensors
+[INFO|2025-02-11 18:01:50] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\tokenizer_config.json
+[INFO|2025-02-11 18:01:50] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-595000\special_tokens_map.json
+[INFO|2025-02-11 18:02:02] logging.py:157 >> {'loss': 2.0220, 'learning_rate': 1.3752e-09, 'epoch': 11.72, 'throughput': 510131.56}
+[INFO|2025-02-11 18:02:14] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 1.3561e-09, 'epoch': 11.72, 'throughput': 505271.09}
+[INFO|2025-02-11 18:02:25] logging.py:157 >> {'loss': 1.9671, 'learning_rate': 1.3372e-09, 'epoch': 11.72, 'throughput': 500534.56}
+[INFO|2025-02-11 18:02:37] logging.py:157 >> {'loss': 1.9840, 'learning_rate': 1.3185e-09, 'epoch': 11.72, 'throughput': 495856.27}
+[INFO|2025-02-11 18:02:49] logging.py:157 >> {'loss': 1.9937, 'learning_rate': 1.2998e-09, 'epoch': 11.72, 'throughput': 491327.18}
+[INFO|2025-02-11 18:03:00] logging.py:157 >> {'loss': 1.9848, 'learning_rate': 1.2813e-09, 'epoch': 11.73, 'throughput': 486779.60}
+[INFO|2025-02-11 18:03:12] logging.py:157 >> {'loss': 1.9859, 'learning_rate': 1.2630e-09, 'epoch': 11.73, 'throughput': 482340.06}
+[INFO|2025-02-11 18:03:23] logging.py:157 >> {'loss': 2.0021, 'learning_rate': 1.2447e-09, 'epoch': 11.73, 'throughput': 478029.97}
+[INFO|2025-02-11 18:03:35] logging.py:157 >> {'loss': 1.9589, 'learning_rate': 1.2266e-09, 'epoch': 11.73, 'throughput': 473763.66}
+[INFO|2025-02-11 18:03:47] logging.py:157 >> {'loss': 2.0034, 'learning_rate': 1.2086e-09, 'epoch': 11.73, 'throughput': 469588.35}
+[INFO|2025-02-11 18:03:58] logging.py:157 >> {'loss': 1.9454, 'learning_rate': 1.1908e-09, 'epoch': 11.74, 'throughput': 465484.85}
+[INFO|2025-02-11 18:04:10] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 1.1731e-09, 'epoch': 11.74, 'throughput': 461443.54}
+[INFO|2025-02-11 18:04:22] logging.py:157 >> {'loss': 1.9869, 'learning_rate': 1.1555e-09, 'epoch': 11.74, 'throughput': 457476.90}
+[INFO|2025-02-11 18:04:33] logging.py:157 >> {'loss': 1.9897, 'learning_rate': 1.1381e-09, 'epoch': 11.74, 'throughput': 453577.26}
+[INFO|2025-02-11 18:04:45] logging.py:157 >> {'loss': 1.9508, 'learning_rate': 1.1207e-09, 'epoch': 11.74, 'throughput': 449756.40}
+[INFO|2025-02-11 18:04:57] logging.py:157 >> {'loss': 2.0472, 'learning_rate': 1.1036e-09, 'epoch': 11.75, 'throughput': 445933.85}
+[INFO|2025-02-11 18:05:08] logging.py:157 >> {'loss': 2.0183, 'learning_rate': 1.0865e-09, 'epoch': 11.75, 'throughput': 442211.33}
+[INFO|2025-02-11 18:05:20] logging.py:157 >> {'loss': 1.9619, 'learning_rate': 1.0696e-09, 'epoch': 11.75, 'throughput': 438547.56}
+[INFO|2025-02-11 18:05:32] logging.py:157 >> {'loss': 1.9705, 'learning_rate': 1.0528e-09, 'epoch': 11.75, 'throughput': 434931.99}
+[INFO|2025-02-11 18:05:44] logging.py:157 >> {'loss': 1.9639, 'learning_rate': 1.0362e-09, 'epoch': 11.75, 'throughput': 431369.53}
+[INFO|2025-02-11 18:05:55] logging.py:157 >> {'loss': 2.0180, 'learning_rate': 1.0197e-09, 'epoch': 11.76, 'throughput': 427895.84}
+[INFO|2025-02-11 18:06:07] logging.py:157 >> {'loss': 1.9725, 'learning_rate': 1.0033e-09, 'epoch': 11.76, 'throughput': 424536.52}
+[INFO|2025-02-11 18:06:18] logging.py:157 >> {'loss': 1.9720, 'learning_rate': 9.8702e-10, 'epoch': 11.76, 'throughput': 421155.65}
+[INFO|2025-02-11 18:06:30] logging.py:157 >> {'loss': 1.9493, 'learning_rate': 9.7090e-10, 'epoch': 11.76, 'throughput': 417872.57}
+[INFO|2025-02-11 18:06:42] logging.py:157 >> {'loss': 1.9629, 'learning_rate': 9.5491e-10, 'epoch': 11.76, 'throughput': 414591.42}
+[INFO|2025-02-11 18:06:53] logging.py:157 >> {'loss': 2.0116, 'learning_rate': 9.3906e-10, 'epoch': 11.77, 'throughput': 411412.15}
+[INFO|2025-02-11 18:07:05] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 9.2333e-10, 'epoch': 11.77, 'throughput': 408307.17}
+[INFO|2025-02-11 18:07:16] logging.py:157 >> {'loss': 2.0005, 'learning_rate': 9.0775e-10, 'epoch': 11.77, 'throughput': 405234.40}
+[INFO|2025-02-11 18:07:28] logging.py:157 >> {'loss': 2.0201, 'learning_rate': 8.9229e-10, 'epoch': 11.77, 'throughput': 402202.34}
+[INFO|2025-02-11 18:07:40] logging.py:157 >> {'loss': 2.0112, 'learning_rate': 8.7696e-10, 'epoch': 11.77, 'throughput': 399205.39}
+[INFO|2025-02-11 18:07:51] logging.py:157 >> {'loss': 1.9830, 'learning_rate': 8.6177e-10, 'epoch': 11.78, 'throughput': 396210.92}
+[INFO|2025-02-11 18:08:03] logging.py:157 >> {'loss': 1.9361, 'learning_rate': 8.4671e-10, 'epoch': 11.78, 'throughput': 393320.81}
+[INFO|2025-02-11 18:08:15] logging.py:157 >> {'loss': 1.9500, 'learning_rate': 8.3179e-10, 'epoch': 11.78, 'throughput': 390469.11}
+[INFO|2025-02-11 18:08:26] logging.py:157 >> {'loss': 1.9692, 'learning_rate': 8.1700e-10, 'epoch': 11.78, 'throughput': 387669.28}
+[INFO|2025-02-11 18:08:38] logging.py:157 >> {'loss': 1.9561, 'learning_rate': 8.0233e-10, 'epoch': 11.78, 'throughput': 384887.83}
+[INFO|2025-02-11 18:08:49] logging.py:157 >> {'loss': 1.9669, 'learning_rate': 7.8781e-10, 'epoch': 11.79, 'throughput': 382152.22}
+[INFO|2025-02-11 18:09:01] logging.py:157 >> {'loss': 2.0171, 'learning_rate': 7.7341e-10, 'epoch': 11.79, 'throughput': 379451.83}
+[INFO|2025-02-11 18:09:13] logging.py:157 >> {'loss': 1.9596, 'learning_rate': 7.5915e-10, 'epoch': 11.79, 'throughput': 376779.36}
+[INFO|2025-02-11 18:09:24] logging.py:157 >> {'loss': 1.9839, 'learning_rate': 7.4502e-10, 'epoch': 11.79, 'throughput': 374121.06}
+[INFO|2025-02-11 18:09:36] logging.py:157 >> {'loss': 1.9670, 'learning_rate': 7.3102e-10, 'epoch': 11.79, 'throughput': 371467.09}
+[INFO|2025-02-11 18:09:48] logging.py:157 >> {'loss': 1.9590, 'learning_rate': 7.1715e-10, 'epoch': 11.80, 'throughput': 368908.16}
+[INFO|2025-02-11 18:10:00] logging.py:157 >> {'loss': 1.9685, 'learning_rate': 7.0342e-10, 'epoch': 11.80, 'throughput': 366412.85}
+[INFO|2025-02-11 18:10:11] logging.py:157 >> {'loss': 1.9602, 'learning_rate': 6.8982e-10, 'epoch': 11.80, 'throughput': 363886.09}
+[INFO|2025-02-11 18:10:23] logging.py:157 >> {'loss': 1.9667, 'learning_rate': 6.7635e-10, 'epoch': 11.80, 'throughput': 361409.68}
+[INFO|2025-02-11 18:10:35] logging.py:157 >> {'loss': 1.9997, 'learning_rate': 6.6302e-10, 'epoch': 11.80, 'throughput': 359002.79}
+[INFO|2025-02-11 18:10:46] logging.py:157 >> {'loss': 1.9776, 'learning_rate': 6.4982e-10, 'epoch': 11.81, 'throughput': 356673.52}
+[INFO|2025-02-11 18:10:58] logging.py:157 >> {'loss': 1.9370, 'learning_rate': 6.3675e-10, 'epoch': 11.81, 'throughput': 354316.74}
+[INFO|2025-02-11 18:11:10] logging.py:157 >> {'loss': 1.9884, 'learning_rate': 6.2381e-10, 'epoch': 11.81, 'throughput': 351970.04}
+[INFO|2025-02-11 18:11:21] logging.py:157 >> {'loss': 1.9506, 'learning_rate': 6.1101e-10, 'epoch': 11.81, 'throughput': 349703.67}
+[INFO|2025-02-11 18:11:33] logging.py:157 >> {'loss': 1.9925, 'learning_rate': 5.9834e-10, 'epoch': 11.81, 'throughput': 347455.50}
+[INFO|2025-02-11 18:11:33] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-600000
+[INFO|2025-02-11 18:11:33] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\config.json
+[INFO|2025-02-11 18:11:33] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\generation_config.json
+[INFO|2025-02-11 18:11:33] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\model.safetensors
+[INFO|2025-02-11 18:11:33] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\tokenizer_config.json
+[INFO|2025-02-11 18:11:33] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-600000\special_tokens_map.json
+[INFO|2025-02-11 18:11:45] logging.py:157 >> {'loss': 2.0176, 'learning_rate': 5.8580e-10, 'epoch': 11.82, 'throughput': 345137.06}
+[INFO|2025-02-11 18:11:56] logging.py:157 >> {'loss': 1.9770, 'learning_rate': 5.7339e-10, 'epoch': 11.82, 'throughput': 342956.52}
+[INFO|2025-02-11 18:12:08] logging.py:157 >> {'loss': 2.0058, 'learning_rate': 5.6112e-10, 'epoch': 11.82, 'throughput': 340798.55}
+[INFO|2025-02-11 18:12:20] logging.py:157 >> {'loss': 1.9480, 'learning_rate': 5.4898e-10, 'epoch': 11.82, 'throughput': 338660.27}
+[INFO|2025-02-11 18:12:31] logging.py:157 >> {'loss': 1.9882, 'learning_rate': 5.3697e-10, 'epoch': 11.82, 'throughput': 336520.05}
+[INFO|2025-02-11 18:12:44] logging.py:157 >> {'loss': 1.9543, 'learning_rate': 5.2510e-10, 'epoch': 11.82, 'throughput': 334342.07}
+[INFO|2025-02-11 18:12:55] logging.py:157 >> {'loss': 2.0162, 'learning_rate': 5.1336e-10, 'epoch': 11.83, 'throughput': 332223.27}
+[INFO|2025-02-11 18:13:07] logging.py:157 >> {'loss': 1.9700, 'learning_rate': 5.0175e-10, 'epoch': 11.83, 'throughput': 330148.95}
+[INFO|2025-02-11 18:13:19] logging.py:157 >> {'loss': 1.9556, 'learning_rate': 4.9027e-10, 'epoch': 11.83, 'throughput': 328120.14}
+[INFO|2025-02-11 18:13:31] logging.py:157 >> {'loss': 1.9164, 'learning_rate': 4.7893e-10, 'epoch': 11.83, 'throughput': 326103.62}
+[INFO|2025-02-11 18:13:43] logging.py:157 >> {'loss': 2.0214, 'learning_rate': 4.6771e-10, 'epoch': 11.83, 'throughput': 324137.85}
+[INFO|2025-02-11 18:13:54] logging.py:157 >> {'loss': 1.9946, 'learning_rate': 4.5664e-10, 'epoch': 11.84, 'throughput': 322178.13}
+[INFO|2025-02-11 18:14:06] logging.py:157 >> {'loss': 1.9869, 'learning_rate': 4.4569e-10, 'epoch': 11.84, 'throughput': 320240.19}
+[INFO|2025-02-11 18:14:18] logging.py:157 >> {'loss': 1.9422, 'learning_rate': 4.3488e-10, 'epoch': 11.84, 'throughput': 318336.19}
+[INFO|2025-02-11 18:14:30] logging.py:157 >> {'loss': 2.0274, 'learning_rate': 4.2420e-10, 'epoch': 11.84, 'throughput': 316426.29}
+[INFO|2025-02-11 18:14:42] logging.py:157 >> {'loss': 1.9705, 'learning_rate': 4.1365e-10, 'epoch': 11.84, 'throughput': 314563.35}
+[INFO|2025-02-11 18:14:53] logging.py:157 >> {'loss': 2.0292, 'learning_rate': 4.0323e-10, 'epoch': 11.85, 'throughput': 312747.74}
+[INFO|2025-02-11 18:15:05] logging.py:157 >> {'loss': 1.9715, 'learning_rate': 3.9295e-10, 'epoch': 11.85, 'throughput': 310944.31}
+[INFO|2025-02-11 18:15:17] logging.py:157 >> {'loss': 1.9632, 'learning_rate': 3.8280e-10, 'epoch': 11.85, 'throughput': 309151.82}
+[INFO|2025-02-11 18:15:29] logging.py:157 >> {'loss': 1.9986, 'learning_rate': 3.7279e-10, 'epoch': 11.85, 'throughput': 307389.38}
+[INFO|2025-02-11 18:15:40] logging.py:157 >> {'loss': 1.9875, 'learning_rate': 3.6290e-10, 'epoch': 11.85, 'throughput': 305642.32}
+[INFO|2025-02-11 18:15:52] logging.py:157 >> {'loss': 1.9834, 'learning_rate': 3.5315e-10, 'epoch': 11.86, 'throughput': 303915.22}
+[INFO|2025-02-11 18:16:04] logging.py:157 >> {'loss': 1.9761, 'learning_rate': 3.4353e-10, 'epoch': 11.86, 'throughput': 302186.17}
+[INFO|2025-02-11 18:16:16] logging.py:157 >> {'loss': 1.9958, 'learning_rate': 3.3405e-10, 'epoch': 11.86, 'throughput': 300490.49}
+[INFO|2025-02-11 18:16:27] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 3.2469e-10, 'epoch': 11.86, 'throughput': 298820.94}
+[INFO|2025-02-11 18:16:39] logging.py:157 >> {'loss': 1.9959, 'learning_rate': 3.1547e-10, 'epoch': 11.86, 'throughput': 297188.72}
+[INFO|2025-02-11 18:16:51] logging.py:157 >> {'loss': 2.0291, 'learning_rate': 3.0639e-10, 'epoch': 11.87, 'throughput': 295566.17}
+[INFO|2025-02-11 18:17:02] logging.py:157 >> {'loss': 1.9732, 'learning_rate': 2.9743e-10, 'epoch': 11.87, 'throughput': 293965.39}
+[INFO|2025-02-11 18:17:14] logging.py:157 >> {'loss': 1.9883, 'learning_rate': 2.8861e-10, 'epoch': 11.87, 'throughput': 292397.39}
+[INFO|2025-02-11 18:17:25] logging.py:157 >> {'loss': 2.0485, 'learning_rate': 2.7992e-10, 'epoch': 11.87, 'throughput': 290839.54}
+[INFO|2025-02-11 18:17:37] logging.py:157 >> {'loss': 1.9821, 'learning_rate': 2.7136e-10, 'epoch': 11.87, 'throughput': 289296.97}
+[INFO|2025-02-11 18:17:49] logging.py:157 >> {'loss': 1.9260, 'learning_rate': 2.6294e-10, 'epoch': 11.88, 'throughput': 287758.88}
+[INFO|2025-02-11 18:18:00] logging.py:157 >> {'loss': 1.9753, 'learning_rate': 2.5465e-10, 'epoch': 11.88, 'throughput': 286236.56}
+[INFO|2025-02-11 18:18:12] logging.py:157 >> {'loss': 1.9688, 'learning_rate': 2.4649e-10, 'epoch': 11.88, 'throughput': 284738.27}
+[INFO|2025-02-11 18:18:24] logging.py:157 >> {'loss': 2.0239, 'learning_rate': 2.3847e-10, 'epoch': 11.88, 'throughput': 283255.23}
+[INFO|2025-02-11 18:18:35] logging.py:157 >> {'loss': 1.9905, 'learning_rate': 2.3057e-10, 'epoch': 11.88, 'throughput': 281788.13}
+[INFO|2025-02-11 18:18:47] logging.py:157 >> {'loss': 1.9786, 'learning_rate': 2.2281e-10, 'epoch': 11.89, 'throughput': 280336.59}
+[INFO|2025-02-11 18:18:59] logging.py:157 >> {'loss': 1.9951, 'learning_rate': 2.1519e-10, 'epoch': 11.89, 'throughput': 278902.58}
+[INFO|2025-02-11 18:19:10] logging.py:157 >> {'loss': 1.9598, 'learning_rate': 2.0769e-10, 'epoch': 11.89, 'throughput': 277472.95}
+[INFO|2025-02-11 18:19:22] logging.py:157 >> {'loss': 2.0105, 'learning_rate': 2.0033e-10, 'epoch': 11.89, 'throughput': 276052.37}
+[INFO|2025-02-11 18:19:34] logging.py:157 >> {'loss': 1.9820, 'learning_rate': 1.9310e-10, 'epoch': 11.89, 'throughput': 274651.96}
+[INFO|2025-02-11 18:19:46] logging.py:157 >> {'loss': 1.9581, 'learning_rate': 1.8601e-10, 'epoch': 11.90, 'throughput': 273266.57}
+[INFO|2025-02-11 18:19:57] logging.py:157 >> {'loss': 1.9834, 'learning_rate': 1.7904e-10, 'epoch': 11.90, 'throughput': 271897.28}
+[INFO|2025-02-11 18:20:09] logging.py:157 >> {'loss': 2.0045, 'learning_rate': 1.7221e-10, 'epoch': 11.90, 'throughput': 270528.26}
+[INFO|2025-02-11 18:20:21] logging.py:157 >> {'loss': 1.9594, 'learning_rate': 1.6552e-10, 'epoch': 11.90, 'throughput': 269176.81}
+[INFO|2025-02-11 18:20:33] logging.py:157 >> {'loss': 1.9738, 'learning_rate': 1.5895e-10, 'epoch': 11.90, 'throughput': 267831.20}
+[INFO|2025-02-11 18:20:45] logging.py:157 >> {'loss': 2.0208, 'learning_rate': 1.5252e-10, 'epoch': 11.91, 'throughput': 266498.59}
+[INFO|2025-02-11 18:20:57] logging.py:157 >> {'loss': 1.9997, 'learning_rate': 1.4622e-10, 'epoch': 11.91, 'throughput': 265185.54}
+[INFO|2025-02-11 18:21:09] logging.py:157 >> {'loss': 2.0192, 'learning_rate': 1.4006e-10, 'epoch': 11.91, 'throughput': 263875.74}
+[INFO|2025-02-11 18:21:21] logging.py:157 >> {'loss': 2.0046, 'learning_rate': 1.3402e-10, 'epoch': 11.91, 'throughput': 262555.68}
+[INFO|2025-02-11 18:21:21] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-605000
+[INFO|2025-02-11 18:21:21] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\config.json
+[INFO|2025-02-11 18:21:21] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\generation_config.json
+[INFO|2025-02-11 18:21:21] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\model.safetensors
+[INFO|2025-02-11 18:21:21] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\tokenizer_config.json
+[INFO|2025-02-11 18:21:21] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-605000\special_tokens_map.json
+[INFO|2025-02-11 18:21:33] logging.py:157 >> {'loss': 1.9669, 'learning_rate': 1.2812e-10, 'epoch': 11.91, 'throughput': 261169.48}
+[INFO|2025-02-11 18:21:45] logging.py:157 >> {'loss': 1.9956, 'learning_rate': 1.2235e-10, 'epoch': 11.92, 'throughput': 259898.90}
+[INFO|2025-02-11 18:21:57] logging.py:157 >> {'loss': 2.0026, 'learning_rate': 1.1672e-10, 'epoch': 11.92, 'throughput': 258652.69}
+[INFO|2025-02-11 18:22:09] logging.py:157 >> {'loss': 1.9802, 'learning_rate': 1.1122e-10, 'epoch': 11.92, 'throughput': 257444.84}
+[INFO|2025-02-11 18:22:20] logging.py:157 >> {'loss': 1.9955, 'learning_rate': 1.0585e-10, 'epoch': 11.92, 'throughput': 256244.85}
+[INFO|2025-02-11 18:22:32] logging.py:157 >> {'loss': 2.0080, 'learning_rate': 1.0061e-10, 'epoch': 11.92, 'throughput': 255055.64}
+[INFO|2025-02-11 18:22:44] logging.py:157 >> {'loss': 2.0313, 'learning_rate': 9.5508e-11, 'epoch': 11.93, 'throughput': 253878.46}
+[INFO|2025-02-11 18:22:55] logging.py:157 >> {'loss': 2.0100, 'learning_rate': 9.0537e-11, 'epoch': 11.93, 'throughput': 252726.02}
+[INFO|2025-02-11 18:23:07] logging.py:157 >> {'loss': 2.0303, 'learning_rate': 8.5699e-11, 'epoch': 11.93, 'throughput': 251573.33}
+[INFO|2025-02-11 18:23:18] logging.py:157 >> {'loss': 1.9680, 'learning_rate': 8.0994e-11, 'epoch': 11.93, 'throughput': 250425.41}
+[INFO|2025-02-11 18:23:30] logging.py:157 >> {'loss': 2.0075, 'learning_rate': 7.6422e-11, 'epoch': 11.93, 'throughput': 249294.78}
+[INFO|2025-02-11 18:23:42] logging.py:157 >> {'loss': 1.9790, 'learning_rate': 7.1983e-11, 'epoch': 11.94, 'throughput': 248167.19}
+[INFO|2025-02-11 18:23:53] logging.py:157 >> {'loss': 2.0086, 'learning_rate': 6.7676e-11, 'epoch': 11.94, 'throughput': 247047.41}
+[INFO|2025-02-11 18:24:05] logging.py:157 >> {'loss': 1.9897, 'learning_rate': 6.3502e-11, 'epoch': 11.94, 'throughput': 245938.79}
+[INFO|2025-02-11 18:24:17] logging.py:157 >> {'loss': 2.0329, 'learning_rate': 5.9461e-11, 'epoch': 11.94, 'throughput': 244853.80}
+[INFO|2025-02-11 18:24:28] logging.py:157 >> {'loss': 1.9813, 'learning_rate': 5.5553e-11, 'epoch': 11.94, 'throughput': 243763.46}
+[INFO|2025-02-11 18:24:40] logging.py:157 >> {'loss': 1.9675, 'learning_rate': 5.1778e-11, 'epoch': 11.95, 'throughput': 242684.81}
+[INFO|2025-02-11 18:24:52] logging.py:157 >> {'loss': 2.0206, 'learning_rate': 4.8135e-11, 'epoch': 11.95, 'throughput': 241619.87}
+[INFO|2025-02-11 18:25:03] logging.py:157 >> {'loss': 1.9695, 'learning_rate': 4.4625e-11, 'epoch': 11.95, 'throughput': 240551.71}
+[INFO|2025-02-11 18:25:15] logging.py:157 >> {'loss': 1.9561, 'learning_rate': 4.1248e-11, 'epoch': 11.95, 'throughput': 239500.87}
+[INFO|2025-02-11 18:25:27] logging.py:157 >> {'loss': 1.9766, 'learning_rate': 3.8004e-11, 'epoch': 11.95, 'throughput': 238446.21}
+[INFO|2025-02-11 18:25:39] logging.py:157 >> {'loss': 2.0330, 'learning_rate': 3.4893e-11, 'epoch': 11.95, 'throughput': 237392.63}
+[INFO|2025-02-11 18:25:51] logging.py:157 >> {'loss': 1.9738, 'learning_rate': 3.1915e-11, 'epoch': 11.96, 'throughput': 236352.43}
+[INFO|2025-02-11 18:26:03] logging.py:157 >> {'loss': 2.0233, 'learning_rate': 2.9069e-11, 'epoch': 11.96, 'throughput': 235318.62}
+[INFO|2025-02-11 18:26:14] logging.py:157 >> {'loss': 1.9637, 'learning_rate': 2.6357e-11, 'epoch': 11.96, 'throughput': 234313.34}
+[INFO|2025-02-11 18:26:26] logging.py:157 >> {'loss': 1.9714, 'learning_rate': 2.3777e-11, 'epoch': 11.96, 'throughput': 233322.02}
+[INFO|2025-02-11 18:26:38] logging.py:157 >> {'loss': 1.9451, 'learning_rate': 2.1330e-11, 'epoch': 11.96, 'throughput': 232339.87}
+[INFO|2025-02-11 18:26:49] logging.py:157 >> {'loss': 2.0128, 'learning_rate': 1.9016e-11, 'epoch': 11.97, 'throughput': 231362.97}
+[INFO|2025-02-11 18:27:01] logging.py:157 >> {'loss': 1.9903, 'learning_rate': 1.6835e-11, 'epoch': 11.97, 'throughput': 230391.77}
+[INFO|2025-02-11 18:27:13] logging.py:157 >> {'loss': 2.0050, 'learning_rate': 1.4786e-11, 'epoch': 11.97, 'throughput': 229432.14}
+[INFO|2025-02-11 18:27:24] logging.py:157 >> {'loss': 1.9584, 'learning_rate': 1.2870e-11, 'epoch': 11.97, 'throughput': 228474.82}
+[INFO|2025-02-11 18:27:36] logging.py:157 >> {'loss': 1.9683, 'learning_rate': 1.1088e-11, 'epoch': 11.97, 'throughput': 227535.44}
+[INFO|2025-02-11 18:27:48] logging.py:157 >> {'loss': 1.9758, 'learning_rate': 9.4378e-12, 'epoch': 11.98, 'throughput': 226600.82}
+[INFO|2025-02-11 18:27:59] logging.py:157 >> {'loss': 2.0262, 'learning_rate': 7.9207e-12, 'epoch': 11.98, 'throughput': 225680.27}
+[INFO|2025-02-11 18:28:11] logging.py:157 >> {'loss': 1.9921, 'learning_rate': 6.5364e-12, 'epoch': 11.98, 'throughput': 224757.71}
+[INFO|2025-02-11 18:28:23] logging.py:157 >> {'loss': 1.9910, 'learning_rate': 5.2850e-12, 'epoch': 11.98, 'throughput': 223841.89}
+[INFO|2025-02-11 18:28:35] logging.py:157 >> {'loss': 1.9481, 'learning_rate': 4.1665e-12, 'epoch': 11.98, 'throughput': 222924.90}
+[INFO|2025-02-11 18:28:46] logging.py:157 >> {'loss': 1.9444, 'learning_rate': 3.1808e-12, 'epoch': 11.99, 'throughput': 222033.51}
+[INFO|2025-02-11 18:28:58] logging.py:157 >> {'loss': 1.9570, 'learning_rate': 2.3279e-12, 'epoch': 11.99, 'throughput': 221145.11}
+[INFO|2025-02-11 18:29:10] logging.py:157 >> {'loss': 2.0438, 'learning_rate': 1.6079e-12, 'epoch': 11.99, 'throughput': 220258.60}
+[INFO|2025-02-11 18:29:22] logging.py:157 >> {'loss': 1.9305, 'learning_rate': 1.0207e-12, 'epoch': 11.99, 'throughput': 219369.78}
+[INFO|2025-02-11 18:29:33] logging.py:157 >> {'loss': 1.9854, 'learning_rate': 5.6635e-13, 'epoch': 11.99, 'throughput': 218488.66}
+[INFO|2025-02-11 18:29:45] logging.py:157 >> {'loss': 1.9765, 'learning_rate': 2.4486e-13, 'epoch': 12.00, 'throughput': 217621.92}
+[INFO|2025-02-11 18:29:57] logging.py:157 >> {'loss': 1.9388, 'learning_rate': 5.6220e-14, 'epoch': 12.00, 'throughput': 216764.99}
+[INFO|2025-02-11 18:30:08] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025\checkpoint-609492
+[INFO|2025-02-11 18:30:08] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\config.json
+[INFO|2025-02-11 18:30:08] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\generation_config.json
+[INFO|2025-02-11 18:30:08] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\model.safetensors
+[INFO|2025-02-11 18:30:08] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\tokenizer_config.json
+[INFO|2025-02-11 18:30:08] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\checkpoint-609492\special_tokens_map.json
+[INFO|2025-02-11 18:30:08] trainer.py:2643 >>
+Training completed. Do not forget to share your model on huggingface.co/models =)
+[INFO|2025-02-11 18:30:08] trainer.py:3910 >> Saving model checkpoint to saves\GPT-2-Small\full\10-02-2025
+[INFO|2025-02-11 18:30:08] configuration_utils.py:420 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\config.json
+[INFO|2025-02-11 18:30:08] configuration_utils.py:909 >> Configuration saved in saves\GPT-2-Small\full\10-02-2025\generation_config.json
+[INFO|2025-02-11 18:30:09] modeling_utils.py:2988 >> Model weights saved in saves\GPT-2-Small\full\10-02-2025\model.safetensors
+[INFO|2025-02-11 18:30:09] tokenization_utils_base.py:2491 >> tokenizer config file saved in saves\GPT-2-Small\full\10-02-2025\tokenizer_config.json
+[INFO|2025-02-11 18:30:09] tokenization_utils_base.py:2500 >> Special tokens file saved in saves\GPT-2-Small\full\10-02-2025\special_tokens_map.json
+[WARNING|2025-02-11 18:30:09] logging.py:162 >> No metric eval_loss to plot.
+[WARNING|2025-02-11 18:30:09] logging.py:162 >> No metric eval_accuracy to plot.
+[INFO|2025-02-11 18:30:09] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields:
+{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% set system_message = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + content + '\n\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|endoftext|>' + '\n\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 12.0,
+    "num_input_tokens_seen": 620593344,
+    "total_flos": 3.16711124803584e+17,
+    "train_loss": 0.07982336108915004,
+    "train_runtime": 2874.0318,
+    "train_samples_per_second": 212.069,
+    "train_steps_per_second": 212.069
+}

trainer_log.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff9a7d93fc153a1403128fcff75f60e26e16253da1fa185ac9859426dec960b5
+size 5688

training_args.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+bf16: true
+cutoff_len: 1024
+dataset: XeTute/SStory-Gen-EN_ZH,MatanP/emotion_mapped_story_dataset,webnovel,jaydenccc/AI_Storyteller_Dataset
+dataset_dir: data
+ddp_timeout: 180000000
+do_train: true
+finetuning_type: full
+flash_attn: auto
+gradient_accumulation_steps: 1
+include_num_input_tokens_seen: true
+learning_rate: 1.0e-06
+logging_steps: 100
+lr_scheduler_type: cosine
+max_grad_norm: 1.0
+max_samples: 1000000000
+model_name_or_path: XeTute/Phantasor_V0.2-137M
+num_train_epochs: 12.0
+optim: sgd
+output_dir: saves\GPT-2-Small\full\10-02-2025
+packing: false
+per_device_train_batch_size: 1
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: none
+save_steps: 5000
+stage: sft
+template: alpaca
+trust_remote_code: true
+warmup_steps: 10

training_loss.png ADDED Viewed

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff