End of training

Browse files

Files changed (5) hide show

README.md +37 -52
config.json +48 -24
generation_config.json +8 -4
model.safetensors +2 -2
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-license: apache-2.0
-base_model: google/byt5-small
 tags:
 - generated_from_trainer
 model-index:
@@ -13,9 +13,9 @@ should probably proofread and complete it, then remove this comment. -->
 # models
-This model is a fine-tuned version of [google/byt5-small](https://huggingface.co/google/byt5-small) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.1009
 ## Model description
@@ -35,8 +35,8 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 2e-05
-- train_batch_size: 2
-- eval_batch_size: 2
 - seed: 42
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
@@ -46,54 +46,39 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch  | Step  | Validation Loss |
 |:-------------:|:------:|:-----:|:---------------:|
-| 0.9002        | 0.0175 | 500   | 0.2578          |
-| 0.311         | 0.0349 | 1000  | 0.1760          |
-| 0.2391        | 0.0524 | 1500  | 0.1591          |
-| 0.2133        | 0.0699 | 2000  | 0.1568          |
-| 0.1813        | 0.0874 | 2500  | 0.1529          |
-| 0.1908        | 0.1048 | 3000  | 0.1535          |
-| 0.1625        | 0.1223 | 3500  | 0.1467          |
-| 0.154         | 0.1398 | 4000  | 0.1416          |
-| 0.1596        | 0.1573 | 4500  | 0.1410          |
-| 0.1528        | 0.1747 | 5000  | 0.1347          |
-| 0.1541        | 0.1922 | 5500  | 0.1328          |
-| 0.1428        | 0.2097 | 6000  | 0.1303          |
-| 0.1438        | 0.2272 | 6500  | 0.1330          |
-| 0.1594        | 0.2446 | 7000  | 0.1211          |
-| 0.1404        | 0.2621 | 7500  | 0.1230          |
-| 0.1466        | 0.2796 | 8000  | 0.1195          |
-| 0.1448        | 0.2971 | 8500  | 0.1159          |
-| 0.1289        | 0.3145 | 9000  | 0.1199          |
-| 0.119         | 0.3320 | 9500  | 0.1182          |
-| 0.1333        | 0.3495 | 10000 | 0.1117          |
-| 0.1334        | 0.3670 | 10500 | 0.1151          |
-| 0.1321        | 0.3844 | 11000 | 0.1156          |
-| 0.1259        | 0.4019 | 11500 | 0.1107          |
-| 0.1269        | 0.4194 | 12000 | 0.1129          |
-| 0.1343        | 0.4369 | 12500 | 0.1068          |
-| 0.1269        | 0.4543 | 13000 | 0.1064          |
-| 0.1173        | 0.4718 | 13500 | 0.1103          |
-| 0.1146        | 0.4893 | 14000 | 0.1086          |
-| 0.1362        | 0.5068 | 14500 | 0.1047          |
-| 0.1252        | 0.5242 | 15000 | 0.1078          |
-| 0.1213        | 0.5417 | 15500 | 0.1085          |
-| 0.1314        | 0.5592 | 16000 | 0.1047          |
-| 0.114         | 0.5767 | 16500 | 0.1045          |
-| 0.1172        | 0.5941 | 17000 | 0.1044          |
-| 0.1034        | 0.6116 | 17500 | 0.1076          |
-| 0.1232        | 0.6291 | 18000 | 0.1030          |
-| 0.1261        | 0.6466 | 18500 | 0.1024          |
-| 0.123         | 0.6640 | 19000 | 0.1007          |
-| 0.1052        | 0.6815 | 19500 | 0.1026          |
-| 0.1194        | 0.6990 | 20000 | 0.1016          |
-| 0.1136        | 0.7165 | 20500 | 0.1015          |
-| 0.1115        | 0.7339 | 21000 | 0.1058          |
-| 0.1191        | 0.7514 | 21500 | 0.1009          |
 ### Framework versions
-- Transformers 4.43.4
-- Pytorch 2.2.0+cu121
-- Datasets 2.17.1
 - Tokenizers 0.19.1

 ---
+license: mit
+base_model: facebook/mbart-large-50
 tags:
 - generated_from_trainer
 model-index:
 # models
+This model is a fine-tuned version of [facebook/mbart-large-50](https://huggingface.co/facebook/mbart-large-50) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.0060
 ## Model description
 The following hyperparameters were used during training:
 - learning_rate: 2e-05
+- train_batch_size: 8
+- eval_batch_size: 8
 - seed: 42
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
 | Training Loss | Epoch  | Step  | Validation Loss |
 |:-------------:|:------:|:-----:|:---------------:|
+| 1.4786        | 0.0699 | 500   | 0.0108          |
+| 0.0542        | 0.1398 | 1000  | 0.0094          |
+| 0.01          | 0.2097 | 1500  | 0.0086          |
+| 0.0095        | 0.2796 | 2000  | 0.0080          |
+| 0.0082        | 0.3495 | 2500  | 0.0076          |
+| 0.0078        | 0.4193 | 3000  | 0.0075          |
+| 0.0081        | 0.4892 | 3500  | 0.0074          |
+| 0.0083        | 0.5591 | 4000  | 0.0071          |
+| 0.0071        | 0.6290 | 4500  | 0.0070          |
+| 0.0074        | 0.6989 | 5000  | 0.0069          |
+| 0.0075        | 0.7688 | 5500  | 0.0067          |
+| 0.0073        | 0.8387 | 6000  | 0.0066          |
+| 0.007         | 0.9086 | 6500  | 0.0065          |
+| 0.0075        | 0.9785 | 7000  | 0.0064          |
+| 0.006         | 1.0484 | 7500  | 0.0063          |
+| 0.0181        | 1.1183 | 8000  | 0.0077          |
+| 0.0066        | 1.1881 | 8500  | 0.0064          |
+| 0.0067        | 1.2580 | 9000  | 0.0107          |
+| 0.0087        | 1.3279 | 9500  | 0.0091          |
+| 0.0065        | 1.3978 | 10000 | 0.0062          |
+| 0.0054        | 1.4677 | 10500 | 0.0061          |
+| 0.0055        | 1.5376 | 11000 | 0.0061          |
+| 0.0053        | 1.6075 | 11500 | 0.0060          |
+| 0.0052        | 1.6774 | 12000 | 0.0060          |
+| 0.0051        | 1.7473 | 12500 | 0.0060          |
+| 0.0051        | 1.8172 | 13000 | 0.0060          |
+| 0.0063        | 1.8871 | 13500 | 0.0063          |
+| 0.0054        | 1.9569 | 14000 | 0.0060          |
 ### Framework versions
+- Transformers 4.40.2
+- Pytorch 2.1.2
+- Datasets 2.18.0
 - Tokenizers 0.19.1

config.json CHANGED Viewed

@@ -1,34 +1,58 @@
 {
-  "_name_or_path": "google/byt5-small",
   "architectures": [
-    "T5ForConditionalGeneration"
   ],
   "classifier_dropout": 0.0,
-  "d_ff": 3584,
-  "d_kv": 64,
-  "d_model": 1472,
-  "decoder_start_token_id": 0,
-  "dense_act_fn": "gelu_new",
-  "dropout_rate": 0.1,
-  "eos_token_id": 1,
-  "feed_forward_proj": "gated-gelu",
   "gradient_checkpointing": false,
-  "initializer_factor": 1.0,
   "is_encoder_decoder": true,
-  "is_gated_act": true,
-  "layer_norm_epsilon": 1e-06,
   "max_length": 1024,
-  "model_type": "t5",
-  "num_decoder_layers": 4,
-  "num_heads": 6,
-  "num_layers": 12,
-  "pad_token_id": 0,
-  "relative_attention_max_distance": 128,
-  "relative_attention_num_buckets": 32,
-  "tie_word_embeddings": false,
-  "tokenizer_class": "ByT5Tokenizer",
   "torch_dtype": "float32",
-  "transformers_version": "4.43.4",
   "use_cache": true,
-  "vocab_size": 384
 }

 {
+  "_name_or_path": "facebook/mbart-large-50",
+  "_num_labels": 3,
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": true,
   "architectures": [
+    "MBartForConditionalGeneration"
   ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classif_dropout": 0.0,
   "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 12,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "early_stopping": true,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
   "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
   "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
   "max_length": 1024,
+  "max_position_embeddings": 1024,
+  "model_type": "mbart",
+  "normalize_before": true,
+  "normalize_embedding": true,
+  "num_beams": 5,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "scale_embedding": true,
+  "static_position_embeddings": false,
+  "tokenizer_class": "MBart50Tokenizer",
   "torch_dtype": "float32",
+  "transformers_version": "4.40.2",
   "use_cache": true,
+  "vocab_size": 250054
 }

generation_config.json CHANGED Viewed

@@ -1,8 +1,12 @@
 {
-  "decoder_start_token_id": 0,
   "early_stopping": true,
-  "eos_token_id": 1,
   "num_beams": 5,
-  "pad_token": 0,
-  "transformers_version": "4.43.4"
 }

 {
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
   "early_stopping": true,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "max_length": 200,
   "num_beams": 5,
+  "pad_token_id": 1,
+  "transformers_version": "4.40.2"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a01400a4aff0de2efdbbc3690e6ba19bc5d348e456d55bd45a3b8af7ae66bb44
-size 1198571496

 version https://git-lfs.github.com/spec/v1
+oid sha256:6c272579e2bbd55aa3e3098c58166a8e3c5044f70bc6e4a02431666594892c6e
+size 2444578688

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e593e7f842bd43dea021f2fd86db6af706a40f154f670ca2889481072cbad2ff
-size 7032

 version https://git-lfs.github.com/spec/v1
+oid sha256:587d6f3cf0717982586d9acf794092e95552916715a71880f17300c86e164e80
+size 5112