Amala3 commited on
Commit
3da4756
1 Parent(s): 3b80261

End of training

Browse files
Files changed (5) hide show
  1. README.md +37 -52
  2. config.json +48 -24
  3. generation_config.json +8 -4
  4. model.safetensors +2 -2
  5. training_args.bin +2 -2
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- license: apache-2.0
3
- base_model: google/byt5-small
4
  tags:
5
  - generated_from_trainer
6
  model-index:
@@ -13,9 +13,9 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # models
15
 
16
- This model is a fine-tuned version of [google/byt5-small](https://huggingface.co/google/byt5-small) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.1009
19
 
20
  ## Model description
21
 
@@ -35,8 +35,8 @@ More information needed
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 2e-05
38
- - train_batch_size: 2
39
- - eval_batch_size: 2
40
  - seed: 42
41
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
  - lr_scheduler_type: linear
@@ -46,54 +46,39 @@ The following hyperparameters were used during training:
46
 
47
  | Training Loss | Epoch | Step | Validation Loss |
48
  |:-------------:|:------:|:-----:|:---------------:|
49
- | 0.9002 | 0.0175 | 500 | 0.2578 |
50
- | 0.311 | 0.0349 | 1000 | 0.1760 |
51
- | 0.2391 | 0.0524 | 1500 | 0.1591 |
52
- | 0.2133 | 0.0699 | 2000 | 0.1568 |
53
- | 0.1813 | 0.0874 | 2500 | 0.1529 |
54
- | 0.1908 | 0.1048 | 3000 | 0.1535 |
55
- | 0.1625 | 0.1223 | 3500 | 0.1467 |
56
- | 0.154 | 0.1398 | 4000 | 0.1416 |
57
- | 0.1596 | 0.1573 | 4500 | 0.1410 |
58
- | 0.1528 | 0.1747 | 5000 | 0.1347 |
59
- | 0.1541 | 0.1922 | 5500 | 0.1328 |
60
- | 0.1428 | 0.2097 | 6000 | 0.1303 |
61
- | 0.1438 | 0.2272 | 6500 | 0.1330 |
62
- | 0.1594 | 0.2446 | 7000 | 0.1211 |
63
- | 0.1404 | 0.2621 | 7500 | 0.1230 |
64
- | 0.1466 | 0.2796 | 8000 | 0.1195 |
65
- | 0.1448 | 0.2971 | 8500 | 0.1159 |
66
- | 0.1289 | 0.3145 | 9000 | 0.1199 |
67
- | 0.119 | 0.3320 | 9500 | 0.1182 |
68
- | 0.1333 | 0.3495 | 10000 | 0.1117 |
69
- | 0.1334 | 0.3670 | 10500 | 0.1151 |
70
- | 0.1321 | 0.3844 | 11000 | 0.1156 |
71
- | 0.1259 | 0.4019 | 11500 | 0.1107 |
72
- | 0.1269 | 0.4194 | 12000 | 0.1129 |
73
- | 0.1343 | 0.4369 | 12500 | 0.1068 |
74
- | 0.1269 | 0.4543 | 13000 | 0.1064 |
75
- | 0.1173 | 0.4718 | 13500 | 0.1103 |
76
- | 0.1146 | 0.4893 | 14000 | 0.1086 |
77
- | 0.1362 | 0.5068 | 14500 | 0.1047 |
78
- | 0.1252 | 0.5242 | 15000 | 0.1078 |
79
- | 0.1213 | 0.5417 | 15500 | 0.1085 |
80
- | 0.1314 | 0.5592 | 16000 | 0.1047 |
81
- | 0.114 | 0.5767 | 16500 | 0.1045 |
82
- | 0.1172 | 0.5941 | 17000 | 0.1044 |
83
- | 0.1034 | 0.6116 | 17500 | 0.1076 |
84
- | 0.1232 | 0.6291 | 18000 | 0.1030 |
85
- | 0.1261 | 0.6466 | 18500 | 0.1024 |
86
- | 0.123 | 0.6640 | 19000 | 0.1007 |
87
- | 0.1052 | 0.6815 | 19500 | 0.1026 |
88
- | 0.1194 | 0.6990 | 20000 | 0.1016 |
89
- | 0.1136 | 0.7165 | 20500 | 0.1015 |
90
- | 0.1115 | 0.7339 | 21000 | 0.1058 |
91
- | 0.1191 | 0.7514 | 21500 | 0.1009 |
92
 
93
 
94
  ### Framework versions
95
 
96
- - Transformers 4.43.4
97
- - Pytorch 2.2.0+cu121
98
- - Datasets 2.17.1
99
  - Tokenizers 0.19.1
 
1
  ---
2
+ license: mit
3
+ base_model: facebook/mbart-large-50
4
  tags:
5
  - generated_from_trainer
6
  model-index:
 
13
 
14
  # models
15
 
16
+ This model is a fine-tuned version of [facebook/mbart-large-50](https://huggingface.co/facebook/mbart-large-50) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0060
19
 
20
  ## Model description
21
 
 
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 2e-05
38
+ - train_batch_size: 8
39
+ - eval_batch_size: 8
40
  - seed: 42
41
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
  - lr_scheduler_type: linear
 
46
 
47
  | Training Loss | Epoch | Step | Validation Loss |
48
  |:-------------:|:------:|:-----:|:---------------:|
49
+ | 1.4786 | 0.0699 | 500 | 0.0108 |
50
+ | 0.0542 | 0.1398 | 1000 | 0.0094 |
51
+ | 0.01 | 0.2097 | 1500 | 0.0086 |
52
+ | 0.0095 | 0.2796 | 2000 | 0.0080 |
53
+ | 0.0082 | 0.3495 | 2500 | 0.0076 |
54
+ | 0.0078 | 0.4193 | 3000 | 0.0075 |
55
+ | 0.0081 | 0.4892 | 3500 | 0.0074 |
56
+ | 0.0083 | 0.5591 | 4000 | 0.0071 |
57
+ | 0.0071 | 0.6290 | 4500 | 0.0070 |
58
+ | 0.0074 | 0.6989 | 5000 | 0.0069 |
59
+ | 0.0075 | 0.7688 | 5500 | 0.0067 |
60
+ | 0.0073 | 0.8387 | 6000 | 0.0066 |
61
+ | 0.007 | 0.9086 | 6500 | 0.0065 |
62
+ | 0.0075 | 0.9785 | 7000 | 0.0064 |
63
+ | 0.006 | 1.0484 | 7500 | 0.0063 |
64
+ | 0.0181 | 1.1183 | 8000 | 0.0077 |
65
+ | 0.0066 | 1.1881 | 8500 | 0.0064 |
66
+ | 0.0067 | 1.2580 | 9000 | 0.0107 |
67
+ | 0.0087 | 1.3279 | 9500 | 0.0091 |
68
+ | 0.0065 | 1.3978 | 10000 | 0.0062 |
69
+ | 0.0054 | 1.4677 | 10500 | 0.0061 |
70
+ | 0.0055 | 1.5376 | 11000 | 0.0061 |
71
+ | 0.0053 | 1.6075 | 11500 | 0.0060 |
72
+ | 0.0052 | 1.6774 | 12000 | 0.0060 |
73
+ | 0.0051 | 1.7473 | 12500 | 0.0060 |
74
+ | 0.0051 | 1.8172 | 13000 | 0.0060 |
75
+ | 0.0063 | 1.8871 | 13500 | 0.0063 |
76
+ | 0.0054 | 1.9569 | 14000 | 0.0060 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
 
79
  ### Framework versions
80
 
81
+ - Transformers 4.40.2
82
+ - Pytorch 2.1.2
83
+ - Datasets 2.18.0
84
  - Tokenizers 0.19.1
config.json CHANGED
@@ -1,34 +1,58 @@
1
  {
2
- "_name_or_path": "google/byt5-small",
 
 
 
 
 
3
  "architectures": [
4
- "T5ForConditionalGeneration"
5
  ],
 
 
 
6
  "classifier_dropout": 0.0,
7
- "d_ff": 3584,
8
- "d_kv": 64,
9
- "d_model": 1472,
10
- "decoder_start_token_id": 0,
11
- "dense_act_fn": "gelu_new",
12
- "dropout_rate": 0.1,
13
- "eos_token_id": 1,
14
- "feed_forward_proj": "gated-gelu",
 
 
 
 
 
 
15
  "gradient_checkpointing": false,
16
- "initializer_factor": 1.0,
 
 
 
 
 
17
  "is_encoder_decoder": true,
18
- "is_gated_act": true,
19
- "layer_norm_epsilon": 1e-06,
 
 
 
20
  "max_length": 1024,
21
- "model_type": "t5",
22
- "num_decoder_layers": 4,
23
- "num_heads": 6,
24
- "num_layers": 12,
25
- "pad_token_id": 0,
26
- "relative_attention_max_distance": 128,
27
- "relative_attention_num_buckets": 32,
28
- "tie_word_embeddings": false,
29
- "tokenizer_class": "ByT5Tokenizer",
 
 
30
  "torch_dtype": "float32",
31
- "transformers_version": "4.43.4",
32
  "use_cache": true,
33
- "vocab_size": 384
34
  }
 
1
  {
2
+ "_name_or_path": "facebook/mbart-large-50",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "gelu",
6
+ "add_bias_logits": false,
7
+ "add_final_layer_norm": true,
8
  "architectures": [
9
+ "MBartForConditionalGeneration"
10
  ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 0,
13
+ "classif_dropout": 0.0,
14
  "classifier_dropout": 0.0,
15
+ "d_model": 1024,
16
+ "decoder_attention_heads": 16,
17
+ "decoder_ffn_dim": 4096,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 12,
20
+ "decoder_start_token_id": 2,
21
+ "dropout": 0.1,
22
+ "early_stopping": true,
23
+ "encoder_attention_heads": 16,
24
+ "encoder_ffn_dim": 4096,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 12,
27
+ "eos_token_id": 2,
28
+ "forced_eos_token_id": 2,
29
  "gradient_checkpointing": false,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1",
33
+ "2": "LABEL_2"
34
+ },
35
+ "init_std": 0.02,
36
  "is_encoder_decoder": true,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1,
40
+ "LABEL_2": 2
41
+ },
42
  "max_length": 1024,
43
+ "max_position_embeddings": 1024,
44
+ "model_type": "mbart",
45
+ "normalize_before": true,
46
+ "normalize_embedding": true,
47
+ "num_beams": 5,
48
+ "num_hidden_layers": 12,
49
+ "output_past": true,
50
+ "pad_token_id": 1,
51
+ "scale_embedding": true,
52
+ "static_position_embeddings": false,
53
+ "tokenizer_class": "MBart50Tokenizer",
54
  "torch_dtype": "float32",
55
+ "transformers_version": "4.40.2",
56
  "use_cache": true,
57
+ "vocab_size": 250054
58
  }
generation_config.json CHANGED
@@ -1,8 +1,12 @@
1
  {
2
- "decoder_start_token_id": 0,
 
 
3
  "early_stopping": true,
4
- "eos_token_id": 1,
 
 
5
  "num_beams": 5,
6
- "pad_token": 0,
7
- "transformers_version": "4.43.4"
8
  }
 
1
  {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
  "early_stopping": true,
6
+ "eos_token_id": 2,
7
+ "forced_eos_token_id": 2,
8
+ "max_length": 200,
9
  "num_beams": 5,
10
+ "pad_token_id": 1,
11
+ "transformers_version": "4.40.2"
12
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a01400a4aff0de2efdbbc3690e6ba19bc5d348e456d55bd45a3b8af7ae66bb44
3
- size 1198571496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c272579e2bbd55aa3e3098c58166a8e3c5044f70bc6e4a02431666594892c6e
3
+ size 2444578688
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e593e7f842bd43dea021f2fd86db6af706a40f154f670ca2889481072cbad2ff
3
- size 7032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:587d6f3cf0717982586d9acf794092e95552916715a71880f17300c86e164e80
3
+ size 5112