End of training

Browse files

Files changed (6) hide show

README.md +75 -0
added_tokens.json +0 -8
generation_config.json +7 -0
special_tokens_map.json +0 -56
tokenizer.json +6 -92
tokenizer_config.json +0 -72

README.md ADDED Viewed

	@@ -0,0 +1,75 @@

+---
+license: mit
+base_model: naver-clova-ix/donut-base
+tags:
+- generated_from_trainer
+metrics:
+- bleu
+- wer
+model-index:
+- name: donut-base-sroie-bayesian-optimization
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# donut-base-sroie-bayesian-optimization
+This model is a fine-tuned version of [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.1396
+- Bleu: 0.0196
+- Precisions: [0.9883177570093458, 0.9724655819774718, 0.954177897574124, 0.9328467153284672]
+- Brevity Penalty: 0.0203
+- Length Ratio: 0.2043
+- Translation Length: 856
+- Reference Length: 4190
+- Cer: 0.8584
+- Wer: 1.0
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 1.2010406976282324e-05
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 2
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- num_epochs: 5
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch | Step | Validation Loss | Bleu   | Precisions                                                                       | Brevity Penalty | Length Ratio | Translation Length | Reference Length | Cer    | Wer |
+|:-------------:|:-----:|:----:|:---------------:|:------:|:--------------------------------------------------------------------------------:|:---------------:|:------------:|:------------------:|:----------------:|:------:|:---:|
+| 0.021         | 1.0   | 253  | 0.1656          | 0.0194 | [0.9848130841121495, 0.9649561952440551, 0.9420485175202157, 0.9153284671532846] | 0.0203          | 0.2043       | 856                | 4190             | 0.8596 | 1.0 |
+| 0.0353        | 2.0   | 506  | 0.1501          | 0.0195 | [0.9813736903376019, 0.9588528678304239, 0.9328859060402684, 0.9026162790697675] | 0.0207          | 0.2050       | 859                | 4190             | 0.8595 | 1.0 |
+| 0.0417        | 3.0   | 759  | 0.1423          | 0.0195 | [0.9871495327102804, 0.9699624530663329, 0.9501347708894878, 0.927007299270073]  | 0.0203          | 0.2043       | 856                | 4190             | 0.8586 | 1.0 |
+| 0.0308        | 4.0   | 1012 | 0.1403          | 0.0193 | [0.9859649122807017, 0.9674185463659147, 0.9460188933873145, 0.9210526315789473] | 0.0202          | 0.2041       | 855                | 4190             | 0.8593 | 1.0 |
+| 0.0464        | 5.0   | 1265 | 0.1396          | 0.0196 | [0.9883177570093458, 0.9724655819774718, 0.954177897574124, 0.9328467153284672]  | 0.0203          | 0.2043       | 856                | 4190             | 0.8584 | 1.0 |
+### Framework versions
+- Transformers 4.41.0.dev0
+- Pytorch 2.1.0
+- Datasets 2.19.0
+- Tokenizers 0.19.1

added_tokens.json CHANGED Viewed

@@ -1,13 +1,5 @@
 {
-  "</s_address>": 57532,
-  "</s_company>": 57530,
-  "</s_date>": 57528,
-  "</s_total>": 57526,
-  "<s_address>": 57531,
-  "<s_company>": 57529,
-  "<s_date>": 57527,
   "<s_iitcdip>": 57523,
   "<s_synthdog>": 57524,
-  "<s_total>": 57525,
   "<sep/>": 57522
 }

 {
   "<s_iitcdip>": 57523,
   "<s_synthdog>": 57524,
   "<sep/>": 57522
 }

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.41.0.dev0"
+}

special_tokens_map.json CHANGED Viewed

@@ -1,61 +1,5 @@
 {
   "additional_special_tokens": [
-    {
-      "content": "<s_total>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "</s_total>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<s_date>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "</s_date>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<s_company>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "</s_company>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<s_address>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "</s_address>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
     {
       "content": "<s>",
       "lstrip": false,

 {
   "additional_special_tokens": [
     {
       "content": "<s>",
       "lstrip": false,

tokenizer.json CHANGED Viewed

@@ -1,21 +1,7 @@
 {
   "version": "1.0",
-  "truncation": {
-    "direction": "Right",
-    "max_length": 512,
-    "strategy": "LongestFirst",
-    "stride": 0
-  },
-  "padding": {
-    "strategy": {
-      "Fixed": 512
-    },
-    "direction": "Right",
-    "pad_to_multiple_of": null,
-    "pad_id": 1,
-    "pad_type_id": 0,
-    "pad_token": "<pad>"
-  },
   "added_tokens": [
     {
       "id": 0,
@@ -88,78 +74,6 @@
       "rstrip": false,
       "normalized": false,
       "special": true
-    },
-    {
-      "id": 57525,
-      "content": "<s_total>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 57526,
-      "content": "</s_total>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 57527,
-      "content": "<s_date>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 57528,
-      "content": "</s_date>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 57529,
-      "content": "<s_company>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 57530,
-      "content": "</s_company>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 57531,
-      "content": "<s_address>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 57532,
-      "content": "</s_address>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
     }
   ],
   "normalizer": {
@@ -186,8 +100,8 @@
   "pre_tokenizer": {
     "type": "Metaspace",
     "replacement": "▁",
-    "add_prefix_space": true,
-    "prepend_scheme": "always"
   },
   "post_processor": {
     "type": "TemplateProcessing",
@@ -273,8 +187,8 @@
   "decoder": {
     "type": "Metaspace",
     "replacement": "▁",
-    "add_prefix_space": true,
-    "prepend_scheme": "always"
   },
   "model": {
     "type": "Unigram",

 {
   "version": "1.0",
+  "truncation": null,
+  "padding": null,
   "added_tokens": [
     {
       "id": 0,
       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": {
   "pre_tokenizer": {
     "type": "Metaspace",
     "replacement": "▁",
+    "prepend_scheme": "always",
+    "split": true
   },
   "post_processor": {
     "type": "TemplateProcessing",
   "decoder": {
     "type": "Metaspace",
     "replacement": "▁",
+    "prepend_scheme": "always",
+    "split": true
   },
   "model": {
     "type": "Unigram",

tokenizer_config.json CHANGED Viewed

@@ -63,81 +63,9 @@
       "rstrip": false,
       "single_word": false,
       "special": true
-    },
-    "57525": {
-      "content": "<s_total>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "57526": {
-      "content": "</s_total>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "57527": {
-      "content": "<s_date>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "57528": {
-      "content": "</s_date>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "57529": {
-      "content": "<s_company>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "57530": {
-      "content": "</s_company>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "57531": {
-      "content": "<s_address>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "57532": {
-      "content": "</s_address>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
     }
   },
   "additional_special_tokens": [
-    "<s_total>",
-    "</s_total>",
-    "<s_date>",
-    "</s_date>",
-    "<s_company>",
-    "</s_company>",
-    "<s_address>",
-    "</s_address>",
     "<s>",
     "</s>"
   ],

       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "additional_special_tokens": [
     "<s>",
     "</s>"
   ],