End of training

Browse files

Files changed (6) hide show

README.md +15 -15
added_tokens.json +8 -0
preprocessor_config.json +5 -5
special_tokens_map.json +70 -2
tokenizer.json +88 -2
tokenizer_config.json +74 -2

README.md CHANGED Viewed

@@ -17,17 +17,17 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.1662
-- Bleu score: 0.0215
-- Precisions: [0.9469914040114613, 0.9204368174726989, 0.8938356164383562, 0.872865275142315]
-- Brevity penalty: 0.0237
-- Length ratio: 0.2109
-- Translation length: 698
-- Reference length: 3310
-- Cer: 0.7917
-- Wer: 0.8253
-- Cer Hugging Face: 0.7954
-- Wer Hugging Face: 0.8274
 ## Model description
@@ -61,10 +61,10 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss | Bleu score | Precisions                                                                       | Brevity penalty | Length ratio | Translation length | Reference length | Cer    | Wer    | Cer Hugging Face | Wer Hugging Face |
 |:-------------:|:-----:|:----:|:---------------:|:----------:|:--------------------------------------------------------------------------------:|:---------------:|:------------:|:------------------:|:----------------:|:------:|:------:|:----------------:|:----------------:|
-| 0.5956        | 1.0   | 253  | 0.2372          | 0.0231     | [0.9258741258741259, 0.8890577507598785, 0.8519134775374376, 0.8180147058823529] | 0.0265          | 0.2160       | 715                | 3310             | 0.7922 | 0.8383 | 0.7969           | 0.8412           |
-| 0.2509        | 2.0   | 506  | 0.1730          | 0.0213     | [0.9425287356321839, 0.9217527386541471, 0.8969072164948454, 0.88]               | 0.0234          | 0.2103       | 696                | 3310             | 0.7928 | 0.8285 | 0.7966           | 0.8306           |
-| 0.22          | 3.0   | 759  | 0.1777          | 0.0215     | [0.9469914040114613, 0.9188767550702028, 0.8921232876712328, 0.872865275142315]  | 0.0237          | 0.2109       | 698                | 3310             | 0.7914 | 0.8282 | 0.7948           | 0.8306           |
-| 0.1687        | 4.0   | 1012 | 0.1662          | 0.0215     | [0.9469914040114613, 0.9204368174726989, 0.8938356164383562, 0.872865275142315]  | 0.0237          | 0.2109       | 698                | 3310             | 0.7917 | 0.8253 | 0.7954           | 0.8274           |
 ### Framework versions

 This model is a fine-tuned version of [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.3400
+- Bleu score: 0.0856
+- Precisions: [0.8478260869565217, 0.8017817371937639, 0.7755102040816326, 0.755223880597015]
+- Brevity penalty: 0.1078
+- Length ratio: 0.3099
+- Translation length: 506
+- Reference length: 1633
+- Cer: 0.7597
+- Wer: 0.8305
+- Cer Hugging Face: 0.7664
+- Wer Hugging Face: 0.8347
 ## Model description
 | Training Loss | Epoch | Step | Validation Loss | Bleu score | Precisions                                                                       | Brevity penalty | Length ratio | Translation length | Reference length | Cer    | Wer    | Cer Hugging Face | Wer Hugging Face |
 |:-------------:|:-----:|:----:|:---------------:|:----------:|:--------------------------------------------------------------------------------:|:---------------:|:------------:|:------------------:|:----------------:|:------:|:------:|:----------------:|:----------------:|
+| 0.9692        | 1.0   | 253  | 0.4901          | 0.0746     | [0.8011928429423459, 0.726457399103139, 0.6760925449871465, 0.6295180722891566]  | 0.1058          | 0.3080       | 503                | 1633             | 0.7672 | 0.8440 | 0.7741           | 0.8478           |
+| 0.437         | 2.0   | 506  | 0.3906          | 0.0824     | [0.8382642998027613, 0.7755555555555556, 0.7353689567430025, 0.6964285714285714] | 0.1085          | 0.3105       | 507                | 1633             | 0.7611 | 0.8328 | 0.7675           | 0.8367           |
+| 0.2997        | 3.0   | 759  | 0.3565          | 0.0858     | [0.828125, 0.778021978021978, 0.7462311557788944, 0.718475073313783]             | 0.1120          | 0.3135       | 512                | 1633             | 0.7640 | 0.8363 | 0.7703           | 0.8397           |
+| 0.2168        | 4.0   | 1012 | 0.3400          | 0.0856     | [0.8478260869565217, 0.8017817371937639, 0.7755102040816326, 0.755223880597015]  | 0.1078          | 0.3099       | 506                | 1633             | 0.7597 | 0.8305 | 0.7664           | 0.8347           |
 ### Framework versions

added_tokens.json CHANGED Viewed

@@ -1,5 +1,13 @@
 {
   "<s_iitcdip>": 57523,
   "<s_synthdog>": 57524,
   "<sep/>": 57522
 }

 {
+  "</s_address>": 57532,
+  "</s_company>": 57530,
+  "</s_date>": 57528,
+  "</s_total>": 57526,
+  "<s_address>": 57531,
+  "<s_company>": 57529,
+  "<s_date>": 57527,
   "<s_iitcdip>": 57523,
   "<s_synthdog>": 57524,
+  "<s_total>": 57525,
   "<sep/>": 57522
 }

preprocessor_config.json CHANGED Viewed

@@ -17,7 +17,7 @@
     "data_format",
     "input_data_format"
   ],
-  "do_align_long_axis": true,
   "do_normalize": true,
   "do_pad": true,
   "do_rescale": true,
@@ -37,8 +37,8 @@
   "processor_class": "DonutProcessor",
   "resample": 2,
   "rescale_factor": 0.00392156862745098,
-  "size": {
-    "height": 2560,
-    "width": 1920
-  }
 }

     "data_format",
     "input_data_format"
   ],
+  "do_align_long_axis": false,
   "do_normalize": true,
   "do_pad": true,
   "do_rescale": true,
   "processor_class": "DonutProcessor",
   "resample": 2,
   "rescale_factor": 0.00392156862745098,
+  "size": [
+    720,
+    960
+  ]
 }

special_tokens_map.json CHANGED Viewed

@@ -1,7 +1,75 @@
 {
   "additional_special_tokens": [
-    "<s_iitcdip>",
-    "<s_synthdog>"
   ],
   "bos_token": {
     "content": "<s>",

 {
   "additional_special_tokens": [
+    {
+      "content": "<s_total>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</s_total>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<s_date>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</s_date>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<s_company>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</s_company>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<s_address>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</s_address>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
   ],
   "bos_token": {
     "content": "<s>",

tokenizer.json CHANGED Viewed

@@ -1,7 +1,21 @@
 {
   "version": "1.0",
-  "truncation": null,
-  "padding": null,
   "added_tokens": [
     {
       "id": 0,
@@ -74,6 +88,78 @@
       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": {

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 512,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": {
+      "Fixed": 512
+    },
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 1,
+    "pad_type_id": 0,
+    "pad_token": "<pad>"
+  },
   "added_tokens": [
     {
       "id": 0,
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 57525,
+      "content": "<s_total>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 57526,
+      "content": "</s_total>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 57527,
+      "content": "<s_date>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 57528,
+      "content": "</s_date>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 57529,
+      "content": "<s_company>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 57530,
+      "content": "</s_company>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 57531,
+      "content": "<s_address>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 57532,
+      "content": "</s_address>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": {

tokenizer_config.json CHANGED Viewed

@@ -63,11 +63,83 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "additional_special_tokens": [
-    "<s_iitcdip>",
-    "<s_synthdog>"
   ],
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": true,

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "57525": {
+      "content": "<s_total>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57526": {
+      "content": "</s_total>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57527": {
+      "content": "<s_date>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57528": {
+      "content": "</s_date>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57529": {
+      "content": "<s_company>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57530": {
+      "content": "</s_company>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57531": {
+      "content": "<s_address>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57532": {
+      "content": "</s_address>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "additional_special_tokens": [
+    "<s_total>",
+    "</s_total>",
+    "<s_date>",
+    "</s_date>",
+    "<s_company>",
+    "</s_company>",
+    "<s_address>",
+    "</s_address>",
+    "<s>",
+    "</s>"
   ],
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": true,