davelotito commited on
Commit
d64006e
1 Parent(s): cb19a17

End of training

Browse files
README.md CHANGED
@@ -17,17 +17,17 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.1662
21
- - Bleu score: 0.0215
22
- - Precisions: [0.9469914040114613, 0.9204368174726989, 0.8938356164383562, 0.872865275142315]
23
- - Brevity penalty: 0.0237
24
- - Length ratio: 0.2109
25
- - Translation length: 698
26
- - Reference length: 3310
27
- - Cer: 0.7917
28
- - Wer: 0.8253
29
- - Cer Hugging Face: 0.7954
30
- - Wer Hugging Face: 0.8274
31
 
32
  ## Model description
33
 
@@ -61,10 +61,10 @@ The following hyperparameters were used during training:
61
 
62
  | Training Loss | Epoch | Step | Validation Loss | Bleu score | Precisions | Brevity penalty | Length ratio | Translation length | Reference length | Cer | Wer | Cer Hugging Face | Wer Hugging Face |
63
  |:-------------:|:-----:|:----:|:---------------:|:----------:|:--------------------------------------------------------------------------------:|:---------------:|:------------:|:------------------:|:----------------:|:------:|:------:|:----------------:|:----------------:|
64
- | 0.5956 | 1.0 | 253 | 0.2372 | 0.0231 | [0.9258741258741259, 0.8890577507598785, 0.8519134775374376, 0.8180147058823529] | 0.0265 | 0.2160 | 715 | 3310 | 0.7922 | 0.8383 | 0.7969 | 0.8412 |
65
- | 0.2509 | 2.0 | 506 | 0.1730 | 0.0213 | [0.9425287356321839, 0.9217527386541471, 0.8969072164948454, 0.88] | 0.0234 | 0.2103 | 696 | 3310 | 0.7928 | 0.8285 | 0.7966 | 0.8306 |
66
- | 0.22 | 3.0 | 759 | 0.1777 | 0.0215 | [0.9469914040114613, 0.9188767550702028, 0.8921232876712328, 0.872865275142315] | 0.0237 | 0.2109 | 698 | 3310 | 0.7914 | 0.8282 | 0.7948 | 0.8306 |
67
- | 0.1687 | 4.0 | 1012 | 0.1662 | 0.0215 | [0.9469914040114613, 0.9204368174726989, 0.8938356164383562, 0.872865275142315] | 0.0237 | 0.2109 | 698 | 3310 | 0.7917 | 0.8253 | 0.7954 | 0.8274 |
68
 
69
 
70
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.3400
21
+ - Bleu score: 0.0856
22
+ - Precisions: [0.8478260869565217, 0.8017817371937639, 0.7755102040816326, 0.755223880597015]
23
+ - Brevity penalty: 0.1078
24
+ - Length ratio: 0.3099
25
+ - Translation length: 506
26
+ - Reference length: 1633
27
+ - Cer: 0.7597
28
+ - Wer: 0.8305
29
+ - Cer Hugging Face: 0.7664
30
+ - Wer Hugging Face: 0.8347
31
 
32
  ## Model description
33
 
 
61
 
62
  | Training Loss | Epoch | Step | Validation Loss | Bleu score | Precisions | Brevity penalty | Length ratio | Translation length | Reference length | Cer | Wer | Cer Hugging Face | Wer Hugging Face |
63
  |:-------------:|:-----:|:----:|:---------------:|:----------:|:--------------------------------------------------------------------------------:|:---------------:|:------------:|:------------------:|:----------------:|:------:|:------:|:----------------:|:----------------:|
64
+ | 0.9692 | 1.0 | 253 | 0.4901 | 0.0746 | [0.8011928429423459, 0.726457399103139, 0.6760925449871465, 0.6295180722891566] | 0.1058 | 0.3080 | 503 | 1633 | 0.7672 | 0.8440 | 0.7741 | 0.8478 |
65
+ | 0.437 | 2.0 | 506 | 0.3906 | 0.0824 | [0.8382642998027613, 0.7755555555555556, 0.7353689567430025, 0.6964285714285714] | 0.1085 | 0.3105 | 507 | 1633 | 0.7611 | 0.8328 | 0.7675 | 0.8367 |
66
+ | 0.2997 | 3.0 | 759 | 0.3565 | 0.0858 | [0.828125, 0.778021978021978, 0.7462311557788944, 0.718475073313783] | 0.1120 | 0.3135 | 512 | 1633 | 0.7640 | 0.8363 | 0.7703 | 0.8397 |
67
+ | 0.2168 | 4.0 | 1012 | 0.3400 | 0.0856 | [0.8478260869565217, 0.8017817371937639, 0.7755102040816326, 0.755223880597015] | 0.1078 | 0.3099 | 506 | 1633 | 0.7597 | 0.8305 | 0.7664 | 0.8347 |
68
 
69
 
70
  ### Framework versions
added_tokens.json CHANGED
@@ -1,5 +1,13 @@
1
  {
 
 
 
 
 
 
 
2
  "<s_iitcdip>": 57523,
3
  "<s_synthdog>": 57524,
 
4
  "<sep/>": 57522
5
  }
 
1
  {
2
+ "</s_address>": 57532,
3
+ "</s_company>": 57530,
4
+ "</s_date>": 57528,
5
+ "</s_total>": 57526,
6
+ "<s_address>": 57531,
7
+ "<s_company>": 57529,
8
+ "<s_date>": 57527,
9
  "<s_iitcdip>": 57523,
10
  "<s_synthdog>": 57524,
11
+ "<s_total>": 57525,
12
  "<sep/>": 57522
13
  }
preprocessor_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "data_format",
18
  "input_data_format"
19
  ],
20
- "do_align_long_axis": true,
21
  "do_normalize": true,
22
  "do_pad": true,
23
  "do_rescale": true,
@@ -37,8 +37,8 @@
37
  "processor_class": "DonutProcessor",
38
  "resample": 2,
39
  "rescale_factor": 0.00392156862745098,
40
- "size": {
41
- "height": 2560,
42
- "width": 1920
43
- }
44
  }
 
17
  "data_format",
18
  "input_data_format"
19
  ],
20
+ "do_align_long_axis": false,
21
  "do_normalize": true,
22
  "do_pad": true,
23
  "do_rescale": true,
 
37
  "processor_class": "DonutProcessor",
38
  "resample": 2,
39
  "rescale_factor": 0.00392156862745098,
40
+ "size": [
41
+ 720,
42
+ 960
43
+ ]
44
  }
special_tokens_map.json CHANGED
@@ -1,7 +1,75 @@
1
  {
2
  "additional_special_tokens": [
3
- "<s_iitcdip>",
4
- "<s_synthdog>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
  "bos_token": {
7
  "content": "<s>",
 
1
  {
2
  "additional_special_tokens": [
3
+ {
4
+ "content": "<s_total>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s_total>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<s_date>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "</s_date>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<s_company>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "</s_company>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<s_address>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "</s_address>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<s>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "</s>",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ }
73
  ],
74
  "bos_token": {
75
  "content": "<s>",
tokenizer.json CHANGED
@@ -1,7 +1,21 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
@@ -74,6 +88,78 @@
74
  "rstrip": false,
75
  "normalized": false,
76
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
78
  ],
79
  "normalizer": {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 512
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 1,
16
+ "pad_type_id": 0,
17
+ "pad_token": "<pad>"
18
+ },
19
  "added_tokens": [
20
  {
21
  "id": 0,
 
88
  "rstrip": false,
89
  "normalized": false,
90
  "special": true
91
+ },
92
+ {
93
+ "id": 57525,
94
+ "content": "<s_total>",
95
+ "single_word": false,
96
+ "lstrip": false,
97
+ "rstrip": false,
98
+ "normalized": false,
99
+ "special": true
100
+ },
101
+ {
102
+ "id": 57526,
103
+ "content": "</s_total>",
104
+ "single_word": false,
105
+ "lstrip": false,
106
+ "rstrip": false,
107
+ "normalized": false,
108
+ "special": true
109
+ },
110
+ {
111
+ "id": 57527,
112
+ "content": "<s_date>",
113
+ "single_word": false,
114
+ "lstrip": false,
115
+ "rstrip": false,
116
+ "normalized": false,
117
+ "special": true
118
+ },
119
+ {
120
+ "id": 57528,
121
+ "content": "</s_date>",
122
+ "single_word": false,
123
+ "lstrip": false,
124
+ "rstrip": false,
125
+ "normalized": false,
126
+ "special": true
127
+ },
128
+ {
129
+ "id": 57529,
130
+ "content": "<s_company>",
131
+ "single_word": false,
132
+ "lstrip": false,
133
+ "rstrip": false,
134
+ "normalized": false,
135
+ "special": true
136
+ },
137
+ {
138
+ "id": 57530,
139
+ "content": "</s_company>",
140
+ "single_word": false,
141
+ "lstrip": false,
142
+ "rstrip": false,
143
+ "normalized": false,
144
+ "special": true
145
+ },
146
+ {
147
+ "id": 57531,
148
+ "content": "<s_address>",
149
+ "single_word": false,
150
+ "lstrip": false,
151
+ "rstrip": false,
152
+ "normalized": false,
153
+ "special": true
154
+ },
155
+ {
156
+ "id": 57532,
157
+ "content": "</s_address>",
158
+ "single_word": false,
159
+ "lstrip": false,
160
+ "rstrip": false,
161
+ "normalized": false,
162
+ "special": true
163
  }
164
  ],
165
  "normalizer": {
tokenizer_config.json CHANGED
@@ -63,11 +63,83 @@
63
  "rstrip": false,
64
  "single_word": false,
65
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  }
67
  },
68
  "additional_special_tokens": [
69
- "<s_iitcdip>",
70
- "<s_synthdog>"
 
 
 
 
 
 
 
 
71
  ],
72
  "bos_token": "<s>",
73
  "clean_up_tokenization_spaces": true,
 
63
  "rstrip": false,
64
  "single_word": false,
65
  "special": true
66
+ },
67
+ "57525": {
68
+ "content": "<s_total>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "57526": {
76
+ "content": "</s_total>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "57527": {
84
+ "content": "<s_date>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "57528": {
92
+ "content": "</s_date>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "57529": {
100
+ "content": "<s_company>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "57530": {
108
+ "content": "</s_company>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "57531": {
116
+ "content": "<s_address>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "57532": {
124
+ "content": "</s_address>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
  }
131
  },
132
  "additional_special_tokens": [
133
+ "<s_total>",
134
+ "</s_total>",
135
+ "<s_date>",
136
+ "</s_date>",
137
+ "<s_company>",
138
+ "</s_company>",
139
+ "<s_address>",
140
+ "</s_address>",
141
+ "<s>",
142
+ "</s>"
143
  ],
144
  "bos_token": "<s>",
145
  "clean_up_tokenization_spaces": true,