AndreaHuang97 commited on
Commit
08d1b0e
1 Parent(s): 0e5a16c

Upload 10 files

Browse files

Add cloned files

README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ ---
5
+
6
+ # MarkupLM
7
+
8
+ **Multimodal (text +markup language) pre-training for [Document AI](https://www.microsoft.com/en-us/research/project/document-ai/)**
9
+
10
+ ## Introduction
11
+
12
+ MarkupLM is a simple but effective multi-modal pre-training method of text and markup language for visually-rich document understanding and information extraction tasks, such as webpage QA and webpage information extraction. MarkupLM archives the SOTA results on multiple datasets. For more details, please refer to our paper:
13
+
14
+ [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei, ACL 2022
15
+
16
+ ## Usage
17
+
18
+ We refer to the [docs](https://huggingface.co/docs/transformers/main/en/model_doc/markuplm) and [demo notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MarkupLM).
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<end-of-node>": 50266,
3
+ "[empty-title]": 50265
4
+ }
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MarkupLMForPretraining"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_depth": 50,
17
+ "max_position_embeddings": 514,
18
+ "max_xpath_subs_unit_embeddings": 1024,
19
+ "max_xpath_tag_unit_embeddings": 256,
20
+ "model_type": "markuplm",
21
+ "num_attention_heads": 12,
22
+ "num_hidden_layers": 12,
23
+ "pad_token_id": 1,
24
+ "position_embedding_type": "absolute",
25
+ "torch_dtype": "float16",
26
+ "transformers_version": "4.10.2",
27
+ "type_vocab_size": 1,
28
+ "use_cache": true,
29
+ "vocab_size": 50267,
30
+ "xpath_unit_hidden_size": 32
31
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "MarkupLMFeatureExtractor",
3
+ "processor_class": "MarkupLMProcessor"
4
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:812da9c91ed2b8d807c15c610fad699893ae2e7163861a2675fc162bcd834185
3
+ size 276526721
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "errors": "replace",
28
+ "from_slow": true,
29
+ "mask_token": {
30
+ "__type": "AddedToken",
31
+ "content": "<mask>",
32
+ "lstrip": true,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "max_depth": 50,
38
+ "max_width": 1000,
39
+ "model_max_length": 512,
40
+ "name_or_path": "microsoft/markuplm-base",
41
+ "only_label_first_subword": true,
42
+ "pad_token": {
43
+ "__type": "AddedToken",
44
+ "content": "<pad>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false
49
+ },
50
+ "pad_token_label": -100,
51
+ "pad_width": 1001,
52
+ "sep_token": {
53
+ "__type": "AddedToken",
54
+ "content": "</s>",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false
59
+ },
60
+ "special_tokens_map_file": null,
61
+ "tags_dict": {
62
+ "a": 0,
63
+ "abbr": 1,
64
+ "acronym": 2,
65
+ "address": 3,
66
+ "altGlyph": 4,
67
+ "altGlyphDef": 5,
68
+ "altGlyphItem": 6,
69
+ "animate": 7,
70
+ "animateColor": 8,
71
+ "animateMotion": 9,
72
+ "animateTransform": 10,
73
+ "applet": 11,
74
+ "area": 12,
75
+ "article": 13,
76
+ "aside": 14,
77
+ "audio": 15,
78
+ "b": 16,
79
+ "base": 17,
80
+ "basefont": 18,
81
+ "bdi": 19,
82
+ "bdo": 20,
83
+ "bgsound": 21,
84
+ "big": 22,
85
+ "blink": 23,
86
+ "blockquote": 24,
87
+ "body": 25,
88
+ "br": 26,
89
+ "button": 27,
90
+ "canvas": 28,
91
+ "caption": 29,
92
+ "center": 30,
93
+ "circle": 31,
94
+ "cite": 32,
95
+ "clipPath": 33,
96
+ "code": 34,
97
+ "col": 35,
98
+ "colgroup": 36,
99
+ "color-profile": 37,
100
+ "content": 38,
101
+ "cursor": 39,
102
+ "data": 40,
103
+ "datalist": 41,
104
+ "dd": 42,
105
+ "defs": 43,
106
+ "del": 44,
107
+ "desc": 45,
108
+ "details": 46,
109
+ "dfn": 47,
110
+ "dialog": 48,
111
+ "dir": 49,
112
+ "div": 50,
113
+ "dl": 51,
114
+ "dt": 52,
115
+ "ellipse": 53,
116
+ "em": 54,
117
+ "embed": 55,
118
+ "feBlend": 56,
119
+ "feColorMatrix": 57,
120
+ "feComponentTransfer": 58,
121
+ "feComposite": 59,
122
+ "feConvolveMatrix": 60,
123
+ "feDiffuseLighting": 61,
124
+ "feDisplacementMap": 62,
125
+ "feDistantLight": 63,
126
+ "feFlood": 64,
127
+ "feFuncA": 65,
128
+ "feFuncB": 66,
129
+ "feFuncG": 67,
130
+ "feFuncR": 68,
131
+ "feGaussianBlur": 69,
132
+ "feImage": 70,
133
+ "feMerge": 71,
134
+ "feMergeNode": 72,
135
+ "feMorphology": 73,
136
+ "feOffset": 74,
137
+ "fePointLight": 75,
138
+ "feSpecularLighting": 76,
139
+ "feSpotLight": 77,
140
+ "feTile": 78,
141
+ "feTurbulence": 79,
142
+ "fieldset": 80,
143
+ "figcaption": 81,
144
+ "figure": 82,
145
+ "filter": 83,
146
+ "font": 89,
147
+ "font-face": 88,
148
+ "font-face-format": 84,
149
+ "font-face-name": 85,
150
+ "font-face-src": 86,
151
+ "font-face-uri": 87,
152
+ "footer": 90,
153
+ "foreignObject": 91,
154
+ "form": 92,
155
+ "frame": 93,
156
+ "frameset": 94,
157
+ "g": 95,
158
+ "glyph": 96,
159
+ "glyphRef": 97,
160
+ "h1": 98,
161
+ "h2": 99,
162
+ "h3": 100,
163
+ "h4": 101,
164
+ "h5": 102,
165
+ "h6": 103,
166
+ "head": 104,
167
+ "header": 105,
168
+ "hgroup": 106,
169
+ "hkern": 107,
170
+ "hr": 108,
171
+ "html": 109,
172
+ "i": 110,
173
+ "iframe": 111,
174
+ "image": 112,
175
+ "img": 113,
176
+ "input": 114,
177
+ "ins": 115,
178
+ "kbd": 116,
179
+ "keygen": 117,
180
+ "label": 118,
181
+ "legend": 119,
182
+ "li": 120,
183
+ "line": 121,
184
+ "linearGradient": 122,
185
+ "link": 123,
186
+ "main": 124,
187
+ "map": 125,
188
+ "mark": 126,
189
+ "marker": 127,
190
+ "marquee": 128,
191
+ "mask": 129,
192
+ "math": 130,
193
+ "menu": 131,
194
+ "menuitem": 132,
195
+ "meta": 133,
196
+ "metadata": 134,
197
+ "meter": 135,
198
+ "missing-glyph": 136,
199
+ "mpath": 137,
200
+ "nav": 138,
201
+ "nobr": 139,
202
+ "noembed": 140,
203
+ "noframes": 141,
204
+ "noscript": 142,
205
+ "object": 143,
206
+ "ol": 144,
207
+ "optgroup": 145,
208
+ "option": 146,
209
+ "output": 147,
210
+ "p": 148,
211
+ "param": 149,
212
+ "path": 150,
213
+ "pattern": 151,
214
+ "picture": 152,
215
+ "plaintext": 153,
216
+ "polygon": 154,
217
+ "polyline": 155,
218
+ "portal": 156,
219
+ "pre": 157,
220
+ "progress": 158,
221
+ "q": 159,
222
+ "radialGradient": 160,
223
+ "rb": 161,
224
+ "rect": 162,
225
+ "rp": 163,
226
+ "rt": 164,
227
+ "rtc": 165,
228
+ "ruby": 166,
229
+ "s": 167,
230
+ "samp": 168,
231
+ "script": 169,
232
+ "section": 170,
233
+ "select": 171,
234
+ "set": 172,
235
+ "shadow": 173,
236
+ "slot": 174,
237
+ "small": 175,
238
+ "source": 176,
239
+ "spacer": 177,
240
+ "span": 178,
241
+ "stop": 179,
242
+ "strike": 180,
243
+ "strong": 181,
244
+ "style": 182,
245
+ "sub": 183,
246
+ "summary": 184,
247
+ "sup": 185,
248
+ "svg": 186,
249
+ "switch": 187,
250
+ "symbol": 188,
251
+ "table": 189,
252
+ "tbody": 190,
253
+ "td": 191,
254
+ "template": 192,
255
+ "text": 193,
256
+ "textPath": 194,
257
+ "textarea": 195,
258
+ "tfoot": 196,
259
+ "th": 197,
260
+ "thead": 198,
261
+ "time": 199,
262
+ "title": 200,
263
+ "tr": 201,
264
+ "track": 202,
265
+ "tref": 203,
266
+ "tspan": 204,
267
+ "tt": 205,
268
+ "u": 206,
269
+ "ul": 207,
270
+ "use": 208,
271
+ "var": 209,
272
+ "video": 210,
273
+ "view": 211,
274
+ "vkern": 212,
275
+ "wbr": 213,
276
+ "xmp": 214
277
+ },
278
+ "tokenizer_class": "MarkupLMTokenizer",
279
+ "trim_offsets": false,
280
+ "unk_token": {
281
+ "__type": "AddedToken",
282
+ "content": "<unk>",
283
+ "lstrip": false,
284
+ "normalized": true,
285
+ "rstrip": false,
286
+ "single_word": false
287
+ }
288
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff