KoichiYasuoka commited on
Commit
29ea6b8
·
1 Parent(s): 86d082e

initial release

Browse files
README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "ain"
4
+ tags:
5
+ - "ainu"
6
+ - "token-classification"
7
+ - "pos"
8
+ - "dependency-parsing"
9
+ license: "cc-by-sa-4.0"
10
+ pipeline_tag: "token-classification"
11
+ widget:
12
+ - text: "itak=as awa pon rupne aynu ene itaki"
13
+ - text: "イタカㇱ アワ ポン ルㇷ゚ネ アイヌ エネ イタキ"
14
+ ---
15
+
16
+ # roberta-base-ainu-upos
17
+
18
+ ## Model Description
19
+
20
+ This is a RoBERTa model pre-trained on Ainu texts (both カタカナ and romanized) for POS-tagging and dependency-parsing, derived from [roberta-base-ainu](https://huggingface.co/KoichiYasuoka/roberta-base-ainu). Every word is tagged by [UPOS](https://universaldependencies.org/u/pos/) (Universal Part-Of-Speech).
21
+
22
+ ## How to Use
23
+
24
+ ```py
25
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
26
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/roberta-base-ainu-upos")
27
+ model=AutoModelForTokenClassification.from_pretrained("KoichiYasuoka/roberta-base-ainu-upos")
28
+ ```
29
+
30
+ or
31
+
32
+ ```py
33
+ import esupar
34
+ nlp=esupar.load("KoichiYasuoka/roberta-base-ainu-upos")
35
+ ```
36
+
37
+ ## See Also
38
+
39
+ [esupar](https://github.com/KoichiYasuoka/esupar): Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models
40
+
config.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "ADP",
14
+ "1": "ADV",
15
+ "2": "AUX",
16
+ "3": "AUX+NOUN",
17
+ "4": "B-ADP",
18
+ "5": "B-ADP+VERB+NOUN",
19
+ "6": "B-ADV",
20
+ "7": "B-AUX",
21
+ "8": "B-AUX+PART",
22
+ "9": "B-CCONJ",
23
+ "10": "B-DET",
24
+ "11": "B-DET+NOUN",
25
+ "12": "B-INFR.EV",
26
+ "13": "B-INTJ",
27
+ "14": "B-NOUN",
28
+ "15": "B-NOUN+ADP",
29
+ "16": "B-NOUN+ADP+NOUN",
30
+ "17": "B-NOUN+ADP+VERB",
31
+ "18": "B-NOUN+ADV",
32
+ "19": "B-NOUN+NOUN",
33
+ "20": "B-NOUN+VERB",
34
+ "21": "B-NUM",
35
+ "22": "B-NUM+NOUN",
36
+ "23": "B-PART",
37
+ "24": "B-PART+AUX",
38
+ "25": "B-PART+NOUN",
39
+ "26": "B-PART+VERB",
40
+ "27": "B-PRON",
41
+ "28": "B-PROPN",
42
+ "29": "B-PUNCT",
43
+ "30": "B-SCONJ",
44
+ "31": "B-SCONJ+ADV",
45
+ "32": "B-VERB",
46
+ "33": "B-VERB+NOUN",
47
+ "34": "B-VERB+PART",
48
+ "35": "B-VERB+SCONJ",
49
+ "36": "B-VERT",
50
+ "37": "B-X",
51
+ "38": "CCONJ",
52
+ "39": "DET",
53
+ "40": "DET+NOUN",
54
+ "41": "DET+SCONJ+VERB",
55
+ "42": "I-ADP",
56
+ "43": "I-ADP+VERB+NOUN",
57
+ "44": "I-ADV",
58
+ "45": "I-AUX",
59
+ "46": "I-AUX+PART",
60
+ "47": "I-CCONJ",
61
+ "48": "I-DET",
62
+ "49": "I-DET+NOUN",
63
+ "50": "I-INFR.EV",
64
+ "51": "I-INTJ",
65
+ "52": "I-NOUN",
66
+ "53": "I-NOUN+ADP",
67
+ "54": "I-NOUN+ADP+NOUN",
68
+ "55": "I-NOUN+ADP+VERB",
69
+ "56": "I-NOUN+ADV",
70
+ "57": "I-NOUN+NOUN",
71
+ "58": "I-NOUN+VERB",
72
+ "59": "I-NUM",
73
+ "60": "I-NUM+NOUN",
74
+ "61": "I-PART",
75
+ "62": "I-PART+AUX",
76
+ "63": "I-PART+NOUN",
77
+ "64": "I-PART+VERB",
78
+ "65": "I-PRON",
79
+ "66": "I-PROPN",
80
+ "67": "I-PUNCT",
81
+ "68": "I-SCONJ",
82
+ "69": "I-SCONJ+ADV",
83
+ "70": "I-VERB",
84
+ "71": "I-VERB+NOUN",
85
+ "72": "I-VERB+PART",
86
+ "73": "I-VERB+SCONJ",
87
+ "74": "I-VERT",
88
+ "75": "I-X",
89
+ "76": "INTJ",
90
+ "77": "NOUN",
91
+ "78": "NOUN+ADP",
92
+ "79": "NOUN+NOUN",
93
+ "80": "NOUN+VERB",
94
+ "81": "NUM",
95
+ "82": "PART",
96
+ "83": "PART+VERB",
97
+ "84": "PROPN",
98
+ "85": "PUNCT",
99
+ "86": "SCONJ",
100
+ "87": "SYM",
101
+ "88": "VERB",
102
+ "89": "VERB+AUX",
103
+ "90": "VERB+NOUN",
104
+ "91": "VERB+PART",
105
+ "92": "VERB+VERB",
106
+ "93": "VERT",
107
+ "94": "X"
108
+ },
109
+ "initializer_range": 0.02,
110
+ "intermediate_size": 3072,
111
+ "label2id": {
112
+ "ADP": 0,
113
+ "ADV": 1,
114
+ "AUX": 2,
115
+ "AUX+NOUN": 3,
116
+ "B-ADP": 4,
117
+ "B-ADP+VERB+NOUN": 5,
118
+ "B-ADV": 6,
119
+ "B-AUX": 7,
120
+ "B-AUX+PART": 8,
121
+ "B-CCONJ": 9,
122
+ "B-DET": 10,
123
+ "B-DET+NOUN": 11,
124
+ "B-INFR.EV": 12,
125
+ "B-INTJ": 13,
126
+ "B-NOUN": 14,
127
+ "B-NOUN+ADP": 15,
128
+ "B-NOUN+ADP+NOUN": 16,
129
+ "B-NOUN+ADP+VERB": 17,
130
+ "B-NOUN+ADV": 18,
131
+ "B-NOUN+NOUN": 19,
132
+ "B-NOUN+VERB": 20,
133
+ "B-NUM": 21,
134
+ "B-NUM+NOUN": 22,
135
+ "B-PART": 23,
136
+ "B-PART+AUX": 24,
137
+ "B-PART+NOUN": 25,
138
+ "B-PART+VERB": 26,
139
+ "B-PRON": 27,
140
+ "B-PROPN": 28,
141
+ "B-PUNCT": 29,
142
+ "B-SCONJ": 30,
143
+ "B-SCONJ+ADV": 31,
144
+ "B-VERB": 32,
145
+ "B-VERB+NOUN": 33,
146
+ "B-VERB+PART": 34,
147
+ "B-VERB+SCONJ": 35,
148
+ "B-VERT": 36,
149
+ "B-X": 37,
150
+ "CCONJ": 38,
151
+ "DET": 39,
152
+ "DET+NOUN": 40,
153
+ "DET+SCONJ+VERB": 41,
154
+ "I-ADP": 42,
155
+ "I-ADP+VERB+NOUN": 43,
156
+ "I-ADV": 44,
157
+ "I-AUX": 45,
158
+ "I-AUX+PART": 46,
159
+ "I-CCONJ": 47,
160
+ "I-DET": 48,
161
+ "I-DET+NOUN": 49,
162
+ "I-INFR.EV": 50,
163
+ "I-INTJ": 51,
164
+ "I-NOUN": 52,
165
+ "I-NOUN+ADP": 53,
166
+ "I-NOUN+ADP+NOUN": 54,
167
+ "I-NOUN+ADP+VERB": 55,
168
+ "I-NOUN+ADV": 56,
169
+ "I-NOUN+NOUN": 57,
170
+ "I-NOUN+VERB": 58,
171
+ "I-NUM": 59,
172
+ "I-NUM+NOUN": 60,
173
+ "I-PART": 61,
174
+ "I-PART+AUX": 62,
175
+ "I-PART+NOUN": 63,
176
+ "I-PART+VERB": 64,
177
+ "I-PRON": 65,
178
+ "I-PROPN": 66,
179
+ "I-PUNCT": 67,
180
+ "I-SCONJ": 68,
181
+ "I-SCONJ+ADV": 69,
182
+ "I-VERB": 70,
183
+ "I-VERB+NOUN": 71,
184
+ "I-VERB+PART": 72,
185
+ "I-VERB+SCONJ": 73,
186
+ "I-VERT": 74,
187
+ "I-X": 75,
188
+ "INTJ": 76,
189
+ "NOUN": 77,
190
+ "NOUN+ADP": 78,
191
+ "NOUN+NOUN": 79,
192
+ "NOUN+VERB": 80,
193
+ "NUM": 81,
194
+ "PART": 82,
195
+ "PART+VERB": 83,
196
+ "PROPN": 84,
197
+ "PUNCT": 85,
198
+ "SCONJ": 86,
199
+ "SYM": 87,
200
+ "VERB": 88,
201
+ "VERB+AUX": 89,
202
+ "VERB+NOUN": 90,
203
+ "VERB+PART": 91,
204
+ "VERB+VERB": 92,
205
+ "VERT": 93,
206
+ "X": 94
207
+ },
208
+ "layer_norm_eps": 1e-12,
209
+ "max_position_embeddings": 512,
210
+ "model_type": "roberta",
211
+ "num_attention_heads": 12,
212
+ "num_hidden_layers": 12,
213
+ "pad_token_id": 1,
214
+ "position_embedding_type": "absolute",
215
+ "task_specific_params": {
216
+ "upos_multiword": {
217
+ "ADP+VERB+NOUN": {
218
+ "tambe": [
219
+ "ta",
220
+ "m",
221
+ "be"
222
+ ]
223
+ },
224
+ "AUX+NOUN": {
225
+ "nep": [
226
+ "ne",
227
+ "p"
228
+ ]
229
+ },
230
+ "AUX+PART": {
231
+ "nangonna": [
232
+ "nangon",
233
+ "na"
234
+ ],
235
+ "nankonna": [
236
+ "nankon",
237
+ "na"
238
+ ]
239
+ },
240
+ "DET+NOUN": {
241
+ "Tamba": [
242
+ "Tam",
243
+ "ba"
244
+ ],
245
+ "Tampa": [
246
+ "Tam",
247
+ "pa"
248
+ ],
249
+ "tanpa": [
250
+ "tan",
251
+ "pa"
252
+ ],
253
+ "tanto": [
254
+ "tan",
255
+ "to"
256
+ ]
257
+ },
258
+ "DET+SCONJ+VERB": {
259
+ "Newaan": [
260
+ "Ne",
261
+ "wa",
262
+ "an"
263
+ ],
264
+ "newaan": [
265
+ "ne",
266
+ "wa",
267
+ "an"
268
+ ]
269
+ },
270
+ "NOUN+ADP": {
271
+ "Kunneiwano": [
272
+ "Kunnei",
273
+ "wano"
274
+ ],
275
+ "Orota": [
276
+ "Oro",
277
+ "ta"
278
+ ],
279
+ "Orowano": [
280
+ "Oro",
281
+ "wano"
282
+ ],
283
+ "Oshmaketa": [
284
+ "Oshmake",
285
+ "ta"
286
+ ],
287
+ "Pet-samaketa": [
288
+ "Pet-samake",
289
+ "ta"
290
+ ],
291
+ "Soita": [
292
+ "Soi",
293
+ "ta"
294
+ ],
295
+ "cheppone": [
296
+ "cheppo",
297
+ "ne"
298
+ ],
299
+ "keseta": [
300
+ "kese",
301
+ "ta"
302
+ ],
303
+ "kesta": [
304
+ "kes",
305
+ "ta"
306
+ ],
307
+ "kunneywano": [
308
+ "kunney",
309
+ "wano"
310
+ ],
311
+ "neyta": [
312
+ "ney",
313
+ "ta"
314
+ ],
315
+ "orota": [
316
+ "oro",
317
+ "ta"
318
+ ],
319
+ "orowa": [
320
+ "oro",
321
+ "wa"
322
+ ],
323
+ "orowano": [
324
+ "oro",
325
+ "wano"
326
+ ],
327
+ "oshmaketa": [
328
+ "oshmake",
329
+ "ta"
330
+ ],
331
+ "otta": [
332
+ "ot",
333
+ "ta"
334
+ ],
335
+ "petsamaketa": [
336
+ "petsamake",
337
+ "ta"
338
+ ],
339
+ "samaketa": [
340
+ "samake",
341
+ "ta"
342
+ ],
343
+ "soyta": [
344
+ "soy",
345
+ "ta"
346
+ ],
347
+ "tomta": [
348
+ "tom",
349
+ "ta"
350
+ ]
351
+ },
352
+ "NOUN+ADP+NOUN": {
353
+ "rorunpurai": [
354
+ "ror",
355
+ "un",
356
+ "purai"
357
+ ],
358
+ "rorunpuray": [
359
+ "ror",
360
+ "un",
361
+ "puray"
362
+ ]
363
+ },
364
+ "NOUN+ADP+VERB": {
365
+ "soytaarpa": [
366
+ "soy",
367
+ "ta",
368
+ "arpa"
369
+ ]
370
+ },
371
+ "NOUN+ADV": {
372
+ "Tambeta ne": [
373
+ "Tambe",
374
+ "ta ne"
375
+ ]
376
+ },
377
+ "NOUN+NOUN": {
378
+ "Petetoko": [
379
+ "Pet",
380
+ "etoko"
381
+ ],
382
+ "Shirokanipe": [
383
+ "Shirokani",
384
+ "pe"
385
+ ],
386
+ "hekattar": [
387
+ "hekat",
388
+ "tar"
389
+ ],
390
+ "inaanpe": [
391
+ "inaan",
392
+ "pe"
393
+ ],
394
+ "inanpe": [
395
+ "inan",
396
+ "pe"
397
+ ],
398
+ "iporohoka": [
399
+ "iporoho",
400
+ "ka"
401
+ ],
402
+ "kamuinish": [
403
+ "kamui",
404
+ "nish"
405
+ ],
406
+ "kamuynis": [
407
+ "kamuy",
408
+ "nis"
409
+ ],
410
+ "konkanipe": [
411
+ "konkani",
412
+ "pe"
413
+ ],
414
+ "petetok": [
415
+ "pet",
416
+ "etok"
417
+ ],
418
+ "petetoko": [
419
+ "pet",
420
+ "etoko"
421
+ ],
422
+ "sirokanipe": [
423
+ "sirokani",
424
+ "pe"
425
+ ]
426
+ },
427
+ "NOUN+VERB": {
428
+ "Omakun": [
429
+ "Omak",
430
+ "un"
431
+ ],
432
+ "Orepun": [
433
+ "Orep",
434
+ "un"
435
+ ],
436
+ "Shiriki": [
437
+ "Shiri",
438
+ "ki"
439
+ ],
440
+ "kotankor": [
441
+ "kotan",
442
+ "kor"
443
+ ],
444
+ "makun": [
445
+ "mak",
446
+ "un"
447
+ ],
448
+ "repun": [
449
+ "rep",
450
+ "un"
451
+ ],
452
+ "rikunruke": [
453
+ "rik",
454
+ "unruke"
455
+ ],
456
+ "siriki": [
457
+ "siri",
458
+ "ki"
459
+ ],
460
+ "ukakushte": [
461
+ "uka",
462
+ "kushte"
463
+ ],
464
+ "ukakuste": [
465
+ "uka",
466
+ "kuste"
467
+ ],
468
+ "uraikik": [
469
+ "urai",
470
+ "kik"
471
+ ]
472
+ },
473
+ "NUM+NOUN": {
474
+ "Wanto": [
475
+ "Wan",
476
+ "to"
477
+ ],
478
+ "hotnepa": [
479
+ "hotne",
480
+ "pa"
481
+ ],
482
+ "wanpe": [
483
+ "wan",
484
+ "pe"
485
+ ],
486
+ "wanto": [
487
+ "wan",
488
+ "to"
489
+ ]
490
+ },
491
+ "PART+AUX": {
492
+ "chine": [
493
+ "chi",
494
+ "ne"
495
+ ]
496
+ },
497
+ "PART+NOUN": {
498
+ "=anpe": [
499
+ "=an",
500
+ "pe"
501
+ ],
502
+ "eisam": [
503
+ "ei",
504
+ "sam"
505
+ ]
506
+ },
507
+ "PART+VERB": {
508
+ "ainu-wap": [
509
+ "a",
510
+ "inu-wap"
511
+ ],
512
+ "akus": [
513
+ "a",
514
+ "kus"
515
+ ],
516
+ "chiki": [
517
+ "chi",
518
+ "ki"
519
+ ],
520
+ "chikik": [
521
+ "chi",
522
+ "kik"
523
+ ],
524
+ "eram an": [
525
+ "e",
526
+ "ram an"
527
+ ],
528
+ "eramasu": [
529
+ "e",
530
+ "ramasu"
531
+ ],
532
+ "karapa": [
533
+ "k",
534
+ "arapa"
535
+ ]
536
+ },
537
+ "SCONJ+ADV": {
538
+ "koiramno": [
539
+ "ko",
540
+ "iramno"
541
+ ]
542
+ },
543
+ "VERB+AUX": {
544
+ "sattek": [
545
+ "sat",
546
+ "tek"
547
+ ]
548
+ },
549
+ "VERB+NOUN": {
550
+ "Hesepa": [
551
+ "Hese",
552
+ "pa"
553
+ ],
554
+ "ambe": [
555
+ "am",
556
+ "be"
557
+ ],
558
+ "anpe": [
559
+ "an",
560
+ "pe"
561
+ ],
562
+ "anto": [
563
+ "an",
564
+ "to"
565
+ ],
566
+ "h\u00e9sep\u00e1ha": [
567
+ "h\u00e9se",
568
+ "p\u00e1ha"
569
+ ],
570
+ "kari": [
571
+ "kar",
572
+ "i"
573
+ ],
574
+ "wenpuri": [
575
+ "wen",
576
+ "puri"
577
+ ]
578
+ },
579
+ "VERB+PART": {
580
+ "kari": [
581
+ "kar",
582
+ "i"
583
+ ],
584
+ "rokash": [
585
+ "rok",
586
+ "ash"
587
+ ],
588
+ "sapash": [
589
+ "sap",
590
+ "ash"
591
+ ],
592
+ "shinotash": [
593
+ "shinot",
594
+ "ash"
595
+ ]
596
+ },
597
+ "VERB+SCONJ": {
598
+ "anak un": [
599
+ "an",
600
+ "ak un"
601
+ ],
602
+ "anakanakne": [
603
+ "an",
604
+ "akanakne"
605
+ ],
606
+ "sakno": [
607
+ "sak",
608
+ "no"
609
+ ]
610
+ },
611
+ "VERB+VERB": {
612
+ "ranran": [
613
+ "ran",
614
+ "ran"
615
+ ]
616
+ }
617
+ }
618
+ },
619
+ "tokenizer_class": "RemBertTokenizerFast",
620
+ "torch_dtype": "float32",
621
+ "transformers_version": "4.22.1",
622
+ "type_vocab_size": 2,
623
+ "use_cache": true,
624
+ "vocab_size": 6143
625
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ccc9e2b9cb90186a3e5a43d569cef08f6a24666468f2123f3057708262901cb
3
+ size 361037489
sentencepiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
3
+ size 1
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": {
6
+ "content": "[MASK]",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "[PAD]",
13
+ "sep_token": "[SEP]",
14
+ "unk_token": "[UNK]"
15
+ }
supar.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11c0c17bdf57d2fa5f72b5a07f2b4951c2cd3c041a76410fe8ab173777b36e49
3
+ size 408389861
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "eos_token": "[SEP]",
6
+ "keep_accents": false,
7
+ "mask_token": {
8
+ "__type": "AddedToken",
9
+ "content": "[MASK]",
10
+ "lstrip": true,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ },
15
+ "model_max_length": 512,
16
+ "pad_token": "[PAD]",
17
+ "remove_space": true,
18
+ "sep_token": "[SEP]",
19
+ "split_by_punct": true,
20
+ "tokenizer_class": "RemBertTokenizerFast",
21
+ "unk_token": "[UNK]"
22
+ }