nicolauduran45 commited on
Commit
c69a9b0
·
verified ·
1 Parent(s): 9cba09a

Upload tokenizer

Browse files
Files changed (2) hide show
  1. README.md +93 -93
  2. tokenizer_config.json +2 -1
README.md CHANGED
@@ -6,99 +6,99 @@ tags:
6
  - science
7
  license: apache-2.0
8
  language:
9
- - af
10
- - am
11
- - ar
12
- - as
13
- - az
14
- - be
15
- - bg
16
- - bn
17
- - br
18
- - bs
19
- - ca
20
- - cs
21
- - cy
22
- - da
23
- - de
24
- - el
25
- - en
26
- - eo
27
- - es
28
- - et
29
- - eu
30
- - fa
31
- - fi
32
- - fr
33
- - fy
34
- - ga
35
- - gd
36
- - gl
37
- - gu
38
- - ha
39
- - he
40
- - hi
41
- - hr
42
- - hu
43
- - hy
44
- - id
45
- - is
46
- - it
47
- - ja
48
- - jv
49
- - ka
50
- - kk
51
- - km
52
- - kn
53
- - ko
54
- - ku
55
- - ky
56
- - la
57
- - lo
58
- - lt
59
- - lv
60
- - mg
61
- - mk
62
- - ml
63
- - mn
64
- - mr
65
- - ms
66
- - my
67
- - ne
68
- - nl
69
- - 'no'
70
- - om
71
- - or
72
- - pa
73
- - pl
74
- - ps
75
- - pt
76
- - ro
77
- - ru
78
- - sa
79
- - sd
80
- - si
81
- - sk
82
- - sl
83
- - so
84
- - sq
85
- - sr
86
- - su
87
- - sv
88
- - sw
89
- - ta
90
- - te
91
- - th
92
- - tl
93
- - tr
94
- - ug
95
- - uk
96
- - ur
97
- - uz
98
- - vi
99
- - xh
100
- - yi
101
- - zh
102
  base_model:
103
  - distilbert/distilbert-base-multilingual-cased
104
  ---
 
6
  - science
7
  license: apache-2.0
8
  language:
9
+ - af
10
+ - am
11
+ - ar
12
+ - as
13
+ - az
14
+ - be
15
+ - bg
16
+ - bn
17
+ - br
18
+ - bs
19
+ - ca
20
+ - cs
21
+ - cy
22
+ - da
23
+ - de
24
+ - el
25
+ - en
26
+ - eo
27
+ - es
28
+ - et
29
+ - eu
30
+ - fa
31
+ - fi
32
+ - fr
33
+ - fy
34
+ - ga
35
+ - gd
36
+ - gl
37
+ - gu
38
+ - ha
39
+ - he
40
+ - hi
41
+ - hr
42
+ - hu
43
+ - hy
44
+ - id
45
+ - is
46
+ - it
47
+ - ja
48
+ - jv
49
+ - ka
50
+ - kk
51
+ - km
52
+ - kn
53
+ - ko
54
+ - ku
55
+ - ky
56
+ - la
57
+ - lo
58
+ - lt
59
+ - lv
60
+ - mg
61
+ - mk
62
+ - ml
63
+ - mn
64
+ - mr
65
+ - ms
66
+ - my
67
+ - ne
68
+ - nl
69
+ - 'no'
70
+ - om
71
+ - or
72
+ - pa
73
+ - pl
74
+ - ps
75
+ - pt
76
+ - ro
77
+ - ru
78
+ - sa
79
+ - sd
80
+ - si
81
+ - sk
82
+ - sl
83
+ - so
84
+ - sq
85
+ - sr
86
+ - su
87
+ - sv
88
+ - sw
89
+ - ta
90
+ - te
91
+ - th
92
+ - tl
93
+ - tr
94
+ - ug
95
+ - uk
96
+ - ur
97
+ - uz
98
+ - vi
99
+ - xh
100
+ - yi
101
+ - zh
102
  base_model:
103
  - distilbert/distilbert-base-multilingual-cased
104
  ---
tokenizer_config.json CHANGED
@@ -41,9 +41,10 @@
41
  "special": true
42
  }
43
  },
44
- "clean_up_tokenization_spaces": true,
45
  "cls_token": "[CLS]",
46
  "do_lower_case": false,
 
47
  "mask_token": "[MASK]",
48
  "model_max_length": 512,
49
  "pad_token": "[PAD]",
 
41
  "special": true
42
  }
43
  },
44
+ "clean_up_tokenization_spaces": false,
45
  "cls_token": "[CLS]",
46
  "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
  "mask_token": "[MASK]",
49
  "model_max_length": 512,
50
  "pad_token": "[PAD]",