mgelard commited on
Commit
bff7684
·
verified ·
1 Parent(s): e839f31

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -5
  2. tokenizer.py +1 -7
  3. tokenizer_config.json +1 -29
special_tokens_map.json CHANGED
@@ -1,5 +1 @@
1
- {
2
- "cls_token": "<cls>",
3
- "mask_token": "<mask>",
4
- "pad_token": "<pad>"
5
- }
 
1
+ {}
 
 
 
 
tokenizer.py CHANGED
@@ -30,7 +30,6 @@ class BinnedOmicTokenizer(PreTrainedTokenizer):
30
 
31
  ids_to_tokens = {i: tok for tok, i in vocab.items()}
32
 
33
- # Save vocab attributes before superclass init
34
  self.vocab = vocab
35
  self.ids_to_tokens = ids_to_tokens
36
 
@@ -49,12 +48,7 @@ class BinnedOmicTokenizer(PreTrainedTokenizer):
49
  self.mask_token = "<mask>"
50
  self.cls_token = "<cls>"
51
 
52
- super().__init__(
53
- pad_token=self.pad_token,
54
- mask_token=self.mask_token,
55
- cls_token=self.cls_token,
56
- **kwargs,
57
- )
58
 
59
  def _convert_token_to_id(self, token: str) -> int:
60
  return self.vocab.get(token, self.vocab[self.unk_token])
 
30
 
31
  ids_to_tokens = {i: tok for tok, i in vocab.items()}
32
 
 
33
  self.vocab = vocab
34
  self.ids_to_tokens = ids_to_tokens
35
 
 
48
  self.mask_token = "<mask>"
49
  self.cls_token = "<cls>"
50
 
51
+ super().__init__(**kwargs)
 
 
 
 
 
52
 
53
  def _convert_token_to_id(self, token: str) -> int:
54
  return self.vocab.get(token, self.vocab[self.unk_token])
tokenizer_config.json CHANGED
@@ -1,30 +1,5 @@
1
  {
2
- "added_tokens_decoder": {
3
- "64": {
4
- "content": "<pad>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "65": {
12
- "content": "<mask>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "66": {
20
- "content": "<cls>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- }
27
- },
28
  "auto_map": {
29
  "AutoTokenizer": [
30
  "tokenizer.BinnedOmicTokenizer",
@@ -32,9 +7,6 @@
32
  ]
33
  },
34
  "clean_up_tokenization_spaces": true,
35
- "cls_token": "<cls>",
36
- "mask_token": "<mask>",
37
  "model_max_length": 1000000000000000019884624838656,
38
- "pad_token": "<pad>",
39
  "tokenizer_class": "BinnedOmicTokenizer"
40
  }
 
1
  {
2
+ "added_tokens_decoder": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "auto_map": {
4
  "AutoTokenizer": [
5
  "tokenizer.BinnedOmicTokenizer",
 
7
  ]
8
  },
9
  "clean_up_tokenization_spaces": true,
 
 
10
  "model_max_length": 1000000000000000019884624838656,
 
11
  "tokenizer_class": "BinnedOmicTokenizer"
12
  }