File size: 1,999 Bytes
fdfbe63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import json
import os
from transformers import PreTrainedTokenizer
class NomTokenizer(PreTrainedTokenizer):
vocab_files_names = {'vocab_file': 'vocab.json'}
def __init__(
self,
vocab_file,
unk_token='<UNK>',
unk_token_id=0,
id_start=1,
**kwargs
):
self.vocab_file = vocab_file
self.id_start = id_start
self.unk_token = unk_token
self.unk_token_id = unk_token_id
self.pad_token = unk_token
self.pad_token_id = unk_token_id
with open(vocab_file, 'r', encoding='utf-8') as f:
self.vocab_dict = json.load(f)
self.char2id = {}
self.id2char = {}
for i, char in enumerate(self.vocab_dict.keys(), start=id_start):
self.char2id[char] = i
self.id2char[i] = char
self.id_to_options = {idx: v for idx, v in enumerate(self.vocab_dict.values(), start=id_start)}
super().__init__(**kwargs)
def _tokenize(self, text):
return list(text)
def _convert_token_to_id(self, token):
return self.char2id.get(token, self.unk_token_id)
def _convert_id_to_token(self, index):
if index == self.unk_token_id:
return self.unk_token
return self.id2char.get(index, self.unk_token)
@property
def vocab_size(self):
return len(self.char2id) + 1
def get_vocab(self):
vocab = {**self.char2id, **self.added_tokens_encoder}
return vocab
def save_vocabulary(self, save_directory, filename_prefix=None):
if filename_prefix:
vocab_file = os.path.join(save_directory, f'{filename_prefix}-vocab.json')
else:
vocab_file = os.path.join(save_directory, 'vocab.json')
with open(vocab_file, 'w', encoding='utf-8') as f:
json.dump(self.vocab_dict, f, ensure_ascii=False)
return (vocab_file,)
|