File size: 1,999 Bytes
fdfbe63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
import os
from transformers import PreTrainedTokenizer


class NomTokenizer(PreTrainedTokenizer):
    vocab_files_names = {'vocab_file': 'vocab.json'}
    
    def __init__(

        self,

        vocab_file,

        unk_token='<UNK>',

        unk_token_id=0,

        id_start=1,

        **kwargs

    ):
        self.vocab_file = vocab_file
        self.id_start = id_start
        self.unk_token = unk_token
        self.unk_token_id = unk_token_id
        self.pad_token = unk_token
        self.pad_token_id = unk_token_id

        with open(vocab_file, 'r', encoding='utf-8') as f:
            self.vocab_dict = json.load(f)
    
        self.char2id = {}
        self.id2char = {}
        for i, char in enumerate(self.vocab_dict.keys(), start=id_start):
            self.char2id[char] = i
            self.id2char[i] = char
        self.id_to_options = {idx: v for idx, v in enumerate(self.vocab_dict.values(), start=id_start)}

        super().__init__(**kwargs)

    def _tokenize(self, text):
        return list(text)

    def _convert_token_to_id(self, token):
        return self.char2id.get(token, self.unk_token_id)

    def _convert_id_to_token(self, index):
        if index == self.unk_token_id:
            return self.unk_token
        return self.id2char.get(index, self.unk_token)

    @property
    def vocab_size(self):
        return len(self.char2id) + 1

    def get_vocab(self):
        vocab = {**self.char2id, **self.added_tokens_encoder}
        return vocab

    def save_vocabulary(self, save_directory, filename_prefix=None):
        if filename_prefix:
            vocab_file = os.path.join(save_directory, f'{filename_prefix}-vocab.json')
        else:
            vocab_file = os.path.join(save_directory, 'vocab.json')
        
        with open(vocab_file, 'w', encoding='utf-8') as f:
            json.dump(self.vocab_dict, f, ensure_ascii=False)
        
        return (vocab_file,)