NomBert-hn2qn-v0.1 / tokenization_nombert.py
CjangCjengh's picture
upload files
fdfbe63
import json
import os
from transformers import PreTrainedTokenizer
class NomTokenizer(PreTrainedTokenizer):
vocab_files_names = {'vocab_file': 'vocab.json'}
def __init__(
self,
vocab_file,
unk_token='<UNK>',
unk_token_id=0,
id_start=1,
**kwargs
):
self.vocab_file = vocab_file
self.id_start = id_start
self.unk_token = unk_token
self.unk_token_id = unk_token_id
self.pad_token = unk_token
self.pad_token_id = unk_token_id
with open(vocab_file, 'r', encoding='utf-8') as f:
self.vocab_dict = json.load(f)
self.char2id = {}
self.id2char = {}
for i, char in enumerate(self.vocab_dict.keys(), start=id_start):
self.char2id[char] = i
self.id2char[i] = char
self.id_to_options = {idx: v for idx, v in enumerate(self.vocab_dict.values(), start=id_start)}
super().__init__(**kwargs)
def _tokenize(self, text):
return list(text)
def _convert_token_to_id(self, token):
return self.char2id.get(token, self.unk_token_id)
def _convert_id_to_token(self, index):
if index == self.unk_token_id:
return self.unk_token
return self.id2char.get(index, self.unk_token)
@property
def vocab_size(self):
return len(self.char2id) + 1
def get_vocab(self):
vocab = {**self.char2id, **self.added_tokens_encoder}
return vocab
def save_vocabulary(self, save_directory, filename_prefix=None):
if filename_prefix:
vocab_file = os.path.join(save_directory, f'{filename_prefix}-vocab.json')
else:
vocab_file = os.path.join(save_directory, 'vocab.json')
with open(vocab_file, 'w', encoding='utf-8') as f:
json.dump(self.vocab_dict, f, ensure_ascii=False)
return (vocab_file,)