import torch class CharacterLevelTokenizer: def __init__(self,data): self.data = data self.vocab = sorted(list(set(self.data))) self.VOCAB_SIZE = len(self.vocab) self.i_s = {i:s for i,s in enumerate(self.vocab)} self.s_i = {s:i for i,s in self.i_s.items()} def encode(self,s): return torch.tensor([self.s_i[c] for c in s],dtype=torch.long) def decode(self,s): return ''.join([self.i_s[i.item()] for i in s])