ShakespeareGPT / src /data /tokenizer.py
nikhiljais's picture
Upload 19 files
b57fe5a verified
raw
history blame contribute delete
411 Bytes
class CharacterTokenizer:
def __init__(self, text):
chars = sorted(list(set(text)))
self.vocab_size = len(chars)
self.stoi = {ch: i for i, ch in enumerate(chars)}
self.itos = {i: ch for i, ch in enumerate(chars)}
def encode(self, s):
return [self.stoi[c] for c in s]
def decode(self, l):
return "".join([self.itos[i] for i in l])