Kavin Arasu
First Model Version
68694b3
import torch
import pickle
# Character which represents the start and end of a word
TOKEN = '.'
# Reading the names into a list
words = open('data/names.txt','r').read().splitlines()
# Building a vocabulary out of all of the characters we have
vocab = sorted(list(set(''.join(words)) | {TOKEN}))
# Building a Bigram table which will hold the counts for each of the bigram
n = len(vocab)
N = torch.zeros((n,n), dtype = torch.int32)
# Defining a mapping for characters to and from integers
char_to_int = {char:i for i,char in enumerate(vocab)}
int_to_char = {value:key for key,value in char_to_int.items()}
# Populating the Bigram table (N) with counts
for word in words:
chars = [TOKEN] + list(word) + [TOKEN]
for ch1,ch2 in zip(chars,chars[1:]):
ix1 = char_to_int[ch1]
ix2 = char_to_int[ch2]
N[ix1,ix2] += 1
# Normalise the counts to represent probabilities
P = N.float()
P /= P.sum(1, keepdim = True)
# Open a file and use dump()
with open('model/bigrams.pkl', 'wb') as file:
pickle.dump([P,char_to_int,int_to_char], file)