Thor Kell
add python code
628e563
import csv
import torch
from torchtext.vocab import build_vocab_from_iterator
class TextPreProcessor:
def __init__(self, input_file):
self.input_file = input_file
self.context_size = 1
def build_training_data(self):
data = []
for row in self._generate_rows():
for i in range(self.context_size, len(row) - self.context_size):
before = row[i - 1].lower()
target = row[i].lower()
after = row[i + 1].lower()
context_one = [before, after]
context_two = [after, before]
data.append((context_one, target))
data.append((context_two, target))
return data
def build_vocab(self):
rows_of_artists = self._generate_rows()
our_vocab = build_vocab_from_iterator(
rows_of_artists, specials=["<unk>"], min_freq=1
)
return our_vocab
def _generate_rows(self):
with open(self.input_file, encoding="utf-8") as f:
reader = csv.reader(f)
for row in reader:
yield row
class CBOW(torch.nn.Module):
def __init__(self, vocab):
super(CBOW, self).__init__()
self.num_epochs = 3
self.context_size = 1 # 1 word to the left, 1 to the right
self.embedding_dim = 100 # embedding vector size
self.learning_rate = 0.001
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.vocab = vocab
self.word_to_ix = self.vocab.get_stoi()
self.ix_to_word = self.vocab.get_itos()
self.vocab_list = list(self.vocab.get_stoi().keys())
self.vocab_size = len(self.vocab)
self.model = None
# out: 1 x embedding_dim
# initialize an Embedding matrix based on our inputs
self.embeddings = torch.nn.Embedding(self.vocab_size, self.embedding_dim)
self.linear1 = torch.nn.Linear(self.embedding_dim, 128)
self.activation_function1 = torch.nn.ReLU()
# out: 1 x vocab_size
self.linear2 = torch.nn.Linear(128, self.vocab_size)
self.activation_function2 = torch.nn.LogSoftmax(dim=-1)
def forward(self, inputs):
embeds = sum(self.embeddings(inputs)).view(1, -1)
out = self.linear1(embeds)
out = self.activation_function1(out)
out = self.linear2(out)
out = self.activation_function2(out)
return out
def get_word_emdedding(self, word):
word = torch.tensor([self.word_to_ix[word]])
# Embeddings lookup of a single word,
# once the Embeddings layer has been optimized
return self.embeddings(word).view(1, -1)
def make_context_vector(context, word_to_ix):
idxs = [word_to_ix[w] for w in context]
return torch.tensor(idxs, dtype=torch.long)
if __name__ == "__main__":
artist_names = "data/artist-names-per-row.csv"
model_path = "data/cbow-model-weights"
text = TextPreProcessor(artist_names)
training_data = text.build_training_data()
vocab = text.build_vocab()
cbow = CBOW(vocab)
loss_function = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)
# 50 to start with, no correct answer here
for epoch in range(50):
# we start tracking how accurate our intial words are
total_loss = 0
# for the x, y in the training data:
for context, target in training_data:
context_vector = make_context_vector(context, cbow.word_to_ix)
# we look at loss
log_probs = cbow(context_vector)
# we compare the loss from what the actual word is, related to the
# probaility of the words
total_loss += loss_function(
log_probs, torch.tensor([cbow.word_to_ix[target]])
)
# optimize at the end of each epoch
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
# Log out some metrics to see if loss decreases
print("end of epoch {} | loss {:2.3f}".format(epoch, total_loss))
torch.save(cbow.state_dict(), model_path)
print("saved model!")