import csv import torch from torchtext.vocab import build_vocab_from_iterator class TextPreProcessor: def __init__(self, input_file): self.input_file = input_file self.context_size = 1 def build_training_data(self): data = [] for row in self._generate_rows(): for i in range(self.context_size, len(row) - self.context_size): before = row[i - 1].lower() target = row[i].lower() after = row[i + 1].lower() context_one = [before, after] context_two = [after, before] data.append((context_one, target)) data.append((context_two, target)) return data def build_vocab(self): rows_of_artists = self._generate_rows() our_vocab = build_vocab_from_iterator( rows_of_artists, specials=[""], min_freq=1 ) return our_vocab def _generate_rows(self): with open(self.input_file, encoding="utf-8") as f: reader = csv.reader(f) for row in reader: yield row class CBOW(torch.nn.Module): def __init__(self, vocab): super(CBOW, self).__init__() self.num_epochs = 3 self.context_size = 1 # 1 word to the left, 1 to the right self.embedding_dim = 100 # embedding vector size self.learning_rate = 0.001 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.vocab = vocab self.word_to_ix = self.vocab.get_stoi() self.ix_to_word = self.vocab.get_itos() self.vocab_list = list(self.vocab.get_stoi().keys()) self.vocab_size = len(self.vocab) self.model = None # out: 1 x embedding_dim # initialize an Embedding matrix based on our inputs self.embeddings = torch.nn.Embedding(self.vocab_size, self.embedding_dim) self.linear1 = torch.nn.Linear(self.embedding_dim, 128) self.activation_function1 = torch.nn.ReLU() # out: 1 x vocab_size self.linear2 = torch.nn.Linear(128, self.vocab_size) self.activation_function2 = torch.nn.LogSoftmax(dim=-1) def forward(self, inputs): embeds = sum(self.embeddings(inputs)).view(1, -1) out = self.linear1(embeds) out = self.activation_function1(out) out = self.linear2(out) out = self.activation_function2(out) return out def get_word_emdedding(self, word): word = torch.tensor([self.word_to_ix[word]]) # Embeddings lookup of a single word, # once the Embeddings layer has been optimized return self.embeddings(word).view(1, -1) def make_context_vector(context, word_to_ix): idxs = [word_to_ix[w] for w in context] return torch.tensor(idxs, dtype=torch.long) if __name__ == "__main__": artist_names = "data/artist-names-per-row.csv" model_path = "data/cbow-model-weights" text = TextPreProcessor(artist_names) training_data = text.build_training_data() vocab = text.build_vocab() cbow = CBOW(vocab) loss_function = torch.nn.NLLLoss() optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001) # 50 to start with, no correct answer here for epoch in range(50): # we start tracking how accurate our intial words are total_loss = 0 # for the x, y in the training data: for context, target in training_data: context_vector = make_context_vector(context, cbow.word_to_ix) # we look at loss log_probs = cbow(context_vector) # we compare the loss from what the actual word is, related to the # probaility of the words total_loss += loss_function( log_probs, torch.tensor([cbow.word_to_ix[target]]) ) # optimize at the end of each epoch optimizer.zero_grad() total_loss.backward() optimizer.step() # Log out some metrics to see if loss decreases print("end of epoch {} | loss {:2.3f}".format(epoch, total_loss)) torch.save(cbow.state_dict(), model_path) print("saved model!")