|
import csv |
|
import torch |
|
from torchtext.vocab import build_vocab_from_iterator |
|
|
|
|
|
class TextPreProcessor: |
|
def __init__(self, input_file): |
|
self.input_file = input_file |
|
self.context_size = 1 |
|
|
|
def build_training_data(self): |
|
data = [] |
|
for row in self._generate_rows(): |
|
for i in range(self.context_size, len(row) - self.context_size): |
|
before = row[i - 1].lower() |
|
target = row[i].lower() |
|
after = row[i + 1].lower() |
|
|
|
context_one = [before, after] |
|
context_two = [after, before] |
|
data.append((context_one, target)) |
|
data.append((context_two, target)) |
|
|
|
return data |
|
|
|
def build_vocab(self): |
|
rows_of_artists = self._generate_rows() |
|
our_vocab = build_vocab_from_iterator( |
|
rows_of_artists, specials=["<unk>"], min_freq=1 |
|
) |
|
|
|
return our_vocab |
|
|
|
def _generate_rows(self): |
|
with open(self.input_file, encoding="utf-8") as f: |
|
reader = csv.reader(f) |
|
for row in reader: |
|
yield row |
|
|
|
|
|
class CBOW(torch.nn.Module): |
|
def __init__(self, vocab): |
|
super(CBOW, self).__init__() |
|
self.num_epochs = 3 |
|
self.context_size = 1 |
|
self.embedding_dim = 100 |
|
self.learning_rate = 0.001 |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
self.vocab = vocab |
|
self.word_to_ix = self.vocab.get_stoi() |
|
self.ix_to_word = self.vocab.get_itos() |
|
self.vocab_list = list(self.vocab.get_stoi().keys()) |
|
self.vocab_size = len(self.vocab) |
|
|
|
self.model = None |
|
|
|
|
|
|
|
self.embeddings = torch.nn.Embedding(self.vocab_size, self.embedding_dim) |
|
self.linear1 = torch.nn.Linear(self.embedding_dim, 128) |
|
self.activation_function1 = torch.nn.ReLU() |
|
|
|
|
|
self.linear2 = torch.nn.Linear(128, self.vocab_size) |
|
self.activation_function2 = torch.nn.LogSoftmax(dim=-1) |
|
|
|
def forward(self, inputs): |
|
embeds = sum(self.embeddings(inputs)).view(1, -1) |
|
out = self.linear1(embeds) |
|
out = self.activation_function1(out) |
|
out = self.linear2(out) |
|
out = self.activation_function2(out) |
|
return out |
|
|
|
def get_word_emdedding(self, word): |
|
word = torch.tensor([self.word_to_ix[word]]) |
|
|
|
|
|
return self.embeddings(word).view(1, -1) |
|
|
|
|
|
def make_context_vector(context, word_to_ix): |
|
idxs = [word_to_ix[w] for w in context] |
|
return torch.tensor(idxs, dtype=torch.long) |
|
|
|
|
|
if __name__ == "__main__": |
|
artist_names = "data/artist-names-per-row.csv" |
|
model_path = "data/cbow-model-weights" |
|
text = TextPreProcessor(artist_names) |
|
training_data = text.build_training_data() |
|
vocab = text.build_vocab() |
|
cbow = CBOW(vocab) |
|
|
|
loss_function = torch.nn.NLLLoss() |
|
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001) |
|
|
|
|
|
for epoch in range(50): |
|
|
|
total_loss = 0 |
|
|
|
|
|
for context, target in training_data: |
|
context_vector = make_context_vector(context, cbow.word_to_ix) |
|
|
|
|
|
log_probs = cbow(context_vector) |
|
|
|
|
|
|
|
total_loss += loss_function( |
|
log_probs, torch.tensor([cbow.word_to_ix[target]]) |
|
) |
|
|
|
|
|
optimizer.zero_grad() |
|
total_loss.backward() |
|
optimizer.step() |
|
|
|
|
|
print("end of epoch {} | loss {:2.3f}".format(epoch, total_loss)) |
|
torch.save(cbow.state_dict(), model_path) |
|
print("saved model!") |
|
|