add python code

Files changed (3) hide show

parse_tracklists.py +68 -0
runner.py +21 -0
trainer.py +125 -0

parse_tracklists.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import csv
+import re
+def load_lines(filename):
+    lines = []
+    with open(filename) as f:
+        for line in f:
+            lines.append(line.strip())
+    return lines
+def remove_titles_and_bad_tracks(lines):
+    is_track = re.compile(r"^\d.*")
+    better_lines = []
+    for line in lines:
+        if is_track.match(line) and "???" not in line:
+            better_lines.append(line)
+    return better_lines
+def group_by_set(lines):
+    is_set_title = re.compile(r".*:$")
+    is_track = re.compile(r"^\d.*:")
+    grouped_lines = []
+    current_set = []
+    for line in lines:
+        if not line.strip():
+            continue
+        if is_set_title.match(line) and len(current_set) > 0:
+            grouped_lines.append(current_set)
+            current_set = []
+        elif is_track.match(line) and "???" not in line:
+            current_set.append(line)
+    return grouped_lines
+def get_grouped_artists(grouped_lines):
+    artist_from_track = re.compile(r"\d+\: (.+?) - .+?")
+    artist_names = []
+    for dj_set_lines in grouped_lines:
+        dj_set_artists = []
+        for line in dj_set_lines:
+            if artist_match := artist_from_track.match(line):
+                artist_name = artist_match.group(1).strip().lower()
+                dj_set_artists.append(artist_name)
+        artist_names.append(dj_set_artists)
+    return artist_names
+def write_to_csv(filename):
+    with open(output_filename, "w", newline="") as csvfile:
+        writer = csv.writer(csvfile)
+        for artists in artist_names:
+            writer.writerow(artists)
+if __name__ == "__main__":
+    filename = "data/radio-original.txt"
+    output_filename = "data/artist-names-per-row.csv"
+    lines = load_lines(filename)
+    grouped_lines = group_by_set(lines)
+    artist_names = get_grouped_artists(grouped_lines)
+    write_to_csv(output_filename)

runner.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from trainer import CBOW, TextPreProcessor, make_context_vector
+if __name__ == "__main__":
+    artist_names = "data/artist-names-per-row.csv"
+    model_path = "data/cbow-model-weights"
+    text = TextPreProcessor(artist_names)
+    vocab = text.build_vocab()
+    model = CBOW(vocab)
+    model.load_state_dict(torch.load(model_path))
+    model.eval()
+    print("Loaded model")
+    context = ["ana roxanne", "bjork"]
+    context_vector = make_context_vector(context, model.word_to_ix)
+    a = model(context_vector)
+    prediction = model.ix_to_word[torch.argmax(a[0]).item()]
+    print(f"Context: {context}\n")
+    print(f"Prediction: {prediction}")

trainer.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import csv
+import torch
+from torchtext.vocab import build_vocab_from_iterator
+class TextPreProcessor:
+    def __init__(self, input_file):
+        self.input_file = input_file
+        self.context_size = 1
+    def build_training_data(self):
+        data = []
+        for row in self._generate_rows():
+            for i in range(self.context_size, len(row) - self.context_size):
+                before = row[i - 1].lower()
+                target = row[i].lower()
+                after = row[i + 1].lower()
+                context_one = [before, after]
+                context_two = [after, before]
+                data.append((context_one, target))
+                data.append((context_two, target))
+        return data
+    def build_vocab(self):
+        rows_of_artists = self._generate_rows()
+        our_vocab = build_vocab_from_iterator(
+            rows_of_artists, specials=["<unk>"], min_freq=1
+        )
+        return our_vocab
+    def _generate_rows(self):
+        with open(self.input_file, encoding="utf-8") as f:
+            reader = csv.reader(f)
+            for row in reader:
+                yield row
+class CBOW(torch.nn.Module):
+    def __init__(self, vocab):
+        super(CBOW, self).__init__()
+        self.num_epochs = 3
+        self.context_size = 1  # 1 word to the left, 1 to the right
+        self.embedding_dim = 100  # embedding vector size
+        self.learning_rate = 0.001
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.vocab = vocab
+        self.word_to_ix = self.vocab.get_stoi()
+        self.ix_to_word = self.vocab.get_itos()
+        self.vocab_list = list(self.vocab.get_stoi().keys())
+        self.vocab_size = len(self.vocab)
+        self.model = None
+        # out: 1 x embedding_dim
+        # initialize an Embedding matrix based on our inputs
+        self.embeddings = torch.nn.Embedding(self.vocab_size, self.embedding_dim)
+        self.linear1 = torch.nn.Linear(self.embedding_dim, 128)
+        self.activation_function1 = torch.nn.ReLU()
+        # out: 1 x vocab_size
+        self.linear2 = torch.nn.Linear(128, self.vocab_size)
+        self.activation_function2 = torch.nn.LogSoftmax(dim=-1)
+    def forward(self, inputs):
+        embeds = sum(self.embeddings(inputs)).view(1, -1)
+        out = self.linear1(embeds)
+        out = self.activation_function1(out)
+        out = self.linear2(out)
+        out = self.activation_function2(out)
+        return out
+    def get_word_emdedding(self, word):
+        word = torch.tensor([self.word_to_ix[word]])
+        # Embeddings lookup of a single word,
+        # once the Embeddings layer has been optimized
+        return self.embeddings(word).view(1, -1)
+def make_context_vector(context, word_to_ix):
+    idxs = [word_to_ix[w] for w in context]
+    return torch.tensor(idxs, dtype=torch.long)
+if __name__ == "__main__":
+    artist_names = "data/artist-names-per-row.csv"
+    model_path = "data/cbow-model-weights"
+    text = TextPreProcessor(artist_names)
+    training_data = text.build_training_data()
+    vocab = text.build_vocab()
+    cbow = CBOW(vocab)
+    loss_function = torch.nn.NLLLoss()
+    optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)
+    # 50 to start with, no correct answer here
+    for epoch in range(50):
+        # we start tracking how accurate our intial words are
+        total_loss = 0
+        # for the x, y in the training data:
+        for context, target in training_data:
+            context_vector = make_context_vector(context, cbow.word_to_ix)
+            # we look at loss
+            log_probs = cbow(context_vector)
+            # we compare the loss from what the actual word is, related to the
+            # probaility of the words
+            total_loss += loss_function(
+                log_probs, torch.tensor([cbow.word_to_ix[target]])
+            )
+        # optimize at the end of each epoch
+        optimizer.zero_grad()
+        total_loss.backward()
+        optimizer.step()
+        # Log out some metrics to see if loss decreases
+        print("end of epoch {} | loss {:2.3f}".format(epoch, total_loss))
+    torch.save(cbow.state_dict(), model_path)
+    print("saved model!")