<a href="https://colab.research.google.com/drive/1lqS67-mbCspIKzx6y9wn7CuP96utWzP2?usp=sharing" target="_blank"><img align="left" alt="Colab" title="Open in Colab" src="https://colab.research.google.com/assets/colab-badge.svg"></a>


# Continuous Bag of Words (CBOW) Text Classifier

The code below implements a continuous bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- The lookup allows for extracting embeddings for each tokenized input
- The embedding vectors are added together
- The resulting vector is multiplied with a weight matrix, which is then added a bias vector; this results in scores
- The scores are applied a softmax to generate probabilities which are used for the final classification

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

![img txt](https://github.com/dair-ai/ML-Notebooks/blob/main/img/cbow.png?raw=true)

In [None]:
import torch
import random
import torch.nn as nn

In [None]:
%%capture

# download the files
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt

# create the data folders
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes

## Read and Process Data

In [None]:
# function to read in data, process each line and split columns by " ||| "
def read_data(filename):
    data = []
    with open(filename, 'r') as f:
        for line in f:
            line = line.lower().strip()
            line = line.split(' ||| ')
            data.append(line)
    return data

train_data = read_data('data/classes/train.txt')
test_data = read_data('data/classes/test.txt')

# creating the word and tag indices
word_to_index = {}
word_to_index["<unk>"] = len(word_to_index) # add <UNK> to dictionary
tag_to_index = {}

# create word to index dictionary and tag to index dictionary from data
def create_dict(data, check_unk=False):
    for line in data:
        for word in line[1].split(" "):
            if check_unk == False:
                if word not in word_to_index:
                    word_to_index[word] = len(word_to_index)
            else:
                if word not in word_to_index:
                    word_to_index[word] = word_to_index["<unk>"]

        if line[0] not in tag_to_index:
            tag_to_index[line[0]] = len(tag_to_index)

create_dict(train_data)
create_dict(test_data, check_unk=True)

# create word and tag tensors from data
def create_tensor(data):
    for line in data:
        yield([word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]])

train_data = list(create_tensor(train_data))
test_data = list(create_tensor(test_data))

number_of_words = len(word_to_index)
number_of_tags = len(tag_to_index)

## Model

In [None]:
# cpu or gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# create a simple neural network with embedding layer, bias, and xavier initialization
class CBoW(torch.nn.Module):
    def __init__(self, nwords, ntags, emb_size):
        super(CBoW, self).__init__()

        # layers
        self.embedding = torch.nn.Embedding(nwords, emb_size)
        self.linear = torch.nn.Linear(emb_size, ntags)

        # use xavier initialization for weights
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, x):
        emb = self.embedding(x) # seq x emb_size
        out = torch.sum(emb, dim=0) # emb_size
        out = out.view(1, -1) # reshape to (1, emb_size)
        out = self.linear(out) # 1 x ntags
        return out

EMB_SIZE = 64
model = CBoW(number_of_words, number_of_tags, EMB_SIZE)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
type = torch.LongTensor

if torch.cuda.is_available():
    model.to(device)
    type = torch.cuda.LongTensor

In [None]:
# perform training of the Bow model

for epoch in range(10):
    # perform training
    model.train()
    random.shuffle(train_data)
    total_loss = 0.0
    train_correct = 0
    for sentence, tag in train_data:
        sentence = torch.tensor(sentence).type(type)
        tag = torch.tensor([tag]).type(type)
        output = model(sentence)
        predicted = torch.argmax(output.data.detach()).item()
        
        loss = criterion(output, tag)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if predicted == tag: train_correct+=1

    # perform testing of the model
    model.eval()
    test_correct = 0
    for sentence, tag in test_data:
        sentence = torch.tensor(sentence).type(type)
        output = model(sentence)
        predicted = torch.argmax(output.data.detach()).item()
        if predicted == tag: test_correct += 1
    
    # print model performance results
    log = f'epoch: {epoch+1} | ' \
        f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
        f'train accuracy: {train_correct/len(train_data):.4f} | ' \
        f'test accuracy: {test_correct/len(test_data):.4f}'
    print(log)

epoch: 1 | train loss/sent: 1.4089 | train accuracy: 0.3826 | test accuracy: 0.4149
epoch: 2 | train loss/sent: 0.9089 | train accuracy: 0.6358 | test accuracy: 0.4104
epoch: 3 | train loss/sent: 0.5298 | train accuracy: 0.8076 | test accuracy: 0.3837
epoch: 4 | train loss/sent: 0.3289 | train accuracy: 0.8864 | test accuracy: 0.3670
epoch: 5 | train loss/sent: 0.2179 | train accuracy: 0.9254 | test accuracy: 0.3851
epoch: 6 | train loss/sent: 0.1529 | train accuracy: 0.9467 | test accuracy: 0.3774
epoch: 7 | train loss/sent: 0.1131 | train accuracy: 0.9594 | test accuracy: 0.3774
epoch: 8 | train loss/sent: 0.0835 | train accuracy: 0.9719 | test accuracy: 0.3643
epoch: 9 | train loss/sent: 0.0594 | train accuracy: 0.9795 | test accuracy: 0.3566
epoch: 10 | train loss/sent: 0.0477 | train accuracy: 0.9837 | test accuracy: 0.3706
