{ "cells": [ { "cell_type": "raw", "metadata": {}, "source": [ "---\n", "title: 10 Deep Continuous Bag of Words (Deep CBOW) Text Classifier\n", "description: Build a deep continuous bag of words text classifier\n", "---" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"Colab\"" ] }, { "cell_type": "markdown", "metadata": { "id": "B8m-hOTiIQdz" }, "source": [ "# Deep Continuous Bag of Words (Deep CBOW) Text Classifier\n", "\n", "The code below implements a continuous bag of words text classifier.\n", "- We tokenize the text, create a vocabulary and encode each piece of text in the dataset\n", "- We create embeddings for inputs and sum them together\n", "- The resulting vector is fed to hidden neural network, which generates a new vector that is multiplied to a weights matrix\n", "- We then add the bias and obtain scores\n", "- The scores are applied a softmax to generate probabilities which are used for the final classification\n", "\n", "The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). \n", "\n", "![img txt](https://github.com/dair-ai/ML-Notebooks/blob/main/img/deep_cbow.png?raw=true)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nfqATQzlIJ-k" }, "outputs": [], "source": [ "import torch\n", "import random\n", "import torch.nn as nn" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-4qY1e2XNiri" }, "outputs": [], "source": [ "%%capture\n", "\n", "# download the files\n", "!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt\n", "!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt\n", "!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt\n", "\n", "# create the data folders\n", "!mkdir data data/classes\n", "!cp dev.txt data/classes\n", "!cp test.txt data/classes\n", "!cp train.txt data/classes" ] }, { "cell_type": "markdown", "metadata": { "id": "6Vh6stZfNt7F" }, "source": [ "## Read and Process the Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZjrwnvlyNsG2" }, "outputs": [], "source": [ "# function to read in data, process each line and split columns by \" ||| \"\n", "def read_data(filename):\n", " data = []\n", " with open(filename, 'r') as f:\n", " for line in f:\n", " line = line.lower().strip()\n", " line = line.split(' ||| ')\n", " data.append(line)\n", " return data\n", "\n", "train_data = read_data('data/classes/train.txt')\n", "test_data = read_data('data/classes/test.txt')\n", "\n", "# creating the word and tag indices\n", "word_to_index = {}\n", "word_to_index[\"\"] = len(word_to_index) # add to dictionary\n", "tag_to_index = {}\n", "\n", "# create word to index dictionary and tag to index dictionary from data\n", "def create_dict(data, check_unk=False):\n", " for line in data:\n", " for word in line[1].split(\" \"):\n", " if check_unk == False:\n", " if word not in word_to_index:\n", " word_to_index[word] = len(word_to_index)\n", " else:\n", " if word not in word_to_index:\n", " word_to_index[word] = word_to_index[\"\"]\n", "\n", " if line[0] not in tag_to_index:\n", " tag_to_index[line[0]] = len(tag_to_index)\n", "\n", "create_dict(train_data)\n", "create_dict(test_data, check_unk=True)\n", "\n", "# create word and tag tensors from data\n", "def create_tensor(data):\n", " for line in data:\n", " yield([word_to_index[word] for word in line[1].split(\" \")], tag_to_index[line[0]])\n", "\n", "train_data = list(create_tensor(train_data))\n", "test_data = list(create_tensor(test_data))\n", "\n", "number_of_words = len(word_to_index)\n", "number_of_tags = len(tag_to_index)" ] }, { "cell_type": "markdown", "metadata": { "id": "sSoomtjuN4HD" }, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "j_-GavImNz6n" }, "outputs": [], "source": [ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "# create a simple neural network with embedding layer, bias, and xavier initialization\n", "class DeepCBoW(nn.Module):\n", " def __init__(self, nwords, ntags, hidden_size, num_layers, emb_size):\n", " super(DeepCBoW, self).__init__()\n", "\n", " self.num_layers = num_layers\n", "\n", " # layers\n", " self.embedding = nn.Embedding(nwords, emb_size)\n", " self.linears = nn.ModuleList([nn.Linear(emb_size if i ==0 else hidden_size, hidden_size) \\\n", " for i in range(num_layers)])\n", "\n", " # use xavier initialization for weights\n", " nn.init.xavier_uniform_(self.embedding.weight)\n", " for i in range(self.num_layers):\n", " nn.init.xavier_uniform_(self.linears[i].weight)\n", "\n", " # output layer\n", " self.output_layer = nn.Linear(hidden_size, ntags)\n", "\n", " def forward(self, x):\n", " emb = self.embedding(x) # seq x emb_size\n", " emb_sum = torch.sum(emb, dim=0) # emb_size\n", " h = emb_sum.view(1, -1) # reshape to (1, emb_size)\n", " for i in range(self.num_layers):\n", " h = torch.tanh(self.linears[i](h))\n", " out = self.output_layer(h) # 1 x ntags\n", " return out\n", "\n", "HIDDEN_SIZE = 64\n", "NUM_LAYERS = 2 # hidden layers\n", "EMB_SIZE = 64\n", "model = DeepCBoW(number_of_words, number_of_tags, HIDDEN_SIZE, NUM_LAYERS, EMB_SIZE).to(device)\n", "criterion = nn.CrossEntropyLoss()\n", "optimizer = torch.optim.Adam(model.parameters())\n", "type = torch.LongTensor\n", "\n", "if torch.cuda.is_available():\n", " model.to(device)\n", " type = torch.cuda.LongTensor" ] }, { "cell_type": "markdown", "metadata": { "id": "tMqill6ZOLPu" }, "source": [ "## Model Training" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BkY11eyXOIOY" }, "outputs": [], "source": [ "# perform training of the Bow model\n", "\n", "for epoch in range(10):\n", " # perform training\n", " model.train()\n", " random.shuffle(train_data)\n", " total_loss = 0.0\n", " train_correct = 0\n", " for sentence, tag in train_data:\n", " sentence = torch.tensor(sentence).type(type)\n", " tag = torch.tensor([tag]).type(type)\n", " output = model(sentence)\n", " predicted = torch.argmax(output.data.detach()).item()\n", " \n", " loss = criterion(output, tag)\n", " total_loss += loss.item()\n", "\n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", "\n", " if predicted == tag: train_correct+=1\n", "\n", " # perform testing of the model\n", " model.eval()\n", " test_correct = 0\n", " for sentence, tag in test_data:\n", " sentence = torch.tensor(sentence).type(type)\n", " output = model(sentence)\n", " predicted = torch.argmax(output.data.detach()).item()\n", " if predicted == tag: test_correct += 1\n", " \n", " # print model performance results\n", " log = f'epoch: {epoch+1} | ' \\\n", " f'train loss/sent: {total_loss/len(train_data):.4f} | ' \\\n", " f'train accuracy: {train_correct/len(train_data):.4f} | ' \\\n", " f'test accuracy: {test_correct/len(test_data):.4f}'\n", " \n", " print(log)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "name": "deep-cbow.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 1 }