{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Sentiment Analysis - Restaurant Reviews.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "kh4udnC9fZyU", "colab_type": "code", "outputId": "677fbeb5-d5b2-49f7-99bf-92bd1f2fa44e", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "# Connecting Google Drive with Google Colab\n", "from google.colab import drive\n", "drive.mount('/content/drive/')" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount(\"/content/drive/\", force_remount=True).\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "wqtOguIVfysM", "colab_type": "code", "colab": {} }, "source": [ "# Importing essential libraries\n", "import numpy as np\n", "import pandas as pd" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "FsZFCtjijekC", "colab_type": "code", "colab": {} }, "source": [ "# Loading the dataset\n", "df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Datasets/Restaurant_Reviews.tsv', delimiter='\\t', quoting=3)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "zkdfWSlej05y", "colab_type": "code", "outputId": "26f108a7-5617-4abe-efae-0d64d31e8041", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "df.shape" ], "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(1000, 2)" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "cell_type": "code", "metadata": { "id": "SyYImhASubeb", "colab_type": "code", "outputId": "2c8efdb6-17a5-48da-8ac2-7c9d2c289b09", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "df.columns" ], "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['Review', 'Liked'], dtype='object')" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "code", "metadata": { "id": "b5lzlG5DMNX9", "colab_type": "code", "outputId": "ab125608-7f10-479c-8dab-bb298fa7bbaf", "colab": { "base_uri": "https://localhost:8080/", "height": 197 } }, "source": [ "df.head()" ], "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ReviewLiked
0Wow... Loved this place.1
1Crust is not good.0
2Not tasty and the texture was just nasty.0
3Stopped by during the late May bank holiday of...1
4The selection on the menu was great and so wer...1
\n", "
" ], "text/plain": [ " Review Liked\n", "0 Wow... Loved this place. 1\n", "1 Crust is not good. 0\n", "2 Not tasty and the texture was just nasty. 0\n", "3 Stopped by during the late May bank holiday of... 1\n", "4 The selection on the menu was great and so wer... 1" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "cell_type": "markdown", "metadata": { "id": "38_tPfGAr0AL", "colab_type": "text" }, "source": [ "# **Data Preprocessing**" ] }, { "cell_type": "code", "metadata": { "id": "gZpsSpUAkCyH", "colab_type": "code", "outputId": "81a672d9-a796-4789-e2e8-36d360f9e558", "colab": { "base_uri": "https://localhost:8080/", "height": 52 } }, "source": [ "# Importing essential libraries for performing Natural Language Processing on 'Restaurant_Reviews.tsv' dataset\n", "import nltk\n", "import re\n", "nltk.download('stopwords')\n", "from nltk.corpus import stopwords\n", "from nltk.stem.porter import PorterStemmer" ], "execution_count": 7, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "tUnp7Dr7mFwn", "colab_type": "code", "colab": {} }, "source": [ "# Cleaning the reviews\n", "corpus = []\n", "for i in range(0,1000):\n", "\n", " # Cleaning special character from the reviews\n", " review = re.sub(pattern='[^a-zA-Z]',repl=' ', string=df['Review'][i])\n", "\n", " # Converting the entire review into lower case\n", " review = review.lower()\n", "\n", " # Tokenizing the review by words\n", " review_words = review.split()\n", "\n", " # Removing the stop words\n", " review_words = [word for word in review_words if not word in set(stopwords.words('english'))]\n", "\n", " # Stemming the words\n", " ps = PorterStemmer()\n", " review = [ps.stem(word) for word in review_words]\n", "\n", " # Joining the stemmed words\n", " review = ' '.join(review)\n", "\n", " # Creating a corpus\n", " corpus.append(review)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "6ewB2oNJ0rr9", "colab_type": "code", "outputId": "9f2c2e4b-adf7-4157-d573-f3383a16cee0", "colab": { "base_uri": "https://localhost:8080/", "height": 194 } }, "source": [ "corpus[0:10]" ], "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['wow love place',\n", " 'crust good',\n", " 'tasti textur nasti',\n", " 'stop late may bank holiday rick steve recommend love',\n", " 'select menu great price',\n", " 'get angri want damn pho',\n", " 'honeslti tast fresh',\n", " 'potato like rubber could tell made ahead time kept warmer',\n", " 'fri great',\n", " 'great touch']" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "spNHLhGs20LV", "colab_type": "code", "colab": {} }, "source": [ "# Creating the Bag of Words model\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "cv = CountVectorizer(max_features=1500)\n", "X = cv.fit_transform(corpus).toarray()\n", "y = df.iloc[:, 1].values" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "jYNkfBqJ42hs", "colab_type": "text" }, "source": [ "# **Model Building**" ] }, { "cell_type": "code", "metadata": { "id": "sL6FOXMx45w0", "colab_type": "code", "colab": {} }, "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "KYTe6hjJDV8K", "colab_type": "code", "outputId": "56f78ef1-3f7f-40ce-cf1c-15a2b91b61c3", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "# Fitting Naive Bayes to the Training set\n", "from sklearn.naive_bayes import MultinomialNB\n", "classifier = MultinomialNB()\n", "classifier.fit(X_train, y_train)" ], "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" ] }, "metadata": { "tags": [] }, "execution_count": 12 } ] }, { "cell_type": "code", "metadata": { "id": "CjXrDsEyDbD7", "colab_type": "code", "colab": {} }, "source": [ "# Predicting the Test set results\n", "y_pred = classifier.predict(X_test)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "CcRU4PabPDY-", "colab_type": "code", "outputId": "4985115a-e9be-4447-9a22-026c59045ec9", "colab": { "base_uri": "https://localhost:8080/", "height": 87 } }, "source": [ "# Accuracy, Precision and Recall\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import precision_score\n", "from sklearn.metrics import recall_score\n", "score1 = accuracy_score(y_test,y_pred)\n", "score2 = precision_score(y_test,y_pred)\n", "score3= recall_score(y_test,y_pred)\n", "print(\"---- Scores ----\")\n", "print(\"Accuracy score is: {}%\".format(round(score1*100,2)))\n", "print(\"Precision score is: {}\".format(round(score2,2)))\n", "print(\"Recall score is: {}\".format(round(score3,2)))" ], "execution_count": 14, "outputs": [ { "output_type": "stream", "text": [ "---- Scores ----\n", "Accuracy score is: 76.5%\n", "Precision score is: 0.76\n", "Recall score is: 0.79\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "-77oRRHjDgwr", "colab_type": "code", "colab": {} }, "source": [ "# Making the Confusion Matrix\n", "from sklearn.metrics import confusion_matrix\n", "cm = confusion_matrix(y_test, y_pred)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "9lRKOJ-zjv3F", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 52 }, "outputId": "b5c14f34-e062-4cf6-b899-31a5d583d62c" }, "source": [ "cm" ], "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[72, 25],\n", " [22, 81]])" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "code", "metadata": { "id": "hYd9LdXmDkKb", "colab_type": "code", "outputId": "30c403fb-f204-42ff-a19c-eb2ecbdf8cd5", "colab": { "base_uri": "https://localhost:8080/", "height": 461 } }, "source": [ "# Plotting the confusion matrix\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "\n", "plt.figure(figsize = (10,6))\n", "sns.heatmap(cm, annot=True, cmap=\"YlGnBu\", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])\n", "plt.xlabel('Predicted values')\n", "plt.ylabel('Actual values')" ], "execution_count": 17, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", " import pandas.util.testing as tm\n" ], "name": "stderr" }, { "output_type": "execute_result", "data": { "text/plain": [ "Text(69.0, 0.5, 'Actual values')" ] }, "metadata": { "tags": [] }, "execution_count": 17 }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "id": "LJbZKcc9jWcV", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 230 }, "outputId": "654b7fc8-9c8e-452b-c14c-dd57c87d82ec" }, "source": [ "# Hyperparameter tuning the Naive Bayes Classifier\n", "best_accuracy = 0.0\n", "alpha_val = 0.0\n", "for i in np.arange(0.1,1.1,0.1):\n", " temp_classifier = MultinomialNB(alpha=i)\n", " temp_classifier.fit(X_train, y_train)\n", " temp_y_pred = temp_classifier.predict(X_test)\n", " score = accuracy_score(y_test, temp_y_pred)\n", " print(\"Accuracy score for alpha={} is: {}%\".format(round(i,1), round(score*100,2)))\n", " if score>best_accuracy:\n", " best_accuracy = score\n", " alpha_val = i\n", "print('--------------------------------------------')\n", "print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))" ], "execution_count": 18, "outputs": [ { "output_type": "stream", "text": [ "Accuracy score for alpha=0.1 is: 78.0%\n", "Accuracy score for alpha=0.2 is: 78.5%\n", "Accuracy score for alpha=0.3 is: 78.0%\n", "Accuracy score for alpha=0.4 is: 78.0%\n", "Accuracy score for alpha=0.5 is: 77.5%\n", "Accuracy score for alpha=0.6 is: 77.5%\n", "Accuracy score for alpha=0.7 is: 77.5%\n", "Accuracy score for alpha=0.8 is: 77.0%\n", "Accuracy score for alpha=0.9 is: 76.5%\n", "Accuracy score for alpha=1.0 is: 76.5%\n", "--------------------------------------------\n", "The best accuracy is 78.5% with alpha value as 0.2\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "9BNR7SfKkDsL", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "0ebe229f-009d-46fa-852c-90b758d548b6" }, "source": [ "classifier = MultinomialNB(alpha=0.2)\n", "classifier.fit(X_train, y_train)" ], "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "markdown", "metadata": { "id": "iYQVSu17MWgV", "colab_type": "text" }, "source": [ "# **Predictions**" ] }, { "cell_type": "code", "metadata": { "id": "mYbh9DFvwmW1", "colab_type": "code", "colab": {} }, "source": [ "def predict_sentiment(sample_review):\n", " sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ', string = sample_review)\n", " sample_review = sample_review.lower()\n", " sample_review_words = sample_review.split()\n", " sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]\n", " ps = PorterStemmer()\n", " final_review = [ps.stem(word) for word in sample_review_words]\n", " final_review = ' '.join(final_review)\n", "\n", " temp = cv.transform([final_review]).toarray()\n", " return classifier.predict(temp)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Os0d_BZELC95", "colab_type": "code", "outputId": "3478b8c9-55a9-454f-aaae-b42ccc28d609", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "# Predicting values\n", "sample_review = 'The food is really good here.'\n", "\n", "if predict_sentiment(sample_review):\n", " print('This is a POSITIVE review.')\n", "else:\n", " print('This is a NEGATIVE review!')" ], "execution_count": 21, "outputs": [ { "output_type": "stream", "text": [ "This is a POSITIVE review.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "A88ILf9PNAKY", "colab_type": "code", "outputId": "d1fe224e-373f-4e98-9c05-da96980d4f49", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "# Predicting values\n", "sample_review = 'Food was pretty bad and the service was very slow.'\n", "\n", "if predict_sentiment(sample_review):\n", " print('This is a POSITIVE review.')\n", "else:\n", " print('This is a NEGATIVE review!')" ], "execution_count": 22, "outputs": [ { "output_type": "stream", "text": [ "This is a NEGATIVE review!\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "UXgRRzafOX3d", "colab_type": "code", "outputId": "f913faa2-38b5-48c6-f6fa-456ab807a01c", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "# Predicting values\n", "sample_review = 'The food was absolutely wonderful, from preparation to presentation, very pleasing.'\n", "\n", "if predict_sentiment(sample_review):\n", " print('This is a POSITIVE review.')\n", "else:\n", " print('This is a NEGATIVE review!')" ], "execution_count": 23, "outputs": [ { "output_type": "stream", "text": [ "This is a POSITIVE review.\n" ], "name": "stdout" } ] } ] }