diff --git "a/notebook.ipynb" "b/notebook.ipynb" --- "a/notebook.ipynb" +++ "b/notebook.ipynb" @@ -7,58 +7,278 @@ "# Sentiment Analysis" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Imports" - ] - }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from __future__ import annotations\n", "\n", + "from typing import TYPE_CHECKING\n", + "\n", + "if TYPE_CHECKING:\n", + " from sklearn.base import BaseEstimator\n", + "\n", "import re\n", + "import warnings\n", "from functools import cache\n", "\n", "import matplotlib.pyplot as plt\n", + "import nltk\n", + "import numpy as np\n", "import pandas as pd\n", - "import seaborn as sns" + "import seaborn as sns\n", + "from joblib import Memory\n", + "from nltk.corpus import stopwords\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.model_selection import RandomizedSearchCV, train_test_split\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.svm import SVC\n", + "\n", + "from app.constants import CACHE_DIR, SENTIMENT140_PATH\n", + "from app.model import TextCleaner, TextLemmatizer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "SEED = 42\n", + "MAX_FEATURES = 20000" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /home/tymec/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to /home/tymec/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nltk.download(\"wordnet\")\n", + "nltk.download(\"stopwords\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Load the data" + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the data" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | target | \n", + "id | \n", + "date | \n", + "flag | \n", + "user | \n", + "text | \n", + "sentiment | \n", + "
---|---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "1467810369 | \n", + "Mon Apr 06 22:19:45 PDT 2009 | \n", + "NO_QUERY | \n", + "_TheSpecialOne_ | \n", + "@switchfoot http://twitpic.com/2y1zl - Awww, t... | \n", + "negative | \n", + "
1 | \n", + "0 | \n", + "1467810672 | \n", + "Mon Apr 06 22:19:49 PDT 2009 | \n", + "NO_QUERY | \n", + "scotthamilton | \n", + "is upset that he can't update his Facebook by ... | \n", + "negative | \n", + "
2 | \n", + "0 | \n", + "1467810917 | \n", + "Mon Apr 06 22:19:53 PDT 2009 | \n", + "NO_QUERY | \n", + "mattycus | \n", + "@Kenichan I dived many times for the ball. Man... | \n", + "negative | \n", + "
3 | \n", + "0 | \n", + "1467811184 | \n", + "Mon Apr 06 22:19:57 PDT 2009 | \n", + "NO_QUERY | \n", + "ElleCTF | \n", + "my whole body feels itchy and like its on fire | \n", + "negative | \n", + "
4 | \n", + "0 | \n", + "1467811193 | \n", + "Mon Apr 06 22:19:57 PDT 2009 | \n", + "NO_QUERY | \n", + "Karoli | \n", + "@nationwideclass no, it's not behaving at all.... | \n", + "negative | \n", + "
\n", + " | word | \n", + "count | \n", + "
---|---|---|
0 | \n", + "i | \n", + "750749 | \n", + "
1 | \n", + "to | \n", + "564469 | \n", + "
2 | \n", + "the | \n", + "520036 | \n", + "
3 | \n", + "a | \n", + "377506 | \n", + "
4 | \n", + "my | \n", + "314024 | \n", + "
Pipeline(memory=Memory(location=.cache),\n", + " steps=[('clean', TextCleaner()), ('lemma', TextLemmatizer()),\n", + " ('vectorize',\n", + " CountVectorizer(max_features=20000, ngram_range=(1, 2),\n", + " stop_words=['i', 'me', 'my', 'myself', 'we',\n", + " 'our', 'ours', 'ourselves', 'you',\n", + " "you're", "you've", "you'll",\n", + " "you'd", 'your', 'yours',\n", + " 'yourself', 'yourselves', 'he',\n", + " 'him', 'his', 'himself', 'she',\n", + " "she's", 'her', 'hers', 'herself',\n", + " 'it', "it's", 'its', 'itself', ...])),\n", + " ('tfidf', TfidfTransformer())],\n", + " verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(memory=Memory(location=.cache),\n", + " steps=[('clean', TextCleaner()), ('lemma', TextLemmatizer()),\n", + " ('vectorize',\n", + " CountVectorizer(max_features=20000, ngram_range=(1, 2),\n", + " stop_words=['i', 'me', 'my', 'myself', 'we',\n", + " 'our', 'ours', 'ourselves', 'you',\n", + " "you're", "you've", "you'll",\n", + " "you'd", 'your', 'yours',\n", + " 'yourself', 'yourselves', 'he',\n", + " 'him', 'his', 'himself', 'she',\n", + " "she's", 'her', 'hers', 'herself',\n", + " 'it', "it's", 'its', 'itself', ...])),\n", + " ('tfidf', TfidfTransformer())],\n", + " verbose=True)
TextCleaner()
TextLemmatizer()
CountVectorizer(max_features=20000, ngram_range=(1, 2),\n", + " stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',\n", + " 'ourselves', 'you', "you're", "you've", "you'll",\n", + " "you'd", 'your', 'yours', 'yourself', 'yourselves',\n", + " 'he', 'him', 'his', 'himself', 'she', "she's",\n", + " 'her', 'hers', 'herself', 'it', "it's", 'its',\n", + " 'itself', ...])
TfidfTransformer()