{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import regex as re\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix, classification_report\n", "\n", "from sklearn.ensemble import AdaBoostClassifier\n", "from xgboost import XGBClassifier\n", "\n", "from sklearn.linear_model import LogisticRegression, SGDClassifier\n", "from sklearn.svm import SVC, LinearSVC, NuSVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.naive_bayes import MultinomialNB, BernoulliNB\n", "\n", "import pickle\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | VideoID | \n", "Effectiveness | \n", "
---|---|---|
0 | \n", "pvuN_WvF1to | \n", "1.0 | \n", "
1 | \n", "eRLJscAlk1M | \n", "5.0 | \n", "
2 | \n", "VbiRNT_gWUQ | \n", "2.0 | \n", "
3 | \n", "5dVcn8NjbwY | \n", "NaN | \n", "
4 | \n", "5scez5dqtAc | \n", "4.0 | \n", "
... | \n", "... | \n", "... | \n", "
186 | \n", "TZ0j6kr4ZJ0 | \n", "3.0 | \n", "
187 | \n", "8DiWzvE52ZY | \n", "1.0 | \n", "
188 | \n", "OwqIy8Ikv-c | \n", "2.0 | \n", "
189 | \n", "lPgZfhnCAdI | \n", "1.0 | \n", "
190 | \n", "dSu5sXmsur4 | \n", "3.0 | \n", "
191 rows × 2 columns
\n", "\n", " | VideoID | \n", "Effectiveness | \n", "
---|---|---|
0 | \n", "pvuN_WvF1to | \n", "1.0 | \n", "
1 | \n", "eRLJscAlk1M | \n", "5.0 | \n", "
2 | \n", "VbiRNT_gWUQ | \n", "2.0 | \n", "
3 | \n", "5scez5dqtAc | \n", "4.0 | \n", "
4 | \n", "JDcro7dPqpA | \n", "2.0 | \n", "
... | \n", "... | \n", "... | \n", "
164 | \n", "TZ0j6kr4ZJ0 | \n", "3.0 | \n", "
165 | \n", "8DiWzvE52ZY | \n", "1.0 | \n", "
166 | \n", "OwqIy8Ikv-c | \n", "2.0 | \n", "
167 | \n", "lPgZfhnCAdI | \n", "1.0 | \n", "
168 | \n", "dSu5sXmsur4 | \n", "3.0 | \n", "
169 rows × 2 columns
\n", "\n", " | VideoID | \n", "Effectiveness | \n", "
---|---|---|
0 | \n", "pvuN_WvF1to | \n", "neg | \n", "
1 | \n", "eRLJscAlk1M | \n", "pos | \n", "
2 | \n", "VbiRNT_gWUQ | \n", "neg | \n", "
3 | \n", "5scez5dqtAc | \n", "pos | \n", "
4 | \n", "JDcro7dPqpA | \n", "neg | \n", "
... | \n", "... | \n", "... | \n", "
132 | \n", "JYZpxRy5Mfg | \n", "pos | \n", "
133 | \n", "xXMlFFY9uEI | \n", "pos | \n", "
134 | \n", "8DiWzvE52ZY | \n", "neg | \n", "
135 | \n", "OwqIy8Ikv-c | \n", "neg | \n", "
136 | \n", "lPgZfhnCAdI | \n", "neg | \n", "
137 rows × 2 columns
\n", "\n", " | VideoID | \n", "Effectiveness | \n", "cleaned | \n", "cleaned_string | \n", "num_comments | \n", "average_word_length | \n", "average_sentence_length | \n", "average_punctuation_count | \n", "average_emoji_count | \n", "average_sentiment | \n", "sentiment_ratio_negative | \n", "sentiment_ratio_neutral | \n", "sentiment_ratio_positive | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "pvuN_WvF1to | \n", "neg | \n", "[clean, version, video, child, love, northeast... | \n", "clean version video child love northeast india... | \n", "125533 | \n", "11.370739 | \n", "1.292959 | \n", "2.123577 | \n", "0.588371 | \n", "0.095633 | \n", "0.137295 | \n", "0.529606 | \n", "0.333100 | \n", "
1 | \n", "eRLJscAlk1M | \n", "pos | \n", "[step, take, help, fight, climate, change, wel... | \n", "step take help fight climate change well equal... | \n", "161953 | \n", "17.195229 | \n", "1.594994 | \n", "2.718289 | \n", "0.489704 | \n", "0.037611 | \n", "0.202355 | \n", "0.500905 | \n", "0.296740 | \n", "
2 | \n", "VbiRNT_gWUQ | \n", "neg | \n", "[country, disappear, video, year, old, world, ... | \n", "country disappear video year old world map did... | \n", "27616 | \n", "18.386660 | \n", "1.726789 | \n", "3.540701 | \n", "0.117903 | \n", "0.052846 | \n", "0.196010 | \n", "0.445177 | \n", "0.358814 | \n", "
3 | \n", "5scez5dqtAc | \n", "pos | \n", "[im, watch, trump, biden, ha, already, start, ... | \n", "im watch trump biden ha already start process ... | \n", "13773 | \n", "32.300443 | \n", "2.364554 | \n", "5.870616 | \n", "0.060626 | \n", "0.020608 | \n", "0.301387 | \n", "0.315545 | \n", "0.383068 | \n", "
4 | \n", "JDcro7dPqpA | \n", "neg | \n", "[fun, fact, cow, belch, fart, adult, version, ... | \n", "fun fact cow belch fart adult version bill nye... | \n", "18821 | \n", "34.869454 | \n", "2.559588 | \n", "6.624250 | \n", "0.106902 | \n", "0.032238 | \n", "0.296796 | \n", "0.313480 | \n", "0.389724 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
132 | \n", "JYZpxRy5Mfg | \n", "pos | \n", "[usually, consumer_NEG, say_NEG, though_NEG, s... | \n", "usually consumer_NEG say_NEG though_NEG suppor... | \n", "415 | \n", "19.036145 | \n", "1.759036 | \n", "3.508434 | \n", "0.207229 | \n", "0.090164 | \n", "0.149398 | \n", "0.513253 | \n", "0.337349 | \n", "
133 | \n", "xXMlFFY9uEI | \n", "pos | \n", "[joe, biden, ha, plan, fix, thing, forefront, ... | \n", "joe biden ha plan fix thing forefront news sev... | \n", "431 | \n", "37.774942 | \n", "2.700696 | \n", "9.039443 | \n", "0.153132 | \n", "0.034621 | \n", "0.225058 | \n", "0.396752 | \n", "0.378190 | \n", "
134 | \n", "8DiWzvE52ZY | \n", "neg | \n", "[marios, leave, hand, doe, intro, impressive, ... | \n", "marios leave hand doe intro impressive today p... | \n", "5262 | \n", "18.298556 | \n", "1.779742 | \n", "3.726720 | \n", "0.136640 | \n", "0.143438 | \n", "0.129609 | \n", "0.403079 | \n", "0.467313 | \n", "
135 | \n", "OwqIy8Ikv-c | \n", "neg | \n", "[lie, interseting, isnt, group_NEG, consist_NE... | \n", "lie interseting isnt group_NEG consist_NEG com... | \n", "14421 | \n", "57.651203 | \n", "3.803966 | \n", "13.288954 | \n", "0.029402 | \n", "0.049250 | \n", "0.249983 | \n", "0.278275 | \n", "0.471743 | \n", "
136 | \n", "lPgZfhnCAdI | \n", "neg | \n", "[miss, man, wa, hero, didnt, cherish_NEG, enou... | \n", "miss man wa hero didnt cherish_NEG enough_NEG ... | \n", "3777 | \n", "40.999735 | \n", "3.014562 | \n", "8.415674 | \n", "0.034948 | \n", "0.017825 | \n", "0.294943 | \n", "0.291236 | \n", "0.413820 | \n", "
137 rows × 13 columns
\n", "\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "... | \n", "569679 | \n", "569680 | \n", "average_word_length | \n", "average_sentence_length | \n", "average_punctuation_count | \n", "average_emoji_count | \n", "average_sentiment | \n", "sentiment_ratio_negative | \n", "sentiment_ratio_neutral | \n", "sentiment_ratio_positive | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.000259 | \n", "0.000000 | \n", "0.000315 | \n", "0.000000 | \n", "0.000435 | \n", "0.000000 | \n", "0.000372 | \n", "0.000237 | \n", "0.000085 | \n", "0.000149 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "11.370739 | \n", "1.292959 | \n", "2.123577 | \n", "0.588371 | \n", "0.095633 | \n", "0.137295 | \n", "0.529606 | \n", "0.333100 | \n", "
1 | \n", "0.000473 | \n", "0.000053 | \n", "0.000222 | \n", "0.000000 | \n", "0.000000 | \n", "0.000084 | \n", "0.000000 | \n", "0.000067 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "17.195229 | \n", "1.594994 | \n", "2.718289 | \n", "0.489704 | \n", "0.037611 | \n", "0.202355 | \n", "0.500905 | \n", "0.296740 | \n", "
2 | \n", "0.000201 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000397 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "18.386660 | \n", "1.726789 | \n", "3.540701 | \n", "0.117903 | \n", "0.052846 | \n", "0.196010 | \n", "0.445177 | \n", "0.358814 | \n", "
3 | \n", "0.000539 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "32.300443 | \n", "2.364554 | \n", "5.870616 | \n", "0.060626 | \n", "0.020608 | \n", "0.301387 | \n", "0.315545 | \n", "0.383068 | \n", "
4 | \n", "0.000181 | \n", "0.000263 | \n", "0.000000 | \n", "0.000258 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "34.869454 | \n", "2.559588 | \n", "6.624250 | \n", "0.106902 | \n", "0.032238 | \n", "0.296796 | \n", "0.313480 | \n", "0.389724 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
132 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "19.036145 | \n", "1.759036 | \n", "3.508434 | \n", "0.207229 | \n", "0.090164 | \n", "0.149398 | \n", "0.513253 | \n", "0.337349 | \n", "
133 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "37.774942 | \n", "2.700696 | \n", "9.039443 | \n", "0.153132 | \n", "0.034621 | \n", "0.225058 | \n", "0.396752 | \n", "0.378190 | \n", "
134 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "18.298556 | \n", "1.779742 | \n", "3.726720 | \n", "0.136640 | \n", "0.143438 | \n", "0.129609 | \n", "0.403079 | \n", "0.467313 | \n", "
135 | \n", "0.000000 | \n", "0.000225 | \n", "0.000379 | \n", "0.000883 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "57.651203 | \n", "3.803966 | \n", "13.288954 | \n", "0.029402 | \n", "0.049250 | \n", "0.249983 | \n", "0.278275 | \n", "0.471743 | \n", "
136 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "40.999735 | \n", "3.014562 | \n", "8.415674 | \n", "0.034948 | \n", "0.017825 | \n", "0.294943 | \n", "0.291236 | \n", "0.413820 | \n", "
137 rows × 569689 columns
\n", "\n", " | Multinomial Naive Bayes | \n", "Bernoulli Naive Bayes | \n", "SGD | \n", "Logistic Regression | \n", "Support Vector Classifier | \n", "Linear Support Vector Classifier | \n", "Nu-Support Vector Classifier | \n", "Random Forest | \n", "XGBoost | \n", "AdaBoost Classifier | \n", "CNN | \n", "Best Score | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
Accuracy | \n", "52.38 | \n", "69.05 | \n", "73.81 | \n", "78.57 | \n", "52.38 | \n", "66.67 | \n", "61.90 | \n", "71.43 | \n", "73.81 | \n", "61.90 | \n", "82.35 | \n", "CNN | \n", "
Precision | \n", "26.19 | \n", "69.05 | \n", "73.81 | \n", "78.60 | \n", "51.67 | \n", "67.07 | \n", "61.82 | \n", "73.21 | \n", "74.26 | \n", "61.82 | \n", "88.88 | \n", "CNN | \n", "
Recall | \n", "50.00 | \n", "69.09 | \n", "73.86 | \n", "78.41 | \n", "51.36 | \n", "66.14 | \n", "61.82 | \n", "70.68 | \n", "74.09 | \n", "61.82 | \n", "61.53 | \n", "Logistic Regression | \n", "
F1-score | \n", "34.38 | \n", "69.03 | \n", "73.79 | \n", "78.46 | \n", "49.52 | \n", "65.97 | \n", "61.82 | \n", "70.35 | \n", "73.79 | \n", "61.82 | \n", "72.72 | \n", "Logistic Regression | \n", "