{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "import praw\n",
    "import pandas as pd\n",
    "\n",
    "reddit= praw.Reddit(client_id=\"Q1w42RHhLq2fgwljAk_k-Q\",\t\t # your client id\n",
    "\t\t\t\t\tclient_secret=\"enUJfFthiZRynGfPQtoK1nCxRer2Dw\",\t # your client secret\n",
    "                    usernme = \"xl395\", #profile username\n",
    "                    password = \"12xiao34quanAria!\", #profile password\n",
    "\t\t\t\t\tuser_agent=\"706_post\")\t # your user agent\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## No bad words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "def scrap_by_keyword(keyword, limit=1000):\n",
    "    subreddit = reddit.subreddit(keyword)\n",
    "    top_subreddit = subreddit.top(limit=limit)\n",
    "    post = subreddit.top('year', limit=limit)\n",
    "    posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "    for submission in post:\n",
    "        posts_dict[\"title\"].append(submission.title)\n",
    "        posts_dict[\"Post text\"].append(submission.selftext) \n",
    "    df = pd.DataFrame(posts_dict)\n",
    "    df['full text'] = df['title'] + '. ' + df['Post text']\n",
    "    df['class'] = 0\n",
    "    df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n",
    "    df.reset_index(drop=True, inplace=True)\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(985, 4)"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "international = scrap_by_keyword('international')\n",
    "international.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(997, 4)"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "haircare = scrap_by_keyword('haircare')\n",
    "haircare.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(990, 4)"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "jokes = scrap_by_keyword('jokes')\n",
    "jokes.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(995, 4)"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "houseplants = scrap_by_keyword('houseplants')\n",
    "houseplants.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(942, 4)"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "history = scrap_by_keyword('history')\n",
    "history.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(56, 4)"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stock = scrap_by_keyword('stock')\n",
    "stock.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(996, 4)"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "music = scrap_by_keyword('music')\n",
    "music.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(998, 4)"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "family = scrap_by_keyword('family')\n",
    "family.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(896, 4)"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "photography = scrap_by_keyword('photography')\n",
    "photography.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(982, 4)"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "animal = scrap_by_keyword('animal')\n",
    "animal.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(1000, 4)"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "makeup = scrap_by_keyword('makeup')\n",
    "makeup.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(999, 4)"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "baking = scrap_by_keyword('baking')\n",
    "baking.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(989, 4)"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "yoga = scrap_by_keyword('yoga')\n",
    "yoga.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(996, 4)"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gym = scrap_by_keyword('gym')\n",
    "gym.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(904, 4)"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "car = scrap_by_keyword('car')\n",
    "car.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=limit)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(996, 4)"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "travel = scrap_by_keyword('travel')\n",
    "travel.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(14721, 4)"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.concat([international, haircare, jokes, houseplants, history, stock, music, family, photography, animal, makeup, baking, yoga, gym, car, travel], ignore_index=True)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(14717, 4)"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n",
    "df.reset_index(drop=True, inplace=True)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "swear_list = ['fuck', 'dick', 'cock', 'bullshit', 'bastard', 'asshole', 'damn', 'bitch', 'pussy']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(14210, 4)"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop rows that contain words in swear_list\n",
    "for i in swear_list:\n",
    "    df = df.loc[~df['full text'].str.contains(i, case=False)]\n",
    "\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>full text</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The Global Imams Council (of Muslim faith lead...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Indian military Gun Down 11 Civilians In Nagal...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mariupols' real life hero's'. Save all these w...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The mother of a russian conscript kid at an an...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Ukraine, Russia exchange bodies of fallen sold...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           full text  class\n",
       "0  The Global Imams Council (of Muslim faith lead...      0\n",
       "1  Indian military Gun Down 11 Civilians In Nagal...      0\n",
       "2  Mariupols' real life hero's'. Save all these w...      0\n",
       "3  The mother of a russian conscript kid at an an...      0\n",
       "4  Ukraine, Russia exchange bodies of fallen sold...      0"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_final = df.loc[:, ['full text', 'class']]\n",
    "df_final.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Bad words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "swear_list = ['fuck','dick','cock','bullshit','bastard','asshole','damn','bitch','pussy']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/4029709700.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=1000)\n"
     ]
    }
   ],
   "source": [
    "subreddit = reddit.subreddit(swear_list[0])\n",
    "top_subreddit = subreddit.top(limit=1000)\n",
    "post = subreddit.top('year', limit=1000)\n",
    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "for submission in post:\n",
    "    posts_dict[\"title\"].append(submission.title)\n",
    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
    "\n",
    "posts_fuck = pd.DataFrame(posts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1308722114.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=1000)\n"
     ]
    }
   ],
   "source": [
    "subreddit = reddit.subreddit(swear_list[1])\n",
    "top_subreddit = subreddit.top(limit=1000)\n",
    "post = subreddit.top('year', limit=1000)\n",
    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "for submission in post:\n",
    "    posts_dict[\"title\"].append(submission.title)\n",
    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
    "\n",
    "posts_dick = pd.DataFrame(posts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1885500635.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=1000)\n"
     ]
    }
   ],
   "source": [
    "subreddit = reddit.subreddit(swear_list[2])\n",
    "top_subreddit = subreddit.top(limit=1000)\n",
    "post = subreddit.top('year', limit=1000)\n",
    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "for submission in post:\n",
    "    posts_dict[\"title\"].append(submission.title)\n",
    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
    "\n",
    "posts_cock = pd.DataFrame(posts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/1651880681.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=1000)\n"
     ]
    }
   ],
   "source": [
    "subreddit = reddit.subreddit(swear_list[3])\n",
    "top_subreddit = subreddit.top(limit=1000)\n",
    "post = subreddit.top('year', limit=1000)\n",
    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "for submission in post:\n",
    "    posts_dict[\"title\"].append(submission.title)\n",
    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
    "\n",
    "posts_bullshit = pd.DataFrame(posts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/816827923.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=1000)\n"
     ]
    }
   ],
   "source": [
    "subreddit = reddit.subreddit('bastard')\n",
    "top_subreddit = subreddit.top(limit=1000)\n",
    "post = subreddit.top('year', limit=1000)\n",
    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "for submission in post:\n",
    "    posts_dict[\"title\"].append(submission.title)\n",
    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
    "\n",
    "posts_bastard = pd.DataFrame(posts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/3789948362.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=1000)\n"
     ]
    }
   ],
   "source": [
    "subreddit = reddit.subreddit('asshole')\n",
    "top_subreddit = subreddit.top(limit=1000)\n",
    "post = subreddit.top('year', limit=1000)\n",
    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "for submission in post:\n",
    "    posts_dict[\"title\"].append(submission.title)\n",
    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
    "\n",
    "posts_asshole = pd.DataFrame(posts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/2306344161.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=1000)\n"
     ]
    }
   ],
   "source": [
    "subreddit = reddit.subreddit('damn')\n",
    "top_subreddit = subreddit.top(limit=1000)\n",
    "post = subreddit.top('year', limit=1000)\n",
    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "for submission in post:\n",
    "    posts_dict[\"title\"].append(submission.title)\n",
    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
    "\n",
    "posts_damn = pd.DataFrame(posts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/4247606323.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=1000)\n"
     ]
    }
   ],
   "source": [
    "subreddit = reddit.subreddit('bitch')\n",
    "top_subreddit = subreddit.top(limit=1000)\n",
    "post = subreddit.top('year', limit=1000)\n",
    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "for submission in post:\n",
    "    posts_dict[\"title\"].append(submission.title)\n",
    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
    "\n",
    "posts_bitch = pd.DataFrame(posts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7591/3249493546.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
      "Call this function with 'time_filter' as a keyword argument.\n",
      "  post = subreddit.top('year', limit=1000)\n"
     ]
    }
   ],
   "source": [
    "subreddit = reddit.subreddit('pussy')\n",
    "top_subreddit = subreddit.top(limit=1000)\n",
    "post = subreddit.top('year', limit=1000)\n",
    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
    "for submission in post:\n",
    "    posts_dict[\"title\"].append(submission.title)\n",
    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
    "\n",
    "posts_pussy = pd.DataFrame(posts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1, 2)\n",
      "(995, 2)\n",
      "(45, 2)\n",
      "(542, 2)\n",
      "(1, 2)\n",
      "(997, 2)\n",
      "(92, 2)\n",
      "(20, 2)\n",
      "(994, 2)\n"
     ]
    }
   ],
   "source": [
    "print(posts_bullshit.shape)\n",
    "print(posts_cock.shape)\n",
    "print(posts_dick.shape)\n",
    "print(posts_fuck.shape)\n",
    "print(posts_bastard.shape)\n",
    "print(posts_asshole.shape)\n",
    "print(posts_damn.shape)\n",
    "print(posts_bitch.shape)\n",
    "print(posts_pussy.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "#concatenate dataframe\n",
    "swear_df = pd.concat([posts_bullshit,posts_cock,posts_dick, posts_fuck,posts_bastard, posts_asshole,posts_damn,  posts_bitch, posts_pussy] )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3687, 2)"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "swear_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "# detect none in swear_df\n",
    "swear_df['full text'] = swear_df['title'] + '. '+ swear_df['Post text']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "swear_df['class'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>full text</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>i was told to name the note half a step down o...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>all or nothing 🤷.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>I want to shove it down your throat.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>only interact if you would suck it.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Does my selfsuck deserve a like? (18).</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>989</th>\n",
       "      <td>Oh you are ready for licking, What a good boy!.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>990</th>\n",
       "      <td>Your dick will be happy inside of me 🤤💕.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>991</th>\n",
       "      <td>My sweet and wet pussy loves it harder.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>992</th>\n",
       "      <td>[OC] NSFW TikTok of me fingering my pussy 😫🚨.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>993</th>\n",
       "      <td>Made a mess [OC].</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3687 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             full text  class\n",
       "0    i was told to name the note half a step down o...      1\n",
       "0                                   all or nothing 🤷.       1\n",
       "1                I want to shove it down your throat.       1\n",
       "2                 only interact if you would suck it.       1\n",
       "3              Does my selfsuck deserve a like? (18).       1\n",
       "..                                                 ...    ...\n",
       "989   Oh you are ready for licking, What a good boy!.       1\n",
       "990          Your dick will be happy inside of me 🤤💕.       1\n",
       "991           My sweet and wet pussy loves it harder.       1\n",
       "992     [OC] NSFW TikTok of me fingering my pussy 😫🚨.       1\n",
       "993                                 Made a mess [OC].       1\n",
       "\n",
       "[3687 rows x 2 columns]"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "swear_final = swear_df[['full text','class']]\n",
    "swear_final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Final dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>full text</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The Global Imams Council (of Muslim faith lead...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Indian military Gun Down 11 Civilians In Nagal...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mariupols' real life hero's'. Save all these w...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The mother of a russian conscript kid at an an...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Ukraine, Russia exchange bodies of fallen sold...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           full text  class\n",
       "0  The Global Imams Council (of Muslim faith lead...      0\n",
       "1  Indian military Gun Down 11 Civilians In Nagal...      0\n",
       "2  Mariupols' real life hero's'. Save all these w...      0\n",
       "3  The mother of a russian conscript kid at an an...      0\n",
       "4  Ukraine, Russia exchange bodies of fallen sold...      0"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_all = pd.concat([df_final, swear_final])\n",
    "df_all.reset_index(drop=True, inplace=True)\n",
    "df_all.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all.to_csv(\"reddit_dataset.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.4 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}