Spaces:

yjzhu0225
/

reddit_text_classification_app

Runtime error

App Files Files Community

Yuanjing Zhu commited on Dec 6, 2022

Commit

77531b8

1 Parent(s): b37a68d

add dataset

Browse files

Files changed (2) hide show

reddit_dataset +0 -0
reddit_scraping.ipynb +1294 -0

reddit_dataset ADDED Viewed

The diff for this file is too large to render. See raw diff

reddit_scraping.ipynb ADDED Viewed

	@@ -0,0 +1,1294 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import praw\n",
+    "import pandas as pd\n",
+    "\n",
+    "reddit= praw.Reddit(client_id=\"Q1w42RHhLq2fgwljAk_k-Q\",\t\t # your client id\n",
+    "\t\t\t\t\tclient_secret=\"enUJfFthiZRynGfPQtoK1nCxRer2Dw\",\t # your client secret\n",
+    "                    usernme = \"xl395\", #profile username\n",
+    "                    password = \"12xiao34quanAria!\", #profile password\n",
+    "\t\t\t\t\tuser_agent=\"706_post\")\t # your user agent\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## No bad words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def scrap_by_keyword(keyword, limit=1000):\n",
+    "    subreddit = reddit.subreddit(keyword)\n",
+    "    top_subreddit = subreddit.top(limit=limit)\n",
+    "    post = subreddit.top('year', limit=limit)\n",
+    "    posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "    for submission in post:\n",
+    "        posts_dict[\"title\"].append(submission.title)\n",
+    "        posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "    df = pd.DataFrame(posts_dict)\n",
+    "    df['full text'] = df['title'] + '. ' + df['Post text']\n",
+    "    df['class'] = 0\n",
+    "    df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n",
+    "    df.reset_index(drop=True, inplace=True)\n",
+    "    return df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(985, 4)"
+      ]
+     },
+     "execution_count": 82,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "international = scrap_by_keyword('international')\n",
+    "international.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(997, 4)"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "haircare = scrap_by_keyword('haircare')\n",
+    "haircare.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(990, 4)"
+      ]
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "jokes = scrap_by_keyword('jokes')\n",
+    "jokes.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(995, 4)"
+      ]
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "houseplants = scrap_by_keyword('houseplants')\n",
+    "houseplants.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(942, 4)"
+      ]
+     },
+     "execution_count": 84,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "history = scrap_by_keyword('history')\n",
+    "history.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(56, 4)"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stock = scrap_by_keyword('stock')\n",
+    "stock.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(996, 4)"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "music = scrap_by_keyword('music')\n",
+    "music.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(998, 4)"
+      ]
+     },
+     "execution_count": 91,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "family = scrap_by_keyword('family')\n",
+    "family.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(896, 4)"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "photography = scrap_by_keyword('photography')\n",
+    "photography.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(982, 4)"
+      ]
+     },
+     "execution_count": 93,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "animal = scrap_by_keyword('animal')\n",
+    "animal.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(1000, 4)"
+      ]
+     },
+     "execution_count": 95,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "makeup = scrap_by_keyword('makeup')\n",
+    "makeup.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(999, 4)"
+      ]
+     },
+     "execution_count": 96,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "baking = scrap_by_keyword('baking')\n",
+    "baking.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(989, 4)"
+      ]
+     },
+     "execution_count": 97,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "yoga = scrap_by_keyword('yoga')\n",
+    "yoga.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(996, 4)"
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gym = scrap_by_keyword('gym')\n",
+    "gym.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(904, 4)"
+      ]
+     },
+     "execution_count": 99,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "car = scrap_by_keyword('car')\n",
+    "car.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=limit)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(996, 4)"
+      ]
+     },
+     "execution_count": 100,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "travel = scrap_by_keyword('travel')\n",
+    "travel.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(14721, 4)"
+      ]
+     },
+     "execution_count": 103,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.concat([international, haircare, jokes, houseplants, history, stock, music, family, photography, animal, makeup, baking, yoga, gym, car, travel], ignore_index=True)\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(14717, 4)"
+      ]
+     },
+     "execution_count": 104,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n",
+    "df.reset_index(drop=True, inplace=True)\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "swear_list = ['fuck', 'dick', 'cock', 'bullshit', 'bastard', 'asshole', 'damn', 'bitch', 'pussy']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(14210, 4)"
+      ]
+     },
+     "execution_count": 108,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# drop rows that contain words in swear_list\n",
+    "for i in swear_list:\n",
+    "    df = df.loc[~df['full text'].str.contains(i, case=False)]\n",
+    "\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>full text</th>\n",
+       "      <th>class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>The Global Imams Council (of Muslim faith lead...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Indian military Gun Down 11 Civilians In Nagal...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Mariupols' real life hero's'. Save all these w...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>The mother of a russian conscript kid at an an...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Ukraine, Russia exchange bodies of fallen sold...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           full text  class\n",
+       "0  The Global Imams Council (of Muslim faith lead...      0\n",
+       "1  Indian military Gun Down 11 Civilians In Nagal...      0\n",
+       "2  Mariupols' real life hero's'. Save all these w...      0\n",
+       "3  The mother of a russian conscript kid at an an...      0\n",
+       "4  Ukraine, Russia exchange bodies of fallen sold...      0"
+      ]
+     },
+     "execution_count": 127,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_final = df.loc[:, ['full text', 'class']]\n",
+    "df_final.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Bad words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "swear_list = ['fuck','dick','cock','bullshit','bastard','asshole','damn','bitch','pussy']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/4029709700.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=1000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = reddit.subreddit(swear_list[0])\n",
+    "top_subreddit = subreddit.top(limit=1000)\n",
+    "post = subreddit.top('year', limit=1000)\n",
+    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "for submission in post:\n",
+    "    posts_dict[\"title\"].append(submission.title)\n",
+    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "\n",
+    "posts_fuck = pd.DataFrame(posts_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1308722114.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=1000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = reddit.subreddit(swear_list[1])\n",
+    "top_subreddit = subreddit.top(limit=1000)\n",
+    "post = subreddit.top('year', limit=1000)\n",
+    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "for submission in post:\n",
+    "    posts_dict[\"title\"].append(submission.title)\n",
+    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "\n",
+    "posts_dick = pd.DataFrame(posts_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1885500635.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=1000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = reddit.subreddit(swear_list[2])\n",
+    "top_subreddit = subreddit.top(limit=1000)\n",
+    "post = subreddit.top('year', limit=1000)\n",
+    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "for submission in post:\n",
+    "    posts_dict[\"title\"].append(submission.title)\n",
+    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "\n",
+    "posts_cock = pd.DataFrame(posts_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/1651880681.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=1000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = reddit.subreddit(swear_list[3])\n",
+    "top_subreddit = subreddit.top(limit=1000)\n",
+    "post = subreddit.top('year', limit=1000)\n",
+    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "for submission in post:\n",
+    "    posts_dict[\"title\"].append(submission.title)\n",
+    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "\n",
+    "posts_bullshit = pd.DataFrame(posts_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/816827923.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=1000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = reddit.subreddit('bastard')\n",
+    "top_subreddit = subreddit.top(limit=1000)\n",
+    "post = subreddit.top('year', limit=1000)\n",
+    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "for submission in post:\n",
+    "    posts_dict[\"title\"].append(submission.title)\n",
+    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "\n",
+    "posts_bastard = pd.DataFrame(posts_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/3789948362.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=1000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = reddit.subreddit('asshole')\n",
+    "top_subreddit = subreddit.top(limit=1000)\n",
+    "post = subreddit.top('year', limit=1000)\n",
+    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "for submission in post:\n",
+    "    posts_dict[\"title\"].append(submission.title)\n",
+    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "\n",
+    "posts_asshole = pd.DataFrame(posts_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/2306344161.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=1000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = reddit.subreddit('damn')\n",
+    "top_subreddit = subreddit.top(limit=1000)\n",
+    "post = subreddit.top('year', limit=1000)\n",
+    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "for submission in post:\n",
+    "    posts_dict[\"title\"].append(submission.title)\n",
+    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "\n",
+    "posts_damn = pd.DataFrame(posts_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/4247606323.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=1000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = reddit.subreddit('bitch')\n",
+    "top_subreddit = subreddit.top(limit=1000)\n",
+    "post = subreddit.top('year', limit=1000)\n",
+    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "for submission in post:\n",
+    "    posts_dict[\"title\"].append(submission.title)\n",
+    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "\n",
+    "posts_bitch = pd.DataFrame(posts_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7591/3249493546.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
+      "Call this function with 'time_filter' as a keyword argument.\n",
+      "  post = subreddit.top('year', limit=1000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = reddit.subreddit('pussy')\n",
+    "top_subreddit = subreddit.top(limit=1000)\n",
+    "post = subreddit.top('year', limit=1000)\n",
+    "posts_dict = { \"title\":[], \"Post text\":[]}\n",
+    "for submission in post:\n",
+    "    posts_dict[\"title\"].append(submission.title)\n",
+    "    posts_dict[\"Post text\"].append(submission.selftext) \n",
+    "\n",
+    "posts_pussy = pd.DataFrame(posts_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1, 2)\n",
+      "(995, 2)\n",
+      "(45, 2)\n",
+      "(542, 2)\n",
+      "(1, 2)\n",
+      "(997, 2)\n",
+      "(92, 2)\n",
+      "(20, 2)\n",
+      "(994, 2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(posts_bullshit.shape)\n",
+    "print(posts_cock.shape)\n",
+    "print(posts_dick.shape)\n",
+    "print(posts_fuck.shape)\n",
+    "print(posts_bastard.shape)\n",
+    "print(posts_asshole.shape)\n",
+    "print(posts_damn.shape)\n",
+    "print(posts_bitch.shape)\n",
+    "print(posts_pussy.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#concatenate dataframe\n",
+    "swear_df = pd.concat([posts_bullshit,posts_cock,posts_dick, posts_fuck,posts_bastard, posts_asshole,posts_damn,  posts_bitch, posts_pussy] )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(3687, 2)"
+      ]
+     },
+     "execution_count": 122,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "swear_df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# detect none in swear_df\n",
+    "swear_df['full text'] = swear_df['title'] + '. '+ swear_df['Post text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 124,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "swear_df['class'] = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 125,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>full text</th>\n",
+       "      <th>class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>i was told to name the note half a step down o...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>all or nothing 🤷.</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>I want to shove it down your throat.</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>only interact if you would suck it.</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Does my selfsuck deserve a like? (18).</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>989</th>\n",
+       "      <td>Oh you are ready for licking, What a good boy!.</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>990</th>\n",
+       "      <td>Your dick will be happy inside of me 🤤💕.</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>991</th>\n",
+       "      <td>My sweet and wet pussy loves it harder.</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>992</th>\n",
+       "      <td>[OC] NSFW TikTok of me fingering my pussy 😫🚨.</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>993</th>\n",
+       "      <td>Made a mess [OC].</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3687 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             full text  class\n",
+       "0    i was told to name the note half a step down o...      1\n",
+       "0                                   all or nothing 🤷.       1\n",
+       "1                I want to shove it down your throat.       1\n",
+       "2                 only interact if you would suck it.       1\n",
+       "3              Does my selfsuck deserve a like? (18).       1\n",
+       "..                                                 ...    ...\n",
+       "989   Oh you are ready for licking, What a good boy!.       1\n",
+       "990          Your dick will be happy inside of me 🤤💕.       1\n",
+       "991           My sweet and wet pussy loves it harder.       1\n",
+       "992     [OC] NSFW TikTok of me fingering my pussy 😫🚨.       1\n",
+       "993                                 Made a mess [OC].       1\n",
+       "\n",
+       "[3687 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 125,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "swear_final = swear_df[['full text','class']]\n",
+    "swear_final"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Final dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>full text</th>\n",
+       "      <th>class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>The Global Imams Council (of Muslim faith lead...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Indian military Gun Down 11 Civilians In Nagal...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Mariupols' real life hero's'. Save all these w...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>The mother of a russian conscript kid at an an...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Ukraine, Russia exchange bodies of fallen sold...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           full text  class\n",
+       "0  The Global Imams Council (of Muslim faith lead...      0\n",
+       "1  Indian military Gun Down 11 Civilians In Nagal...      0\n",
+       "2  Mariupols' real life hero's'. Save all these w...      0\n",
+       "3  The mother of a russian conscript kid at an an...      0\n",
+       "4  Ukraine, Russia exchange bodies of fallen sold...      0"
+      ]
+     },
+     "execution_count": 131,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_all = pd.concat([df_final, swear_final])\n",
+    "df_all.reset_index(drop=True, inplace=True)\n",
+    "df_all.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 133,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_all.to_csv(\"reddit_dataset\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.4 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}