{
"cells": [
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"import praw\n",
"import pandas as pd\n",
"\n",
"reddit= praw.Reddit(client_id=\"Q1w42RHhLq2fgwljAk_k-Q\",\t\t # your client id\n",
"\t\t\t\t\tclient_secret=\"enUJfFthiZRynGfPQtoK1nCxRer2Dw\",\t # your client secret\n",
" usernme = \"xl395\", #profile username\n",
" password = \"12xiao34quanAria!\", #profile password\n",
"\t\t\t\t\tuser_agent=\"706_post\")\t # your user agent\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## No bad words"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"def scrap_by_keyword(keyword, limit=1000):\n",
" subreddit = reddit.subreddit(keyword)\n",
" top_subreddit = subreddit.top(limit=limit)\n",
" post = subreddit.top('year', limit=limit)\n",
" posts_dict = { \"title\":[], \"Post text\":[]}\n",
" for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
" df = pd.DataFrame(posts_dict)\n",
" df['full text'] = df['title'] + '. ' + df['Post text']\n",
" df['class'] = 0\n",
" df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n",
" df.reset_index(drop=True, inplace=True)\n",
" return df\n"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(985, 4)"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"international = scrap_by_keyword('international')\n",
"international.shape"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(997, 4)"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"haircare = scrap_by_keyword('haircare')\n",
"haircare.shape"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(990, 4)"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jokes = scrap_by_keyword('jokes')\n",
"jokes.shape"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(995, 4)"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"houseplants = scrap_by_keyword('houseplants')\n",
"houseplants.shape"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(942, 4)"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"history = scrap_by_keyword('history')\n",
"history.shape"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(56, 4)"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stock = scrap_by_keyword('stock')\n",
"stock.shape"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(996, 4)"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"music = scrap_by_keyword('music')\n",
"music.shape"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(998, 4)"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"family = scrap_by_keyword('family')\n",
"family.shape"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(896, 4)"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"photography = scrap_by_keyword('photography')\n",
"photography.shape"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(982, 4)"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"animal = scrap_by_keyword('animal')\n",
"animal.shape"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(1000, 4)"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"makeup = scrap_by_keyword('makeup')\n",
"makeup.shape"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(999, 4)"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"baking = scrap_by_keyword('baking')\n",
"baking.shape"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(989, 4)"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yoga = scrap_by_keyword('yoga')\n",
"yoga.shape"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(996, 4)"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gym = scrap_by_keyword('gym')\n",
"gym.shape"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(904, 4)"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car = scrap_by_keyword('car')\n",
"car.shape"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=limit)\n"
]
},
{
"data": {
"text/plain": [
"(996, 4)"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"travel = scrap_by_keyword('travel')\n",
"travel.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(14721, 4)"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.concat([international, haircare, jokes, houseplants, history, stock, music, family, photography, animal, makeup, baking, yoga, gym, car, travel], ignore_index=True)\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(14717, 4)"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n",
"df.reset_index(drop=True, inplace=True)\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"swear_list = ['fuck', 'dick', 'cock', 'bullshit', 'bastard', 'asshole', 'damn', 'bitch', 'pussy']"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(14210, 4)"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# drop rows that contain words in swear_list\n",
"for i in swear_list:\n",
" df = df.loc[~df['full text'].str.contains(i, case=False)]\n",
"\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" full text | \n",
" class | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" The Global Imams Council (of Muslim faith lead... | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Indian military Gun Down 11 Civilians In Nagal... | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" Mariupols' real life hero's'. Save all these w... | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" The mother of a russian conscript kid at an an... | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" Ukraine, Russia exchange bodies of fallen sold... | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" full text class\n",
"0 The Global Imams Council (of Muslim faith lead... 0\n",
"1 Indian military Gun Down 11 Civilians In Nagal... 0\n",
"2 Mariupols' real life hero's'. Save all these w... 0\n",
"3 The mother of a russian conscript kid at an an... 0\n",
"4 Ukraine, Russia exchange bodies of fallen sold... 0"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_final = df.loc[:, ['full text', 'class']]\n",
"df_final.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Bad words"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"swear_list = ['fuck','dick','cock','bullshit','bastard','asshole','damn','bitch','pussy']"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/4029709700.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=1000)\n"
]
}
],
"source": [
"subreddit = reddit.subreddit(swear_list[0])\n",
"top_subreddit = subreddit.top(limit=1000)\n",
"post = subreddit.top('year', limit=1000)\n",
"posts_dict = { \"title\":[], \"Post text\":[]}\n",
"for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
"\n",
"posts_fuck = pd.DataFrame(posts_dict)"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1308722114.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=1000)\n"
]
}
],
"source": [
"subreddit = reddit.subreddit(swear_list[1])\n",
"top_subreddit = subreddit.top(limit=1000)\n",
"post = subreddit.top('year', limit=1000)\n",
"posts_dict = { \"title\":[], \"Post text\":[]}\n",
"for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
"\n",
"posts_dick = pd.DataFrame(posts_dict)"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1885500635.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=1000)\n"
]
}
],
"source": [
"subreddit = reddit.subreddit(swear_list[2])\n",
"top_subreddit = subreddit.top(limit=1000)\n",
"post = subreddit.top('year', limit=1000)\n",
"posts_dict = { \"title\":[], \"Post text\":[]}\n",
"for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
"\n",
"posts_cock = pd.DataFrame(posts_dict)"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/1651880681.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=1000)\n"
]
}
],
"source": [
"subreddit = reddit.subreddit(swear_list[3])\n",
"top_subreddit = subreddit.top(limit=1000)\n",
"post = subreddit.top('year', limit=1000)\n",
"posts_dict = { \"title\":[], \"Post text\":[]}\n",
"for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
"\n",
"posts_bullshit = pd.DataFrame(posts_dict)"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/816827923.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=1000)\n"
]
}
],
"source": [
"subreddit = reddit.subreddit('bastard')\n",
"top_subreddit = subreddit.top(limit=1000)\n",
"post = subreddit.top('year', limit=1000)\n",
"posts_dict = { \"title\":[], \"Post text\":[]}\n",
"for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
"\n",
"posts_bastard = pd.DataFrame(posts_dict)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/3789948362.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=1000)\n"
]
}
],
"source": [
"subreddit = reddit.subreddit('asshole')\n",
"top_subreddit = subreddit.top(limit=1000)\n",
"post = subreddit.top('year', limit=1000)\n",
"posts_dict = { \"title\":[], \"Post text\":[]}\n",
"for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
"\n",
"posts_asshole = pd.DataFrame(posts_dict)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/2306344161.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=1000)\n"
]
}
],
"source": [
"subreddit = reddit.subreddit('damn')\n",
"top_subreddit = subreddit.top(limit=1000)\n",
"post = subreddit.top('year', limit=1000)\n",
"posts_dict = { \"title\":[], \"Post text\":[]}\n",
"for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
"\n",
"posts_damn = pd.DataFrame(posts_dict)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/4247606323.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=1000)\n"
]
}
],
"source": [
"subreddit = reddit.subreddit('bitch')\n",
"top_subreddit = subreddit.top(limit=1000)\n",
"post = subreddit.top('year', limit=1000)\n",
"posts_dict = { \"title\":[], \"Post text\":[]}\n",
"for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
"\n",
"posts_bitch = pd.DataFrame(posts_dict)"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_7591/3249493546.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
"Call this function with 'time_filter' as a keyword argument.\n",
" post = subreddit.top('year', limit=1000)\n"
]
}
],
"source": [
"subreddit = reddit.subreddit('pussy')\n",
"top_subreddit = subreddit.top(limit=1000)\n",
"post = subreddit.top('year', limit=1000)\n",
"posts_dict = { \"title\":[], \"Post text\":[]}\n",
"for submission in post:\n",
" posts_dict[\"title\"].append(submission.title)\n",
" posts_dict[\"Post text\"].append(submission.selftext) \n",
"\n",
"posts_pussy = pd.DataFrame(posts_dict)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1, 2)\n",
"(995, 2)\n",
"(45, 2)\n",
"(542, 2)\n",
"(1, 2)\n",
"(997, 2)\n",
"(92, 2)\n",
"(20, 2)\n",
"(994, 2)\n"
]
}
],
"source": [
"print(posts_bullshit.shape)\n",
"print(posts_cock.shape)\n",
"print(posts_dick.shape)\n",
"print(posts_fuck.shape)\n",
"print(posts_bastard.shape)\n",
"print(posts_asshole.shape)\n",
"print(posts_damn.shape)\n",
"print(posts_bitch.shape)\n",
"print(posts_pussy.shape)"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"#concatenate dataframe\n",
"swear_df = pd.concat([posts_bullshit,posts_cock,posts_dick, posts_fuck,posts_bastard, posts_asshole,posts_damn, posts_bitch, posts_pussy] )"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3687, 2)"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"swear_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"# detect none in swear_df\n",
"swear_df['full text'] = swear_df['title'] + '. '+ swear_df['Post text']"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [],
"source": [
"swear_df['class'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" full text | \n",
" class | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" i was told to name the note half a step down o... | \n",
" 1 | \n",
"
\n",
" \n",
" 0 | \n",
" all or nothing 🤷. | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" I want to shove it down your throat. | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" only interact if you would suck it. | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" Does my selfsuck deserve a like? (18). | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 989 | \n",
" Oh you are ready for licking, What a good boy!. | \n",
" 1 | \n",
"
\n",
" \n",
" 990 | \n",
" Your dick will be happy inside of me 🤤💕. | \n",
" 1 | \n",
"
\n",
" \n",
" 991 | \n",
" My sweet and wet pussy loves it harder. | \n",
" 1 | \n",
"
\n",
" \n",
" 992 | \n",
" [OC] NSFW TikTok of me fingering my pussy 😫🚨. | \n",
" 1 | \n",
"
\n",
" \n",
" 993 | \n",
" Made a mess [OC]. | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
3687 rows × 2 columns
\n",
"
"
],
"text/plain": [
" full text class\n",
"0 i was told to name the note half a step down o... 1\n",
"0 all or nothing 🤷. 1\n",
"1 I want to shove it down your throat. 1\n",
"2 only interact if you would suck it. 1\n",
"3 Does my selfsuck deserve a like? (18). 1\n",
".. ... ...\n",
"989 Oh you are ready for licking, What a good boy!. 1\n",
"990 Your dick will be happy inside of me 🤤💕. 1\n",
"991 My sweet and wet pussy loves it harder. 1\n",
"992 [OC] NSFW TikTok of me fingering my pussy 😫🚨. 1\n",
"993 Made a mess [OC]. 1\n",
"\n",
"[3687 rows x 2 columns]"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"swear_final = swear_df[['full text','class']]\n",
"swear_final"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Final dataframe"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" full text | \n",
" class | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" The Global Imams Council (of Muslim faith lead... | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Indian military Gun Down 11 Civilians In Nagal... | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" Mariupols' real life hero's'. Save all these w... | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" The mother of a russian conscript kid at an an... | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" Ukraine, Russia exchange bodies of fallen sold... | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" full text class\n",
"0 The Global Imams Council (of Muslim faith lead... 0\n",
"1 Indian military Gun Down 11 Civilians In Nagal... 0\n",
"2 Mariupols' real life hero's'. Save all these w... 0\n",
"3 The mother of a russian conscript kid at an an... 0\n",
"4 Ukraine, Russia exchange bodies of fallen sold... 0"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_all = pd.concat([df_final, swear_final])\n",
"df_all.reset_index(drop=True, inplace=True)\n",
"df_all.head()"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
"df_all.to_csv(\"reddit_dataset.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.4 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}