{ "cells": [ { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "import praw\n", "import pandas as pd\n", "\n", "reddit= praw.Reddit(client_id=\"Q1w42RHhLq2fgwljAk_k-Q\",\t\t # your client id\n", "\t\t\t\t\tclient_secret=\"enUJfFthiZRynGfPQtoK1nCxRer2Dw\",\t # your client secret\n", " usernme = \"xl395\", #profile username\n", " password = \"12xiao34quanAria!\", #profile password\n", "\t\t\t\t\tuser_agent=\"706_post\")\t # your user agent\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## No bad words" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "def scrap_by_keyword(keyword, limit=1000):\n", " subreddit = reddit.subreddit(keyword)\n", " top_subreddit = subreddit.top(limit=limit)\n", " post = subreddit.top('year', limit=limit)\n", " posts_dict = { \"title\":[], \"Post text\":[]}\n", " for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", " df = pd.DataFrame(posts_dict)\n", " df['full text'] = df['title'] + '. ' + df['Post text']\n", " df['class'] = 0\n", " df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n", " df.reset_index(drop=True, inplace=True)\n", " return df\n" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(985, 4)" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "international = scrap_by_keyword('international')\n", "international.shape" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(997, 4)" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "haircare = scrap_by_keyword('haircare')\n", "haircare.shape" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(990, 4)" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jokes = scrap_by_keyword('jokes')\n", "jokes.shape" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(995, 4)" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "houseplants = scrap_by_keyword('houseplants')\n", "houseplants.shape" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(942, 4)" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "history = scrap_by_keyword('history')\n", "history.shape" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(56, 4)" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stock = scrap_by_keyword('stock')\n", "stock.shape" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(996, 4)" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "music = scrap_by_keyword('music')\n", "music.shape" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(998, 4)" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "family = scrap_by_keyword('family')\n", "family.shape" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(896, 4)" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "photography = scrap_by_keyword('photography')\n", "photography.shape" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(982, 4)" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "animal = scrap_by_keyword('animal')\n", "animal.shape" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(1000, 4)" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "makeup = scrap_by_keyword('makeup')\n", "makeup.shape" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(999, 4)" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "baking = scrap_by_keyword('baking')\n", "baking.shape" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(989, 4)" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "yoga = scrap_by_keyword('yoga')\n", "yoga.shape" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(996, 4)" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gym = scrap_by_keyword('gym')\n", "gym.shape" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(904, 4)" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "car = scrap_by_keyword('car')\n", "car.shape" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=limit)\n" ] }, { "data": { "text/plain": [ "(996, 4)" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "travel = scrap_by_keyword('travel')\n", "travel.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(14721, 4)" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.concat([international, haircare, jokes, houseplants, history, stock, music, family, photography, animal, makeup, baking, yoga, gym, car, travel], ignore_index=True)\n", "df.shape" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(14717, 4)" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n", "df.reset_index(drop=True, inplace=True)\n", "df.shape" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "swear_list = ['fuck', 'dick', 'cock', 'bullshit', 'bastard', 'asshole', 'damn', 'bitch', 'pussy']" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(14210, 4)" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop rows that contain words in swear_list\n", "for i in swear_list:\n", " df = df.loc[~df['full text'].str.contains(i, case=False)]\n", "\n", "df.shape" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
full textclass
0The Global Imams Council (of Muslim faith lead...0
1Indian military Gun Down 11 Civilians In Nagal...0
2Mariupols' real life hero's'. Save all these w...0
3The mother of a russian conscript kid at an an...0
4Ukraine, Russia exchange bodies of fallen sold...0
\n", "
" ], "text/plain": [ " full text class\n", "0 The Global Imams Council (of Muslim faith lead... 0\n", "1 Indian military Gun Down 11 Civilians In Nagal... 0\n", "2 Mariupols' real life hero's'. Save all these w... 0\n", "3 The mother of a russian conscript kid at an an... 0\n", "4 Ukraine, Russia exchange bodies of fallen sold... 0" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_final = df.loc[:, ['full text', 'class']]\n", "df_final.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bad words" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [], "source": [ "swear_list = ['fuck','dick','cock','bullshit','bastard','asshole','damn','bitch','pussy']" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/4029709700.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=1000)\n" ] } ], "source": [ "subreddit = reddit.subreddit(swear_list[0])\n", "top_subreddit = subreddit.top(limit=1000)\n", "post = subreddit.top('year', limit=1000)\n", "posts_dict = { \"title\":[], \"Post text\":[]}\n", "for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", "\n", "posts_fuck = pd.DataFrame(posts_dict)" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1308722114.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=1000)\n" ] } ], "source": [ "subreddit = reddit.subreddit(swear_list[1])\n", "top_subreddit = subreddit.top(limit=1000)\n", "post = subreddit.top('year', limit=1000)\n", "posts_dict = { \"title\":[], \"Post text\":[]}\n", "for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", "\n", "posts_dick = pd.DataFrame(posts_dict)" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1885500635.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=1000)\n" ] } ], "source": [ "subreddit = reddit.subreddit(swear_list[2])\n", "top_subreddit = subreddit.top(limit=1000)\n", "post = subreddit.top('year', limit=1000)\n", "posts_dict = { \"title\":[], \"Post text\":[]}\n", "for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", "\n", "posts_cock = pd.DataFrame(posts_dict)" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/1651880681.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=1000)\n" ] } ], "source": [ "subreddit = reddit.subreddit(swear_list[3])\n", "top_subreddit = subreddit.top(limit=1000)\n", "post = subreddit.top('year', limit=1000)\n", "posts_dict = { \"title\":[], \"Post text\":[]}\n", "for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", "\n", "posts_bullshit = pd.DataFrame(posts_dict)" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/816827923.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=1000)\n" ] } ], "source": [ "subreddit = reddit.subreddit('bastard')\n", "top_subreddit = subreddit.top(limit=1000)\n", "post = subreddit.top('year', limit=1000)\n", "posts_dict = { \"title\":[], \"Post text\":[]}\n", "for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", "\n", "posts_bastard = pd.DataFrame(posts_dict)" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/3789948362.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=1000)\n" ] } ], "source": [ "subreddit = reddit.subreddit('asshole')\n", "top_subreddit = subreddit.top(limit=1000)\n", "post = subreddit.top('year', limit=1000)\n", "posts_dict = { \"title\":[], \"Post text\":[]}\n", "for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", "\n", "posts_asshole = pd.DataFrame(posts_dict)" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/2306344161.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=1000)\n" ] } ], "source": [ "subreddit = reddit.subreddit('damn')\n", "top_subreddit = subreddit.top(limit=1000)\n", "post = subreddit.top('year', limit=1000)\n", "posts_dict = { \"title\":[], \"Post text\":[]}\n", "for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", "\n", "posts_damn = pd.DataFrame(posts_dict)" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/4247606323.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=1000)\n" ] } ], "source": [ "subreddit = reddit.subreddit('bitch')\n", "top_subreddit = subreddit.top(limit=1000)\n", "post = subreddit.top('year', limit=1000)\n", "posts_dict = { \"title\":[], \"Post text\":[]}\n", "for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", "\n", "posts_bitch = pd.DataFrame(posts_dict)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7591/3249493546.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n", "Call this function with 'time_filter' as a keyword argument.\n", " post = subreddit.top('year', limit=1000)\n" ] } ], "source": [ "subreddit = reddit.subreddit('pussy')\n", "top_subreddit = subreddit.top(limit=1000)\n", "post = subreddit.top('year', limit=1000)\n", "posts_dict = { \"title\":[], \"Post text\":[]}\n", "for submission in post:\n", " posts_dict[\"title\"].append(submission.title)\n", " posts_dict[\"Post text\"].append(submission.selftext) \n", "\n", "posts_pussy = pd.DataFrame(posts_dict)" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1, 2)\n", "(995, 2)\n", "(45, 2)\n", "(542, 2)\n", "(1, 2)\n", "(997, 2)\n", "(92, 2)\n", "(20, 2)\n", "(994, 2)\n" ] } ], "source": [ "print(posts_bullshit.shape)\n", "print(posts_cock.shape)\n", "print(posts_dick.shape)\n", "print(posts_fuck.shape)\n", "print(posts_bastard.shape)\n", "print(posts_asshole.shape)\n", "print(posts_damn.shape)\n", "print(posts_bitch.shape)\n", "print(posts_pussy.shape)" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [], "source": [ "#concatenate dataframe\n", "swear_df = pd.concat([posts_bullshit,posts_cock,posts_dick, posts_fuck,posts_bastard, posts_asshole,posts_damn, posts_bitch, posts_pussy] )" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3687, 2)" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "swear_df.shape" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "# detect none in swear_df\n", "swear_df['full text'] = swear_df['title'] + '. '+ swear_df['Post text']" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "swear_df['class'] = 1" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
full textclass
0i was told to name the note half a step down o...1
0all or nothing 🤷.1
1I want to shove it down your throat.1
2only interact if you would suck it.1
3Does my selfsuck deserve a like? (18).1
.........
989Oh you are ready for licking, What a good boy!.1
990Your dick will be happy inside of me 🤤💕.1
991My sweet and wet pussy loves it harder.1
992[OC] NSFW TikTok of me fingering my pussy 😫🚨.1
993Made a mess [OC].1
\n", "

3687 rows × 2 columns

\n", "
" ], "text/plain": [ " full text class\n", "0 i was told to name the note half a step down o... 1\n", "0 all or nothing 🤷. 1\n", "1 I want to shove it down your throat. 1\n", "2 only interact if you would suck it. 1\n", "3 Does my selfsuck deserve a like? (18). 1\n", ".. ... ...\n", "989 Oh you are ready for licking, What a good boy!. 1\n", "990 Your dick will be happy inside of me 🤤💕. 1\n", "991 My sweet and wet pussy loves it harder. 1\n", "992 [OC] NSFW TikTok of me fingering my pussy 😫🚨. 1\n", "993 Made a mess [OC]. 1\n", "\n", "[3687 rows x 2 columns]" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "swear_final = swear_df[['full text','class']]\n", "swear_final" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Final dataframe" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
full textclass
0The Global Imams Council (of Muslim faith lead...0
1Indian military Gun Down 11 Civilians In Nagal...0
2Mariupols' real life hero's'. Save all these w...0
3The mother of a russian conscript kid at an an...0
4Ukraine, Russia exchange bodies of fallen sold...0
\n", "
" ], "text/plain": [ " full text class\n", "0 The Global Imams Council (of Muslim faith lead... 0\n", "1 Indian military Gun Down 11 Civilians In Nagal... 0\n", "2 Mariupols' real life hero's'. Save all these w... 0\n", "3 The mother of a russian conscript kid at an an... 0\n", "4 Ukraine, Russia exchange bodies of fallen sold... 0" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_all = pd.concat([df_final, swear_final])\n", "df_all.reset_index(drop=True, inplace=True)\n", "df_all.head()" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [], "source": [ "df_all.to_csv(\"reddit_dataset.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.4 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858" } } }, "nbformat": 4, "nbformat_minor": 2 }