Yuanjing Zhu commited on
Commit
77531b8
Β·
1 Parent(s): b37a68d

add dataset

Browse files
Files changed (2) hide show
  1. reddit_dataset +0 -0
  2. reddit_scraping.ipynb +1294 -0
reddit_dataset ADDED
The diff for this file is too large to render. See raw diff
 
reddit_scraping.ipynb ADDED
@@ -0,0 +1,1294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 37,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import praw\n",
10
+ "import pandas as pd\n",
11
+ "\n",
12
+ "reddit= praw.Reddit(client_id=\"Q1w42RHhLq2fgwljAk_k-Q\",\t\t # your client id\n",
13
+ "\t\t\t\t\tclient_secret=\"enUJfFthiZRynGfPQtoK1nCxRer2Dw\",\t # your client secret\n",
14
+ " usernme = \"xl395\", #profile username\n",
15
+ " password = \"12xiao34quanAria!\", #profile password\n",
16
+ "\t\t\t\t\tuser_agent=\"706_post\")\t # your user agent\n"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "markdown",
21
+ "metadata": {},
22
+ "source": [
23
+ "## No bad words"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 81,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "def scrap_by_keyword(keyword, limit=1000):\n",
33
+ " subreddit = reddit.subreddit(keyword)\n",
34
+ " top_subreddit = subreddit.top(limit=limit)\n",
35
+ " post = subreddit.top('year', limit=limit)\n",
36
+ " posts_dict = { \"title\":[], \"Post text\":[]}\n",
37
+ " for submission in post:\n",
38
+ " posts_dict[\"title\"].append(submission.title)\n",
39
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
40
+ " df = pd.DataFrame(posts_dict)\n",
41
+ " df['full text'] = df['title'] + '. ' + df['Post text']\n",
42
+ " df['class'] = 0\n",
43
+ " df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n",
44
+ " df.reset_index(drop=True, inplace=True)\n",
45
+ " return df\n"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 82,
51
+ "metadata": {},
52
+ "outputs": [
53
+ {
54
+ "name": "stderr",
55
+ "output_type": "stream",
56
+ "text": [
57
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
58
+ "Call this function with 'time_filter' as a keyword argument.\n",
59
+ " post = subreddit.top('year', limit=limit)\n"
60
+ ]
61
+ },
62
+ {
63
+ "data": {
64
+ "text/plain": [
65
+ "(985, 4)"
66
+ ]
67
+ },
68
+ "execution_count": 82,
69
+ "metadata": {},
70
+ "output_type": "execute_result"
71
+ }
72
+ ],
73
+ "source": [
74
+ "international = scrap_by_keyword('international')\n",
75
+ "international.shape"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 87,
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "name": "stderr",
85
+ "output_type": "stream",
86
+ "text": [
87
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
88
+ "Call this function with 'time_filter' as a keyword argument.\n",
89
+ " post = subreddit.top('year', limit=limit)\n"
90
+ ]
91
+ },
92
+ {
93
+ "data": {
94
+ "text/plain": [
95
+ "(997, 4)"
96
+ ]
97
+ },
98
+ "execution_count": 87,
99
+ "metadata": {},
100
+ "output_type": "execute_result"
101
+ }
102
+ ],
103
+ "source": [
104
+ "haircare = scrap_by_keyword('haircare')\n",
105
+ "haircare.shape"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 83,
111
+ "metadata": {},
112
+ "outputs": [
113
+ {
114
+ "name": "stderr",
115
+ "output_type": "stream",
116
+ "text": [
117
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
118
+ "Call this function with 'time_filter' as a keyword argument.\n",
119
+ " post = subreddit.top('year', limit=limit)\n"
120
+ ]
121
+ },
122
+ {
123
+ "data": {
124
+ "text/plain": [
125
+ "(990, 4)"
126
+ ]
127
+ },
128
+ "execution_count": 83,
129
+ "metadata": {},
130
+ "output_type": "execute_result"
131
+ }
132
+ ],
133
+ "source": [
134
+ "jokes = scrap_by_keyword('jokes')\n",
135
+ "jokes.shape"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 88,
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "name": "stderr",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
148
+ "Call this function with 'time_filter' as a keyword argument.\n",
149
+ " post = subreddit.top('year', limit=limit)\n"
150
+ ]
151
+ },
152
+ {
153
+ "data": {
154
+ "text/plain": [
155
+ "(995, 4)"
156
+ ]
157
+ },
158
+ "execution_count": 88,
159
+ "metadata": {},
160
+ "output_type": "execute_result"
161
+ }
162
+ ],
163
+ "source": [
164
+ "houseplants = scrap_by_keyword('houseplants')\n",
165
+ "houseplants.shape"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": 84,
171
+ "metadata": {},
172
+ "outputs": [
173
+ {
174
+ "name": "stderr",
175
+ "output_type": "stream",
176
+ "text": [
177
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
178
+ "Call this function with 'time_filter' as a keyword argument.\n",
179
+ " post = subreddit.top('year', limit=limit)\n"
180
+ ]
181
+ },
182
+ {
183
+ "data": {
184
+ "text/plain": [
185
+ "(942, 4)"
186
+ ]
187
+ },
188
+ "execution_count": 84,
189
+ "metadata": {},
190
+ "output_type": "execute_result"
191
+ }
192
+ ],
193
+ "source": [
194
+ "history = scrap_by_keyword('history')\n",
195
+ "history.shape"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": 89,
201
+ "metadata": {},
202
+ "outputs": [
203
+ {
204
+ "name": "stderr",
205
+ "output_type": "stream",
206
+ "text": [
207
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
208
+ "Call this function with 'time_filter' as a keyword argument.\n",
209
+ " post = subreddit.top('year', limit=limit)\n"
210
+ ]
211
+ },
212
+ {
213
+ "data": {
214
+ "text/plain": [
215
+ "(56, 4)"
216
+ ]
217
+ },
218
+ "execution_count": 89,
219
+ "metadata": {},
220
+ "output_type": "execute_result"
221
+ }
222
+ ],
223
+ "source": [
224
+ "stock = scrap_by_keyword('stock')\n",
225
+ "stock.shape"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 90,
231
+ "metadata": {},
232
+ "outputs": [
233
+ {
234
+ "name": "stderr",
235
+ "output_type": "stream",
236
+ "text": [
237
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
238
+ "Call this function with 'time_filter' as a keyword argument.\n",
239
+ " post = subreddit.top('year', limit=limit)\n"
240
+ ]
241
+ },
242
+ {
243
+ "data": {
244
+ "text/plain": [
245
+ "(996, 4)"
246
+ ]
247
+ },
248
+ "execution_count": 90,
249
+ "metadata": {},
250
+ "output_type": "execute_result"
251
+ }
252
+ ],
253
+ "source": [
254
+ "music = scrap_by_keyword('music')\n",
255
+ "music.shape"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": 91,
261
+ "metadata": {},
262
+ "outputs": [
263
+ {
264
+ "name": "stderr",
265
+ "output_type": "stream",
266
+ "text": [
267
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
268
+ "Call this function with 'time_filter' as a keyword argument.\n",
269
+ " post = subreddit.top('year', limit=limit)\n"
270
+ ]
271
+ },
272
+ {
273
+ "data": {
274
+ "text/plain": [
275
+ "(998, 4)"
276
+ ]
277
+ },
278
+ "execution_count": 91,
279
+ "metadata": {},
280
+ "output_type": "execute_result"
281
+ }
282
+ ],
283
+ "source": [
284
+ "family = scrap_by_keyword('family')\n",
285
+ "family.shape"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": 92,
291
+ "metadata": {},
292
+ "outputs": [
293
+ {
294
+ "name": "stderr",
295
+ "output_type": "stream",
296
+ "text": [
297
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
298
+ "Call this function with 'time_filter' as a keyword argument.\n",
299
+ " post = subreddit.top('year', limit=limit)\n"
300
+ ]
301
+ },
302
+ {
303
+ "data": {
304
+ "text/plain": [
305
+ "(896, 4)"
306
+ ]
307
+ },
308
+ "execution_count": 92,
309
+ "metadata": {},
310
+ "output_type": "execute_result"
311
+ }
312
+ ],
313
+ "source": [
314
+ "photography = scrap_by_keyword('photography')\n",
315
+ "photography.shape"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": 93,
321
+ "metadata": {},
322
+ "outputs": [
323
+ {
324
+ "name": "stderr",
325
+ "output_type": "stream",
326
+ "text": [
327
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
328
+ "Call this function with 'time_filter' as a keyword argument.\n",
329
+ " post = subreddit.top('year', limit=limit)\n"
330
+ ]
331
+ },
332
+ {
333
+ "data": {
334
+ "text/plain": [
335
+ "(982, 4)"
336
+ ]
337
+ },
338
+ "execution_count": 93,
339
+ "metadata": {},
340
+ "output_type": "execute_result"
341
+ }
342
+ ],
343
+ "source": [
344
+ "animal = scrap_by_keyword('animal')\n",
345
+ "animal.shape"
346
+ ]
347
+ },
348
+ {
349
+ "cell_type": "code",
350
+ "execution_count": 95,
351
+ "metadata": {},
352
+ "outputs": [
353
+ {
354
+ "name": "stderr",
355
+ "output_type": "stream",
356
+ "text": [
357
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
358
+ "Call this function with 'time_filter' as a keyword argument.\n",
359
+ " post = subreddit.top('year', limit=limit)\n"
360
+ ]
361
+ },
362
+ {
363
+ "data": {
364
+ "text/plain": [
365
+ "(1000, 4)"
366
+ ]
367
+ },
368
+ "execution_count": 95,
369
+ "metadata": {},
370
+ "output_type": "execute_result"
371
+ }
372
+ ],
373
+ "source": [
374
+ "makeup = scrap_by_keyword('makeup')\n",
375
+ "makeup.shape"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": 96,
381
+ "metadata": {},
382
+ "outputs": [
383
+ {
384
+ "name": "stderr",
385
+ "output_type": "stream",
386
+ "text": [
387
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
388
+ "Call this function with 'time_filter' as a keyword argument.\n",
389
+ " post = subreddit.top('year', limit=limit)\n"
390
+ ]
391
+ },
392
+ {
393
+ "data": {
394
+ "text/plain": [
395
+ "(999, 4)"
396
+ ]
397
+ },
398
+ "execution_count": 96,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "baking = scrap_by_keyword('baking')\n",
405
+ "baking.shape"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": 97,
411
+ "metadata": {},
412
+ "outputs": [
413
+ {
414
+ "name": "stderr",
415
+ "output_type": "stream",
416
+ "text": [
417
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
418
+ "Call this function with 'time_filter' as a keyword argument.\n",
419
+ " post = subreddit.top('year', limit=limit)\n"
420
+ ]
421
+ },
422
+ {
423
+ "data": {
424
+ "text/plain": [
425
+ "(989, 4)"
426
+ ]
427
+ },
428
+ "execution_count": 97,
429
+ "metadata": {},
430
+ "output_type": "execute_result"
431
+ }
432
+ ],
433
+ "source": [
434
+ "yoga = scrap_by_keyword('yoga')\n",
435
+ "yoga.shape"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": 98,
441
+ "metadata": {},
442
+ "outputs": [
443
+ {
444
+ "name": "stderr",
445
+ "output_type": "stream",
446
+ "text": [
447
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
448
+ "Call this function with 'time_filter' as a keyword argument.\n",
449
+ " post = subreddit.top('year', limit=limit)\n"
450
+ ]
451
+ },
452
+ {
453
+ "data": {
454
+ "text/plain": [
455
+ "(996, 4)"
456
+ ]
457
+ },
458
+ "execution_count": 98,
459
+ "metadata": {},
460
+ "output_type": "execute_result"
461
+ }
462
+ ],
463
+ "source": [
464
+ "gym = scrap_by_keyword('gym')\n",
465
+ "gym.shape"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 99,
471
+ "metadata": {},
472
+ "outputs": [
473
+ {
474
+ "name": "stderr",
475
+ "output_type": "stream",
476
+ "text": [
477
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
478
+ "Call this function with 'time_filter' as a keyword argument.\n",
479
+ " post = subreddit.top('year', limit=limit)\n"
480
+ ]
481
+ },
482
+ {
483
+ "data": {
484
+ "text/plain": [
485
+ "(904, 4)"
486
+ ]
487
+ },
488
+ "execution_count": 99,
489
+ "metadata": {},
490
+ "output_type": "execute_result"
491
+ }
492
+ ],
493
+ "source": [
494
+ "car = scrap_by_keyword('car')\n",
495
+ "car.shape"
496
+ ]
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "execution_count": 100,
501
+ "metadata": {},
502
+ "outputs": [
503
+ {
504
+ "name": "stderr",
505
+ "output_type": "stream",
506
+ "text": [
507
+ "/tmp/ipykernel_7591/1569617397.py:4: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
508
+ "Call this function with 'time_filter' as a keyword argument.\n",
509
+ " post = subreddit.top('year', limit=limit)\n"
510
+ ]
511
+ },
512
+ {
513
+ "data": {
514
+ "text/plain": [
515
+ "(996, 4)"
516
+ ]
517
+ },
518
+ "execution_count": 100,
519
+ "metadata": {},
520
+ "output_type": "execute_result"
521
+ }
522
+ ],
523
+ "source": [
524
+ "travel = scrap_by_keyword('travel')\n",
525
+ "travel.shape"
526
+ ]
527
+ },
528
+ {
529
+ "cell_type": "code",
530
+ "execution_count": null,
531
+ "metadata": {},
532
+ "outputs": [],
533
+ "source": []
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "execution_count": 103,
538
+ "metadata": {},
539
+ "outputs": [
540
+ {
541
+ "data": {
542
+ "text/plain": [
543
+ "(14721, 4)"
544
+ ]
545
+ },
546
+ "execution_count": 103,
547
+ "metadata": {},
548
+ "output_type": "execute_result"
549
+ }
550
+ ],
551
+ "source": [
552
+ "df = pd.concat([international, haircare, jokes, houseplants, history, stock, music, family, photography, animal, makeup, baking, yoga, gym, car, travel], ignore_index=True)\n",
553
+ "df.shape"
554
+ ]
555
+ },
556
+ {
557
+ "cell_type": "code",
558
+ "execution_count": 104,
559
+ "metadata": {},
560
+ "outputs": [
561
+ {
562
+ "data": {
563
+ "text/plain": [
564
+ "(14717, 4)"
565
+ ]
566
+ },
567
+ "execution_count": 104,
568
+ "metadata": {},
569
+ "output_type": "execute_result"
570
+ }
571
+ ],
572
+ "source": [
573
+ "df.drop_duplicates(subset =\"full text\", keep = False, inplace = True)\n",
574
+ "df.reset_index(drop=True, inplace=True)\n",
575
+ "df.shape"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "code",
580
+ "execution_count": 105,
581
+ "metadata": {},
582
+ "outputs": [],
583
+ "source": [
584
+ "swear_list = ['fuck', 'dick', 'cock', 'bullshit', 'bastard', 'asshole', 'damn', 'bitch', 'pussy']"
585
+ ]
586
+ },
587
+ {
588
+ "cell_type": "code",
589
+ "execution_count": 108,
590
+ "metadata": {},
591
+ "outputs": [
592
+ {
593
+ "data": {
594
+ "text/plain": [
595
+ "(14210, 4)"
596
+ ]
597
+ },
598
+ "execution_count": 108,
599
+ "metadata": {},
600
+ "output_type": "execute_result"
601
+ }
602
+ ],
603
+ "source": [
604
+ "# drop rows that contain words in swear_list\n",
605
+ "for i in swear_list:\n",
606
+ " df = df.loc[~df['full text'].str.contains(i, case=False)]\n",
607
+ "\n",
608
+ "df.shape"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": 127,
614
+ "metadata": {},
615
+ "outputs": [
616
+ {
617
+ "data": {
618
+ "text/html": [
619
+ "<div>\n",
620
+ "<style scoped>\n",
621
+ " .dataframe tbody tr th:only-of-type {\n",
622
+ " vertical-align: middle;\n",
623
+ " }\n",
624
+ "\n",
625
+ " .dataframe tbody tr th {\n",
626
+ " vertical-align: top;\n",
627
+ " }\n",
628
+ "\n",
629
+ " .dataframe thead th {\n",
630
+ " text-align: right;\n",
631
+ " }\n",
632
+ "</style>\n",
633
+ "<table border=\"1\" class=\"dataframe\">\n",
634
+ " <thead>\n",
635
+ " <tr style=\"text-align: right;\">\n",
636
+ " <th></th>\n",
637
+ " <th>full text</th>\n",
638
+ " <th>class</th>\n",
639
+ " </tr>\n",
640
+ " </thead>\n",
641
+ " <tbody>\n",
642
+ " <tr>\n",
643
+ " <th>0</th>\n",
644
+ " <td>The Global Imams Council (of Muslim faith lead...</td>\n",
645
+ " <td>0</td>\n",
646
+ " </tr>\n",
647
+ " <tr>\n",
648
+ " <th>1</th>\n",
649
+ " <td>Indian military Gun Down 11 Civilians In Nagal...</td>\n",
650
+ " <td>0</td>\n",
651
+ " </tr>\n",
652
+ " <tr>\n",
653
+ " <th>2</th>\n",
654
+ " <td>Mariupols' real life hero's'. Save all these w...</td>\n",
655
+ " <td>0</td>\n",
656
+ " </tr>\n",
657
+ " <tr>\n",
658
+ " <th>3</th>\n",
659
+ " <td>The mother of a russian conscript kid at an an...</td>\n",
660
+ " <td>0</td>\n",
661
+ " </tr>\n",
662
+ " <tr>\n",
663
+ " <th>4</th>\n",
664
+ " <td>Ukraine, Russia exchange bodies of fallen sold...</td>\n",
665
+ " <td>0</td>\n",
666
+ " </tr>\n",
667
+ " </tbody>\n",
668
+ "</table>\n",
669
+ "</div>"
670
+ ],
671
+ "text/plain": [
672
+ " full text class\n",
673
+ "0 The Global Imams Council (of Muslim faith lead... 0\n",
674
+ "1 Indian military Gun Down 11 Civilians In Nagal... 0\n",
675
+ "2 Mariupols' real life hero's'. Save all these w... 0\n",
676
+ "3 The mother of a russian conscript kid at an an... 0\n",
677
+ "4 Ukraine, Russia exchange bodies of fallen sold... 0"
678
+ ]
679
+ },
680
+ "execution_count": 127,
681
+ "metadata": {},
682
+ "output_type": "execute_result"
683
+ }
684
+ ],
685
+ "source": [
686
+ "df_final = df.loc[:, ['full text', 'class']]\n",
687
+ "df_final.head()"
688
+ ]
689
+ },
690
+ {
691
+ "cell_type": "code",
692
+ "execution_count": null,
693
+ "metadata": {},
694
+ "outputs": [],
695
+ "source": []
696
+ },
697
+ {
698
+ "cell_type": "markdown",
699
+ "metadata": {},
700
+ "source": [
701
+ "## Bad words"
702
+ ]
703
+ },
704
+ {
705
+ "cell_type": "code",
706
+ "execution_count": 110,
707
+ "metadata": {},
708
+ "outputs": [],
709
+ "source": [
710
+ "swear_list = ['fuck','dick','cock','bullshit','bastard','asshole','damn','bitch','pussy']"
711
+ ]
712
+ },
713
+ {
714
+ "cell_type": "code",
715
+ "execution_count": 111,
716
+ "metadata": {},
717
+ "outputs": [
718
+ {
719
+ "name": "stderr",
720
+ "output_type": "stream",
721
+ "text": [
722
+ "/tmp/ipykernel_7591/4029709700.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
723
+ "Call this function with 'time_filter' as a keyword argument.\n",
724
+ " post = subreddit.top('year', limit=1000)\n"
725
+ ]
726
+ }
727
+ ],
728
+ "source": [
729
+ "subreddit = reddit.subreddit(swear_list[0])\n",
730
+ "top_subreddit = subreddit.top(limit=1000)\n",
731
+ "post = subreddit.top('year', limit=1000)\n",
732
+ "posts_dict = { \"title\":[], \"Post text\":[]}\n",
733
+ "for submission in post:\n",
734
+ " posts_dict[\"title\"].append(submission.title)\n",
735
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
736
+ "\n",
737
+ "posts_fuck = pd.DataFrame(posts_dict)"
738
+ ]
739
+ },
740
+ {
741
+ "cell_type": "code",
742
+ "execution_count": 112,
743
+ "metadata": {},
744
+ "outputs": [
745
+ {
746
+ "name": "stderr",
747
+ "output_type": "stream",
748
+ "text": [
749
+ "/tmp/ipykernel_7591/1308722114.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
750
+ "Call this function with 'time_filter' as a keyword argument.\n",
751
+ " post = subreddit.top('year', limit=1000)\n"
752
+ ]
753
+ }
754
+ ],
755
+ "source": [
756
+ "subreddit = reddit.subreddit(swear_list[1])\n",
757
+ "top_subreddit = subreddit.top(limit=1000)\n",
758
+ "post = subreddit.top('year', limit=1000)\n",
759
+ "posts_dict = { \"title\":[], \"Post text\":[]}\n",
760
+ "for submission in post:\n",
761
+ " posts_dict[\"title\"].append(submission.title)\n",
762
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
763
+ "\n",
764
+ "posts_dick = pd.DataFrame(posts_dict)"
765
+ ]
766
+ },
767
+ {
768
+ "cell_type": "code",
769
+ "execution_count": 113,
770
+ "metadata": {},
771
+ "outputs": [
772
+ {
773
+ "name": "stderr",
774
+ "output_type": "stream",
775
+ "text": [
776
+ "/tmp/ipykernel_7591/1885500635.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
777
+ "Call this function with 'time_filter' as a keyword argument.\n",
778
+ " post = subreddit.top('year', limit=1000)\n"
779
+ ]
780
+ }
781
+ ],
782
+ "source": [
783
+ "subreddit = reddit.subreddit(swear_list[2])\n",
784
+ "top_subreddit = subreddit.top(limit=1000)\n",
785
+ "post = subreddit.top('year', limit=1000)\n",
786
+ "posts_dict = { \"title\":[], \"Post text\":[]}\n",
787
+ "for submission in post:\n",
788
+ " posts_dict[\"title\"].append(submission.title)\n",
789
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
790
+ "\n",
791
+ "posts_cock = pd.DataFrame(posts_dict)"
792
+ ]
793
+ },
794
+ {
795
+ "cell_type": "code",
796
+ "execution_count": 114,
797
+ "metadata": {},
798
+ "outputs": [
799
+ {
800
+ "name": "stderr",
801
+ "output_type": "stream",
802
+ "text": [
803
+ "/tmp/ipykernel_7591/1651880681.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
804
+ "Call this function with 'time_filter' as a keyword argument.\n",
805
+ " post = subreddit.top('year', limit=1000)\n"
806
+ ]
807
+ }
808
+ ],
809
+ "source": [
810
+ "subreddit = reddit.subreddit(swear_list[3])\n",
811
+ "top_subreddit = subreddit.top(limit=1000)\n",
812
+ "post = subreddit.top('year', limit=1000)\n",
813
+ "posts_dict = { \"title\":[], \"Post text\":[]}\n",
814
+ "for submission in post:\n",
815
+ " posts_dict[\"title\"].append(submission.title)\n",
816
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
817
+ "\n",
818
+ "posts_bullshit = pd.DataFrame(posts_dict)"
819
+ ]
820
+ },
821
+ {
822
+ "cell_type": "code",
823
+ "execution_count": 115,
824
+ "metadata": {},
825
+ "outputs": [
826
+ {
827
+ "name": "stderr",
828
+ "output_type": "stream",
829
+ "text": [
830
+ "/tmp/ipykernel_7591/816827923.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
831
+ "Call this function with 'time_filter' as a keyword argument.\n",
832
+ " post = subreddit.top('year', limit=1000)\n"
833
+ ]
834
+ }
835
+ ],
836
+ "source": [
837
+ "subreddit = reddit.subreddit('bastard')\n",
838
+ "top_subreddit = subreddit.top(limit=1000)\n",
839
+ "post = subreddit.top('year', limit=1000)\n",
840
+ "posts_dict = { \"title\":[], \"Post text\":[]}\n",
841
+ "for submission in post:\n",
842
+ " posts_dict[\"title\"].append(submission.title)\n",
843
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
844
+ "\n",
845
+ "posts_bastard = pd.DataFrame(posts_dict)"
846
+ ]
847
+ },
848
+ {
849
+ "cell_type": "code",
850
+ "execution_count": 116,
851
+ "metadata": {},
852
+ "outputs": [
853
+ {
854
+ "name": "stderr",
855
+ "output_type": "stream",
856
+ "text": [
857
+ "/tmp/ipykernel_7591/3789948362.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
858
+ "Call this function with 'time_filter' as a keyword argument.\n",
859
+ " post = subreddit.top('year', limit=1000)\n"
860
+ ]
861
+ }
862
+ ],
863
+ "source": [
864
+ "subreddit = reddit.subreddit('asshole')\n",
865
+ "top_subreddit = subreddit.top(limit=1000)\n",
866
+ "post = subreddit.top('year', limit=1000)\n",
867
+ "posts_dict = { \"title\":[], \"Post text\":[]}\n",
868
+ "for submission in post:\n",
869
+ " posts_dict[\"title\"].append(submission.title)\n",
870
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
871
+ "\n",
872
+ "posts_asshole = pd.DataFrame(posts_dict)"
873
+ ]
874
+ },
875
+ {
876
+ "cell_type": "code",
877
+ "execution_count": 117,
878
+ "metadata": {},
879
+ "outputs": [
880
+ {
881
+ "name": "stderr",
882
+ "output_type": "stream",
883
+ "text": [
884
+ "/tmp/ipykernel_7591/2306344161.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
885
+ "Call this function with 'time_filter' as a keyword argument.\n",
886
+ " post = subreddit.top('year', limit=1000)\n"
887
+ ]
888
+ }
889
+ ],
890
+ "source": [
891
+ "subreddit = reddit.subreddit('damn')\n",
892
+ "top_subreddit = subreddit.top(limit=1000)\n",
893
+ "post = subreddit.top('year', limit=1000)\n",
894
+ "posts_dict = { \"title\":[], \"Post text\":[]}\n",
895
+ "for submission in post:\n",
896
+ " posts_dict[\"title\"].append(submission.title)\n",
897
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
898
+ "\n",
899
+ "posts_damn = pd.DataFrame(posts_dict)"
900
+ ]
901
+ },
902
+ {
903
+ "cell_type": "code",
904
+ "execution_count": 118,
905
+ "metadata": {},
906
+ "outputs": [
907
+ {
908
+ "name": "stderr",
909
+ "output_type": "stream",
910
+ "text": [
911
+ "/tmp/ipykernel_7591/4247606323.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
912
+ "Call this function with 'time_filter' as a keyword argument.\n",
913
+ " post = subreddit.top('year', limit=1000)\n"
914
+ ]
915
+ }
916
+ ],
917
+ "source": [
918
+ "subreddit = reddit.subreddit('bitch')\n",
919
+ "top_subreddit = subreddit.top(limit=1000)\n",
920
+ "post = subreddit.top('year', limit=1000)\n",
921
+ "posts_dict = { \"title\":[], \"Post text\":[]}\n",
922
+ "for submission in post:\n",
923
+ " posts_dict[\"title\"].append(submission.title)\n",
924
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
925
+ "\n",
926
+ "posts_bitch = pd.DataFrame(posts_dict)"
927
+ ]
928
+ },
929
+ {
930
+ "cell_type": "code",
931
+ "execution_count": 119,
932
+ "metadata": {},
933
+ "outputs": [
934
+ {
935
+ "name": "stderr",
936
+ "output_type": "stream",
937
+ "text": [
938
+ "/tmp/ipykernel_7591/3249493546.py:3: DeprecationWarning: Positional arguments for 'BaseListingMixin.top' will no longer be supported in PRAW 8.\n",
939
+ "Call this function with 'time_filter' as a keyword argument.\n",
940
+ " post = subreddit.top('year', limit=1000)\n"
941
+ ]
942
+ }
943
+ ],
944
+ "source": [
945
+ "subreddit = reddit.subreddit('pussy')\n",
946
+ "top_subreddit = subreddit.top(limit=1000)\n",
947
+ "post = subreddit.top('year', limit=1000)\n",
948
+ "posts_dict = { \"title\":[], \"Post text\":[]}\n",
949
+ "for submission in post:\n",
950
+ " posts_dict[\"title\"].append(submission.title)\n",
951
+ " posts_dict[\"Post text\"].append(submission.selftext) \n",
952
+ "\n",
953
+ "posts_pussy = pd.DataFrame(posts_dict)"
954
+ ]
955
+ },
956
+ {
957
+ "cell_type": "code",
958
+ "execution_count": 120,
959
+ "metadata": {},
960
+ "outputs": [
961
+ {
962
+ "name": "stdout",
963
+ "output_type": "stream",
964
+ "text": [
965
+ "(1, 2)\n",
966
+ "(995, 2)\n",
967
+ "(45, 2)\n",
968
+ "(542, 2)\n",
969
+ "(1, 2)\n",
970
+ "(997, 2)\n",
971
+ "(92, 2)\n",
972
+ "(20, 2)\n",
973
+ "(994, 2)\n"
974
+ ]
975
+ }
976
+ ],
977
+ "source": [
978
+ "print(posts_bullshit.shape)\n",
979
+ "print(posts_cock.shape)\n",
980
+ "print(posts_dick.shape)\n",
981
+ "print(posts_fuck.shape)\n",
982
+ "print(posts_bastard.shape)\n",
983
+ "print(posts_asshole.shape)\n",
984
+ "print(posts_damn.shape)\n",
985
+ "print(posts_bitch.shape)\n",
986
+ "print(posts_pussy.shape)"
987
+ ]
988
+ },
989
+ {
990
+ "cell_type": "code",
991
+ "execution_count": 121,
992
+ "metadata": {},
993
+ "outputs": [],
994
+ "source": [
995
+ "#concatenate dataframe\n",
996
+ "swear_df = pd.concat([posts_bullshit,posts_cock,posts_dick, posts_fuck,posts_bastard, posts_asshole,posts_damn, posts_bitch, posts_pussy] )"
997
+ ]
998
+ },
999
+ {
1000
+ "cell_type": "code",
1001
+ "execution_count": 122,
1002
+ "metadata": {},
1003
+ "outputs": [
1004
+ {
1005
+ "data": {
1006
+ "text/plain": [
1007
+ "(3687, 2)"
1008
+ ]
1009
+ },
1010
+ "execution_count": 122,
1011
+ "metadata": {},
1012
+ "output_type": "execute_result"
1013
+ }
1014
+ ],
1015
+ "source": [
1016
+ "swear_df.shape"
1017
+ ]
1018
+ },
1019
+ {
1020
+ "cell_type": "code",
1021
+ "execution_count": 123,
1022
+ "metadata": {},
1023
+ "outputs": [],
1024
+ "source": [
1025
+ "# detect none in swear_df\n",
1026
+ "swear_df['full text'] = swear_df['title'] + '. '+ swear_df['Post text']"
1027
+ ]
1028
+ },
1029
+ {
1030
+ "cell_type": "code",
1031
+ "execution_count": 124,
1032
+ "metadata": {},
1033
+ "outputs": [],
1034
+ "source": [
1035
+ "swear_df['class'] = 1"
1036
+ ]
1037
+ },
1038
+ {
1039
+ "cell_type": "code",
1040
+ "execution_count": 125,
1041
+ "metadata": {},
1042
+ "outputs": [
1043
+ {
1044
+ "data": {
1045
+ "text/html": [
1046
+ "<div>\n",
1047
+ "<style scoped>\n",
1048
+ " .dataframe tbody tr th:only-of-type {\n",
1049
+ " vertical-align: middle;\n",
1050
+ " }\n",
1051
+ "\n",
1052
+ " .dataframe tbody tr th {\n",
1053
+ " vertical-align: top;\n",
1054
+ " }\n",
1055
+ "\n",
1056
+ " .dataframe thead th {\n",
1057
+ " text-align: right;\n",
1058
+ " }\n",
1059
+ "</style>\n",
1060
+ "<table border=\"1\" class=\"dataframe\">\n",
1061
+ " <thead>\n",
1062
+ " <tr style=\"text-align: right;\">\n",
1063
+ " <th></th>\n",
1064
+ " <th>full text</th>\n",
1065
+ " <th>class</th>\n",
1066
+ " </tr>\n",
1067
+ " </thead>\n",
1068
+ " <tbody>\n",
1069
+ " <tr>\n",
1070
+ " <th>0</th>\n",
1071
+ " <td>i was told to name the note half a step down o...</td>\n",
1072
+ " <td>1</td>\n",
1073
+ " </tr>\n",
1074
+ " <tr>\n",
1075
+ " <th>0</th>\n",
1076
+ " <td>all or nothing 🀷.</td>\n",
1077
+ " <td>1</td>\n",
1078
+ " </tr>\n",
1079
+ " <tr>\n",
1080
+ " <th>1</th>\n",
1081
+ " <td>I want to shove it down your throat.</td>\n",
1082
+ " <td>1</td>\n",
1083
+ " </tr>\n",
1084
+ " <tr>\n",
1085
+ " <th>2</th>\n",
1086
+ " <td>only interact if you would suck it.</td>\n",
1087
+ " <td>1</td>\n",
1088
+ " </tr>\n",
1089
+ " <tr>\n",
1090
+ " <th>3</th>\n",
1091
+ " <td>Does my selfsuck deserve a like? (18).</td>\n",
1092
+ " <td>1</td>\n",
1093
+ " </tr>\n",
1094
+ " <tr>\n",
1095
+ " <th>...</th>\n",
1096
+ " <td>...</td>\n",
1097
+ " <td>...</td>\n",
1098
+ " </tr>\n",
1099
+ " <tr>\n",
1100
+ " <th>989</th>\n",
1101
+ " <td>Oh you are ready for licking, What a good boy!.</td>\n",
1102
+ " <td>1</td>\n",
1103
+ " </tr>\n",
1104
+ " <tr>\n",
1105
+ " <th>990</th>\n",
1106
+ " <td>Your dick will be happy inside of me πŸ€€πŸ’•.</td>\n",
1107
+ " <td>1</td>\n",
1108
+ " </tr>\n",
1109
+ " <tr>\n",
1110
+ " <th>991</th>\n",
1111
+ " <td>My sweet and wet pussy loves it harder.</td>\n",
1112
+ " <td>1</td>\n",
1113
+ " </tr>\n",
1114
+ " <tr>\n",
1115
+ " <th>992</th>\n",
1116
+ " <td>[OC] NSFW TikTok of me fingering my pussy 😫🚨.</td>\n",
1117
+ " <td>1</td>\n",
1118
+ " </tr>\n",
1119
+ " <tr>\n",
1120
+ " <th>993</th>\n",
1121
+ " <td>Made a mess [OC].</td>\n",
1122
+ " <td>1</td>\n",
1123
+ " </tr>\n",
1124
+ " </tbody>\n",
1125
+ "</table>\n",
1126
+ "<p>3687 rows Γ— 2 columns</p>\n",
1127
+ "</div>"
1128
+ ],
1129
+ "text/plain": [
1130
+ " full text class\n",
1131
+ "0 i was told to name the note half a step down o... 1\n",
1132
+ "0 all or nothing 🀷. 1\n",
1133
+ "1 I want to shove it down your throat. 1\n",
1134
+ "2 only interact if you would suck it. 1\n",
1135
+ "3 Does my selfsuck deserve a like? (18). 1\n",
1136
+ ".. ... ...\n",
1137
+ "989 Oh you are ready for licking, What a good boy!. 1\n",
1138
+ "990 Your dick will be happy inside of me πŸ€€πŸ’•. 1\n",
1139
+ "991 My sweet and wet pussy loves it harder. 1\n",
1140
+ "992 [OC] NSFW TikTok of me fingering my pussy 😫🚨. 1\n",
1141
+ "993 Made a mess [OC]. 1\n",
1142
+ "\n",
1143
+ "[3687 rows x 2 columns]"
1144
+ ]
1145
+ },
1146
+ "execution_count": 125,
1147
+ "metadata": {},
1148
+ "output_type": "execute_result"
1149
+ }
1150
+ ],
1151
+ "source": [
1152
+ "swear_final = swear_df[['full text','class']]\n",
1153
+ "swear_final"
1154
+ ]
1155
+ },
1156
+ {
1157
+ "cell_type": "code",
1158
+ "execution_count": null,
1159
+ "metadata": {},
1160
+ "outputs": [],
1161
+ "source": []
1162
+ },
1163
+ {
1164
+ "cell_type": "markdown",
1165
+ "metadata": {},
1166
+ "source": [
1167
+ "## Final dataframe"
1168
+ ]
1169
+ },
1170
+ {
1171
+ "cell_type": "code",
1172
+ "execution_count": 131,
1173
+ "metadata": {},
1174
+ "outputs": [
1175
+ {
1176
+ "data": {
1177
+ "text/html": [
1178
+ "<div>\n",
1179
+ "<style scoped>\n",
1180
+ " .dataframe tbody tr th:only-of-type {\n",
1181
+ " vertical-align: middle;\n",
1182
+ " }\n",
1183
+ "\n",
1184
+ " .dataframe tbody tr th {\n",
1185
+ " vertical-align: top;\n",
1186
+ " }\n",
1187
+ "\n",
1188
+ " .dataframe thead th {\n",
1189
+ " text-align: right;\n",
1190
+ " }\n",
1191
+ "</style>\n",
1192
+ "<table border=\"1\" class=\"dataframe\">\n",
1193
+ " <thead>\n",
1194
+ " <tr style=\"text-align: right;\">\n",
1195
+ " <th></th>\n",
1196
+ " <th>full text</th>\n",
1197
+ " <th>class</th>\n",
1198
+ " </tr>\n",
1199
+ " </thead>\n",
1200
+ " <tbody>\n",
1201
+ " <tr>\n",
1202
+ " <th>0</th>\n",
1203
+ " <td>The Global Imams Council (of Muslim faith lead...</td>\n",
1204
+ " <td>0</td>\n",
1205
+ " </tr>\n",
1206
+ " <tr>\n",
1207
+ " <th>1</th>\n",
1208
+ " <td>Indian military Gun Down 11 Civilians In Nagal...</td>\n",
1209
+ " <td>0</td>\n",
1210
+ " </tr>\n",
1211
+ " <tr>\n",
1212
+ " <th>2</th>\n",
1213
+ " <td>Mariupols' real life hero's'. Save all these w...</td>\n",
1214
+ " <td>0</td>\n",
1215
+ " </tr>\n",
1216
+ " <tr>\n",
1217
+ " <th>3</th>\n",
1218
+ " <td>The mother of a russian conscript kid at an an...</td>\n",
1219
+ " <td>0</td>\n",
1220
+ " </tr>\n",
1221
+ " <tr>\n",
1222
+ " <th>4</th>\n",
1223
+ " <td>Ukraine, Russia exchange bodies of fallen sold...</td>\n",
1224
+ " <td>0</td>\n",
1225
+ " </tr>\n",
1226
+ " </tbody>\n",
1227
+ "</table>\n",
1228
+ "</div>"
1229
+ ],
1230
+ "text/plain": [
1231
+ " full text class\n",
1232
+ "0 The Global Imams Council (of Muslim faith lead... 0\n",
1233
+ "1 Indian military Gun Down 11 Civilians In Nagal... 0\n",
1234
+ "2 Mariupols' real life hero's'. Save all these w... 0\n",
1235
+ "3 The mother of a russian conscript kid at an an... 0\n",
1236
+ "4 Ukraine, Russia exchange bodies of fallen sold... 0"
1237
+ ]
1238
+ },
1239
+ "execution_count": 131,
1240
+ "metadata": {},
1241
+ "output_type": "execute_result"
1242
+ }
1243
+ ],
1244
+ "source": [
1245
+ "df_all = pd.concat([df_final, swear_final])\n",
1246
+ "df_all.reset_index(drop=True, inplace=True)\n",
1247
+ "df_all.head()"
1248
+ ]
1249
+ },
1250
+ {
1251
+ "cell_type": "code",
1252
+ "execution_count": 133,
1253
+ "metadata": {},
1254
+ "outputs": [],
1255
+ "source": [
1256
+ "df_all.to_csv(\"reddit_dataset\")"
1257
+ ]
1258
+ },
1259
+ {
1260
+ "cell_type": "code",
1261
+ "execution_count": null,
1262
+ "metadata": {},
1263
+ "outputs": [],
1264
+ "source": []
1265
+ }
1266
+ ],
1267
+ "metadata": {
1268
+ "kernelspec": {
1269
+ "display_name": "Python 3.10.4 64-bit",
1270
+ "language": "python",
1271
+ "name": "python3"
1272
+ },
1273
+ "language_info": {
1274
+ "codemirror_mode": {
1275
+ "name": "ipython",
1276
+ "version": 3
1277
+ },
1278
+ "file_extension": ".py",
1279
+ "mimetype": "text/x-python",
1280
+ "name": "python",
1281
+ "nbconvert_exporter": "python",
1282
+ "pygments_lexer": "ipython3",
1283
+ "version": "3.10.4"
1284
+ },
1285
+ "orig_nbformat": 4,
1286
+ "vscode": {
1287
+ "interpreter": {
1288
+ "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
1289
+ }
1290
+ }
1291
+ },
1292
+ "nbformat": 4,
1293
+ "nbformat_minor": 2
1294
+ }