XquanL commited on
Commit
558b3da
·
unverified ·
1 Parent(s): ae42467

new_redditdata

Browse files
Files changed (2) hide show
  1. Reddit_new.ipynb +369 -0
  2. reddit_annotated.csv +0 -0
Reddit_new.ipynb ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd "
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "# read json file, change row to column\n",
19
+ "df = pd.read_json('/Users/liuxiaoquan/Documents/706/Final_project/Reddit_new.json', orient='index')"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 3,
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "data": {
29
+ "text/plain": [
30
+ "24506"
31
+ ]
32
+ },
33
+ "execution_count": 3,
34
+ "metadata": {},
35
+ "output_type": "execute_result"
36
+ }
37
+ ],
38
+ "source": [
39
+ "len(df)"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 4,
45
+ "metadata": {},
46
+ "outputs": [
47
+ {
48
+ "data": {
49
+ "text/html": [
50
+ "<div>\n",
51
+ "<style scoped>\n",
52
+ " .dataframe tbody tr th:only-of-type {\n",
53
+ " vertical-align: middle;\n",
54
+ " }\n",
55
+ "\n",
56
+ " .dataframe tbody tr th {\n",
57
+ " vertical-align: top;\n",
58
+ " }\n",
59
+ "\n",
60
+ " .dataframe thead th {\n",
61
+ " text-align: right;\n",
62
+ " }\n",
63
+ "</style>\n",
64
+ "<table border=\"1\" class=\"dataframe\">\n",
65
+ " <thead>\n",
66
+ " <tr style=\"text-align: right;\">\n",
67
+ " <th></th>\n",
68
+ " <th>title</th>\n",
69
+ " <th>body</th>\n",
70
+ " <th>comment</th>\n",
71
+ " <th>L1</th>\n",
72
+ " <th>L2</th>\n",
73
+ " <th>L3</th>\n",
74
+ " <th>L4</th>\n",
75
+ " <th>L5</th>\n",
76
+ " <th>L6</th>\n",
77
+ " </tr>\n",
78
+ " </thead>\n",
79
+ " <tbody>\n",
80
+ " <tr>\n",
81
+ " <th>2k7915</th>\n",
82
+ " <td>Why is the NW map so small?</td>\n",
83
+ " <td>When did this change? It's not been fun spawni...</td>\n",
84
+ " <td>Maybe they need to keep the map large to start...</td>\n",
85
+ " <td>0</td>\n",
86
+ " <td>0</td>\n",
87
+ " <td>0</td>\n",
88
+ " <td>0</td>\n",
89
+ " <td>0</td>\n",
90
+ " <td>0</td>\n",
91
+ " </tr>\n",
92
+ " <tr>\n",
93
+ " <th>1k3845</th>\n",
94
+ " <td>Any updates in regards to the Flame War?</td>\n",
95
+ " <td>Just out of curiosity. I'm only wondering what...</td>\n",
96
+ " <td>Shut the fuck up freeloading asshat</td>\n",
97
+ " <td>3</td>\n",
98
+ " <td>1</td>\n",
99
+ " <td>0</td>\n",
100
+ " <td>0</td>\n",
101
+ " <td>0</td>\n",
102
+ " <td>1</td>\n",
103
+ " </tr>\n",
104
+ " <tr>\n",
105
+ " <th>1k8446</th>\n",
106
+ " <td>Hey</td>\n",
107
+ " <td>Im not phased by anything, love you all and I'...</td>\n",
108
+ " <td>MORE WIGGER SHIT TO DECODE</td>\n",
109
+ " <td>3</td>\n",
110
+ " <td>1</td>\n",
111
+ " <td>0</td>\n",
112
+ " <td>0</td>\n",
113
+ " <td>0</td>\n",
114
+ " <td>1</td>\n",
115
+ " </tr>\n",
116
+ " <tr>\n",
117
+ " <th>14k940</th>\n",
118
+ " <td>Any tips for final exams?</td>\n",
119
+ " <td>I am a first year student in Bachelor of Scien...</td>\n",
120
+ " <td>For Calc2, do past exams \\* 6, remember to exp...</td>\n",
121
+ " <td>0</td>\n",
122
+ " <td>0</td>\n",
123
+ " <td>0</td>\n",
124
+ " <td>0</td>\n",
125
+ " <td>0</td>\n",
126
+ " <td>0</td>\n",
127
+ " </tr>\n",
128
+ " <tr>\n",
129
+ " <th>12k646</th>\n",
130
+ " <td>My orthodontist just said I can't have nuts be...</td>\n",
131
+ " <td>What do I do I want to keep my nuts</td>\n",
132
+ " <td>just eat em and be careful it's fine</td>\n",
133
+ " <td>0</td>\n",
134
+ " <td>0</td>\n",
135
+ " <td>0</td>\n",
136
+ " <td>0</td>\n",
137
+ " <td>0</td>\n",
138
+ " <td>0</td>\n",
139
+ " </tr>\n",
140
+ " </tbody>\n",
141
+ "</table>\n",
142
+ "</div>"
143
+ ],
144
+ "text/plain": [
145
+ " title \\\n",
146
+ "2k7915 Why is the NW map so small? \n",
147
+ "1k3845 Any updates in regards to the Flame War? \n",
148
+ "1k8446 Hey \n",
149
+ "14k940 Any tips for final exams? \n",
150
+ "12k646 My orthodontist just said I can't have nuts be... \n",
151
+ "\n",
152
+ " body \\\n",
153
+ "2k7915 When did this change? It's not been fun spawni... \n",
154
+ "1k3845 Just out of curiosity. I'm only wondering what... \n",
155
+ "1k8446 Im not phased by anything, love you all and I'... \n",
156
+ "14k940 I am a first year student in Bachelor of Scien... \n",
157
+ "12k646 What do I do I want to keep my nuts \n",
158
+ "\n",
159
+ " comment L1 L2 L3 L4 L5 \\\n",
160
+ "2k7915 Maybe they need to keep the map large to start... 0 0 0 0 0 \n",
161
+ "1k3845 Shut the fuck up freeloading asshat 3 1 0 0 0 \n",
162
+ "1k8446 MORE WIGGER SHIT TO DECODE 3 1 0 0 0 \n",
163
+ "14k940 For Calc2, do past exams \\* 6, remember to exp... 0 0 0 0 0 \n",
164
+ "12k646 just eat em and be careful it's fine 0 0 0 0 0 \n",
165
+ "\n",
166
+ " L6 \n",
167
+ "2k7915 0 \n",
168
+ "1k3845 1 \n",
169
+ "1k8446 1 \n",
170
+ "14k940 0 \n",
171
+ "12k646 0 "
172
+ ]
173
+ },
174
+ "execution_count": 4,
175
+ "metadata": {},
176
+ "output_type": "execute_result"
177
+ }
178
+ ],
179
+ "source": [
180
+ "df.head()"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": 5,
186
+ "metadata": {},
187
+ "outputs": [],
188
+ "source": [
189
+ "#select body and L2, change L2 to Class\n",
190
+ "df_select = df[['body', 'L2']].copy()\n",
191
+ "df_select.rename(columns={'L2':'Class'}, inplace=True) \n"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": 6,
197
+ "metadata": {},
198
+ "outputs": [
199
+ {
200
+ "data": {
201
+ "text/html": [
202
+ "<div>\n",
203
+ "<style scoped>\n",
204
+ " .dataframe tbody tr th:only-of-type {\n",
205
+ " vertical-align: middle;\n",
206
+ " }\n",
207
+ "\n",
208
+ " .dataframe tbody tr th {\n",
209
+ " vertical-align: top;\n",
210
+ " }\n",
211
+ "\n",
212
+ " .dataframe thead th {\n",
213
+ " text-align: right;\n",
214
+ " }\n",
215
+ "</style>\n",
216
+ "<table border=\"1\" class=\"dataframe\">\n",
217
+ " <thead>\n",
218
+ " <tr style=\"text-align: right;\">\n",
219
+ " <th></th>\n",
220
+ " <th>body</th>\n",
221
+ " <th>Class</th>\n",
222
+ " </tr>\n",
223
+ " </thead>\n",
224
+ " <tbody>\n",
225
+ " <tr>\n",
226
+ " <th>2k7915</th>\n",
227
+ " <td>When did this change? It's not been fun spawni...</td>\n",
228
+ " <td>0</td>\n",
229
+ " </tr>\n",
230
+ " <tr>\n",
231
+ " <th>1k3845</th>\n",
232
+ " <td>Just out of curiosity. I'm only wondering what...</td>\n",
233
+ " <td>1</td>\n",
234
+ " </tr>\n",
235
+ " <tr>\n",
236
+ " <th>1k8446</th>\n",
237
+ " <td>Im not phased by anything, love you all and I'...</td>\n",
238
+ " <td>1</td>\n",
239
+ " </tr>\n",
240
+ " <tr>\n",
241
+ " <th>14k940</th>\n",
242
+ " <td>I am a first year student in Bachelor of Scien...</td>\n",
243
+ " <td>0</td>\n",
244
+ " </tr>\n",
245
+ " <tr>\n",
246
+ " <th>12k646</th>\n",
247
+ " <td>What do I do I want to keep my nuts</td>\n",
248
+ " <td>0</td>\n",
249
+ " </tr>\n",
250
+ " </tbody>\n",
251
+ "</table>\n",
252
+ "</div>"
253
+ ],
254
+ "text/plain": [
255
+ " body Class\n",
256
+ "2k7915 When did this change? It's not been fun spawni... 0\n",
257
+ "1k3845 Just out of curiosity. I'm only wondering what... 1\n",
258
+ "1k8446 Im not phased by anything, love you all and I'... 1\n",
259
+ "14k940 I am a first year student in Bachelor of Scien... 0\n",
260
+ "12k646 What do I do I want to keep my nuts 0"
261
+ ]
262
+ },
263
+ "execution_count": 6,
264
+ "metadata": {},
265
+ "output_type": "execute_result"
266
+ }
267
+ ],
268
+ "source": [
269
+ "df_select.head()"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 7,
275
+ "metadata": {},
276
+ "outputs": [
277
+ {
278
+ "data": {
279
+ "text/plain": [
280
+ "1 12577\n",
281
+ "0 11929\n",
282
+ "Name: Class, dtype: int64"
283
+ ]
284
+ },
285
+ "execution_count": 7,
286
+ "metadata": {},
287
+ "output_type": "execute_result"
288
+ }
289
+ ],
290
+ "source": [
291
+ "#check the number of each class\n",
292
+ "df_select['Class'].value_counts()\n"
293
+ ]
294
+ },
295
+ {
296
+ "cell_type": "code",
297
+ "execution_count": 8,
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": [
301
+ "# save to csv\n",
302
+ "df_select.to_csv('reddit_annotated.csv', index=False)"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": 16,
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "ht = pd.read_table('/Users/liuxiaoquan/Documents/706/Final_project/RAL-E/retrain_reddit_abuse_test.txt', header=None)"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": 20,
317
+ "metadata": {},
318
+ "outputs": [
319
+ {
320
+ "data": {
321
+ "text/plain": [
322
+ "14932"
323
+ ]
324
+ },
325
+ "execution_count": 20,
326
+ "metadata": {},
327
+ "output_type": "execute_result"
328
+ }
329
+ ],
330
+ "source": [
331
+ "len(ht)"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": null,
337
+ "metadata": {},
338
+ "outputs": [],
339
+ "source": []
340
+ }
341
+ ],
342
+ "metadata": {
343
+ "kernelspec": {
344
+ "display_name": "Python 3.10.6 ('base')",
345
+ "language": "python",
346
+ "name": "python3"
347
+ },
348
+ "language_info": {
349
+ "codemirror_mode": {
350
+ "name": "ipython",
351
+ "version": 3
352
+ },
353
+ "file_extension": ".py",
354
+ "mimetype": "text/x-python",
355
+ "name": "python",
356
+ "nbconvert_exporter": "python",
357
+ "pygments_lexer": "ipython3",
358
+ "version": "3.10.6"
359
+ },
360
+ "orig_nbformat": 4,
361
+ "vscode": {
362
+ "interpreter": {
363
+ "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf"
364
+ }
365
+ }
366
+ },
367
+ "nbformat": 4,
368
+ "nbformat_minor": 2
369
+ }
reddit_annotated.csv ADDED
The diff for this file is too large to render. See raw diff