{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# read json file, change row to column\n",
"df = pd.read_json('/Users/liuxiaoquan/Documents/706/Final_project/Reddit_new.json', orient='index')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"24506"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" body | \n",
" comment | \n",
" L1 | \n",
" L2 | \n",
" L3 | \n",
" L4 | \n",
" L5 | \n",
" L6 | \n",
"
\n",
" \n",
" \n",
" \n",
" 2k7915 | \n",
" Why is the NW map so small? | \n",
" When did this change? It's not been fun spawni... | \n",
" Maybe they need to keep the map large to start... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1k3845 | \n",
" Any updates in regards to the Flame War? | \n",
" Just out of curiosity. I'm only wondering what... | \n",
" Shut the fuck up freeloading asshat | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1k8446 | \n",
" Hey | \n",
" Im not phased by anything, love you all and I'... | \n",
" MORE WIGGER SHIT TO DECODE | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 14k940 | \n",
" Any tips for final exams? | \n",
" I am a first year student in Bachelor of Scien... | \n",
" For Calc2, do past exams \\* 6, remember to exp... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 12k646 | \n",
" My orthodontist just said I can't have nuts be... | \n",
" What do I do I want to keep my nuts | \n",
" just eat em and be careful it's fine | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title \\\n",
"2k7915 Why is the NW map so small? \n",
"1k3845 Any updates in regards to the Flame War? \n",
"1k8446 Hey \n",
"14k940 Any tips for final exams? \n",
"12k646 My orthodontist just said I can't have nuts be... \n",
"\n",
" body \\\n",
"2k7915 When did this change? It's not been fun spawni... \n",
"1k3845 Just out of curiosity. I'm only wondering what... \n",
"1k8446 Im not phased by anything, love you all and I'... \n",
"14k940 I am a first year student in Bachelor of Scien... \n",
"12k646 What do I do I want to keep my nuts \n",
"\n",
" comment L1 L2 L3 L4 L5 \\\n",
"2k7915 Maybe they need to keep the map large to start... 0 0 0 0 0 \n",
"1k3845 Shut the fuck up freeloading asshat 3 1 0 0 0 \n",
"1k8446 MORE WIGGER SHIT TO DECODE 3 1 0 0 0 \n",
"14k940 For Calc2, do past exams \\* 6, remember to exp... 0 0 0 0 0 \n",
"12k646 just eat em and be careful it's fine 0 0 0 0 0 \n",
"\n",
" L6 \n",
"2k7915 0 \n",
"1k3845 1 \n",
"1k8446 1 \n",
"14k940 0 \n",
"12k646 0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#select body and L2, change L2 to Class\n",
"df_select = df[['body', 'L2']].copy()\n",
"df_select.rename(columns={'L2':'Class'}, inplace=True) \n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" body | \n",
" Class | \n",
"
\n",
" \n",
" \n",
" \n",
" 2k7915 | \n",
" When did this change? It's not been fun spawni... | \n",
" 0 | \n",
"
\n",
" \n",
" 1k3845 | \n",
" Just out of curiosity. I'm only wondering what... | \n",
" 1 | \n",
"
\n",
" \n",
" 1k8446 | \n",
" Im not phased by anything, love you all and I'... | \n",
" 1 | \n",
"
\n",
" \n",
" 14k940 | \n",
" I am a first year student in Bachelor of Scien... | \n",
" 0 | \n",
"
\n",
" \n",
" 12k646 | \n",
" What do I do I want to keep my nuts | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" body Class\n",
"2k7915 When did this change? It's not been fun spawni... 0\n",
"1k3845 Just out of curiosity. I'm only wondering what... 1\n",
"1k8446 Im not phased by anything, love you all and I'... 1\n",
"14k940 I am a first year student in Bachelor of Scien... 0\n",
"12k646 What do I do I want to keep my nuts 0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_select.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 12577\n",
"0 11929\n",
"Name: Class, dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#check the number of each class\n",
"df_select['Class'].value_counts()\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# save to csv\n",
"df_select.to_csv('reddit_annotated.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"ht = pd.read_table('/Users/liuxiaoquan/Documents/706/Final_project/RAL-E/retrain_reddit_abuse_test.txt', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"14932"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(ht)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.6 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}