{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# read json file, change row to column\n", "df = pd.read_json('/Users/liuxiaoquan/Documents/706/Final_project/Reddit_new.json', orient='index')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "24506" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlebodycommentL1L2L3L4L5L6
2k7915Why is the NW map so small?When did this change? It's not been fun spawni...Maybe they need to keep the map large to start...000000
1k3845Any updates in regards to the Flame War?Just out of curiosity. I'm only wondering what...Shut the fuck up freeloading asshat310001
1k8446HeyIm not phased by anything, love you all and I'...MORE WIGGER SHIT TO DECODE310001
14k940Any tips for final exams?I am a first year student in Bachelor of Scien...For Calc2, do past exams \\* 6, remember to exp...000000
12k646My orthodontist just said I can't have nuts be...What do I do I want to keep my nutsjust eat em and be careful it's fine000000
\n", "
" ], "text/plain": [ " title \\\n", "2k7915 Why is the NW map so small? \n", "1k3845 Any updates in regards to the Flame War? \n", "1k8446 Hey \n", "14k940 Any tips for final exams? \n", "12k646 My orthodontist just said I can't have nuts be... \n", "\n", " body \\\n", "2k7915 When did this change? It's not been fun spawni... \n", "1k3845 Just out of curiosity. I'm only wondering what... \n", "1k8446 Im not phased by anything, love you all and I'... \n", "14k940 I am a first year student in Bachelor of Scien... \n", "12k646 What do I do I want to keep my nuts \n", "\n", " comment L1 L2 L3 L4 L5 \\\n", "2k7915 Maybe they need to keep the map large to start... 0 0 0 0 0 \n", "1k3845 Shut the fuck up freeloading asshat 3 1 0 0 0 \n", "1k8446 MORE WIGGER SHIT TO DECODE 3 1 0 0 0 \n", "14k940 For Calc2, do past exams \\* 6, remember to exp... 0 0 0 0 0 \n", "12k646 just eat em and be careful it's fine 0 0 0 0 0 \n", "\n", " L6 \n", "2k7915 0 \n", "1k3845 1 \n", "1k8446 1 \n", "14k940 0 \n", "12k646 0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#select body and L2, change L2 to Class\n", "df_select = df[['body', 'L2']].copy()\n", "df_select.rename(columns={'L2':'Class'}, inplace=True) \n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bodyClass
2k7915When did this change? It's not been fun spawni...0
1k3845Just out of curiosity. I'm only wondering what...1
1k8446Im not phased by anything, love you all and I'...1
14k940I am a first year student in Bachelor of Scien...0
12k646What do I do I want to keep my nuts0
\n", "
" ], "text/plain": [ " body Class\n", "2k7915 When did this change? It's not been fun spawni... 0\n", "1k3845 Just out of curiosity. I'm only wondering what... 1\n", "1k8446 Im not phased by anything, love you all and I'... 1\n", "14k940 I am a first year student in Bachelor of Scien... 0\n", "12k646 What do I do I want to keep my nuts 0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_select.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 12577\n", "0 11929\n", "Name: Class, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check the number of each class\n", "df_select['Class'].value_counts()\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# save to csv\n", "df_select.to_csv('reddit_annotated.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "ht = pd.read_table('/Users/liuxiaoquan/Documents/706/Final_project/RAL-E/retrain_reddit_abuse_test.txt', header=None)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "14932" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(ht)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.6 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf" } } }, "nbformat": 4, "nbformat_minor": 2 }