Spaces:

yjzhu0225
/

reddit_text_classification_app

Runtime error

App Files Files Community

Susanna Anil commited on Dec 13, 2022

Commit

8e9a7ea

1 Parent(s): ad0c90c

start scrape and classify

Browse files

Files changed (1) hide show

scrape_load.ipynb +94 -0

scrape_load.ipynb ADDED Viewed

	@@ -0,0 +1,94 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import praw\n",
+    "import pandas as pd\n",
+    "\n",
+    "reddit= praw.Reddit(client_id=\"Q1w42RHhLq2fgwljAk_k-Q\",\t\t # your client id\n",
+    "\t\t\t\t\tclient_secret=\"enUJfFthiZRynGfPQtoK1nCxRer2Dw\",\t # your client secret\n",
+    "                    usernme = \"xl395\", #profile username\n",
+    "                    password = \"12xiao34quanAria!\", #profile password\n",
+    "\t\t\t\t\tuser_agent=\"706_post\")\t # your user agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "expected ':' (3580341109.py, line 8)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  Cell \u001b[0;32mIn[35], line 8\u001b[0;36m\u001b[0m\n\u001b[0;31m    def extract_comments(input_url)\u001b[0m\n\u001b[0m                                   ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m expected ':'\n"
+     ]
+    }
+   ],
+   "source": [
+    "from praw.models import MoreComments\n",
+    "from transformers import pipeline\n",
+    "from transformers import DistilBertTokenizerFast\n",
+    "classifier = pipeline(\"sentiment-analysis\", model=\"michellejieli/NSFW_text_classifier\")\n",
+    "\n",
+    "input_url = \"https://www.reddit.com/r/europe/comments/r0hthg/sweden_is_taking_the_lead_to_persuade_the_rest_of/\"\n",
+    "\n",
+    "def extract_comments(input_url):\n",
+    "    submission = reddit.submission(url=input_url)\n",
+    "    posts_dict = {\"Post text\":[],}\n",
+    "    # posts_dict = {\"Post text\":[], \"class\": []}\n",
+    "    for top_level_comment in submission.comments:\n",
+    "        if isinstance(top_level_comment, MoreComments):\n",
+    "            continue\n",
+    "        \n",
+    "        posts_dict[\"Post text\"].append(top_level_comment.body)\n",
+    "        #post_dict[\"class\"].append(classifier(top_level_comment.body))\n",
+    "    df = pd.DataFrame(posts_dict)\n",
+    "    return df "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_process = extract_comments(input_url)\n",
+    "\n",
+    "# find how many are NSFW - output message"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}