Susanna Anil commited on
Commit
8e9a7ea
·
1 Parent(s): ad0c90c

start scrape and classify

Browse files
Files changed (1) hide show
  1. scrape_load.ipynb +94 -0
scrape_load.ipynb ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 7,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import praw\n",
10
+ "import pandas as pd\n",
11
+ "\n",
12
+ "reddit= praw.Reddit(client_id=\"Q1w42RHhLq2fgwljAk_k-Q\",\t\t # your client id\n",
13
+ "\t\t\t\t\tclient_secret=\"enUJfFthiZRynGfPQtoK1nCxRer2Dw\",\t # your client secret\n",
14
+ " usernme = \"xl395\", #profile username\n",
15
+ " password = \"12xiao34quanAria!\", #profile password\n",
16
+ "\t\t\t\t\tuser_agent=\"706_post\")\t # your user agent"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 35,
22
+ "metadata": {},
23
+ "outputs": [
24
+ {
25
+ "ename": "SyntaxError",
26
+ "evalue": "expected ':' (3580341109.py, line 8)",
27
+ "output_type": "error",
28
+ "traceback": [
29
+ "\u001b[0;36m Cell \u001b[0;32mIn[35], line 8\u001b[0;36m\u001b[0m\n\u001b[0;31m def extract_comments(input_url)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m expected ':'\n"
30
+ ]
31
+ }
32
+ ],
33
+ "source": [
34
+ "from praw.models import MoreComments\n",
35
+ "from transformers import pipeline\n",
36
+ "from transformers import DistilBertTokenizerFast\n",
37
+ "classifier = pipeline(\"sentiment-analysis\", model=\"michellejieli/NSFW_text_classifier\")\n",
38
+ "\n",
39
+ "input_url = \"https://www.reddit.com/r/europe/comments/r0hthg/sweden_is_taking_the_lead_to_persuade_the_rest_of/\"\n",
40
+ "\n",
41
+ "def extract_comments(input_url):\n",
42
+ " submission = reddit.submission(url=input_url)\n",
43
+ " posts_dict = {\"Post text\":[],}\n",
44
+ " # posts_dict = {\"Post text\":[], \"class\": []}\n",
45
+ " for top_level_comment in submission.comments:\n",
46
+ " if isinstance(top_level_comment, MoreComments):\n",
47
+ " continue\n",
48
+ " \n",
49
+ " posts_dict[\"Post text\"].append(top_level_comment.body)\n",
50
+ " #post_dict[\"class\"].append(classifier(top_level_comment.body))\n",
51
+ " df = pd.DataFrame(posts_dict)\n",
52
+ " return df "
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "df_process = extract_comments(input_url)\n",
62
+ "\n",
63
+ "# find how many are NSFW - output message"
64
+ ]
65
+ }
66
+ ],
67
+ "metadata": {
68
+ "kernelspec": {
69
+ "display_name": "Python 3",
70
+ "language": "python",
71
+ "name": "python3"
72
+ },
73
+ "language_info": {
74
+ "codemirror_mode": {
75
+ "name": "ipython",
76
+ "version": 3
77
+ },
78
+ "file_extension": ".py",
79
+ "mimetype": "text/x-python",
80
+ "name": "python",
81
+ "nbconvert_exporter": "python",
82
+ "pygments_lexer": "ipython3",
83
+ "version": "3.10.4"
84
+ },
85
+ "orig_nbformat": 4,
86
+ "vscode": {
87
+ "interpreter": {
88
+ "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
89
+ }
90
+ }
91
+ },
92
+ "nbformat": 4,
93
+ "nbformat_minor": 2
94
+ }