Spaces:
Runtime error
Runtime error
Susanna Anil
commited on
Commit
·
8e9a7ea
1
Parent(s):
ad0c90c
start scrape and classify
Browse files- scrape_load.ipynb +94 -0
scrape_load.ipynb
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 7,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import praw\n",
|
10 |
+
"import pandas as pd\n",
|
11 |
+
"\n",
|
12 |
+
"reddit= praw.Reddit(client_id=\"Q1w42RHhLq2fgwljAk_k-Q\",\t\t # your client id\n",
|
13 |
+
"\t\t\t\t\tclient_secret=\"enUJfFthiZRynGfPQtoK1nCxRer2Dw\",\t # your client secret\n",
|
14 |
+
" usernme = \"xl395\", #profile username\n",
|
15 |
+
" password = \"12xiao34quanAria!\", #profile password\n",
|
16 |
+
"\t\t\t\t\tuser_agent=\"706_post\")\t # your user agent"
|
17 |
+
]
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"cell_type": "code",
|
21 |
+
"execution_count": 35,
|
22 |
+
"metadata": {},
|
23 |
+
"outputs": [
|
24 |
+
{
|
25 |
+
"ename": "SyntaxError",
|
26 |
+
"evalue": "expected ':' (3580341109.py, line 8)",
|
27 |
+
"output_type": "error",
|
28 |
+
"traceback": [
|
29 |
+
"\u001b[0;36m Cell \u001b[0;32mIn[35], line 8\u001b[0;36m\u001b[0m\n\u001b[0;31m def extract_comments(input_url)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m expected ':'\n"
|
30 |
+
]
|
31 |
+
}
|
32 |
+
],
|
33 |
+
"source": [
|
34 |
+
"from praw.models import MoreComments\n",
|
35 |
+
"from transformers import pipeline\n",
|
36 |
+
"from transformers import DistilBertTokenizerFast\n",
|
37 |
+
"classifier = pipeline(\"sentiment-analysis\", model=\"michellejieli/NSFW_text_classifier\")\n",
|
38 |
+
"\n",
|
39 |
+
"input_url = \"https://www.reddit.com/r/europe/comments/r0hthg/sweden_is_taking_the_lead_to_persuade_the_rest_of/\"\n",
|
40 |
+
"\n",
|
41 |
+
"def extract_comments(input_url):\n",
|
42 |
+
" submission = reddit.submission(url=input_url)\n",
|
43 |
+
" posts_dict = {\"Post text\":[],}\n",
|
44 |
+
" # posts_dict = {\"Post text\":[], \"class\": []}\n",
|
45 |
+
" for top_level_comment in submission.comments:\n",
|
46 |
+
" if isinstance(top_level_comment, MoreComments):\n",
|
47 |
+
" continue\n",
|
48 |
+
" \n",
|
49 |
+
" posts_dict[\"Post text\"].append(top_level_comment.body)\n",
|
50 |
+
" #post_dict[\"class\"].append(classifier(top_level_comment.body))\n",
|
51 |
+
" df = pd.DataFrame(posts_dict)\n",
|
52 |
+
" return df "
|
53 |
+
]
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"cell_type": "code",
|
57 |
+
"execution_count": null,
|
58 |
+
"metadata": {},
|
59 |
+
"outputs": [],
|
60 |
+
"source": [
|
61 |
+
"df_process = extract_comments(input_url)\n",
|
62 |
+
"\n",
|
63 |
+
"# find how many are NSFW - output message"
|
64 |
+
]
|
65 |
+
}
|
66 |
+
],
|
67 |
+
"metadata": {
|
68 |
+
"kernelspec": {
|
69 |
+
"display_name": "Python 3",
|
70 |
+
"language": "python",
|
71 |
+
"name": "python3"
|
72 |
+
},
|
73 |
+
"language_info": {
|
74 |
+
"codemirror_mode": {
|
75 |
+
"name": "ipython",
|
76 |
+
"version": 3
|
77 |
+
},
|
78 |
+
"file_extension": ".py",
|
79 |
+
"mimetype": "text/x-python",
|
80 |
+
"name": "python",
|
81 |
+
"nbconvert_exporter": "python",
|
82 |
+
"pygments_lexer": "ipython3",
|
83 |
+
"version": "3.10.4"
|
84 |
+
},
|
85 |
+
"orig_nbformat": 4,
|
86 |
+
"vscode": {
|
87 |
+
"interpreter": {
|
88 |
+
"hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
|
89 |
+
}
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"nbformat": 4,
|
93 |
+
"nbformat_minor": 2
|
94 |
+
}
|