Vera-ZWY commited on
Commit
513d672
1 Parent(s): f957c0d

Create praw_newgest_df2024.py

Browse files
Files changed (1) hide show
  1. praw_newgest_df2024.py +246 -0
praw_newgest_df2024.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import gradio as gr
2
+ import numpy as np
3
+ import pandas as pd
4
+ import praw
5
+ from huggingface_hub import HfApi, HfFolder
6
+ import time
7
+ import os
8
+ from datetime import datetime
9
+ # from tqdm import tqdm
10
+
11
+ HfFolder.save_token(os.getenv("HF_TOKEN"))
12
+
13
+ try:
14
+ # def initialize_reddit():
15
+
16
+
17
+ reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"),
18
+ client_secret= os.getenv("PRAW_CLIENT_SECRET"),
19
+ user_agent= os.getenv("RPAW_AGENT"),
20
+ check_for_async=False
21
+ )
22
+
23
+ except praw.exceptions.PRAWException as e:
24
+ print(f"PRAW Exception: {str(e)}")
25
+ # return None
26
+ except Exception as e:
27
+ print(f"An error occurred: {str(e)}")
28
+ # return None
29
+
30
+ def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000):
31
+
32
+ posts_data = []
33
+
34
+ if subreddit_name:
35
+ subreddit = reddit.subreddit(subreddit_name)
36
+ if keywords:
37
+ posts = subreddit.search(keywords, limit=limit)
38
+ else:
39
+ posts = subreddit.hot(limit=limit)
40
+ else:
41
+ posts = reddit.subreddit("all").search(keywords, limit=limit)
42
+ # print(posts)
43
+ for post in posts:
44
+ # print(post.title)
45
+ try:
46
+ post_data = {
47
+ "title": post.title,
48
+ "score": post.score,
49
+ "id": post.id,
50
+ "url": post.url,
51
+ "num_comments": post.num_comments,
52
+ "created": datetime.fromtimestamp(post.created),
53
+ "body": post.selftext,
54
+ "subreddit": post.subreddit.display_name
55
+ }
56
+ posts_data.append(post_data)
57
+
58
+ # Add a small delay to avoid hitting rate limits
59
+ time.sleep(0.1)
60
+
61
+ except praw.exceptions.PRAWException as e:
62
+ print(f"Error processing post {post.id}: {str(e)}")
63
+ continue
64
+
65
+ df = pd.DataFrame(posts_data)
66
+ df['content'] = df['title'] + '\n' + df['body']
67
+ return df
68
+
69
+
70
+
71
+ def get_comments(reddit, post_id, limit=100):
72
+ """
73
+ Get top comments from a specific post.
74
+
75
+ Args:
76
+ reddit: Reddit instance
77
+ post_id (str): ID of the post to get comments from
78
+ limit (int): Maximum number of comments to retrieve (default 100)
79
+
80
+ Returns:
81
+ pd.DataFrame: DataFrame containing top comments data
82
+ """
83
+ try:
84
+ submission = reddit.submission(id=post_id)
85
+ comments_data = []
86
+
87
+ # Replace MoreComments objects with actual comments, limited to save time
88
+ submission.comments.replace_more(limit=0) # Ignore "More Comments" expansions
89
+
90
+ # Get all top-level comments
91
+ all_comments = submission.comments.list()
92
+
93
+
94
+ # Sort comments by score and take top ones
95
+ sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit]
96
+
97
+ for comment in sorted_comments:
98
+
99
+ try:
100
+ comment_data = {
101
+ 'comment_id': comment.id,
102
+ 'post_id': post_id,
103
+ 'post_title': submission.title,
104
+ # 'author': str(comment.author) if comment.author else '[deleted]',
105
+ 'body': comment.body,
106
+ 'score': comment.score,
107
+ 'created_utc': datetime.fromtimestamp(comment.created_utc)
108
+ # 'parent_id': comment.parent_id,
109
+ # 'is_submitter': comment.is_submitter
110
+ }
111
+ comments_data.append(comment_data)
112
+
113
+ except Exception as e:
114
+ print(f"Error processing comment {comment.id}: {str(e)}")
115
+ continue
116
+ print(comments_data)
117
+
118
+ # Create DataFrame
119
+ df = pd.DataFrame(comments_data)
120
+
121
+ # Sort by score (highest first)
122
+ if not df.empty:
123
+ print("sort comments by score")
124
+ df = df.sort_values('score', ascending=False)
125
+
126
+ return df
127
+
128
+ except praw.exceptions.PRAWException as e:
129
+ print(f"PRAW Exception while getting comments: {str(e)}")
130
+ return pd.DataFrame()
131
+ except Exception as e:
132
+ print(f"Error getting comments: {str(e)}")
133
+ return pd.DataFrame()
134
+
135
+
136
+
137
+ def get_comments_and_upload(df, dataset_repo_id):
138
+ # Initialize the Hugging Face API
139
+ api = HfApi()
140
+
141
+ existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset")
142
+
143
+ # Iterate over each submission in the DataFrame
144
+ for index, row in df.iterrows():
145
+ csv_file_path = f"comments_{row['id']}.csv"
146
+ repo_csv_path = f"comments/{csv_file_path}"
147
+
148
+ # Check if this file already exists in the Hugging Face dataset
149
+ # if repo_csv_path in existing_files:
150
+ # print(f"{csv_file_path} already exists in the dataset. Skipping upload.")
151
+ # continue
152
+ # Fetch comments for the current submission
153
+ comments_df = get_comments(reddit, row['id'])
154
+
155
+ # # Prepare data for the current submission’s comments
156
+ # comments_data = [{
157
+ # 'comment_id': comment.id,
158
+ # 'comment_content': comment.body,
159
+ # 'comment_created': comment.created,
160
+ # 'submission_id': row['id']
161
+ # } for comment in comments]
162
+
163
+ # Create a DataFrame for the current submission's comments
164
+ # comments_df = pd.DataFrame(comments_data, columns=['comment_id', 'comment_content', 'comment_created', 'submission_id'])
165
+ if len(comments_df) == 0:
166
+ print(f"No comments found for {row['id']}")
167
+ # continue
168
+ # Define a unique CSV filename for each submission based on its ID
169
+ csv_file_path = f"comments_{row['id']}.csv"
170
+
171
+ # Save the comments DataFrame as a CSV file
172
+ comments_df.to_csv(csv_file_path, index=False)
173
+
174
+ # Upload the CSV file to the Hugging Face dataset repository
175
+ api.upload_file(
176
+ path_or_fileobj=csv_file_path,
177
+ path_in_repo=f"comments/{csv_file_path}", # Save in a 'comments' folder in the dataset repo
178
+ repo_id=dataset_repo_id,
179
+ repo_type="dataset"
180
+ )
181
+
182
+ print(f"Uploaded {csv_file_path} to Hugging Face.")
183
+
184
+ # Optionally, delete the local CSV file to save space
185
+ os.remove(csv_file_path)
186
+
187
+ print("All comments CSV files uploaded successfully!")
188
+
189
+
190
+
191
+ def main():
192
+ # Example usage
193
+
194
+ try:
195
+ # Search for 2016 election posts
196
+ df = scrape_reddit(keywords="election")
197
+
198
+ if df is not None and not df.empty:
199
+ print(f"Successfully scraped {len(df)} posts")
200
+ # Save to CSV
201
+ # df.to_csv("reddit_2016_election_posts.csv", index=False)
202
+ df['created'] = pd.to_datetime(df['created'], unit='s')
203
+ df = df.sort_values(by='created', ascending=True)
204
+ df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True)
205
+ # df_16 = df_16[df_16['created'] > '2015-12-31'].reset_index(drop=True)
206
+
207
+
208
+ dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions"
209
+ # reate database if it's not exsit
210
+ api = HfApi()
211
+ try:
212
+ api.dataset_info(dataset_repo_id)
213
+ # dataset_exists = True
214
+ print(f"Dataset {dataset_repo_id} already exists.")
215
+
216
+ except Exception:
217
+ # dataset_exists = False
218
+ print(f"Dataset {dataset_repo_id} will be created.")
219
+ # If the dataset doesn't exist, create it and then upload the CSV file
220
+ # api.create_repo(repo_id=dataset_repo_id, repo_type="dataset")
221
+
222
+ df_24.to_csv("df_24.csv", index=False)
223
+ csv_file_path = "df_24.csv"
224
+
225
+ api.upload_file(
226
+ path_or_fileobj= csv_file_path,
227
+ path_in_repo="df_24_newest.csv",
228
+ repo_id=dataset_repo_id,
229
+ repo_type="dataset"
230
+ )
231
+
232
+ get_comments_and_upload(df_24, dataset_repo_id)
233
+
234
+ else:
235
+ print("No data was retrieved")
236
+
237
+
238
+ except Exception as e:
239
+ print(f"Error in main: {str(e)}")
240
+
241
+
242
+ if __name__ == '__main__':
243
+ main()
244
+
245
+
246
+