Create praw_newgest_df2024.py
Browse files- praw_newgest_df2024.py +246 -0
praw_newgest_df2024.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import praw
|
5 |
+
from huggingface_hub import HfApi, HfFolder
|
6 |
+
import time
|
7 |
+
import os
|
8 |
+
from datetime import datetime
|
9 |
+
# from tqdm import tqdm
|
10 |
+
|
11 |
+
HfFolder.save_token(os.getenv("HF_TOKEN"))
|
12 |
+
|
13 |
+
try:
|
14 |
+
# def initialize_reddit():
|
15 |
+
|
16 |
+
|
17 |
+
reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"),
|
18 |
+
client_secret= os.getenv("PRAW_CLIENT_SECRET"),
|
19 |
+
user_agent= os.getenv("RPAW_AGENT"),
|
20 |
+
check_for_async=False
|
21 |
+
)
|
22 |
+
|
23 |
+
except praw.exceptions.PRAWException as e:
|
24 |
+
print(f"PRAW Exception: {str(e)}")
|
25 |
+
# return None
|
26 |
+
except Exception as e:
|
27 |
+
print(f"An error occurred: {str(e)}")
|
28 |
+
# return None
|
29 |
+
|
30 |
+
def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000):
|
31 |
+
|
32 |
+
posts_data = []
|
33 |
+
|
34 |
+
if subreddit_name:
|
35 |
+
subreddit = reddit.subreddit(subreddit_name)
|
36 |
+
if keywords:
|
37 |
+
posts = subreddit.search(keywords, limit=limit)
|
38 |
+
else:
|
39 |
+
posts = subreddit.hot(limit=limit)
|
40 |
+
else:
|
41 |
+
posts = reddit.subreddit("all").search(keywords, limit=limit)
|
42 |
+
# print(posts)
|
43 |
+
for post in posts:
|
44 |
+
# print(post.title)
|
45 |
+
try:
|
46 |
+
post_data = {
|
47 |
+
"title": post.title,
|
48 |
+
"score": post.score,
|
49 |
+
"id": post.id,
|
50 |
+
"url": post.url,
|
51 |
+
"num_comments": post.num_comments,
|
52 |
+
"created": datetime.fromtimestamp(post.created),
|
53 |
+
"body": post.selftext,
|
54 |
+
"subreddit": post.subreddit.display_name
|
55 |
+
}
|
56 |
+
posts_data.append(post_data)
|
57 |
+
|
58 |
+
# Add a small delay to avoid hitting rate limits
|
59 |
+
time.sleep(0.1)
|
60 |
+
|
61 |
+
except praw.exceptions.PRAWException as e:
|
62 |
+
print(f"Error processing post {post.id}: {str(e)}")
|
63 |
+
continue
|
64 |
+
|
65 |
+
df = pd.DataFrame(posts_data)
|
66 |
+
df['content'] = df['title'] + '\n' + df['body']
|
67 |
+
return df
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
def get_comments(reddit, post_id, limit=100):
|
72 |
+
"""
|
73 |
+
Get top comments from a specific post.
|
74 |
+
|
75 |
+
Args:
|
76 |
+
reddit: Reddit instance
|
77 |
+
post_id (str): ID of the post to get comments from
|
78 |
+
limit (int): Maximum number of comments to retrieve (default 100)
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
pd.DataFrame: DataFrame containing top comments data
|
82 |
+
"""
|
83 |
+
try:
|
84 |
+
submission = reddit.submission(id=post_id)
|
85 |
+
comments_data = []
|
86 |
+
|
87 |
+
# Replace MoreComments objects with actual comments, limited to save time
|
88 |
+
submission.comments.replace_more(limit=0) # Ignore "More Comments" expansions
|
89 |
+
|
90 |
+
# Get all top-level comments
|
91 |
+
all_comments = submission.comments.list()
|
92 |
+
|
93 |
+
|
94 |
+
# Sort comments by score and take top ones
|
95 |
+
sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit]
|
96 |
+
|
97 |
+
for comment in sorted_comments:
|
98 |
+
|
99 |
+
try:
|
100 |
+
comment_data = {
|
101 |
+
'comment_id': comment.id,
|
102 |
+
'post_id': post_id,
|
103 |
+
'post_title': submission.title,
|
104 |
+
# 'author': str(comment.author) if comment.author else '[deleted]',
|
105 |
+
'body': comment.body,
|
106 |
+
'score': comment.score,
|
107 |
+
'created_utc': datetime.fromtimestamp(comment.created_utc)
|
108 |
+
# 'parent_id': comment.parent_id,
|
109 |
+
# 'is_submitter': comment.is_submitter
|
110 |
+
}
|
111 |
+
comments_data.append(comment_data)
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
print(f"Error processing comment {comment.id}: {str(e)}")
|
115 |
+
continue
|
116 |
+
print(comments_data)
|
117 |
+
|
118 |
+
# Create DataFrame
|
119 |
+
df = pd.DataFrame(comments_data)
|
120 |
+
|
121 |
+
# Sort by score (highest first)
|
122 |
+
if not df.empty:
|
123 |
+
print("sort comments by score")
|
124 |
+
df = df.sort_values('score', ascending=False)
|
125 |
+
|
126 |
+
return df
|
127 |
+
|
128 |
+
except praw.exceptions.PRAWException as e:
|
129 |
+
print(f"PRAW Exception while getting comments: {str(e)}")
|
130 |
+
return pd.DataFrame()
|
131 |
+
except Exception as e:
|
132 |
+
print(f"Error getting comments: {str(e)}")
|
133 |
+
return pd.DataFrame()
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
def get_comments_and_upload(df, dataset_repo_id):
|
138 |
+
# Initialize the Hugging Face API
|
139 |
+
api = HfApi()
|
140 |
+
|
141 |
+
existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset")
|
142 |
+
|
143 |
+
# Iterate over each submission in the DataFrame
|
144 |
+
for index, row in df.iterrows():
|
145 |
+
csv_file_path = f"comments_{row['id']}.csv"
|
146 |
+
repo_csv_path = f"comments/{csv_file_path}"
|
147 |
+
|
148 |
+
# Check if this file already exists in the Hugging Face dataset
|
149 |
+
# if repo_csv_path in existing_files:
|
150 |
+
# print(f"{csv_file_path} already exists in the dataset. Skipping upload.")
|
151 |
+
# continue
|
152 |
+
# Fetch comments for the current submission
|
153 |
+
comments_df = get_comments(reddit, row['id'])
|
154 |
+
|
155 |
+
# # Prepare data for the current submission’s comments
|
156 |
+
# comments_data = [{
|
157 |
+
# 'comment_id': comment.id,
|
158 |
+
# 'comment_content': comment.body,
|
159 |
+
# 'comment_created': comment.created,
|
160 |
+
# 'submission_id': row['id']
|
161 |
+
# } for comment in comments]
|
162 |
+
|
163 |
+
# Create a DataFrame for the current submission's comments
|
164 |
+
# comments_df = pd.DataFrame(comments_data, columns=['comment_id', 'comment_content', 'comment_created', 'submission_id'])
|
165 |
+
if len(comments_df) == 0:
|
166 |
+
print(f"No comments found for {row['id']}")
|
167 |
+
# continue
|
168 |
+
# Define a unique CSV filename for each submission based on its ID
|
169 |
+
csv_file_path = f"comments_{row['id']}.csv"
|
170 |
+
|
171 |
+
# Save the comments DataFrame as a CSV file
|
172 |
+
comments_df.to_csv(csv_file_path, index=False)
|
173 |
+
|
174 |
+
# Upload the CSV file to the Hugging Face dataset repository
|
175 |
+
api.upload_file(
|
176 |
+
path_or_fileobj=csv_file_path,
|
177 |
+
path_in_repo=f"comments/{csv_file_path}", # Save in a 'comments' folder in the dataset repo
|
178 |
+
repo_id=dataset_repo_id,
|
179 |
+
repo_type="dataset"
|
180 |
+
)
|
181 |
+
|
182 |
+
print(f"Uploaded {csv_file_path} to Hugging Face.")
|
183 |
+
|
184 |
+
# Optionally, delete the local CSV file to save space
|
185 |
+
os.remove(csv_file_path)
|
186 |
+
|
187 |
+
print("All comments CSV files uploaded successfully!")
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
def main():
|
192 |
+
# Example usage
|
193 |
+
|
194 |
+
try:
|
195 |
+
# Search for 2016 election posts
|
196 |
+
df = scrape_reddit(keywords="election")
|
197 |
+
|
198 |
+
if df is not None and not df.empty:
|
199 |
+
print(f"Successfully scraped {len(df)} posts")
|
200 |
+
# Save to CSV
|
201 |
+
# df.to_csv("reddit_2016_election_posts.csv", index=False)
|
202 |
+
df['created'] = pd.to_datetime(df['created'], unit='s')
|
203 |
+
df = df.sort_values(by='created', ascending=True)
|
204 |
+
df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True)
|
205 |
+
# df_16 = df_16[df_16['created'] > '2015-12-31'].reset_index(drop=True)
|
206 |
+
|
207 |
+
|
208 |
+
dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions"
|
209 |
+
# reate database if it's not exsit
|
210 |
+
api = HfApi()
|
211 |
+
try:
|
212 |
+
api.dataset_info(dataset_repo_id)
|
213 |
+
# dataset_exists = True
|
214 |
+
print(f"Dataset {dataset_repo_id} already exists.")
|
215 |
+
|
216 |
+
except Exception:
|
217 |
+
# dataset_exists = False
|
218 |
+
print(f"Dataset {dataset_repo_id} will be created.")
|
219 |
+
# If the dataset doesn't exist, create it and then upload the CSV file
|
220 |
+
# api.create_repo(repo_id=dataset_repo_id, repo_type="dataset")
|
221 |
+
|
222 |
+
df_24.to_csv("df_24.csv", index=False)
|
223 |
+
csv_file_path = "df_24.csv"
|
224 |
+
|
225 |
+
api.upload_file(
|
226 |
+
path_or_fileobj= csv_file_path,
|
227 |
+
path_in_repo="df_24_newest.csv",
|
228 |
+
repo_id=dataset_repo_id,
|
229 |
+
repo_type="dataset"
|
230 |
+
)
|
231 |
+
|
232 |
+
get_comments_and_upload(df_24, dataset_repo_id)
|
233 |
+
|
234 |
+
else:
|
235 |
+
print("No data was retrieved")
|
236 |
+
|
237 |
+
|
238 |
+
except Exception as e:
|
239 |
+
print(f"Error in main: {str(e)}")
|
240 |
+
|
241 |
+
|
242 |
+
if __name__ == '__main__':
|
243 |
+
main()
|
244 |
+
|
245 |
+
|
246 |
+
|