|
|
|
import numpy as np |
|
import pandas as pd |
|
import praw |
|
from huggingface_hub import HfApi, HfFolder |
|
import time |
|
import os |
|
from datetime import datetime |
|
|
|
|
|
HfFolder.save_token(os.getenv("HF_TOKEN")) |
|
|
|
try: |
|
|
|
|
|
|
|
reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"), |
|
client_secret= os.getenv("PRAW_CLIENT_SECRET"), |
|
user_agent= os.getenv("RPAW_AGENT"), |
|
check_for_async=False |
|
) |
|
|
|
except praw.exceptions.PRAWException as e: |
|
print(f"PRAW Exception: {str(e)}") |
|
|
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}") |
|
|
|
|
|
def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000): |
|
|
|
posts_data = [] |
|
|
|
if subreddit_name: |
|
subreddit = reddit.subreddit(subreddit_name) |
|
if keywords: |
|
posts = subreddit.search(keywords, limit=limit) |
|
else: |
|
posts = subreddit.hot(limit=limit) |
|
else: |
|
posts = reddit.subreddit("all").search(keywords, limit=limit) |
|
|
|
for post in posts: |
|
|
|
try: |
|
post_data = { |
|
"title": post.title, |
|
"score": post.score, |
|
"id": post.id, |
|
"url": post.url, |
|
"num_comments": post.num_comments, |
|
"created": datetime.fromtimestamp(post.created), |
|
"body": post.selftext, |
|
"subreddit": post.subreddit.display_name |
|
} |
|
posts_data.append(post_data) |
|
|
|
|
|
time.sleep(0.1) |
|
|
|
except praw.exceptions.PRAWException as e: |
|
print(f"Error processing post {post.id}: {str(e)}") |
|
continue |
|
|
|
df = pd.DataFrame(posts_data) |
|
df['content'] = df['title'] + '\n' + df['body'] |
|
return df |
|
|
|
|
|
|
|
def get_comments(reddit, post_id, limit=100): |
|
""" |
|
Get top comments from a specific post. |
|
|
|
Args: |
|
reddit: Reddit instance |
|
post_id (str): ID of the post to get comments from |
|
limit (int): Maximum number of comments to retrieve (default 100) |
|
|
|
Returns: |
|
pd.DataFrame: DataFrame containing top comments data |
|
""" |
|
try: |
|
submission = reddit.submission(id=post_id) |
|
comments_data = [] |
|
|
|
|
|
submission.comments.replace_more(limit=0) |
|
|
|
|
|
all_comments = submission.comments.list() |
|
|
|
|
|
|
|
sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit] |
|
|
|
for comment in sorted_comments: |
|
|
|
try: |
|
comment_data = { |
|
'comment_id': comment.id, |
|
'post_id': post_id, |
|
'post_title': submission.title, |
|
|
|
'body': comment.body, |
|
'score': comment.score, |
|
'created_utc': datetime.fromtimestamp(comment.created_utc) |
|
|
|
|
|
} |
|
comments_data.append(comment_data) |
|
|
|
except Exception as e: |
|
print(f"Error processing comment {comment.id}: {str(e)}") |
|
continue |
|
print(comments_data) |
|
|
|
|
|
df = pd.DataFrame(comments_data) |
|
|
|
|
|
if not df.empty: |
|
print("sort comments by score") |
|
df = df.sort_values('score', ascending=False) |
|
|
|
return df |
|
|
|
except praw.exceptions.PRAWException as e: |
|
print(f"PRAW Exception while getting comments: {str(e)}") |
|
return pd.DataFrame() |
|
except Exception as e: |
|
print(f"Error getting comments: {str(e)}") |
|
return pd.DataFrame() |
|
|
|
|
|
|
|
def get_comments_and_upload(df, dataset_repo_id): |
|
|
|
api = HfApi() |
|
|
|
existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset") |
|
|
|
|
|
for index, row in df.iterrows(): |
|
csv_file_path = f"comments_{row['id']}.csv" |
|
repo_csv_path = f"comments/{csv_file_path}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
comments_df = get_comments(reddit, row['id']) |
|
|
|
if len(comments_df) == 0: |
|
print(f"No comments found for {row['id']}") |
|
|
|
|
|
csv_file_path = f"comments_{row['id']}.csv" |
|
|
|
|
|
comments_df.to_csv(csv_file_path, index=False) |
|
|
|
|
|
api.upload_file( |
|
path_or_fileobj=csv_file_path, |
|
path_in_repo=f"comments/{csv_file_path}", |
|
repo_id=dataset_repo_id, |
|
repo_type="dataset" |
|
) |
|
|
|
print(f"Uploaded {csv_file_path} to Hugging Face.") |
|
|
|
|
|
os.remove(csv_file_path) |
|
|
|
print("All comments CSV files uploaded successfully!") |
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
try: |
|
|
|
df = scrape_reddit(keywords="election") |
|
|
|
if df is not None and not df.empty: |
|
print(f"Successfully scraped {len(df)} posts") |
|
|
|
|
|
df['created'] = pd.to_datetime(df['created'], unit='s') |
|
df = df.sort_values(by='created', ascending=True) |
|
df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True) |
|
|
|
|
|
|
|
dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions" |
|
|
|
api = HfApi() |
|
try: |
|
api.dataset_info(dataset_repo_id) |
|
|
|
print(f"Dataset {dataset_repo_id} already exists.") |
|
|
|
except Exception: |
|
|
|
print(f"Dataset {dataset_repo_id} will be created.") |
|
|
|
|
|
|
|
today_date = datetime.now().strftime('%Y%m%d') |
|
filename = f"df_24_{today_date}.csv" |
|
|
|
df_24.to_csv(filename, index=False) |
|
|
|
|
|
api.upload_file( |
|
path_or_fileobj= filename, |
|
path_in_repo=f"submissions/{filename}", |
|
repo_id=dataset_repo_id, |
|
repo_type="dataset" |
|
) |
|
|
|
get_comments_and_upload(df_24, dataset_repo_id) |
|
|
|
else: |
|
print("No data was retrieved") |
|
|
|
|
|
except Exception as e: |
|
print(f"Error in main: {str(e)}") |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|
|
|
|
|
|
|