import numpy as np |
import pandas as pd |
import praw |
from huggingface_hub import HfApi, HfFolder |
import time |
import os |
from datetime import datetime |
HfFolder.save_token(os.getenv("HF_TOKEN")) |
try: |
reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"), |
client_secret= os.getenv("PRAW_CLIENT_SECRET"), |
user_agent= os.getenv("RPAW_AGENT"), |
check_for_async=False |
) |
except praw.exceptions.PRAWException as e: |
print(f"PRAW Exception: {str(e)}") |
except Exception as e: |
print(f"An error occurred: {str(e)}") |
def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000): |
posts_data = [] |
if subreddit_name: |
subreddit = reddit.subreddit(subreddit_name) |
if keywords: |
posts = subreddit.search(keywords, limit=limit) |
else: |
posts = subreddit.hot(limit=limit) |
else: |
posts = reddit.subreddit("all").search(keywords, limit=limit) |
for post in posts: |
try: |
post_data = { |
"title": post.title, |
"score": post.score, |
"id": post.id, |
"url": post.url, |
"num_comments": post.num_comments, |
"created": datetime.fromtimestamp(post.created), |
"body": post.selftext, |
"subreddit": post.subreddit.display_name |
} |
posts_data.append(post_data) |
time.sleep(0.1) |
except praw.exceptions.PRAWException as e: |
print(f"Error processing post {post.id}: {str(e)}") |
continue |
df = pd.DataFrame(posts_data) |
df['content'] = df['title'] + '\n' + df['body'] |
return df |
def get_comments(reddit, post_id, limit=100): |
""" |
Get top comments from a specific post. |
Args: |
reddit: Reddit instance |
post_id (str): ID of the post to get comments from |
limit (int): Maximum number of comments to retrieve (default 100) |
Returns: |
pd.DataFrame: DataFrame containing top comments data |
""" |
try: |
submission = reddit.submission(id=post_id) |
comments_data = [] |
submission.comments.replace_more(limit=0) |
all_comments = submission.comments.list() |
sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit] |
for comment in sorted_comments: |
try: |
comment_data = { |
'comment_id': comment.id, |
'post_id': post_id, |
'post_title': submission.title, |
'body': comment.body, |
'score': comment.score, |
'created_utc': datetime.fromtimestamp(comment.created_utc) |
} |
comments_data.append(comment_data) |
except Exception as e: |
print(f"Error processing comment {comment.id}: {str(e)}") |
continue |
print(comments_data) |
df = pd.DataFrame(comments_data) |
if not df.empty: |
print("sort comments by score") |
df = df.sort_values('score', ascending=False) |
return df |
except praw.exceptions.PRAWException as e: |
print(f"PRAW Exception while getting comments: {str(e)}") |
return pd.DataFrame() |
except Exception as e: |
print(f"Error getting comments: {str(e)}") |
return pd.DataFrame() |
def get_comments_and_upload(df, dataset_repo_id): |
api = HfApi() |
existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset") |
for index, row in df.iterrows(): |
csv_file_path = f"comments_{row['id']}.csv" |
repo_csv_path = f"comments/{csv_file_path}" |
comments_df = get_comments(reddit, row['id']) |
if len(comments_df) == 0: |
print(f"No comments found for {row['id']}") |
csv_file_path = f"comments_{row['id']}.csv" |
comments_df.to_csv(csv_file_path, index=False) |
api.upload_file( |
path_or_fileobj=csv_file_path, |
path_in_repo=f"comments/{csv_file_path}", |
repo_id=dataset_repo_id, |
repo_type="dataset" |
) |
print(f"Uploaded {csv_file_path} to Hugging Face.") |
os.remove(csv_file_path) |
print("All comments CSV files uploaded successfully!") |
def main(): |
try: |
df = scrape_reddit(keywords="election") |
if df is not None and not df.empty: |
print(f"Successfully scraped {len(df)} posts") |
df['created'] = pd.to_datetime(df['created'], unit='s') |
df = df.sort_values(by='created', ascending=True) |
df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True) |
dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions" |
api = HfApi() |
try: |
api.dataset_info(dataset_repo_id) |
print(f"Dataset {dataset_repo_id} already exists.") |
except Exception: |
print(f"Dataset {dataset_repo_id} will be created.") |
today_date = datetime.now().strftime('%Y%m%d') |
filename = f"df_24_{today_date}.csv" |
df_24.to_csv(filename, index=False) |
api.upload_file( |
path_or_fileobj= filename, |
path_in_repo=f"submissions/{filename}", |
repo_id=dataset_repo_id, |
repo_type="dataset" |
) |
get_comments_and_upload(df_24, dataset_repo_id) |
else: |
print("No data was retrieved") |
except Exception as e: |
print(f"Error in main: {str(e)}") |
if __name__ == '__main__': |
main() |