# import gradio as gr import numpy as np import pandas as pd import praw from huggingface_hub import HfApi, HfFolder import time import os from datetime import datetime # from tqdm import tqdm HfFolder.save_token(os.getenv("HF_TOKEN")) try: # def initialize_reddit(): reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"), client_secret= os.getenv("PRAW_CLIENT_SECRET"), user_agent= os.getenv("RPAW_AGENT"), check_for_async=False ) except praw.exceptions.PRAWException as e: print(f"PRAW Exception: {str(e)}") # return None except Exception as e: print(f"An error occurred: {str(e)}") # return None def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000): posts_data = [] if subreddit_name: subreddit = reddit.subreddit(subreddit_name) if keywords: posts = subreddit.search(keywords, limit=limit) else: posts = subreddit.hot(limit=limit) else: posts = reddit.subreddit("all").search(keywords, limit=limit) # print(posts) for post in posts: # print(post.title) try: post_data = { "title": post.title, "score": post.score, "id": post.id, "url": post.url, "num_comments": post.num_comments, "created": datetime.fromtimestamp(post.created), "body": post.selftext, "subreddit": post.subreddit.display_name } posts_data.append(post_data) # Add a small delay to avoid hitting rate limits time.sleep(0.1) except praw.exceptions.PRAWException as e: print(f"Error processing post {post.id}: {str(e)}") continue df = pd.DataFrame(posts_data) df['content'] = df['title'] + '\n' + df['body'] return df def get_comments(reddit, post_id, limit=100): """ Get top comments from a specific post. Args: reddit: Reddit instance post_id (str): ID of the post to get comments from limit (int): Maximum number of comments to retrieve (default 100) Returns: pd.DataFrame: DataFrame containing top comments data """ try: submission = reddit.submission(id=post_id) comments_data = [] # Replace MoreComments objects with actual comments, limited to save time submission.comments.replace_more(limit=0) # Ignore "More Comments" expansions # Get all top-level comments all_comments = submission.comments.list() # Sort comments by score and take top ones sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit] for comment in sorted_comments: try: comment_data = { 'comment_id': comment.id, 'post_id': post_id, 'post_title': submission.title, # 'author': str(comment.author) if comment.author else '[deleted]', 'body': comment.body, 'score': comment.score, 'created_utc': datetime.fromtimestamp(comment.created_utc) # 'parent_id': comment.parent_id, # 'is_submitter': comment.is_submitter } comments_data.append(comment_data) except Exception as e: print(f"Error processing comment {comment.id}: {str(e)}") continue print(comments_data) # Create DataFrame df = pd.DataFrame(comments_data) # Sort by score (highest first) if not df.empty: print("sort comments by score") df = df.sort_values('score', ascending=False) return df except praw.exceptions.PRAWException as e: print(f"PRAW Exception while getting comments: {str(e)}") return pd.DataFrame() except Exception as e: print(f"Error getting comments: {str(e)}") return pd.DataFrame() def get_comments_and_upload(df, dataset_repo_id): # Initialize the Hugging Face API api = HfApi() existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset") # Iterate over each submission in the DataFrame for index, row in df.iterrows(): csv_file_path = f"comments_{row['id']}.csv" repo_csv_path = f"comments/{csv_file_path}" # Check if this file already exists in the Hugging Face dataset # if repo_csv_path in existing_files: # print(f"{csv_file_path} already exists in the dataset. Skipping upload.") # continue # Fetch comments for the current submission comments_df = get_comments(reddit, row['id']) if len(comments_df) == 0: print(f"No comments found for {row['id']}") # continue # Define a unique CSV filename for each submission based on its ID csv_file_path = f"comments_{row['id']}.csv" # Save the comments DataFrame as a CSV file comments_df.to_csv(csv_file_path, index=False) # Upload the CSV file to the Hugging Face dataset repository api.upload_file( path_or_fileobj=csv_file_path, path_in_repo=f"comments/{csv_file_path}", # Save in a 'comments' folder in the dataset repo repo_id=dataset_repo_id, repo_type="dataset" ) print(f"Uploaded {csv_file_path} to Hugging Face.") # Optionally, delete the local CSV file to save space os.remove(csv_file_path) print("All comments CSV files uploaded successfully!") def main(): # Example usage try: # Search for 2016 election posts df = scrape_reddit(keywords="election") if df is not None and not df.empty: print(f"Successfully scraped {len(df)} posts") # Save to CSV # df.to_csv("reddit_2016_election_posts.csv", index=False) df['created'] = pd.to_datetime(df['created'], unit='s') df = df.sort_values(by='created', ascending=True) df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True) # df_16 = df_16[df_16['created'] > '2015-12-31'].reset_index(drop=True) dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions" # reate database if it's not exsit api = HfApi() try: api.dataset_info(dataset_repo_id) # dataset_exists = True print(f"Dataset {dataset_repo_id} already exists.") except Exception: # dataset_exists = False print(f"Dataset {dataset_repo_id} will be created.") # If the dataset doesn't exist, create it and then upload the CSV file # api.create_repo(repo_id=dataset_repo_id, repo_type="dataset") today_date = datetime.now().strftime('%Y%m%d') filename = f"df_24_{today_date}.csv" df_24.to_csv(filename, index=False) # csv_file_path = filename api.upload_file( path_or_fileobj= filename, path_in_repo=f"submissions/{filename}", repo_id=dataset_repo_id, repo_type="dataset" ) get_comments_and_upload(df_24, dataset_repo_id) else: print("No data was retrieved") except Exception as e: print(f"Error in main: {str(e)}") if __name__ == '__main__': main()