import time import requests import pandas as pd from datetime import datetime def extract_comment_data(comment, post_info): return { 'subreddit': post_info['subreddit'], 'post_title': post_info['title'], 'post_score': post_info['score'], 'post_created_utc': post_info['created_utc'], 'comment_id': comment['data'].get('id'), 'comment_author': comment['data'].get('author'), 'comment_body': comment['data'].get('body'), 'comment_score': comment['data'].get('score', 0), 'comment_created_utc': datetime.fromtimestamp(comment['data'].get('created_utc', 0)), 'post_url': post_info['url'], 'comment_url': f"https://www.reddit.com{post_info['permalink']}{comment['data'].get('id')}", } def fetch_top_comments(post_df, num_comments=2): all_comments = [] total_posts = len(post_df) headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1' } print(f"\nFetching top {num_comments} most upvoted comments for {total_posts} posts...") for idx, post in post_df.iterrows(): print(f"\nProcessing post {idx + 1}/{total_posts}") print(f"Title: {post['title'][:100]}...") print(f"Post Score: {post['score']}, Number of Comments: {post['num_comments']}") try: json_url = post['permalink'].replace('https://www.reddit.com', '') + '.json' url = f'https://www.reddit.com{json_url}' response = requests.get(url, headers=headers) response.raise_for_status() data = response.json() if len(data) > 1: comments_data = data[1]['data']['children'] # Filter out non-comment entries and extract scores valid_comments = [ comment for comment in comments_data if comment['kind'] == 't1' and comment['data'].get('score') is not None ] # Sort comments by score (upvotes) in descending order sorted_comments = sorted( valid_comments, key=lambda x: x['data'].get('score', 0), reverse=True ) # Take only the top N comments top_comments = sorted_comments[:num_comments] # Print comment scores for verification print("\nTop comment scores for this post:") for i, comment in enumerate(top_comments, 1): score = comment['data'].get('score', 0) print(f"Comment {i}: {score} upvotes") # Add to main list for comment in top_comments: all_comments.append(extract_comment_data(comment, post)) time.sleep(20) except requests.exceptions.RequestException as e: print(f"Error fetching comments for post {idx + 1}: {e}") continue # Create DataFrame and sort comments_df = pd.DataFrame(all_comments) if not comments_df.empty: # Verify sorting by showing top comments for each post print("\nVerification of comment sorting:") for post_title in comments_df['post_title'].unique(): post_comments = comments_df[comments_df['post_title'] == post_title] print(f"\nPost: {post_title[:100]}...") print("Comment scores:", post_comments['comment_score'].tolist()) return comments_df def fetch_subreddits(limit=10, min_subscribers=1000): headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1' } subreddits_data = [] after = None while len(subreddits_data) < limit: try: url = f'https://www.reddit.com/subreddits/popular.json?limit=100' if after: url += f'&after={after}' print(f"Fetching subreddits... Current count: {len(subreddits_data)}") response = requests.get(url, headers=headers) response.raise_for_status() data = response.json() for subreddit in data['data']['children']: subreddit_data = subreddit['data'] if subreddit_data.get('subscribers', 0) >= min_subscribers: sub_info = { 'display_name': subreddit_data.get('display_name'), 'display_name_prefixed': subreddit_data.get('display_name_prefixed'), 'title': subreddit_data.get('title'), 'subscribers': subreddit_data.get('subscribers', 0), 'active_users': subreddit_data.get('active_user_count', 0), 'created_utc': datetime.fromtimestamp(subreddit_data.get('created_utc', 0)), 'description': subreddit_data.get('description'), 'subreddit_type': subreddit_data.get('subreddit_type'), 'over18': subreddit_data.get('over18', False), 'url': f"https://www.reddit.com/r/{subreddit_data.get('display_name')}/" } subreddits_data.append(sub_info) after = data['data'].get('after') if not after: print("Reached end of listings") break time.sleep(2) except requests.exceptions.RequestException as e: print(f"Error fetching data: {e}") break return pd.DataFrame(subreddits_data) def fetch_top_posts(subreddit, limit=5): posts_data = [] url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all&limit={limit}' headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1' } try: response = requests.get(url, headers=headers) response.raise_for_status() data = response.json() for post in data['data']['children']: post_data = post['data'] posts_data.append({ 'subreddit': subreddit, 'title': post_data.get('title'), 'score': post_data.get('score'), 'num_comments': post_data.get('num_comments'), 'created_utc': datetime.fromtimestamp(post_data.get('created_utc', 0)), 'url': post_data.get('url'), 'permalink': 'https://www.reddit.com' + post_data.get('permalink', '') }) time.sleep(2) except requests.exceptions.RequestException as e: print(f"Error fetching posts from r/{subreddit}: {e}") return pd.DataFrame(posts_data) def main(): # Step 1: Fetch Subreddits print("Fetching subreddits...") subreddits_df = fetch_subreddits(limit=10, min_subscribers=1000) print(f"Fetched {len(subreddits_df)} subreddits.") subreddits_df.to_csv("subreddits.csv") # # Step 2: Fetch Top Posts for each subreddit all_posts_data = [] for subreddit in subreddits_df['display_name']: print(f"\nFetching top posts for subreddit: {subreddit}...") posts_df = fetch_top_posts(subreddit, limit=5) all_posts_data.append(posts_df) # Combine all posts into a single DataFrame posts_df = pd.concat(all_posts_data, ignore_index=True) print(f"Fetched {len(posts_df)} top posts.") posts_df.to_csv("posts.csv") posts_df = pd.read_csv("posts.csv") # Step 3: Fetch Top Comments for each post all_comments_data = [] if not posts_df.empty: all_comments_data = fetch_top_comments(posts_df, num_comments=2) print(f"Fetched {len(all_comments_data)} top comments.") all_comments_data.to_csv("comments.csv") if __name__ == "__main__": main()