from datasets import load_dataset import pandas as pd import praw import time from tqdm import tqdm def initialize_reddit(): return praw.Reddit( client_id="RPAW_CLIENT_ID", client_secret="RPAW_CLIENT_SECRET", user_agent="PRAW_AGENT" ) def get_author_info(reddit, submission_id): try: submission = reddit.submission(id=submission_id) author = submission.author if author is None: return { 'author_name': '[deleted]', 'karma': None, 'account_age_days': None, 'is_mod': None } return { 'author_name': author.name, 'karma': author.link_karma + author.comment_karma, 'account_age_days': (time.time() - author.created_utc) / 86400, 'is_mod': author.is_mod if hasattr(author, 'is_mod') else None } except Exception as e: print(f"Error fetching author info for submission {submission_id}: {e}") return { 'author_name': None, 'karma': None, 'account_age_days': None, 'is_mod': None } def praw_auhtors_to_path(ds_repo_id, file_path): # Initialize Reddit API reddit = initialize_reddit() # Load dataset from Hugging Face dataset = load_dataset(ds_repo_id, data_files={'train': file_path}, split='train') df = pd.DataFrame(dataset) # Fetch author info for each submission author_data = [] for submission_id in tqdm(df['id']): author_info = get_author_info(reddit, submission_id) author_data.append(author_info) time.sleep(1) # Rate limiting # Create DataFrame with author info author_df = pd.DataFrame(author_data) # Merge with original data result_df = pd.concat([df, author_df], axis=1) # Save result output_file = f"submissions_with_authors_{file_path}.csv" result_df.to_csv(output_file, index=False) print(f"Saved to {output_file}")