|
from datasets import load_dataset |
|
import pandas as pd |
|
import praw |
|
import time |
|
from tqdm import tqdm |
|
|
|
def initialize_reddit(): |
|
return praw.Reddit( |
|
client_id="RPAW_CLIENT_ID", |
|
client_secret="RPAW_CLIENT_SECRET", |
|
user_agent="PRAW_AGENT" |
|
) |
|
|
|
def get_author_info(reddit, submission_id): |
|
try: |
|
submission = reddit.submission(id=submission_id) |
|
author = submission.author |
|
if author is None: |
|
return { |
|
'author_name': '[deleted]', |
|
'karma': None, |
|
'account_age_days': None, |
|
'is_mod': None |
|
} |
|
|
|
return { |
|
'author_name': author.name, |
|
'karma': author.link_karma + author.comment_karma, |
|
'account_age_days': (time.time() - author.created_utc) / 86400, |
|
'is_mod': author.is_mod if hasattr(author, 'is_mod') else None |
|
} |
|
except Exception as e: |
|
print(f"Error fetching author info for submission {submission_id}: {e}") |
|
return { |
|
'author_name': None, |
|
'karma': None, |
|
'account_age_days': None, |
|
'is_mod': None |
|
} |
|
|
|
def praw_auhtors_to_path(ds_repo_id, file_path): |
|
|
|
reddit = initialize_reddit() |
|
|
|
|
|
dataset = load_dataset(ds_repo_id, |
|
data_files={'train': file_path}, |
|
split='train') |
|
df = pd.DataFrame(dataset) |
|
|
|
|
|
author_data = [] |
|
for submission_id in tqdm(df['id']): |
|
author_info = get_author_info(reddit, submission_id) |
|
author_data.append(author_info) |
|
time.sleep(1) |
|
|
|
|
|
author_df = pd.DataFrame(author_data) |
|
|
|
|
|
result_df = pd.concat([df, author_df], axis=1) |
|
|
|
|
|
output_file = f"submissions_with_authors_{time.strftime('%Y%m%d')}.csv" |
|
result_df.to_csv(output_file, index=False) |
|
print(f"Saved to {output_file}") |
|
|
|
|