File size: 2,091 Bytes
14cf6e5 ac614e5 14cf6e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
from datasets import load_dataset
import pandas as pd
import praw
import time
from tqdm import tqdm
def initialize_reddit():
return praw.Reddit(
client_id="RPAW_CLIENT_ID",
client_secret="RPAW_CLIENT_SECRET",
user_agent="PRAW_AGENT"
)
def get_author_info(reddit, submission_id):
try:
submission = reddit.submission(id=submission_id)
author = submission.author
if author is None:
return {
'author_name': '[deleted]',
'karma': None,
'account_age_days': None,
'is_mod': None
}
return {
'author_name': author.name,
'karma': author.link_karma + author.comment_karma,
'account_age_days': (time.time() - author.created_utc) / 86400,
'is_mod': author.is_mod if hasattr(author, 'is_mod') else None
}
except Exception as e:
print(f"Error fetching author info for submission {submission_id}: {e}")
return {
'author_name': None,
'karma': None,
'account_age_days': None,
'is_mod': None
}
def praw_auhtors_to_path(ds_repo_id, file_path):
# Initialize Reddit API
reddit = initialize_reddit()
# Load dataset from Hugging Face
dataset = load_dataset(ds_repo_id,
data_files={'train': file_path},
split='train')
df = pd.DataFrame(dataset)
# Fetch author info for each submission
author_data = []
for submission_id in tqdm(df['id']):
author_info = get_author_info(reddit, submission_id)
author_data.append(author_info)
time.sleep(1) # Rate limiting
# Create DataFrame with author info
author_df = pd.DataFrame(author_data)
# Merge with original data
result_df = pd.concat([df, author_df], axis=1)
# Save result
output_file = f"submissions_with_authors_{file_path}.csv"
result_df.to_csv(output_file, index=False)
print(f"Saved to {output_file}")
|