Refresh_Praw_pinecone_dataset / praw_auhtor_info.py
Vera-ZWY's picture
Create praw_auhtor_info.py
14cf6e5 verified
raw
history blame
2.11 kB
from datasets import load_dataset
import pandas as pd
import praw
import time
from tqdm import tqdm
def initialize_reddit():
return praw.Reddit(
client_id="RPAW_CLIENT_ID",
client_secret="RPAW_CLIENT_SECRET",
user_agent="PRAW_AGENT"
)
def get_author_info(reddit, submission_id):
try:
submission = reddit.submission(id=submission_id)
author = submission.author
if author is None:
return {
'author_name': '[deleted]',
'karma': None,
'account_age_days': None,
'is_mod': None
}
return {
'author_name': author.name,
'karma': author.link_karma + author.comment_karma,
'account_age_days': (time.time() - author.created_utc) / 86400,
'is_mod': author.is_mod if hasattr(author, 'is_mod') else None
}
except Exception as e:
print(f"Error fetching author info for submission {submission_id}: {e}")
return {
'author_name': None,
'karma': None,
'account_age_days': None,
'is_mod': None
}
def praw_auhtors_to_path(ds_repo_id, file_path):
# Initialize Reddit API
reddit = initialize_reddit()
# Load dataset from Hugging Face
dataset = load_dataset(ds_repo_id,
data_files={'train': file_path},
split='train')
df = pd.DataFrame(dataset)
# Fetch author info for each submission
author_data = []
for submission_id in tqdm(df['id']):
author_info = get_author_info(reddit, submission_id)
author_data.append(author_info)
time.sleep(1) # Rate limiting
# Create DataFrame with author info
author_df = pd.DataFrame(author_data)
# Merge with original data
result_df = pd.concat([df, author_df], axis=1)
# Save result
output_file = f"submissions_with_authors_{time.strftime('%Y%m%d')}.csv"
result_df.to_csv(output_file, index=False)
print(f"Saved to {output_file}")