Vera-ZWY commited on
Commit
14cf6e5
1 Parent(s): 8b96174

Create praw_auhtor_info.py

Browse files
Files changed (1) hide show
  1. praw_auhtor_info.py +68 -0
praw_auhtor_info.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import pandas as pd
3
+ import praw
4
+ import time
5
+ from tqdm import tqdm
6
+
7
+ def initialize_reddit():
8
+ return praw.Reddit(
9
+ client_id="RPAW_CLIENT_ID",
10
+ client_secret="RPAW_CLIENT_SECRET",
11
+ user_agent="PRAW_AGENT"
12
+ )
13
+
14
+ def get_author_info(reddit, submission_id):
15
+ try:
16
+ submission = reddit.submission(id=submission_id)
17
+ author = submission.author
18
+ if author is None:
19
+ return {
20
+ 'author_name': '[deleted]',
21
+ 'karma': None,
22
+ 'account_age_days': None,
23
+ 'is_mod': None
24
+ }
25
+
26
+ return {
27
+ 'author_name': author.name,
28
+ 'karma': author.link_karma + author.comment_karma,
29
+ 'account_age_days': (time.time() - author.created_utc) / 86400,
30
+ 'is_mod': author.is_mod if hasattr(author, 'is_mod') else None
31
+ }
32
+ except Exception as e:
33
+ print(f"Error fetching author info for submission {submission_id}: {e}")
34
+ return {
35
+ 'author_name': None,
36
+ 'karma': None,
37
+ 'account_age_days': None,
38
+ 'is_mod': None
39
+ }
40
+
41
+ def praw_auhtors_to_path(ds_repo_id, file_path):
42
+ # Initialize Reddit API
43
+ reddit = initialize_reddit()
44
+
45
+ # Load dataset from Hugging Face
46
+ dataset = load_dataset(ds_repo_id,
47
+ data_files={'train': file_path},
48
+ split='train')
49
+ df = pd.DataFrame(dataset)
50
+
51
+ # Fetch author info for each submission
52
+ author_data = []
53
+ for submission_id in tqdm(df['id']):
54
+ author_info = get_author_info(reddit, submission_id)
55
+ author_data.append(author_info)
56
+ time.sleep(1) # Rate limiting
57
+
58
+ # Create DataFrame with author info
59
+ author_df = pd.DataFrame(author_data)
60
+
61
+ # Merge with original data
62
+ result_df = pd.concat([df, author_df], axis=1)
63
+
64
+ # Save result
65
+ output_file = f"submissions_with_authors_{time.strftime('%Y%m%d')}.csv"
66
+ result_df.to_csv(output_file, index=False)
67
+ print(f"Saved to {output_file}")
68
+