Refresh_Praw_pinecone_dataset / praw_newgest_df2024.py
Vera-ZWY's picture
Update praw_newgest_df2024.py
449695a verified
raw
history blame
7.72 kB
# import gradio as gr
import numpy as np
import pandas as pd
import praw
from huggingface_hub import HfApi, HfFolder
import time
import os
from datetime import datetime
# from tqdm import tqdm
HfFolder.save_token(os.getenv("HF_TOKEN"))
try:
# def initialize_reddit():
reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"),
client_secret= os.getenv("PRAW_CLIENT_SECRET"),
user_agent= os.getenv("RPAW_AGENT"),
check_for_async=False
)
except praw.exceptions.PRAWException as e:
print(f"PRAW Exception: {str(e)}")
# return None
except Exception as e:
print(f"An error occurred: {str(e)}")
# return None
def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000):
posts_data = []
if subreddit_name:
subreddit = reddit.subreddit(subreddit_name)
if keywords:
posts = subreddit.search(keywords, limit=limit)
else:
posts = subreddit.hot(limit=limit)
else:
posts = reddit.subreddit("all").search(keywords, limit=limit)
# print(posts)
for post in posts:
# print(post.title)
try:
post_data = {
"title": post.title,
"score": post.score,
"id": post.id,
"url": post.url,
"num_comments": post.num_comments,
"created": datetime.fromtimestamp(post.created),
"body": post.selftext,
"subreddit": post.subreddit.display_name
}
posts_data.append(post_data)
# Add a small delay to avoid hitting rate limits
time.sleep(0.1)
except praw.exceptions.PRAWException as e:
print(f"Error processing post {post.id}: {str(e)}")
continue
df = pd.DataFrame(posts_data)
df['content'] = df['title'] + '\n' + df['body']
return df
def get_comments(reddit, post_id, limit=100):
"""
Get top comments from a specific post.
Args:
reddit: Reddit instance
post_id (str): ID of the post to get comments from
limit (int): Maximum number of comments to retrieve (default 100)
Returns:
pd.DataFrame: DataFrame containing top comments data
"""
try:
submission = reddit.submission(id=post_id)
comments_data = []
# Replace MoreComments objects with actual comments, limited to save time
submission.comments.replace_more(limit=0) # Ignore "More Comments" expansions
# Get all top-level comments
all_comments = submission.comments.list()
# Sort comments by score and take top ones
sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit]
for comment in sorted_comments:
try:
comment_data = {
'comment_id': comment.id,
'post_id': post_id,
'post_title': submission.title,
# 'author': str(comment.author) if comment.author else '[deleted]',
'body': comment.body,
'score': comment.score,
'created_utc': datetime.fromtimestamp(comment.created_utc)
# 'parent_id': comment.parent_id,
# 'is_submitter': comment.is_submitter
}
comments_data.append(comment_data)
except Exception as e:
print(f"Error processing comment {comment.id}: {str(e)}")
continue
print(comments_data)
# Create DataFrame
df = pd.DataFrame(comments_data)
# Sort by score (highest first)
if not df.empty:
print("sort comments by score")
df = df.sort_values('score', ascending=False)
return df
except praw.exceptions.PRAWException as e:
print(f"PRAW Exception while getting comments: {str(e)}")
return pd.DataFrame()
except Exception as e:
print(f"Error getting comments: {str(e)}")
return pd.DataFrame()
def get_comments_and_upload(df, dataset_repo_id):
# Initialize the Hugging Face API
api = HfApi()
existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset")
# Iterate over each submission in the DataFrame
for index, row in df.iterrows():
csv_file_path = f"comments_{row['id']}.csv"
repo_csv_path = f"comments/{csv_file_path}"
# Check if this file already exists in the Hugging Face dataset
# if repo_csv_path in existing_files:
# print(f"{csv_file_path} already exists in the dataset. Skipping upload.")
# continue
# Fetch comments for the current submission
comments_df = get_comments(reddit, row['id'])
if len(comments_df) == 0:
print(f"No comments found for {row['id']}")
# continue
# Define a unique CSV filename for each submission based on its ID
csv_file_path = f"comments_{row['id']}.csv"
# Save the comments DataFrame as a CSV file
comments_df.to_csv(csv_file_path, index=False)
# Upload the CSV file to the Hugging Face dataset repository
api.upload_file(
path_or_fileobj=csv_file_path,
path_in_repo=f"comments/{csv_file_path}", # Save in a 'comments' folder in the dataset repo
repo_id=dataset_repo_id,
repo_type="dataset"
)
print(f"Uploaded {csv_file_path} to Hugging Face.")
# Optionally, delete the local CSV file to save space
os.remove(csv_file_path)
print("All comments CSV files uploaded successfully!")
def main():
# Example usage
try:
# Search for 2016 election posts
df = scrape_reddit(keywords="election")
if df is not None and not df.empty:
print(f"Successfully scraped {len(df)} posts")
# Save to CSV
# df.to_csv("reddit_2016_election_posts.csv", index=False)
df['created'] = pd.to_datetime(df['created'], unit='s')
df = df.sort_values(by='created', ascending=True)
df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True)
# df_16 = df_16[df_16['created'] > '2015-12-31'].reset_index(drop=True)
dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions"
# reate database if it's not exsit
api = HfApi()
try:
api.dataset_info(dataset_repo_id)
# dataset_exists = True
print(f"Dataset {dataset_repo_id} already exists.")
except Exception:
# dataset_exists = False
print(f"Dataset {dataset_repo_id} will be created.")
# If the dataset doesn't exist, create it and then upload the CSV file
# api.create_repo(repo_id=dataset_repo_id, repo_type="dataset")
today_date = datetime.now().strftime('%Y%m%d')
filename = f"df_24_{today_date}.csv"
df_24.to_csv(filename, index=False)
# csv_file_path = filename
api.upload_file(
path_or_fileobj= filename,
path_in_repo=f"submissions/{filename}",
repo_id=dataset_repo_id,
repo_type="dataset"
)
get_comments_and_upload(df_24, dataset_repo_id)
else:
print("No data was retrieved")
except Exception as e:
print(f"Error in main: {str(e)}")
if __name__ == '__main__':
main()