Spaces:

mangoesai
/

Refresh_Praw_pinecone_dataset

Sleeping

App Files Files Community

Refresh_Praw_pinecone_dataset / praw.py

Vera-ZWY

Rename praw_newgest_df2024.py to praw.py

8050a0a verified 8 months ago

raw

history blame contribute delete

7.72 kB

	# import gradio as gr
	import numpy as np
	import pandas as pd
	import praw
	from huggingface_hub import HfApi, HfFolder
	import time
	import os
	from datetime import datetime
	# from tqdm import tqdm

	HfFolder.save_token(os.getenv("HF_TOKEN"))

	try:
	# def initialize_reddit():


	reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"),
	client_secret= os.getenv("PRAW_CLIENT_SECRET"),
	user_agent= os.getenv("RPAW_AGENT"),
	check_for_async=False
	)

	except praw.exceptions.PRAWException as e:
	print(f"PRAW Exception: {str(e)}")
	# return None
	except Exception as e:
	print(f"An error occurred: {str(e)}")
	# return None

	def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000):

	posts_data = []

	if subreddit_name:
	subreddit = reddit.subreddit(subreddit_name)
	if keywords:
	posts = subreddit.search(keywords, limit=limit)
	else:
	posts = subreddit.hot(limit=limit)
	else:
	posts = reddit.subreddit("all").search(keywords, limit=limit)
	# print(posts)
	for post in posts:
	# print(post.title)
	try:
	post_data = {
	"title": post.title,
	"score": post.score,
	"id": post.id,
	"url": post.url,
	"num_comments": post.num_comments,
	"created": datetime.fromtimestamp(post.created),
	"body": post.selftext,
	"subreddit": post.subreddit.display_name
	}
	posts_data.append(post_data)

	# Add a small delay to avoid hitting rate limits
	time.sleep(0.1)

	except praw.exceptions.PRAWException as e:
	print(f"Error processing post {post.id}: {str(e)}")
	continue

	df = pd.DataFrame(posts_data)
	df['content'] = df['title'] + '\n' + df['body']
	return df



	def get_comments(reddit, post_id, limit=100):
	"""
	Get top comments from a specific post.

	Args:
	reddit: Reddit instance
	post_id (str): ID of the post to get comments from
	limit (int): Maximum number of comments to retrieve (default 100)

	Returns:
	pd.DataFrame: DataFrame containing top comments data
	"""
	try:
	submission = reddit.submission(id=post_id)
	comments_data = []

	# Replace MoreComments objects with actual comments, limited to save time
	submission.comments.replace_more(limit=0) # Ignore "More Comments" expansions

	# Get all top-level comments
	all_comments = submission.comments.list()


	# Sort comments by score and take top ones
	sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit]

	for comment in sorted_comments:

	try:
	comment_data = {
	'comment_id': comment.id,
	'post_id': post_id,
	'post_title': submission.title,
	# 'author': str(comment.author) if comment.author else '[deleted]',
	'body': comment.body,
	'score': comment.score,
	'created_utc': datetime.fromtimestamp(comment.created_utc)
	# 'parent_id': comment.parent_id,
	# 'is_submitter': comment.is_submitter
	}
	comments_data.append(comment_data)

	except Exception as e:
	print(f"Error processing comment {comment.id}: {str(e)}")
	continue
	print(comments_data)

	# Create DataFrame
	df = pd.DataFrame(comments_data)

	# Sort by score (highest first)
	if not df.empty:
	print("sort comments by score")
	df = df.sort_values('score', ascending=False)

	return df

	except praw.exceptions.PRAWException as e:
	print(f"PRAW Exception while getting comments: {str(e)}")
	return pd.DataFrame()
	except Exception as e:
	print(f"Error getting comments: {str(e)}")
	return pd.DataFrame()



	def get_comments_and_upload(df, dataset_repo_id):
	# Initialize the Hugging Face API
	api = HfApi()

	existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset")

	# Iterate over each submission in the DataFrame
	for index, row in df.iterrows():
	csv_file_path = f"comments_{row['id']}.csv"
	repo_csv_path = f"comments/{csv_file_path}"

	# Check if this file already exists in the Hugging Face dataset
	# if repo_csv_path in existing_files:
	# print(f"{csv_file_path} already exists in the dataset. Skipping upload.")
	# continue
	# Fetch comments for the current submission
	comments_df = get_comments(reddit, row['id'])

	if len(comments_df) == 0:
	print(f"No comments found for {row['id']}")
	# continue
	# Define a unique CSV filename for each submission based on its ID
	csv_file_path = f"comments_{row['id']}.csv"

	# Save the comments DataFrame as a CSV file
	comments_df.to_csv(csv_file_path, index=False)

	# Upload the CSV file to the Hugging Face dataset repository
	api.upload_file(
	path_or_fileobj=csv_file_path,
	path_in_repo=f"comments/{csv_file_path}", # Save in a 'comments' folder in the dataset repo
	repo_id=dataset_repo_id,
	repo_type="dataset"
	)

	print(f"Uploaded {csv_file_path} to Hugging Face.")

	# Optionally, delete the local CSV file to save space
	os.remove(csv_file_path)

	print("All comments CSV files uploaded successfully!")



	def main():
	# Example usage

	try:
	# Search for 2016 election posts
	df = scrape_reddit(keywords="election")

	if df is not None and not df.empty:
	print(f"Successfully scraped {len(df)} posts")
	# Save to CSV
	# df.to_csv("reddit_2016_election_posts.csv", index=False)
	df['created'] = pd.to_datetime(df['created'], unit='s')
	df = df.sort_values(by='created', ascending=True)
	df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True)
	# df_16 = df_16[df_16['created'] > '2015-12-31'].reset_index(drop=True)


	dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions"
	# reate database if it's not exsit
	api = HfApi()
	try:
	api.dataset_info(dataset_repo_id)
	# dataset_exists = True
	print(f"Dataset {dataset_repo_id} already exists.")

	except Exception:
	# dataset_exists = False
	print(f"Dataset {dataset_repo_id} will be created.")
	# If the dataset doesn't exist, create it and then upload the CSV file
	# api.create_repo(repo_id=dataset_repo_id, repo_type="dataset")

	today_date = datetime.now().strftime('%Y%m%d')
	filename = f"df_24_{today_date}.csv"

	df_24.to_csv(filename, index=False)
	# csv_file_path = filename

	api.upload_file(
	path_or_fileobj= filename,
	path_in_repo=f"submissions/{filename}",
	repo_id=dataset_repo_id,
	repo_type="dataset"
	)

	get_comments_and_upload(df_24, dataset_repo_id)

	else:
	print("No data was retrieved")


	except Exception as e:
	print(f"Error in main: {str(e)}")


	if __name__ == '__main__':
	main()