Spaces:

mangoesai
/

Refresh_Praw_pinecone_dataset

Sleeping

File size: 7,715 Bytes

# import gradio as gr
import numpy as np
import pandas as pd
import praw
from huggingface_hub import HfApi, HfFolder
import time
import os
from datetime import datetime
# from tqdm import tqdm

HfFolder.save_token(os.getenv("HF_TOKEN"))

try:
# def initialize_reddit():


    reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"),
                         client_secret= os.getenv("PRAW_CLIENT_SECRET"),
                         user_agent= os.getenv("RPAW_AGENT"),
                         check_for_async=False
                          )
    
except praw.exceptions.PRAWException as e:
    print(f"PRAW Exception: {str(e)}")
    # return None
except Exception as e:
    print(f"An error occurred: {str(e)}")
    # return None

def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000):

    posts_data = []

    if subreddit_name:
        subreddit = reddit.subreddit(subreddit_name)
        if keywords:
            posts = subreddit.search(keywords, limit=limit)
        else:
            posts = subreddit.hot(limit=limit)
    else:
        posts = reddit.subreddit("all").search(keywords, limit=limit)
    # print(posts)
    for post in posts:
        # print(post.title)
        try:
            post_data = {
                "title": post.title,
                "score": post.score,
                "id": post.id,
                "url": post.url,
                "num_comments": post.num_comments,
                "created": datetime.fromtimestamp(post.created),
                "body": post.selftext,
                "subreddit": post.subreddit.display_name
            }
            posts_data.append(post_data)

            # Add a small delay to avoid hitting rate limits
            time.sleep(0.1)

        except praw.exceptions.PRAWException as e:
            print(f"Error processing post {post.id}: {str(e)}")
            continue

    df = pd.DataFrame(posts_data)
    df['content'] = df['title'] + '\n' + df['body']
    return df



def get_comments(reddit, post_id, limit=100):
    """
    Get top comments from a specific post.

    Args:
        reddit: Reddit instance
        post_id (str): ID of the post to get comments from
        limit (int): Maximum number of comments to retrieve (default 100)

    Returns:
        pd.DataFrame: DataFrame containing top comments data
    """
    try:
        submission = reddit.submission(id=post_id)
        comments_data = []

        # Replace MoreComments objects with actual comments, limited to save time
        submission.comments.replace_more(limit=0)  # Ignore "More Comments" expansions

        # Get all top-level comments
        all_comments = submission.comments.list()


        # Sort comments by score and take top ones
        sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit]

        for comment in sorted_comments:

            try:
                comment_data = {
                    'comment_id': comment.id,
                    'post_id': post_id,
                    'post_title': submission.title,
                    # 'author': str(comment.author) if comment.author else '[deleted]',
                    'body': comment.body,
                    'score': comment.score,
                    'created_utc': datetime.fromtimestamp(comment.created_utc)
                    # 'parent_id': comment.parent_id,
                    # 'is_submitter': comment.is_submitter
                }
                comments_data.append(comment_data)

            except Exception as e:
                print(f"Error processing comment {comment.id}: {str(e)}")
                continue
        print(comments_data)

        # Create DataFrame
        df = pd.DataFrame(comments_data)

        # Sort by score (highest first)
        if not df.empty:
            print("sort comments by score")
            df = df.sort_values('score', ascending=False)

        return df

    except praw.exceptions.PRAWException as e:
        print(f"PRAW Exception while getting comments: {str(e)}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error getting comments: {str(e)}")
        return pd.DataFrame()



def get_comments_and_upload(df, dataset_repo_id):
    # Initialize the Hugging Face API
    api = HfApi()

    existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset")

    # Iterate over each submission in the DataFrame
    for index, row in df.iterrows():
        csv_file_path = f"comments_{row['id']}.csv"
        repo_csv_path = f"comments/{csv_file_path}"

        # Check if this file already exists in the Hugging Face dataset
        # if repo_csv_path in existing_files:
        #     print(f"{csv_file_path} already exists in the dataset. Skipping upload.")
        #     continue
        # Fetch comments for the current submission
        comments_df = get_comments(reddit, row['id'])

        if len(comments_df) == 0:
            print(f"No comments found for {row['id']}")
            # continue
        # Define a unique CSV filename for each submission based on its ID
        csv_file_path = f"comments_{row['id']}.csv"

        # Save the comments DataFrame as a CSV file
        comments_df.to_csv(csv_file_path, index=False)

        # Upload the CSV file to the Hugging Face dataset repository
        api.upload_file(
            path_or_fileobj=csv_file_path,
            path_in_repo=f"comments/{csv_file_path}",  # Save in a 'comments' folder in the dataset repo
            repo_id=dataset_repo_id,
            repo_type="dataset"
        )

        print(f"Uploaded {csv_file_path} to Hugging Face.")

        # Optionally, delete the local CSV file to save space
        os.remove(csv_file_path)

    print("All comments CSV files uploaded successfully!")


    
def main():
    # Example usage

    try:
        # Search for 2016 election posts
        df = scrape_reddit(keywords="election")

        if df is not None and not df.empty:
            print(f"Successfully scraped {len(df)} posts")
            # Save to CSV
            # df.to_csv("reddit_2016_election_posts.csv", index=False)
            df['created'] = pd.to_datetime(df['created'], unit='s')
            df = df.sort_values(by='created', ascending=True)
            df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True)
            # df_16 = df_16[df_16['created'] > '2015-12-31'].reset_index(drop=True)
            

            dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions"
            # reate database if it's not exsit
            api = HfApi()
            try:
                api.dataset_info(dataset_repo_id)
                # dataset_exists = True
                print(f"Dataset {dataset_repo_id} already exists.")

            except Exception:
            # dataset_exists = False
                print(f"Dataset {dataset_repo_id} will be created.")
                # If the dataset doesn't exist, create it and then upload the CSV file
                # api.create_repo(repo_id=dataset_repo_id, repo_type="dataset")
                
            today_date = datetime.now().strftime('%Y%m%d')
            filename = f"df_24_{today_date}.csv"

            df_24.to_csv(filename, index=False)
            # csv_file_path = filename

            api.upload_file(
                path_or_fileobj= filename,
                path_in_repo=f"submissions/{filename}",
                repo_id=dataset_repo_id,
                repo_type="dataset"
            )

            get_comments_and_upload(df_24, dataset_repo_id)

        else:
            print("No data was retrieved")


    except Exception as e:
        print(f"Error in main: {str(e)}")
        

if __name__ == '__main__':
    main()