Spaces:

mangoesai
/

Refresh_Praw_pinecone_dataset

Sleeping

File size: 4,768 Bytes


from datasets import load_dataset
import pandas as pd
from datetime import datetime
from huggingface_hub import HfApi, HfFolder
import time
import logging
from tqdm.auto import tqdm
import os

# Set up logging

HfFolder.save_token(os.getenv("HF_TOKEN"))
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)



def load_huggingface_data(dataset_name, file1_name, file2_name):
    """Load datasets from Hugging Face"""
    logger.info("Loading datasets from Hugging Face...")

    # Load the first CSV file
    dataset1 = load_dataset(dataset_name,
                            data_files={'train': file1_name},
                            split='train')

    # Load the second CSV file
    dataset2 = load_dataset(dataset_name,
                            data_files={'train': file2_name},
                            split='train')

    # Convert to pandas DataFrames
    df1 = pd.DataFrame(dataset1)
    df2 = pd.DataFrame(dataset2)

    logger.info(f"Loaded {len(df1)} rows from {file1_name}")
    logger.info(f"Loaded {len(df2)} rows from {file2_name}")

    return df1, df2


def merge_newest(df1, df2):
    """Process and merge the datasets"""
    logger.info("Processing datasets...")

    # Perform full outer join on idg
    merged_df = pd.merge(df1, df2,
                         on='id',
                         how='outer',
                         suffixes=('', '_y'))

    # For each column that got a suffix, combine it with the original column
    for col in merged_df.columns:
        if col.endswith('_y'):
            original_col = col[:-2]  # Remove the '_y' suffix
            # Combine columns, taking the non-null value
            merged_df[original_col] = merged_df[original_col].combine_first(merged_df[col])
            # Drop the suffix column
            merged_df = merged_df.drop(columns=[col])

    # Final column order
    desired_columns = ['title', 'score', 'id', 'url', 'num_comments',
                       'created', 'body', 'content', 'subreddit']

    # Reorder columns, only keeping those that exist
    final_columns = [col for col in desired_columns if col in merged_df.columns]
    merged_df = merged_df[final_columns]

    return merged_df
    


def save_to_huggingface(df, repo_id):
    """Save the merged dataset to Hugging Face"""
    logger.info("Saving to Hugging Face...")

    # Generate filename with today's date
    # today_date = datetime.now().strftime('%Y%m%d')
    filename = f"merged_reddit_data.csv"

    # Save locally first
    df.to_csv(filename, index=False)

    # Upload to Hugging Face
    api = HfApi()
    api.upload_file(
        path_or_fileobj=filename,
        path_in_repo= f"submission/{filename}",
        repo_id=repo_id,
        repo_type="dataset"
    )

    return filename

def get_newes_file(repo_id):
    """
    Get the newest file from the HuggingFace repository
    
    Args:
        repo_id (str): The repository ID on HuggingFace
    
    Returns:
        str: The filename of the newest merged file
    """
    api = HfApi()
    
    # List all files in the repository
    files = api.list_repo_files(repo_id, repo_type="dataset")
    
    # Filter for merged files
    merged_files = [f for f in files if f.startswith('merged_reddit_data_')]
    
    if not merged_files:
        raise ValueError("No merged files found in repository")
    
    # Extract dates from filenames and pair with filenames
    file_dates = []
    for filename in merged_files:
        try:
            # Extract date string (assuming format: merged_reddit_data_YYYYMMDD.csv)
            date_str = filename.split('_')[-1].split('.')[0]
            date = datetime.strptime(date_str, '%Y%m%d')
            file_dates.append((date, filename))
        except (IndexError, ValueError):
            continue
    
    if not file_dates:
        raise ValueError("No valid dated files found")
    
    # Sort by date and get the newest file
    newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
    
    return newest_file



def main():
    # Initialize Reddit API
    
    repo_id = "Vera-ZWY/reddite2024elections_submissions"
    
    file_new = get_newes_file(repo_id)
    file_old = "submission/merged_reddit_data.csv"

    df1, df2 = load_huggingface_data(repo_id, file_new, file_old)
    print(f"Newest dataset shape: {df1.shape}")
    print(f"Old dataset columns: {df1.columns.tolist()}")

    # Process and merge data
    merged_df = process_data(df1, df2)



    output_file = save_to_huggingface(merged_df, repo_id)

    logger.info(f"Processing complete. File saved as {output_file}")
    return f"Processing complete. File saved as {output_file}. Old dataset columns: {merged_df.columns.tolist()}"
    

if __name__ == "__main__":
    main()