from datasets import load_dataset import pandas as pd from datetime import datetime from huggingface_hub import HfApi, HfFolder import time import logging from tqdm.auto import tqdm import os # Set up logging HfFolder.save_token(os.getenv("HF_TOKEN")) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def load_huggingface_data(dataset_name, file1_name, file2_name): """Load datasets from Hugging Face""" logger.info("Loading datasets from Hugging Face...") # Load the first CSV file dataset1 = load_dataset(dataset_name, data_files={'train': file1_name}, split='train') # Load the second CSV file dataset2 = load_dataset(dataset_name, data_files={'train': file2_name}, split='train') # Convert to pandas DataFrames df1 = pd.DataFrame(dataset1) df2 = pd.DataFrame(dataset2) logger.info(f"Loaded {len(df1)} rows from {file1_name}") logger.info(f"Loaded {len(df2)} rows from {file2_name}") return df1, df2 def merge_newest(df1, df2): """Process and merge the datasets""" logger.info("Processing datasets...") # Perform full outer join on idg merged_df = pd.merge(df1, df2, on='id', how='outer', suffixes=('', '_y')) # For each column that got a suffix, combine it with the original column for col in merged_df.columns: if col.endswith('_y'): original_col = col[:-2] # Remove the '_y' suffix # Combine columns, taking the non-null value merged_df[original_col] = merged_df[original_col].combine_first(merged_df[col]) # Drop the suffix column merged_df = merged_df.drop(columns=[col]) # Final column order desired_columns = ['title', 'score', 'id', 'url', 'num_comments', 'created', 'body', 'content', 'subreddit'] # Reorder columns, only keeping those that exist final_columns = [col for col in desired_columns if col in merged_df.columns] merged_df = merged_df[final_columns] return merged_df def save_to_huggingface(df, repo_id): """Save the merged dataset to Hugging Face""" logger.info("Saving to Hugging Face...") # Generate filename with today's date # today_date = datetime.now().strftime('%Y%m%d') filename = f"merged_reddit_data.csv" # Save locally first df.to_csv(filename, index=False) # Upload to Hugging Face api = HfApi() api.upload_file( path_or_fileobj=filename, path_in_repo= f"submission/{filename}", repo_id=repo_id, repo_type="dataset" ) return filename def get_newes_file(repo_id): """ Get the newest file from the HuggingFace repository Args: repo_id (str): The repository ID on HuggingFace Returns: str: The filename of the newest merged file """ api = HfApi() # List all files in the repository files = api.list_repo_files(repo_id, repo_type="dataset") # Filter for merged files merged_files = [f for f in files if f.startswith('merged_reddit_data_')] if not merged_files: raise ValueError("No merged files found in repository") # Extract dates from filenames and pair with filenames file_dates = [] for filename in merged_files: try: # Extract date string (assuming format: merged_reddit_data_YYYYMMDD.csv) date_str = filename.split('_')[-1].split('.')[0] date = datetime.strptime(date_str, '%Y%m%d') file_dates.append((date, filename)) except (IndexError, ValueError): continue if not file_dates: raise ValueError("No valid dated files found") # Sort by date and get the newest file newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1] return newest_file def main(): # Initialize Reddit API repo_id = "Vera-ZWY/reddite2024elections_submissions" file_new = get_newes_file(repo_id) file_old = "submission/merged_reddit_data.csv" df1, df2 = load_huggingface_data(repo_id, file_new, file_old) print(f"Newest dataset shape: {df1.shape}") print(f"Old dataset columns: {df1.columns.tolist()}") # Process and merge data merged_df = process_data(df1, df2) output_file = save_to_huggingface(merged_df, repo_id) logger.info(f"Processing complete. File saved as {output_file}") return f"Processing complete. File saved as {output_file}. Old dataset columns: {merged_df.columns.tolist()}" if __name__ == "__main__": main()