File size: 4,768 Bytes
0ea196b 25e3532 0ea196b 25e3532 0ea196b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
from datasets import load_dataset
import pandas as pd
from datetime import datetime
from huggingface_hub import HfApi, HfFolder
import time
import logging
from tqdm.auto import tqdm
import os
# Set up logging
HfFolder.save_token(os.getenv("HF_TOKEN"))
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_huggingface_data(dataset_name, file1_name, file2_name):
"""Load datasets from Hugging Face"""
logger.info("Loading datasets from Hugging Face...")
# Load the first CSV file
dataset1 = load_dataset(dataset_name,
data_files={'train': file1_name},
split='train')
# Load the second CSV file
dataset2 = load_dataset(dataset_name,
data_files={'train': file2_name},
split='train')
# Convert to pandas DataFrames
df1 = pd.DataFrame(dataset1)
df2 = pd.DataFrame(dataset2)
logger.info(f"Loaded {len(df1)} rows from {file1_name}")
logger.info(f"Loaded {len(df2)} rows from {file2_name}")
return df1, df2
def merge_newest(df1, df2):
"""Process and merge the datasets"""
logger.info("Processing datasets...")
# Perform full outer join on idg
merged_df = pd.merge(df1, df2,
on='id',
how='outer',
suffixes=('', '_y'))
# For each column that got a suffix, combine it with the original column
for col in merged_df.columns:
if col.endswith('_y'):
original_col = col[:-2] # Remove the '_y' suffix
# Combine columns, taking the non-null value
merged_df[original_col] = merged_df[original_col].combine_first(merged_df[col])
# Drop the suffix column
merged_df = merged_df.drop(columns=[col])
# Final column order
desired_columns = ['title', 'score', 'id', 'url', 'num_comments',
'created', 'body', 'content', 'subreddit']
# Reorder columns, only keeping those that exist
final_columns = [col for col in desired_columns if col in merged_df.columns]
merged_df = merged_df[final_columns]
return merged_df
def save_to_huggingface(df, repo_id):
"""Save the merged dataset to Hugging Face"""
logger.info("Saving to Hugging Face...")
# Generate filename with today's date
# today_date = datetime.now().strftime('%Y%m%d')
filename = f"merged_reddit_data.csv"
# Save locally first
df.to_csv(filename, index=False)
# Upload to Hugging Face
api = HfApi()
api.upload_file(
path_or_fileobj=filename,
path_in_repo= f"submission/{filename}",
repo_id=repo_id,
repo_type="dataset"
)
return filename
def get_newes_file(repo_id):
"""
Get the newest file from the HuggingFace repository
Args:
repo_id (str): The repository ID on HuggingFace
Returns:
str: The filename of the newest merged file
"""
api = HfApi()
# List all files in the repository
files = api.list_repo_files(repo_id, repo_type="dataset")
# Filter for merged files
merged_files = [f for f in files if f.startswith('merged_reddit_data_')]
if not merged_files:
raise ValueError("No merged files found in repository")
# Extract dates from filenames and pair with filenames
file_dates = []
for filename in merged_files:
try:
# Extract date string (assuming format: merged_reddit_data_YYYYMMDD.csv)
date_str = filename.split('_')[-1].split('.')[0]
date = datetime.strptime(date_str, '%Y%m%d')
file_dates.append((date, filename))
except (IndexError, ValueError):
continue
if not file_dates:
raise ValueError("No valid dated files found")
# Sort by date and get the newest file
newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
return newest_file
def main():
# Initialize Reddit API
repo_id = "Vera-ZWY/reddite2024elections_submissions"
file_new = get_newes_file(repo_id)
file_old = "submission/merged_reddit_data.csv"
df1, df2 = load_huggingface_data(repo_id, file_new, file_old)
print(f"Newest dataset shape: {df1.shape}")
print(f"Old dataset columns: {df1.columns.tolist()}")
# Process and merge data
merged_df = process_data(df1, df2)
output_file = save_to_huggingface(merged_df, repo_id)
logger.info(f"Processing complete. File saved as {output_file}")
return f"Processing complete. File saved as {output_file}. Old dataset columns: {merged_df.columns.tolist()}"
if __name__ == "__main__":
main()
|