Vera-ZWY's picture
Rename merge_and_save.py to merge.py
25e3532 verified
raw
history blame
4.77 kB
from datasets import load_dataset
import pandas as pd
from datetime import datetime
from huggingface_hub import HfApi, HfFolder
import time
import logging
from tqdm.auto import tqdm
import os
# Set up logging
HfFolder.save_token(os.getenv("HF_TOKEN"))
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_huggingface_data(dataset_name, file1_name, file2_name):
"""Load datasets from Hugging Face"""
logger.info("Loading datasets from Hugging Face...")
# Load the first CSV file
dataset1 = load_dataset(dataset_name,
data_files={'train': file1_name},
split='train')
# Load the second CSV file
dataset2 = load_dataset(dataset_name,
data_files={'train': file2_name},
split='train')
# Convert to pandas DataFrames
df1 = pd.DataFrame(dataset1)
df2 = pd.DataFrame(dataset2)
logger.info(f"Loaded {len(df1)} rows from {file1_name}")
logger.info(f"Loaded {len(df2)} rows from {file2_name}")
return df1, df2
def merge_newest(df1, df2):
"""Process and merge the datasets"""
logger.info("Processing datasets...")
# Perform full outer join on idg
merged_df = pd.merge(df1, df2,
on='id',
how='outer',
suffixes=('', '_y'))
# For each column that got a suffix, combine it with the original column
for col in merged_df.columns:
if col.endswith('_y'):
original_col = col[:-2] # Remove the '_y' suffix
# Combine columns, taking the non-null value
merged_df[original_col] = merged_df[original_col].combine_first(merged_df[col])
# Drop the suffix column
merged_df = merged_df.drop(columns=[col])
# Final column order
desired_columns = ['title', 'score', 'id', 'url', 'num_comments',
'created', 'body', 'content', 'subreddit']
# Reorder columns, only keeping those that exist
final_columns = [col for col in desired_columns if col in merged_df.columns]
merged_df = merged_df[final_columns]
return merged_df
def save_to_huggingface(df, repo_id):
"""Save the merged dataset to Hugging Face"""
logger.info("Saving to Hugging Face...")
# Generate filename with today's date
# today_date = datetime.now().strftime('%Y%m%d')
filename = f"merged_reddit_data.csv"
# Save locally first
df.to_csv(filename, index=False)
# Upload to Hugging Face
api = HfApi()
api.upload_file(
path_or_fileobj=filename,
path_in_repo= f"submission/{filename}",
repo_id=repo_id,
repo_type="dataset"
)
return filename
def get_newes_file(repo_id):
"""
Get the newest file from the HuggingFace repository
Args:
repo_id (str): The repository ID on HuggingFace
Returns:
str: The filename of the newest merged file
"""
api = HfApi()
# List all files in the repository
files = api.list_repo_files(repo_id, repo_type="dataset")
# Filter for merged files
merged_files = [f for f in files if f.startswith('merged_reddit_data_')]
if not merged_files:
raise ValueError("No merged files found in repository")
# Extract dates from filenames and pair with filenames
file_dates = []
for filename in merged_files:
try:
# Extract date string (assuming format: merged_reddit_data_YYYYMMDD.csv)
date_str = filename.split('_')[-1].split('.')[0]
date = datetime.strptime(date_str, '%Y%m%d')
file_dates.append((date, filename))
except (IndexError, ValueError):
continue
if not file_dates:
raise ValueError("No valid dated files found")
# Sort by date and get the newest file
newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
return newest_file
def main():
# Initialize Reddit API
repo_id = "Vera-ZWY/reddite2024elections_submissions"
file_new = get_newes_file(repo_id)
file_old = "submission/merged_reddit_data.csv"
df1, df2 = load_huggingface_data(repo_id, file_new, file_old)
print(f"Newest dataset shape: {df1.shape}")
print(f"Old dataset columns: {df1.columns.tolist()}")
# Process and merge data
merged_df = process_data(df1, df2)
output_file = save_to_huggingface(merged_df, repo_id)
logger.info(f"Processing complete. File saved as {output_file}")
return f"Processing complete. File saved as {output_file}. Old dataset columns: {merged_df.columns.tolist()}"
if __name__ == "__main__":
main()