|
|
|
from datasets import load_dataset |
|
import pandas as pd |
|
from datetime import datetime |
|
from huggingface_hub import HfApi, HfFolder |
|
import time |
|
import logging |
|
from tqdm.auto import tqdm |
|
import os |
|
|
|
|
|
|
|
HfFolder.save_token(os.getenv("HF_TOKEN")) |
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
def load_huggingface_data(dataset_name, file1_name, file2_name): |
|
"""Load datasets from Hugging Face""" |
|
logger.info("Loading datasets from Hugging Face...") |
|
|
|
|
|
dataset1 = load_dataset(dataset_name, |
|
data_files={'train': file1_name}, |
|
split='train') |
|
|
|
|
|
dataset2 = load_dataset(dataset_name, |
|
data_files={'train': file2_name}, |
|
split='train') |
|
|
|
|
|
df1 = pd.DataFrame(dataset1) |
|
df2 = pd.DataFrame(dataset2) |
|
|
|
logger.info(f"Loaded {len(df1)} rows from {file1_name}") |
|
logger.info(f"Loaded {len(df2)} rows from {file2_name}") |
|
|
|
return df1, df2 |
|
|
|
|
|
def merge_newest(df1, df2): |
|
"""Process and merge the datasets""" |
|
logger.info("Processing datasets...") |
|
|
|
|
|
merged_df = pd.merge(df1, df2, |
|
on='id', |
|
how='outer', |
|
suffixes=('', '_y')) |
|
|
|
|
|
for col in merged_df.columns: |
|
if col.endswith('_y'): |
|
original_col = col[:-2] |
|
|
|
merged_df[original_col] = merged_df[original_col].combine_first(merged_df[col]) |
|
|
|
merged_df = merged_df.drop(columns=[col]) |
|
|
|
|
|
desired_columns = ['title', 'score', 'id', 'url', 'num_comments', |
|
'created', 'body', 'content', 'subreddit'] |
|
|
|
|
|
final_columns = [col for col in desired_columns if col in merged_df.columns] |
|
merged_df = merged_df[final_columns] |
|
|
|
return merged_df |
|
|
|
|
|
|
|
def save_to_huggingface(df, repo_id): |
|
"""Save the merged dataset to Hugging Face""" |
|
logger.info("Saving to Hugging Face...") |
|
|
|
|
|
|
|
filename = f"merged_reddit_data.csv" |
|
|
|
|
|
df.to_csv(filename, index=False) |
|
|
|
|
|
api = HfApi() |
|
api.upload_file( |
|
path_or_fileobj=filename, |
|
path_in_repo= f"submission/{filename}", |
|
repo_id=repo_id, |
|
repo_type="dataset" |
|
) |
|
|
|
return filename |
|
|
|
def get_newes_file(repo_id): |
|
""" |
|
Get the newest file from the HuggingFace repository |
|
|
|
Args: |
|
repo_id (str): The repository ID on HuggingFace |
|
|
|
Returns: |
|
str: The filename of the newest merged file |
|
""" |
|
api = HfApi() |
|
|
|
|
|
files = api.list_repo_files(repo_id, repo_type="dataset") |
|
|
|
|
|
merged_files = [f for f in files if f.startswith('merged_reddit_data_')] |
|
|
|
if not merged_files: |
|
raise ValueError("No merged files found in repository") |
|
|
|
|
|
file_dates = [] |
|
for filename in merged_files: |
|
try: |
|
|
|
date_str = filename.split('_')[-1].split('.')[0] |
|
date = datetime.strptime(date_str, '%Y%m%d') |
|
file_dates.append((date, filename)) |
|
except (IndexError, ValueError): |
|
continue |
|
|
|
if not file_dates: |
|
raise ValueError("No valid dated files found") |
|
|
|
|
|
newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1] |
|
|
|
return newest_file |
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
repo_id = "Vera-ZWY/reddite2024elections_submissions" |
|
|
|
file_new = get_newes_file(repo_id) |
|
file_old = "submission/merged_reddit_data.csv" |
|
|
|
df1, df2 = load_huggingface_data(repo_id, file_new, file_old) |
|
print(f"Newest dataset shape: {df1.shape}") |
|
print(f"Old dataset columns: {df1.columns.tolist()}") |
|
|
|
|
|
merged_df = process_data(df1, df2) |
|
|
|
|
|
|
|
output_file = save_to_huggingface(merged_df, repo_id) |
|
|
|
logger.info(f"Processing complete. File saved as {output_file}") |
|
return f"Processing complete. File saved as {output_file}. Old dataset columns: {merged_df.columns.tolist()}" |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|