import gradio as gr from datasets import load_dataset import pandas as pd import sys import subprocess from datetime import datetime from huggingface_hub import HfApi def get_newest_file(repo_id, prefix): """Get the newest file with given prefix from HuggingFace repo""" api = HfApi() files = api.list_repo_files(repo_id, repo_type="dataset") relevant_files = [f for f in files if f.startswith(prefix)] if not relevant_files: return None file_dates = [] for filename in relevant_files: try: date_str = filename.split('_')[-1].split('.')[0] date = datetime.strptime(date_str, '%Y%m%d') file_dates.append((date, filename)) except (IndexError, ValueError): continue if not file_dates: return None newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1] return newest_file def load_data(repo_id, file_path): """Load data from HuggingFace and return as DataFrame""" try: dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train') df = pd.DataFrame(dataset) return df.head(3) except Exception as e: return pd.DataFrame({'Error': [str(e)]}) def praw_new_data(): """Execute praw.py and show the latest data""" try: # Execute praw.py subprocess.run([sys.executable, "praw.py"], check=True) success_message = "✅ Successfully crawled new data!" except Exception as e: success_message = f"❌ Error executing praw.py: {str(e)}" # Load and return latest data repo_id = "Vera-ZWY/reddite2024elections_submissions" newest_file = get_newest_file(repo_id, "submissions/df_") if newest_file: df = load_data(repo_id, newest_file) return success_message, df, load_merged_data()[1] # Return current merged data state else: return "No crawled data files found", pd.DataFrame(), load_merged_data()[1] def merge_data(): """Execute merge.py and show the latest merged data""" try: # Execute merge.py subprocess.run([sys.executable, "merge.py"], check=True) success_message = "✅ Successfully merged data!" except Exception as e: success_message = f"❌ Error executing merge.py: {str(e)}" # Load and return latest merged data merged_df = load_merged_data()[1] crawled_df = load_crawled_data()[1] return success_message, crawled_df, merged_df def load_crawled_data(): """Load latest crawled data""" repo_id = "Vera-ZWY/reddite2024elections_submissions" newest_file = get_newest_file(repo_id, "submissions/df_24") if newest_file: return f"Latest crawled data ({newest_file}):", load_data(repo_id, newest_file) return "No crawled data available", pd.DataFrame() def load_merged_data(): """Load latest merged data""" repo_id = "Vera-ZWY/reddite2024elections_submissions" newest_merged = "submission/merged_reddit_data.csv" if newest_merged: return f"Latest merged data ({newest_merged}):", load_data(repo_id, newest_merged) return "No merged data available", pd.DataFrame() # Create Gradio interface with gr.Blocks(title="Reddit Data Processing") as iface: gr.Markdown("# Reddit Data Processing Interface") # Status message for operations status_text = gr.Textbox(label="Status", interactive=False) with gr.Row(): with gr.Column(): praw_button = gr.Button("Crawl New Data", variant="primary") with gr.Column(): merge_button = gr.Button("Merge Data", variant="primary") with gr.Row(): with gr.Column(): gr.Markdown("### Latest Crawled Data (Top 3 Rows)") crawled_table = gr.Dataframe( headers=["title", "score", "id", "url", "comms_num", "created", "body", "subreddit"], value=load_crawled_data()[1], wrap=True ) with gr.Row(): with gr.Column(): gr.Markdown("### Latest Merged Data (Top 3 Rows)") merged_table = gr.Dataframe( headers=["title", "score", "id", "url", "num_comments", "created", "body", "content", "subreddit"], value=load_merged_data()[1], wrap=True ) # Button click handlers praw_button.click( fn=praw_new_data, outputs=[status_text, crawled_table, merged_table] ) merge_button.click( fn=merge_data, outputs=[status_text, crawled_table, merged_table] ) gr.Markdown(""" ## The full dataset storage at https://huggingface.co/datasets/Vera-ZWY/reddite2024elections_submissions/ ### Instructions: 1. Click 'Crawl New Data' to fetch new Reddit data 2. Click 'Merge Data' to merge the latest datasets 3. Tables will automatically update to show the latest data """) # Launch the interface if __name__ == "__main__": iface.launch()