import gradio as gr from datasets import load_dataset import pandas as pd import sys import os from datetime import datetime import subprocess from huggingface_hub import HfApi def get_newest_file(repo_id, prefix): """Get the newest file with given prefix from HuggingFace repo""" api = HfApi() files = api.list_repo_files(repo_id, repo_type="dataset") # Filter files by prefix relevant_files = [f for f in files if f.startswith(prefix)] if not relevant_files: return None # Sort files by modification time (newest first) file_dates = [] for filename in relevant_files: try: date_str = filename.split('_')[-1].split('.')[0] date = datetime.strptime(date_str, '%Y%m%d') file_dates.append((date, filename)) except (IndexError, ValueError): continue if not file_dates: return None newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1] return newest_file def load_and_show_data(repo_id, file_path): """Load data from HuggingFace and return top 3 rows""" try: dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train') df = pd.DataFrame(dataset) return df.head(3).to_string() except Exception as e: return f"Error loading data: {str(e)}" def praw_new_data(): """Execute praw.py and show the latest data""" try: # Execute praw.py subprocess.run([sys.executable, "praw.py"], check=True) # Get newest crawled file repo_id = "Vera-ZWY/reddite2024elections_submissions" newest_file = get_newest_file(repo_id, "submissions/df_24") if newest_file: return f"Successfully crawled new data!\n\nTop 3 rows of new data:\n{load_and_show_data(repo_id, newest_file)}" else: return "Error: No crawled data files found" except subprocess.CalledProcessError as e: return f"Error executing praw.py: {str(e)}" except Exception as e: return f"Error: {str(e)}" def merge_data(): """Execute merge.py and show the latest merged data""" try: # Execute merge.py subprocess.run([sys.executable, "merge.py"], check=True) # Get newest merged file repo_id = "Vera-ZWY/reddite2024elections_submissions" old_merged = "submission/merged_reddit_data.csv" if newest_merged: return f"Successfully merged data!\n\nTop 3 rows of merged data:\n{load_and_show_data(repo_id, newest_merged)}" else: return "Error: No merged data files found" except subprocess.CalledProcessError as e: return f"Error executing merge.py: {str(e)}" except Exception as e: return f"Error: {str(e)}" def load_latest_data(): """Load and display the latest data from both sources""" repo_id = "Vera-ZWY/reddite2024elections_submissions" # Get latest crawled data newest_crawled = get_newest_file(repo_id, "submissions/df_24") crawled_data = "No crawled data available" if newest_crawled: crawled_data = f"Latest crawled data ({newest_crawled}):\n{load_and_show_data(repo_id, newest_crawled)}" # Get latest merged data newest_merged = get_newest_file(repo_id, "merged_reddit_data") merged_data = "No merged data available" if newest_merged: merged_data = f"Latest merged data ({newest_merged}):\n{load_and_show_data(repo_id, newest_merged)}" return f"{crawled_data}\n\n{merged_data}" # Create Gradio interface with gr.Blocks(title="Reddit Data Processing") as iface: gr.Markdown("# Reddit Data Processing Interface") with gr.Row(): with gr.Column(): praw_button = gr.Button("Crawl New Data") merge_button = gr.Button("Merge Data") output_text = gr.Textbox(label="Results", value=load_latest_data(), lines=20) # Button click handlers praw_button.click(fn=praw_new_data, outputs=output_text) merge_button.click(fn=merge_data, outputs=output_text) gr.Markdown(""" ### Instructions: 1. Click 'Crawl New Data' to fetch new Reddit data 2. Click 'Merge Data' to merge the latest datasets 3. The results will show the top 3 rows of both the latest crawled and merged data """) # Launch the interface if __name__ == "__main__": iface.launch()