|
import gradio as gr |
|
from datasets import load_dataset |
|
import pandas as pd |
|
import sys |
|
import os |
|
from datetime import datetime |
|
import subprocess |
|
from huggingface_hub import HfApi |
|
|
|
def get_newest_file(repo_id, prefix): |
|
"""Get the newest file with given prefix from HuggingFace repo""" |
|
api = HfApi() |
|
files = api.list_repo_files(repo_id, repo_type="dataset") |
|
|
|
relevant_files = [f for f in files if f.startswith(prefix)] |
|
|
|
if not relevant_files: |
|
return None |
|
|
|
|
|
file_dates = [] |
|
for filename in relevant_files: |
|
try: |
|
date_str = filename.split('_')[-1].split('.')[0] |
|
date = datetime.strptime(date_str, '%Y%m%d') |
|
file_dates.append((date, filename)) |
|
except (IndexError, ValueError): |
|
continue |
|
|
|
if not file_dates: |
|
return None |
|
|
|
newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1] |
|
return newest_file |
|
|
|
def load_and_show_data(repo_id, file_path): |
|
"""Load data from HuggingFace and return top 3 rows""" |
|
try: |
|
dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train') |
|
df = pd.DataFrame(dataset) |
|
return df.head(3).to_string() |
|
except Exception as e: |
|
return f"Error loading data: {str(e)}" |
|
|
|
def praw_new_data(): |
|
"""Execute praw.py and show the latest data""" |
|
try: |
|
|
|
subprocess.run([sys.executable, "praw.py"], check=True) |
|
|
|
|
|
repo_id = "Vera-ZWY/reddite2024elections_submissions" |
|
newest_file = get_newest_file(repo_id, "submissions/df_24") |
|
|
|
if newest_file: |
|
return f"Successfully crawled new data!\n\nTop 3 rows of new data:\n{load_and_show_data(repo_id, newest_file)}" |
|
else: |
|
return "Error: No crawled data files found" |
|
except subprocess.CalledProcessError as e: |
|
return f"Error executing praw.py: {str(e)}" |
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
def merge_data(): |
|
"""Execute merge.py and show the latest merged data""" |
|
try: |
|
|
|
subprocess.run([sys.executable, "merge.py"], check=True) |
|
|
|
|
|
repo_id = "Vera-ZWY/reddite2024elections_submissions" |
|
old_merged = "submission/merged_reddit_data.csv" |
|
|
|
if newest_merged: |
|
return f"Successfully merged data!\n\nTop 3 rows of merged data:\n{load_and_show_data(repo_id, newest_merged)}" |
|
else: |
|
return "Error: No merged data files found" |
|
except subprocess.CalledProcessError as e: |
|
return f"Error executing merge.py: {str(e)}" |
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
def load_latest_data(): |
|
"""Load and display the latest data from both sources""" |
|
repo_id = "Vera-ZWY/reddite2024elections_submissions" |
|
|
|
|
|
newest_crawled = get_newest_file(repo_id, "submissions/df_24") |
|
crawled_data = "No crawled data available" |
|
if newest_crawled: |
|
crawled_data = f"Latest crawled data ({newest_crawled}):\n{load_and_show_data(repo_id, newest_crawled)}" |
|
|
|
|
|
newest_merged = get_newest_file(repo_id, "merged_reddit_data") |
|
merged_data = "No merged data available" |
|
if newest_merged: |
|
merged_data = f"Latest merged data ({newest_merged}):\n{load_and_show_data(repo_id, newest_merged)}" |
|
|
|
return f"{crawled_data}\n\n{merged_data}" |
|
|
|
|
|
with gr.Blocks(title="Reddit Data Processing") as iface: |
|
gr.Markdown("# Reddit Data Processing Interface") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
praw_button = gr.Button("Crawl New Data") |
|
merge_button = gr.Button("Merge Data") |
|
|
|
output_text = gr.Textbox(label="Results", value=load_latest_data(), lines=20) |
|
|
|
|
|
praw_button.click(fn=praw_new_data, outputs=output_text) |
|
merge_button.click(fn=merge_data, outputs=output_text) |
|
|
|
gr.Markdown(""" |
|
### Instructions: |
|
1. Click 'Crawl New Data' to fetch new Reddit data |
|
2. Click 'Merge Data' to merge the latest datasets |
|
3. The results will show the top 3 rows of both the latest crawled and merged data |
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |