File size: 4,441 Bytes
a725a50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
from datasets import load_dataset
import pandas as pd
import sys
import os
from datetime import datetime
import subprocess
from huggingface_hub import HfApi
def get_newest_file(repo_id, prefix):
"""Get the newest file with given prefix from HuggingFace repo"""
api = HfApi()
files = api.list_repo_files(repo_id, repo_type="dataset")
# Filter files by prefix
relevant_files = [f for f in files if f.startswith(prefix)]
if not relevant_files:
return None
# Sort files by modification time (newest first)
file_dates = []
for filename in relevant_files:
try:
date_str = filename.split('_')[-1].split('.')[0]
date = datetime.strptime(date_str, '%Y%m%d')
file_dates.append((date, filename))
except (IndexError, ValueError):
continue
if not file_dates:
return None
newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
return newest_file
def load_and_show_data(repo_id, file_path):
"""Load data from HuggingFace and return top 3 rows"""
try:
dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train')
df = pd.DataFrame(dataset)
return df.head(3).to_string()
except Exception as e:
return f"Error loading data: {str(e)}"
def praw_new_data():
"""Execute praw.py and show the latest data"""
try:
# Execute praw.py
subprocess.run([sys.executable, "praw.py"], check=True)
# Get newest crawled file
repo_id = "Vera-ZWY/reddite2024elections_submissions"
newest_file = get_newest_file(repo_id, "submissions/df_24")
if newest_file:
return f"Successfully crawled new data!\n\nTop 3 rows of new data:\n{load_and_show_data(repo_id, newest_file)}"
else:
return "Error: No crawled data files found"
except subprocess.CalledProcessError as e:
return f"Error executing praw.py: {str(e)}"
except Exception as e:
return f"Error: {str(e)}"
def merge_data():
"""Execute merge.py and show the latest merged data"""
try:
# Execute merge.py
subprocess.run([sys.executable, "merge.py"], check=True)
# Get newest merged file
repo_id = "Vera-ZWY/reddite2024elections_submissions"
old_merged = "submission/merged_reddit_data.csv"
if newest_merged:
return f"Successfully merged data!\n\nTop 3 rows of merged data:\n{load_and_show_data(repo_id, newest_merged)}"
else:
return "Error: No merged data files found"
except subprocess.CalledProcessError as e:
return f"Error executing merge.py: {str(e)}"
except Exception as e:
return f"Error: {str(e)}"
def load_latest_data():
"""Load and display the latest data from both sources"""
repo_id = "Vera-ZWY/reddite2024elections_submissions"
# Get latest crawled data
newest_crawled = get_newest_file(repo_id, "submissions/df_24")
crawled_data = "No crawled data available"
if newest_crawled:
crawled_data = f"Latest crawled data ({newest_crawled}):\n{load_and_show_data(repo_id, newest_crawled)}"
# Get latest merged data
newest_merged = get_newest_file(repo_id, "merged_reddit_data")
merged_data = "No merged data available"
if newest_merged:
merged_data = f"Latest merged data ({newest_merged}):\n{load_and_show_data(repo_id, newest_merged)}"
return f"{crawled_data}\n\n{merged_data}"
# Create Gradio interface
with gr.Blocks(title="Reddit Data Processing") as iface:
gr.Markdown("# Reddit Data Processing Interface")
with gr.Row():
with gr.Column():
praw_button = gr.Button("Crawl New Data")
merge_button = gr.Button("Merge Data")
output_text = gr.Textbox(label="Results", value=load_latest_data(), lines=20)
# Button click handlers
praw_button.click(fn=praw_new_data, outputs=output_text)
merge_button.click(fn=merge_data, outputs=output_text)
gr.Markdown("""
### Instructions:
1. Click 'Crawl New Data' to fetch new Reddit data
2. Click 'Merge Data' to merge the latest datasets
3. The results will show the top 3 rows of both the latest crawled and merged data
""")
# Launch the interface
if __name__ == "__main__":
iface.launch() |