File size: 4,441 Bytes
a725a50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
from datasets import load_dataset
import pandas as pd
import sys
import os
from datetime import datetime
import subprocess
from huggingface_hub import HfApi

def get_newest_file(repo_id, prefix):
    """Get the newest file with given prefix from HuggingFace repo"""
    api = HfApi()
    files = api.list_repo_files(repo_id, repo_type="dataset")
    # Filter files by prefix
    relevant_files = [f for f in files if f.startswith(prefix)]
    
    if not relevant_files:
        return None
    
    # Sort files by modification time (newest first)
    file_dates = []
    for filename in relevant_files:
        try:
            date_str = filename.split('_')[-1].split('.')[0]
            date = datetime.strptime(date_str, '%Y%m%d')
            file_dates.append((date, filename))
        except (IndexError, ValueError):
            continue
    
    if not file_dates:
        return None
    
    newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
    return newest_file

def load_and_show_data(repo_id, file_path):
    """Load data from HuggingFace and return top 3 rows"""
    try:
        dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train')
        df = pd.DataFrame(dataset)
        return df.head(3).to_string()
    except Exception as e:
        return f"Error loading data: {str(e)}"

def praw_new_data():
    """Execute praw.py and show the latest data"""
    try:
        # Execute praw.py
        subprocess.run([sys.executable, "praw.py"], check=True)
        
        # Get newest crawled file
        repo_id = "Vera-ZWY/reddite2024elections_submissions"
        newest_file = get_newest_file(repo_id, "submissions/df_24")
        
        if newest_file:
            return f"Successfully crawled new data!\n\nTop 3 rows of new data:\n{load_and_show_data(repo_id, newest_file)}"
        else:
            return "Error: No crawled data files found"
    except subprocess.CalledProcessError as e:
        return f"Error executing praw.py: {str(e)}"
    except Exception as e:
        return f"Error: {str(e)}"

def merge_data():
    """Execute merge.py and show the latest merged data"""
    try:
        # Execute merge.py
        subprocess.run([sys.executable, "merge.py"], check=True)
        
        # Get newest merged file
        repo_id = "Vera-ZWY/reddite2024elections_submissions"
        old_merged =  "submission/merged_reddit_data.csv"
        
        if newest_merged:
            return f"Successfully merged data!\n\nTop 3 rows of merged data:\n{load_and_show_data(repo_id, newest_merged)}"
        else:
            return "Error: No merged data files found"
    except subprocess.CalledProcessError as e:
        return f"Error executing merge.py: {str(e)}"
    except Exception as e:
        return f"Error: {str(e)}"

def load_latest_data():
    """Load and display the latest data from both sources"""
    repo_id = "Vera-ZWY/reddite2024elections_submissions"
    
    # Get latest crawled data
    newest_crawled = get_newest_file(repo_id, "submissions/df_24")
    crawled_data = "No crawled data available"
    if newest_crawled:
        crawled_data = f"Latest crawled data ({newest_crawled}):\n{load_and_show_data(repo_id, newest_crawled)}"
    
    # Get latest merged data
    newest_merged = get_newest_file(repo_id, "merged_reddit_data")
    merged_data = "No merged data available"
    if newest_merged:
        merged_data = f"Latest merged data ({newest_merged}):\n{load_and_show_data(repo_id, newest_merged)}"
    
    return f"{crawled_data}\n\n{merged_data}"

# Create Gradio interface
with gr.Blocks(title="Reddit Data Processing") as iface:
    gr.Markdown("# Reddit Data Processing Interface")
    
    with gr.Row():
        with gr.Column():
            praw_button = gr.Button("Crawl New Data")
            merge_button = gr.Button("Merge Data")
    
    output_text = gr.Textbox(label="Results", value=load_latest_data(), lines=20)
    
    # Button click handlers
    praw_button.click(fn=praw_new_data, outputs=output_text)
    merge_button.click(fn=merge_data, outputs=output_text)
    
    gr.Markdown("""
    ### Instructions:
    1. Click 'Crawl New Data' to fetch new Reddit data
    2. Click 'Merge Data' to merge the latest datasets
    3. The results will show the top 3 rows of both the latest crawled and merged data
    """)

# Launch the interface
if __name__ == "__main__":
    iface.launch()