Vera-ZWY commited on
Commit
a725a50
1 Parent(s): 0ea196b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -0
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset
3
+ import pandas as pd
4
+ import sys
5
+ import os
6
+ from datetime import datetime
7
+ import subprocess
8
+ from huggingface_hub import HfApi
9
+
10
+ def get_newest_file(repo_id, prefix):
11
+ """Get the newest file with given prefix from HuggingFace repo"""
12
+ api = HfApi()
13
+ files = api.list_repo_files(repo_id, repo_type="dataset")
14
+ # Filter files by prefix
15
+ relevant_files = [f for f in files if f.startswith(prefix)]
16
+
17
+ if not relevant_files:
18
+ return None
19
+
20
+ # Sort files by modification time (newest first)
21
+ file_dates = []
22
+ for filename in relevant_files:
23
+ try:
24
+ date_str = filename.split('_')[-1].split('.')[0]
25
+ date = datetime.strptime(date_str, '%Y%m%d')
26
+ file_dates.append((date, filename))
27
+ except (IndexError, ValueError):
28
+ continue
29
+
30
+ if not file_dates:
31
+ return None
32
+
33
+ newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
34
+ return newest_file
35
+
36
+ def load_and_show_data(repo_id, file_path):
37
+ """Load data from HuggingFace and return top 3 rows"""
38
+ try:
39
+ dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train')
40
+ df = pd.DataFrame(dataset)
41
+ return df.head(3).to_string()
42
+ except Exception as e:
43
+ return f"Error loading data: {str(e)}"
44
+
45
+ def praw_new_data():
46
+ """Execute praw.py and show the latest data"""
47
+ try:
48
+ # Execute praw.py
49
+ subprocess.run([sys.executable, "praw.py"], check=True)
50
+
51
+ # Get newest crawled file
52
+ repo_id = "Vera-ZWY/reddite2024elections_submissions"
53
+ newest_file = get_newest_file(repo_id, "submissions/df_24")
54
+
55
+ if newest_file:
56
+ return f"Successfully crawled new data!\n\nTop 3 rows of new data:\n{load_and_show_data(repo_id, newest_file)}"
57
+ else:
58
+ return "Error: No crawled data files found"
59
+ except subprocess.CalledProcessError as e:
60
+ return f"Error executing praw.py: {str(e)}"
61
+ except Exception as e:
62
+ return f"Error: {str(e)}"
63
+
64
+ def merge_data():
65
+ """Execute merge.py and show the latest merged data"""
66
+ try:
67
+ # Execute merge.py
68
+ subprocess.run([sys.executable, "merge.py"], check=True)
69
+
70
+ # Get newest merged file
71
+ repo_id = "Vera-ZWY/reddite2024elections_submissions"
72
+ old_merged = "submission/merged_reddit_data.csv"
73
+
74
+ if newest_merged:
75
+ return f"Successfully merged data!\n\nTop 3 rows of merged data:\n{load_and_show_data(repo_id, newest_merged)}"
76
+ else:
77
+ return "Error: No merged data files found"
78
+ except subprocess.CalledProcessError as e:
79
+ return f"Error executing merge.py: {str(e)}"
80
+ except Exception as e:
81
+ return f"Error: {str(e)}"
82
+
83
+ def load_latest_data():
84
+ """Load and display the latest data from both sources"""
85
+ repo_id = "Vera-ZWY/reddite2024elections_submissions"
86
+
87
+ # Get latest crawled data
88
+ newest_crawled = get_newest_file(repo_id, "submissions/df_24")
89
+ crawled_data = "No crawled data available"
90
+ if newest_crawled:
91
+ crawled_data = f"Latest crawled data ({newest_crawled}):\n{load_and_show_data(repo_id, newest_crawled)}"
92
+
93
+ # Get latest merged data
94
+ newest_merged = get_newest_file(repo_id, "merged_reddit_data")
95
+ merged_data = "No merged data available"
96
+ if newest_merged:
97
+ merged_data = f"Latest merged data ({newest_merged}):\n{load_and_show_data(repo_id, newest_merged)}"
98
+
99
+ return f"{crawled_data}\n\n{merged_data}"
100
+
101
+ # Create Gradio interface
102
+ with gr.Blocks(title="Reddit Data Processing") as iface:
103
+ gr.Markdown("# Reddit Data Processing Interface")
104
+
105
+ with gr.Row():
106
+ with gr.Column():
107
+ praw_button = gr.Button("Crawl New Data")
108
+ merge_button = gr.Button("Merge Data")
109
+
110
+ output_text = gr.Textbox(label="Results", value=load_latest_data(), lines=20)
111
+
112
+ # Button click handlers
113
+ praw_button.click(fn=praw_new_data, outputs=output_text)
114
+ merge_button.click(fn=merge_data, outputs=output_text)
115
+
116
+ gr.Markdown("""
117
+ ### Instructions:
118
+ 1. Click 'Crawl New Data' to fetch new Reddit data
119
+ 2. Click 'Merge Data' to merge the latest datasets
120
+ 3. The results will show the top 3 rows of both the latest crawled and merged data
121
+ """)
122
+
123
+ # Launch the interface
124
+ if __name__ == "__main__":
125
+ iface.launch()