Vera-ZWY commited on
Commit
8b96174
β€’
1 Parent(s): 8050a0a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -51
app.py CHANGED
@@ -2,22 +2,19 @@ import gradio as gr
2
  from datasets import load_dataset
3
  import pandas as pd
4
  import sys
5
- import os
6
- from datetime import datetime
7
  import subprocess
 
8
  from huggingface_hub import HfApi
9
 
10
  def get_newest_file(repo_id, prefix):
11
  """Get the newest file with given prefix from HuggingFace repo"""
12
  api = HfApi()
13
  files = api.list_repo_files(repo_id, repo_type="dataset")
14
- # Filter files by prefix
15
  relevant_files = [f for f in files if f.startswith(prefix)]
16
 
17
  if not relevant_files:
18
  return None
19
 
20
- # Sort files by modification time (newest first)
21
  file_dates = []
22
  for filename in relevant_files:
23
  try:
@@ -33,91 +30,114 @@ def get_newest_file(repo_id, prefix):
33
  newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
34
  return newest_file
35
 
36
- def load_and_show_data(repo_id, file_path):
37
- """Load data from HuggingFace and return top 3 rows"""
38
  try:
39
  dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train')
40
  df = pd.DataFrame(dataset)
41
- return df.head(3).to_string()
42
  except Exception as e:
43
- return f"Error loading data: {str(e)}"
44
 
45
  def praw_new_data():
46
  """Execute praw.py and show the latest data"""
47
  try:
48
  # Execute praw.py
49
  subprocess.run([sys.executable, "praw.py"], check=True)
50
-
51
- # Get newest crawled file
52
- repo_id = "Vera-ZWY/reddite2024elections_submissions"
53
- newest_file = get_newest_file(repo_id, "submissions/df_24")
54
-
55
- if newest_file:
56
- return f"Successfully crawled new data!\n\nTop 3 rows of new data:\n{load_and_show_data(repo_id, newest_file)}"
57
- else:
58
- return "Error: No crawled data files found"
59
- except subprocess.CalledProcessError as e:
60
- return f"Error executing praw.py: {str(e)}"
61
  except Exception as e:
62
- return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
63
 
64
  def merge_data():
65
  """Execute merge.py and show the latest merged data"""
66
  try:
67
  # Execute merge.py
68
  subprocess.run([sys.executable, "merge.py"], check=True)
69
-
70
- # Get newest merged file
71
- repo_id = "Vera-ZWY/reddite2024elections_submissions"
72
- old_merged = "submission/merged_reddit_data.csv"
73
-
74
- if newest_merged:
75
- return f"Successfully merged data!\n\nTop 3 rows of merged data:\n{load_and_show_data(repo_id, newest_merged)}"
76
- else:
77
- return "Error: No merged data files found"
78
- except subprocess.CalledProcessError as e:
79
- return f"Error executing merge.py: {str(e)}"
80
  except Exception as e:
81
- return f"Error: {str(e)}"
 
 
 
 
 
82
 
83
- def load_latest_data():
84
- """Load and display the latest data from both sources"""
85
  repo_id = "Vera-ZWY/reddite2024elections_submissions"
 
86
 
87
- # Get latest crawled data
88
- newest_crawled = get_newest_file(repo_id, "submissions/df_24")
89
- crawled_data = "No crawled data available"
90
- if newest_crawled:
91
- crawled_data = f"Latest crawled data ({newest_crawled}):\n{load_and_show_data(repo_id, newest_crawled)}"
 
 
 
92
 
93
- # Get latest merged data
94
- newest_merged = get_newest_file(repo_id, "merged_reddit_data")
95
- merged_data = "No merged data available"
96
  if newest_merged:
97
- merged_data = f"Latest merged data ({newest_merged}):\n{load_and_show_data(repo_id, newest_merged)}"
98
-
99
- return f"{crawled_data}\n\n{merged_data}"
100
 
101
  # Create Gradio interface
102
  with gr.Blocks(title="Reddit Data Processing") as iface:
103
  gr.Markdown("# Reddit Data Processing Interface")
104
 
 
 
 
 
 
 
 
 
 
105
  with gr.Row():
106
  with gr.Column():
107
- praw_button = gr.Button("Crawl New Data")
108
- merge_button = gr.Button("Merge Data")
 
 
 
 
109
 
110
- output_text = gr.Textbox(label="Results", value=load_latest_data(), lines=20)
 
 
 
 
 
 
 
111
 
112
  # Button click handlers
113
- praw_button.click(fn=praw_new_data, outputs=output_text)
114
- merge_button.click(fn=merge_data, outputs=output_text)
 
 
 
 
 
 
 
115
 
116
  gr.Markdown("""
 
117
  ### Instructions:
118
  1. Click 'Crawl New Data' to fetch new Reddit data
119
  2. Click 'Merge Data' to merge the latest datasets
120
- 3. The results will show the top 3 rows of both the latest crawled and merged data
121
  """)
122
 
123
  # Launch the interface
 
2
  from datasets import load_dataset
3
  import pandas as pd
4
  import sys
 
 
5
  import subprocess
6
+ from datetime import datetime
7
  from huggingface_hub import HfApi
8
 
9
  def get_newest_file(repo_id, prefix):
10
  """Get the newest file with given prefix from HuggingFace repo"""
11
  api = HfApi()
12
  files = api.list_repo_files(repo_id, repo_type="dataset")
 
13
  relevant_files = [f for f in files if f.startswith(prefix)]
14
 
15
  if not relevant_files:
16
  return None
17
 
 
18
  file_dates = []
19
  for filename in relevant_files:
20
  try:
 
30
  newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
31
  return newest_file
32
 
33
+ def load_data(repo_id, file_path):
34
+ """Load data from HuggingFace and return as DataFrame"""
35
  try:
36
  dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train')
37
  df = pd.DataFrame(dataset)
38
+ return df.head(3)
39
  except Exception as e:
40
+ return pd.DataFrame({'Error': [str(e)]})
41
 
42
  def praw_new_data():
43
  """Execute praw.py and show the latest data"""
44
  try:
45
  # Execute praw.py
46
  subprocess.run([sys.executable, "praw.py"], check=True)
47
+ success_message = "βœ… Successfully crawled new data!"
 
 
 
 
 
 
 
 
 
 
48
  except Exception as e:
49
+ success_message = f"❌ Error executing praw.py: {str(e)}"
50
+
51
+ # Load and return latest data
52
+ repo_id = "Vera-ZWY/reddite2024elections_submissions"
53
+ newest_file = get_newest_file(repo_id, "submissions/df_")
54
+
55
+ if newest_file:
56
+ df = load_data(repo_id, newest_file)
57
+ return success_message, df, load_merged_data()[1] # Return current merged data state
58
+ else:
59
+ return "No crawled data files found", pd.DataFrame(), load_merged_data()[1]
60
 
61
  def merge_data():
62
  """Execute merge.py and show the latest merged data"""
63
  try:
64
  # Execute merge.py
65
  subprocess.run([sys.executable, "merge.py"], check=True)
66
+ success_message = "βœ… Successfully merged data!"
 
 
 
 
 
 
 
 
 
 
67
  except Exception as e:
68
+ success_message = f"❌ Error executing merge.py: {str(e)}"
69
+
70
+ # Load and return latest merged data
71
+ merged_df = load_merged_data()[1]
72
+ crawled_df = load_crawled_data()[1]
73
+ return success_message, crawled_df, merged_df
74
 
75
+ def load_crawled_data():
76
+ """Load latest crawled data"""
77
  repo_id = "Vera-ZWY/reddite2024elections_submissions"
78
+ newest_file = get_newest_file(repo_id, "submissions/df_24")
79
 
80
+ if newest_file:
81
+ return f"Latest crawled data ({newest_file}):", load_data(repo_id, newest_file)
82
+ return "No crawled data available", pd.DataFrame()
83
+
84
+ def load_merged_data():
85
+ """Load latest merged data"""
86
+ repo_id = "Vera-ZWY/reddite2024elections_submissions"
87
+ newest_merged = "submission/merged_reddit_data.csv"
88
 
 
 
 
89
  if newest_merged:
90
+ return f"Latest merged data ({newest_merged}):", load_data(repo_id, newest_merged)
91
+ return "No merged data available", pd.DataFrame()
 
92
 
93
  # Create Gradio interface
94
  with gr.Blocks(title="Reddit Data Processing") as iface:
95
  gr.Markdown("# Reddit Data Processing Interface")
96
 
97
+ # Status message for operations
98
+ status_text = gr.Textbox(label="Status", interactive=False)
99
+
100
+ with gr.Row():
101
+ with gr.Column():
102
+ praw_button = gr.Button("Crawl New Data", variant="primary")
103
+ with gr.Column():
104
+ merge_button = gr.Button("Merge Data", variant="primary")
105
+
106
  with gr.Row():
107
  with gr.Column():
108
+ gr.Markdown("### Latest Crawled Data (Top 3 Rows)")
109
+ crawled_table = gr.Dataframe(
110
+ headers=["title", "score", "id", "url", "comms_num", "created", "body", "subreddit"],
111
+ value=load_crawled_data()[1],
112
+ wrap=True
113
+ )
114
 
115
+ with gr.Row():
116
+ with gr.Column():
117
+ gr.Markdown("### Latest Merged Data (Top 3 Rows)")
118
+ merged_table = gr.Dataframe(
119
+ headers=["title", "score", "id", "url", "num_comments", "created", "body", "content", "subreddit"],
120
+ value=load_merged_data()[1],
121
+ wrap=True
122
+ )
123
 
124
  # Button click handlers
125
+ praw_button.click(
126
+ fn=praw_new_data,
127
+ outputs=[status_text, crawled_table, merged_table]
128
+ )
129
+
130
+ merge_button.click(
131
+ fn=merge_data,
132
+ outputs=[status_text, crawled_table, merged_table]
133
+ )
134
 
135
  gr.Markdown("""
136
+ ## The full dataset storage at https://huggingface.co/datasets/Vera-ZWY/reddite2024elections_submissions/
137
  ### Instructions:
138
  1. Click 'Crawl New Data' to fetch new Reddit data
139
  2. Click 'Merge Data' to merge the latest datasets
140
+ 3. Tables will automatically update to show the latest data
141
  """)
142
 
143
  # Launch the interface