Niharmahesh commited on
Commit
5ca4095
·
verified ·
1 Parent(s): 44680e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -38
app.py CHANGED
@@ -5,37 +5,13 @@ from huggingface_hub import HfApi
5
  import io
6
  from datetime import datetime, timedelta
7
  import time
 
 
8
 
9
  # Set page config for a wider layout and custom theme
10
  st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
11
 
12
- # Custom CSS for black background and styling
13
- st.markdown("""
14
- <style>
15
- .stApp {
16
- background-color: #000000;
17
- color: #FFFFFF;
18
- }
19
- .stButton>button {
20
- background-color: #4e79a7;
21
- color: white;
22
- }
23
- .stSelectbox, .stMultiSelect {
24
- color: #FFFFFF;
25
- }
26
- .stDataFrame {
27
- background-color: #1E1E1E;
28
- }
29
- .plotly-graph-div {
30
- background-color: #1E1E1E;
31
- }
32
- .big-font {
33
- font-size: 48px;
34
- font-weight: bold;
35
- text-align: center;
36
- }
37
- </style>
38
- """, unsafe_allow_html=True)
39
 
40
  # Hugging Face setup
41
  HF_TOKEN = st.secrets["HF_TOKEN"]
@@ -52,7 +28,7 @@ def load_and_concat_data():
52
  for file in csv_files:
53
  try:
54
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
55
- df = pd.read_csv(file_content)
56
  all_data.append(df)
57
  except Exception:
58
  pass # Silently skip files that can't be processed
@@ -72,13 +48,16 @@ def load_and_concat_data():
72
  # Drop duplicates and rows with NaT in date_posted
73
  filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
74
 
 
 
 
75
  return filtered_df
76
 
77
  @st.cache_data()
78
  def get_unique_values(df):
79
  return {
80
  'companies': df['company'].unique(),
81
- 'locations': df['location'].unique(),
82
  'job_types': df['job_type'].unique()
83
  }
84
 
@@ -93,6 +72,7 @@ def create_time_series(df):
93
  fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
94
  return fig
95
 
 
96
  def parse_locations(df):
97
  valid_locations = [
98
  "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
@@ -116,11 +96,27 @@ def parse_locations(df):
116
  "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
117
  ]
118
 
119
- df['parsed_location'] = df['location'].apply(lambda x: next((loc for loc in valid_locations if loc in x), 'Other'))
 
 
 
 
 
 
 
 
120
  return df
121
 
 
 
 
 
 
 
 
 
122
  def display_dashboard(df):
123
- df = parse_locations(df)
124
 
125
  col1, col2 = st.columns(2)
126
 
@@ -135,23 +131,20 @@ def display_dashboard(df):
135
  st.write(f"Job postings from {min_date} to {max_date}")
136
 
137
  with col2:
138
- top_companies = df['company'].value_counts().head(10)
139
  fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
140
  st.plotly_chart(fig, use_container_width=True)
141
 
142
  # Job Postings Over Time Chart
143
- fig_time_series = create_time_series(df)
144
  st.plotly_chart(fig_time_series, use_container_width=True)
145
 
146
  col3, col4 = st.columns(2)
147
 
148
  with col3:
149
- top_locations = df['parsed_location'].value_counts().head(10)
150
  fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
151
  st.plotly_chart(fig, use_container_width=True)
152
 
153
  with col4:
154
- top_job_titles = df['title'].value_counts().head(20)
155
  fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
156
  st.plotly_chart(fig, use_container_width=True)
157
 
@@ -177,7 +170,7 @@ def display_data_explorer(df):
177
  with col1:
178
  companies = st.multiselect("Select Companies", options=unique_values['companies'])
179
  with col2:
180
- locations = st.multiselect("Select Locations", options=df['parsed_location'].unique())
181
  with col3:
182
  job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
183
 
@@ -204,8 +197,6 @@ def main():
204
  st.error("No data available. Please check your dataset.")
205
  return
206
 
207
- df = parse_locations(df)
208
-
209
  # Sidebar for navigation
210
  st.sidebar.title("Navigation")
211
  page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
 
5
  import io
6
  from datetime import datetime, timedelta
7
  import time
8
+ import pyarrow as pa
9
+ import pyarrow.parquet as pq
10
 
11
  # Set page config for a wider layout and custom theme
12
  st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
13
 
14
+ # [Your existing CSS styling code remains unchanged]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Hugging Face setup
17
  HF_TOKEN = st.secrets["HF_TOKEN"]
 
28
  for file in csv_files:
29
  try:
30
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
31
+ df = pd.read_csv(file_content, engine='pyarrow')
32
  all_data.append(df)
33
  except Exception:
34
  pass # Silently skip files that can't be processed
 
48
  # Drop duplicates and rows with NaT in date_posted
49
  filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
50
 
51
+ # Parse locations
52
+ filtered_df = parse_locations(filtered_df)
53
+
54
  return filtered_df
55
 
56
  @st.cache_data()
57
  def get_unique_values(df):
58
  return {
59
  'companies': df['company'].unique(),
60
+ 'locations': df['parsed_location'].unique(),
61
  'job_types': df['job_type'].unique()
62
  }
63
 
 
72
  fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
73
  return fig
74
 
75
+ @st.cache_data
76
  def parse_locations(df):
77
  valid_locations = [
78
  "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
 
96
  "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
97
  ]
98
 
99
+ # Handle NaN and non-string types before parsing
100
+ df['location'] = df['location'].fillna('').astype(str)
101
+
102
+ def parse_location(x):
103
+ if pd.isna(x) or not isinstance(x, str):
104
+ return 'Other'
105
+ return next((loc for loc in valid_locations if loc in x), 'Other')
106
+
107
+ df['parsed_location'] = df['location'].apply(parse_location)
108
  return df
109
 
110
+ @st.cache_data
111
+ def prepare_dashboard_data(df):
112
+ top_companies = df['company'].value_counts().head(10)
113
+ top_locations = df['parsed_location'].value_counts().head(10)
114
+ top_job_titles = df['title'].value_counts().head(20)
115
+ df_by_date = df.groupby('date_posted').size().reset_index(name='count')
116
+ return top_companies, top_locations, top_job_titles, df_by_date
117
+
118
  def display_dashboard(df):
119
+ top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df)
120
 
121
  col1, col2 = st.columns(2)
122
 
 
131
  st.write(f"Job postings from {min_date} to {max_date}")
132
 
133
  with col2:
 
134
  fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
135
  st.plotly_chart(fig, use_container_width=True)
136
 
137
  # Job Postings Over Time Chart
138
+ fig_time_series = create_time_series(df_by_date)
139
  st.plotly_chart(fig_time_series, use_container_width=True)
140
 
141
  col3, col4 = st.columns(2)
142
 
143
  with col3:
 
144
  fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
145
  st.plotly_chart(fig, use_container_width=True)
146
 
147
  with col4:
 
148
  fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
149
  st.plotly_chart(fig, use_container_width=True)
150
 
 
170
  with col1:
171
  companies = st.multiselect("Select Companies", options=unique_values['companies'])
172
  with col2:
173
+ locations = st.multiselect("Select Locations", options=unique_values['locations'])
174
  with col3:
175
  job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
176
 
 
197
  st.error("No data available. Please check your dataset.")
198
  return
199
 
 
 
200
  # Sidebar for navigation
201
  st.sidebar.title("Navigation")
202
  page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])