Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,37 +5,13 @@ from huggingface_hub import HfApi
|
|
5 |
import io
|
6 |
from datetime import datetime, timedelta
|
7 |
import time
|
|
|
|
|
8 |
|
9 |
# Set page config for a wider layout and custom theme
|
10 |
st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
|
11 |
|
12 |
-
#
|
13 |
-
st.markdown("""
|
14 |
-
<style>
|
15 |
-
.stApp {
|
16 |
-
background-color: #000000;
|
17 |
-
color: #FFFFFF;
|
18 |
-
}
|
19 |
-
.stButton>button {
|
20 |
-
background-color: #4e79a7;
|
21 |
-
color: white;
|
22 |
-
}
|
23 |
-
.stSelectbox, .stMultiSelect {
|
24 |
-
color: #FFFFFF;
|
25 |
-
}
|
26 |
-
.stDataFrame {
|
27 |
-
background-color: #1E1E1E;
|
28 |
-
}
|
29 |
-
.plotly-graph-div {
|
30 |
-
background-color: #1E1E1E;
|
31 |
-
}
|
32 |
-
.big-font {
|
33 |
-
font-size: 48px;
|
34 |
-
font-weight: bold;
|
35 |
-
text-align: center;
|
36 |
-
}
|
37 |
-
</style>
|
38 |
-
""", unsafe_allow_html=True)
|
39 |
|
40 |
# Hugging Face setup
|
41 |
HF_TOKEN = st.secrets["HF_TOKEN"]
|
@@ -52,7 +28,7 @@ def load_and_concat_data():
|
|
52 |
for file in csv_files:
|
53 |
try:
|
54 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
55 |
-
df = pd.read_csv(file_content)
|
56 |
all_data.append(df)
|
57 |
except Exception:
|
58 |
pass # Silently skip files that can't be processed
|
@@ -72,13 +48,16 @@ def load_and_concat_data():
|
|
72 |
# Drop duplicates and rows with NaT in date_posted
|
73 |
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
|
74 |
|
|
|
|
|
|
|
75 |
return filtered_df
|
76 |
|
77 |
@st.cache_data()
|
78 |
def get_unique_values(df):
|
79 |
return {
|
80 |
'companies': df['company'].unique(),
|
81 |
-
'locations': df['
|
82 |
'job_types': df['job_type'].unique()
|
83 |
}
|
84 |
|
@@ -93,6 +72,7 @@ def create_time_series(df):
|
|
93 |
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
|
94 |
return fig
|
95 |
|
|
|
96 |
def parse_locations(df):
|
97 |
valid_locations = [
|
98 |
"New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
|
@@ -116,11 +96,27 @@ def parse_locations(df):
|
|
116 |
"San Bernardino, CA", "Boise, ID", "Birmingham, AL"
|
117 |
]
|
118 |
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
return df
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
def display_dashboard(df):
|
123 |
-
|
124 |
|
125 |
col1, col2 = st.columns(2)
|
126 |
|
@@ -135,23 +131,20 @@ def display_dashboard(df):
|
|
135 |
st.write(f"Job postings from {min_date} to {max_date}")
|
136 |
|
137 |
with col2:
|
138 |
-
top_companies = df['company'].value_counts().head(10)
|
139 |
fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
|
140 |
st.plotly_chart(fig, use_container_width=True)
|
141 |
|
142 |
# Job Postings Over Time Chart
|
143 |
-
fig_time_series = create_time_series(
|
144 |
st.plotly_chart(fig_time_series, use_container_width=True)
|
145 |
|
146 |
col3, col4 = st.columns(2)
|
147 |
|
148 |
with col3:
|
149 |
-
top_locations = df['parsed_location'].value_counts().head(10)
|
150 |
fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
|
151 |
st.plotly_chart(fig, use_container_width=True)
|
152 |
|
153 |
with col4:
|
154 |
-
top_job_titles = df['title'].value_counts().head(20)
|
155 |
fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
|
156 |
st.plotly_chart(fig, use_container_width=True)
|
157 |
|
@@ -177,7 +170,7 @@ def display_data_explorer(df):
|
|
177 |
with col1:
|
178 |
companies = st.multiselect("Select Companies", options=unique_values['companies'])
|
179 |
with col2:
|
180 |
-
locations = st.multiselect("Select Locations", options=
|
181 |
with col3:
|
182 |
job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
|
183 |
|
@@ -204,8 +197,6 @@ def main():
|
|
204 |
st.error("No data available. Please check your dataset.")
|
205 |
return
|
206 |
|
207 |
-
df = parse_locations(df)
|
208 |
-
|
209 |
# Sidebar for navigation
|
210 |
st.sidebar.title("Navigation")
|
211 |
page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
|
|
|
5 |
import io
|
6 |
from datetime import datetime, timedelta
|
7 |
import time
|
8 |
+
import pyarrow as pa
|
9 |
+
import pyarrow.parquet as pq
|
10 |
|
11 |
# Set page config for a wider layout and custom theme
|
12 |
st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
|
13 |
|
14 |
+
# [Your existing CSS styling code remains unchanged]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# Hugging Face setup
|
17 |
HF_TOKEN = st.secrets["HF_TOKEN"]
|
|
|
28 |
for file in csv_files:
|
29 |
try:
|
30 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
31 |
+
df = pd.read_csv(file_content, engine='pyarrow')
|
32 |
all_data.append(df)
|
33 |
except Exception:
|
34 |
pass # Silently skip files that can't be processed
|
|
|
48 |
# Drop duplicates and rows with NaT in date_posted
|
49 |
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
|
50 |
|
51 |
+
# Parse locations
|
52 |
+
filtered_df = parse_locations(filtered_df)
|
53 |
+
|
54 |
return filtered_df
|
55 |
|
56 |
@st.cache_data()
|
57 |
def get_unique_values(df):
|
58 |
return {
|
59 |
'companies': df['company'].unique(),
|
60 |
+
'locations': df['parsed_location'].unique(),
|
61 |
'job_types': df['job_type'].unique()
|
62 |
}
|
63 |
|
|
|
72 |
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
|
73 |
return fig
|
74 |
|
75 |
+
@st.cache_data
|
76 |
def parse_locations(df):
|
77 |
valid_locations = [
|
78 |
"New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
|
|
|
96 |
"San Bernardino, CA", "Boise, ID", "Birmingham, AL"
|
97 |
]
|
98 |
|
99 |
+
# Handle NaN and non-string types before parsing
|
100 |
+
df['location'] = df['location'].fillna('').astype(str)
|
101 |
+
|
102 |
+
def parse_location(x):
|
103 |
+
if pd.isna(x) or not isinstance(x, str):
|
104 |
+
return 'Other'
|
105 |
+
return next((loc for loc in valid_locations if loc in x), 'Other')
|
106 |
+
|
107 |
+
df['parsed_location'] = df['location'].apply(parse_location)
|
108 |
return df
|
109 |
|
110 |
+
@st.cache_data
|
111 |
+
def prepare_dashboard_data(df):
|
112 |
+
top_companies = df['company'].value_counts().head(10)
|
113 |
+
top_locations = df['parsed_location'].value_counts().head(10)
|
114 |
+
top_job_titles = df['title'].value_counts().head(20)
|
115 |
+
df_by_date = df.groupby('date_posted').size().reset_index(name='count')
|
116 |
+
return top_companies, top_locations, top_job_titles, df_by_date
|
117 |
+
|
118 |
def display_dashboard(df):
|
119 |
+
top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df)
|
120 |
|
121 |
col1, col2 = st.columns(2)
|
122 |
|
|
|
131 |
st.write(f"Job postings from {min_date} to {max_date}")
|
132 |
|
133 |
with col2:
|
|
|
134 |
fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
|
135 |
st.plotly_chart(fig, use_container_width=True)
|
136 |
|
137 |
# Job Postings Over Time Chart
|
138 |
+
fig_time_series = create_time_series(df_by_date)
|
139 |
st.plotly_chart(fig_time_series, use_container_width=True)
|
140 |
|
141 |
col3, col4 = st.columns(2)
|
142 |
|
143 |
with col3:
|
|
|
144 |
fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
|
145 |
st.plotly_chart(fig, use_container_width=True)
|
146 |
|
147 |
with col4:
|
|
|
148 |
fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
|
149 |
st.plotly_chart(fig, use_container_width=True)
|
150 |
|
|
|
170 |
with col1:
|
171 |
companies = st.multiselect("Select Companies", options=unique_values['companies'])
|
172 |
with col2:
|
173 |
+
locations = st.multiselect("Select Locations", options=unique_values['locations'])
|
174 |
with col3:
|
175 |
job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
|
176 |
|
|
|
197 |
st.error("No data available. Please check your dataset.")
|
198 |
return
|
199 |
|
|
|
|
|
200 |
# Sidebar for navigation
|
201 |
st.sidebar.title("Navigation")
|
202 |
page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])
|