Spaces:

Niharmahesh
/

job_easz

Running

App Files Files Community

Niharmahesh commited on Oct 26, 2024

Commit

5ca4095

verified ·

1 Parent(s): 44680e5

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -38

app.py CHANGED Viewed

@@ -5,37 +5,13 @@ from huggingface_hub import HfApi
 import io
 from datetime import datetime, timedelta
 import time
 # Set page config for a wider layout and custom theme
 st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
-# Custom CSS for black background and styling
-st.markdown("""
-<style>
-    .stApp {
-        background-color: #000000;
-        color: #FFFFFF;
-    }
-    .stButton>button {
-        background-color: #4e79a7;
-        color: white;
-    }
-    .stSelectbox, .stMultiSelect {
-        color: #FFFFFF;
-    }
-    .stDataFrame {
-        background-color: #1E1E1E;
-    }
-    .plotly-graph-div {
-        background-color: #1E1E1E;
-    }
-    .big-font {
-        font-size: 48px;
-        font-weight: bold;
-        text-align: center;
-    }
-</style>
-""", unsafe_allow_html=True)
 # Hugging Face setup
 HF_TOKEN = st.secrets["HF_TOKEN"]
@@ -52,7 +28,7 @@ def load_and_concat_data():
     for file in csv_files:
         try:
             file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
-            df = pd.read_csv(file_content)
             all_data.append(df)
         except Exception:
             pass  # Silently skip files that can't be processed
@@ -72,13 +48,16 @@ def load_and_concat_data():
     # Drop duplicates and rows with NaT in date_posted
     filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
     return filtered_df
 @st.cache_data()
 def get_unique_values(df):
     return {
         'companies': df['company'].unique(),
-        'locations': df['location'].unique(),
         'job_types': df['job_type'].unique()
     }
@@ -93,6 +72,7 @@ def create_time_series(df):
     fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
     return fig
 def parse_locations(df):
     valid_locations = [
         "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
@@ -116,11 +96,27 @@ def parse_locations(df):
         "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
     ]
-    df['parsed_location'] = df['location'].apply(lambda x: next((loc for loc in valid_locations if loc in x), 'Other'))
     return df
 def display_dashboard(df):
-    df = parse_locations(df)
     col1, col2 = st.columns(2)
@@ -135,23 +131,20 @@ def display_dashboard(df):
         st.write(f"Job postings from {min_date} to {max_date}")
     with col2:
-        top_companies = df['company'].value_counts().head(10)
         fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
         st.plotly_chart(fig, use_container_width=True)
     # Job Postings Over Time Chart
-    fig_time_series = create_time_series(df)
     st.plotly_chart(fig_time_series, use_container_width=True)
     col3, col4 = st.columns(2)
     with col3:
-        top_locations = df['parsed_location'].value_counts().head(10)
         fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
         st.plotly_chart(fig, use_container_width=True)
     with col4:
-        top_job_titles = df['title'].value_counts().head(20)
         fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
         st.plotly_chart(fig, use_container_width=True)
@@ -177,7 +170,7 @@ def display_data_explorer(df):
         with col1:
             companies = st.multiselect("Select Companies", options=unique_values['companies'])
         with col2:
-            locations = st.multiselect("Select Locations", options=df['parsed_location'].unique())
         with col3:
             job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
@@ -204,8 +197,6 @@ def main():
         st.error("No data available. Please check your dataset.")
         return
-    df = parse_locations(df)
     # Sidebar for navigation
     st.sidebar.title("Navigation")
     page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])

 import io
 from datetime import datetime, timedelta
 import time
+import pyarrow as pa
+import pyarrow.parquet as pq
 # Set page config for a wider layout and custom theme
 st.set_page_config(layout="wide", page_title="Job Listings Dashboard")
+# [Your existing CSS styling code remains unchanged]
 # Hugging Face setup
 HF_TOKEN = st.secrets["HF_TOKEN"]
     for file in csv_files:
         try:
             file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
+            df = pd.read_csv(file_content, engine='pyarrow')
             all_data.append(df)
         except Exception:
             pass  # Silently skip files that can't be processed
     # Drop duplicates and rows with NaT in date_posted
     filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
+    # Parse locations
+    filtered_df = parse_locations(filtered_df)
     return filtered_df
 @st.cache_data()
 def get_unique_values(df):
     return {
         'companies': df['company'].unique(),
+        'locations': df['parsed_location'].unique(),
         'job_types': df['job_type'].unique()
     }
     fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF')
     return fig
+@st.cache_data
 def parse_locations(df):
     valid_locations = [
         "New York, NY", "San Francisco, CA", "Los Angeles, CA", "Chicago, IL", "Houston, TX",
         "San Bernardino, CA", "Boise, ID", "Birmingham, AL"
     ]
+    # Handle NaN and non-string types before parsing
+    df['location'] = df['location'].fillna('').astype(str)
+    def parse_location(x):
+        if pd.isna(x) or not isinstance(x, str):
+            return 'Other'
+        return next((loc for loc in valid_locations if loc in x), 'Other')
+    df['parsed_location'] = df['location'].apply(parse_location)
     return df
+@st.cache_data
+def prepare_dashboard_data(df):
+    top_companies = df['company'].value_counts().head(10)
+    top_locations = df['parsed_location'].value_counts().head(10)
+    top_job_titles = df['title'].value_counts().head(20)
+    df_by_date = df.groupby('date_posted').size().reset_index(name='count')
+    return top_companies, top_locations, top_job_titles, df_by_date
 def display_dashboard(df):
+    top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df)
     col1, col2 = st.columns(2)
         st.write(f"Job postings from {min_date} to {max_date}")
     with col2:
         fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7'])
         st.plotly_chart(fig, use_container_width=True)
     # Job Postings Over Time Chart
+    fig_time_series = create_time_series(df_by_date)
     st.plotly_chart(fig_time_series, use_container_width=True)
     col3, col4 = st.columns(2)
     with col3:
         fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b'])
         st.plotly_chart(fig, use_container_width=True)
     with col4:
         fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f'])
         st.plotly_chart(fig, use_container_width=True)
         with col1:
             companies = st.multiselect("Select Companies", options=unique_values['companies'])
         with col2:
+            locations = st.multiselect("Select Locations", options=unique_values['locations'])
         with col3:
             job_types = st.multiselect("Select Job Types", options=unique_values['job_types'])
         st.error("No data available. Please check your dataset.")
         return
     # Sidebar for navigation
     st.sidebar.title("Navigation")
     page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer"])