Spaces:
Sleeping
Sleeping
File size: 5,616 Bytes
c1b3dbf d857800 e50d04a 888756a e50d04a d857800 ca2055e d857800 ca2055e d857800 ca2055e d857800 ca2055e d857800 ca2055e d857800 ca2055e ec8154b 888756a e50d04a d857800 ec8154b 888756a ec8154b 888756a e50d04a 888756a e9f4868 888756a e9f4868 888756a e9f4868 888756a d857800 888756a ca2055e 888756a e9f4868 ca89ad8 e9f4868 888756a e9f4868 888756a 9d8ab78 888756a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import pandas as pd
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
# Load and clean data
def load_and_clean_data(file_path):
data = pd.read_csv(file_path)
# Handle missing values explicitly
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
datetime_cols = data.select_dtypes(include=['datetime64[ns]']).columns
data[numeric_cols] = data[numeric_cols].fillna(-1) # Placeholder for numeric columns
data[datetime_cols] = data[datetime_cols].fillna(pd.Timestamp("1970-01-01")) # Placeholder for datetime
data.fillna("Unknown", inplace=True) # Remaining columns
return data
# Load dataset
DATA_PATH = "TRK_13139_FY2021_2023.csv"
data = load_and_clean_data(DATA_PATH)
# Function to filter the dataset
def filter_data(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state):
filtered = data.copy()
if fiscal_year != "All":
filtered = filtered[filtered['fiscal_year'] == int(fiscal_year)]
if employer != "All":
filtered = filtered[filtered['employer_name'] == employer]
if job_title != "All":
filtered = filtered[filtered['job_title'] == job_title]
if country_of_birth != "All":
filtered = filtered[filtered['country_of_birth'] == country_of_birth]
if country_of_nationality != "All":
filtered = filtered[filtered['country_of_nationality'] == country_of_nationality]
if min_salary:
filtered = filtered[filtered['wage_amt'] >= float(min_salary)]
if max_salary:
filtered = filtered[filtered['wage_amt'] <= float(max_salary)]
if worksite_city != "All":
filtered = filtered[filtered['worksite_city'] == worksite_city]
if worksite_state != "All":
filtered = filtered[filtered['worksite_state'] == worksite_state]
return filtered
# Function to generate insights and visualizations
def generate_visuals(filtered_data):
# Gender Distribution Bar Chart
plt.figure(figsize=(12, 8))
sns.countplot(data=filtered_data, x='gender', order=filtered_data['gender'].value_counts().index)
plt.title("Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
gender_chart = gr.Plot(plt.gcf())
# Country of Birth Distribution
plt.figure(figsize=(12, 8))
filtered_data['country_of_birth'].value_counts().head(10).plot(kind='bar', color='skyblue')
plt.title("Top 10 Countries of Birth")
plt.xlabel("Country")
plt.ylabel("Count")
plt.tight_layout()
country_chart = gr.Plot(plt.gcf())
# Salary by Gender Histogram
plt.figure(figsize=(12, 8))
sns.histplot(data=filtered_data, x='wage_amt', hue='gender', multiple="stack", kde=True)
plt.title("Salary Distribution by Gender")
plt.xlabel("Salary (USD)")
plt.ylabel("Frequency")
plt.tight_layout()
salary_gender_hist = gr.Plot(plt.gcf())
return gender_chart, country_chart, salary_gender_hist
# Gradio Interface
def app(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state):
filtered = filter_data(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state)
gender_chart, country_chart, salary_gender_hist = generate_visuals(filtered)
return filtered, gender_chart, country_chart, salary_gender_hist
# Dropdown options
fiscal_years = ["All"] + sorted(data['fiscal_year'].dropna().unique().astype(str).tolist())
employers = ["All"] + sorted(data['employer_name'].dropna().unique().tolist())
job_titles = ["All"] + sorted(data['job_title'].dropna().unique().tolist())
countries_of_birth = ["All"] + sorted(data['country_of_birth'].dropna().unique().tolist())
countries_of_nationality = ["All"] + sorted(data['country_of_nationality'].dropna().unique().tolist())
worksite_cities = ["All"] + sorted(data['worksite_city'].dropna().unique().tolist())
worksite_states = ["All"] + sorted(data['worksite_state'].dropna().unique().tolist())
# Gradio components
with gr.Blocks() as demo:
with gr.Row():
gr.Markdown("### Data Exploration Dashboard")
with gr.Row():
fiscal_year = gr.Dropdown(label="Fiscal Year", choices=fiscal_years)
employer = gr.Dropdown(label="Employer", choices=employers)
job_title = gr.Dropdown(label="Job Title", choices=job_titles)
with gr.Row():
country_of_birth = gr.Dropdown(label="Country of Birth", choices=countries_of_birth)
country_of_nationality = gr.Dropdown(label="Country of Nationality", choices=countries_of_nationality)
with gr.Row():
min_salary = gr.Textbox(label="Min Salary (USD)")
max_salary = gr.Textbox(label="Max Salary (USD)")
with gr.Row():
worksite_city = gr.Dropdown(label="Worksite City", choices=worksite_cities)
worksite_state = gr.Dropdown(label="Worksite State", choices=worksite_states)
with gr.Row():
apply_filters = gr.Button("Apply Filters")
# Output components
output_table = gr.Dataframe(label="Filtered Data")
gender_chart = gr.Plot()
country_chart = gr.Plot()
salary_gender_hist = gr.Plot()
apply_filters.click(
app,
inputs=[fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state],
outputs=[output_table, gender_chart, country_chart, salary_gender_hist]
)
demo.launch()
|