File size: 5,616 Bytes
c1b3dbf
 
d857800
 
 
 
 
 
e50d04a
888756a
 
 
 
 
 
 
e50d04a
d857800
 
ca2055e
d857800
 
 
 
ca2055e
d857800
ca2055e
 
 
d857800
 
ca2055e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d857800
 
ca2055e
d857800
ca2055e
ec8154b
888756a
 
 
 
 
e50d04a
d857800
ec8154b
888756a
ec8154b
888756a
 
 
 
e50d04a
888756a
e9f4868
 
 
 
 
 
 
 
 
888756a
e9f4868
888756a
 
 
 
e9f4868
 
888756a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d857800
888756a
 
 
ca2055e
888756a
 
e9f4868
ca89ad8
 
 
e9f4868
888756a
 
 
 
e9f4868
888756a
9d8ab78
888756a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import pandas as pd
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns

# Load and clean data
def load_and_clean_data(file_path):
    data = pd.read_csv(file_path)
    
    # Handle missing values explicitly
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    datetime_cols = data.select_dtypes(include=['datetime64[ns]']).columns
    
    data[numeric_cols] = data[numeric_cols].fillna(-1)  # Placeholder for numeric columns
    data[datetime_cols] = data[datetime_cols].fillna(pd.Timestamp("1970-01-01"))  # Placeholder for datetime
    data.fillna("Unknown", inplace=True)  # Remaining columns
    
    return data

# Load dataset
DATA_PATH = "TRK_13139_FY2021_2023.csv"
data = load_and_clean_data(DATA_PATH)

# Function to filter the dataset
def filter_data(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state):
    filtered = data.copy()
    
    if fiscal_year != "All":
        filtered = filtered[filtered['fiscal_year'] == int(fiscal_year)]
    if employer != "All":
        filtered = filtered[filtered['employer_name'] == employer]
    if job_title != "All":
        filtered = filtered[filtered['job_title'] == job_title]
    if country_of_birth != "All":
        filtered = filtered[filtered['country_of_birth'] == country_of_birth]
    if country_of_nationality != "All":
        filtered = filtered[filtered['country_of_nationality'] == country_of_nationality]
    if min_salary:
        filtered = filtered[filtered['wage_amt'] >= float(min_salary)]
    if max_salary:
        filtered = filtered[filtered['wage_amt'] <= float(max_salary)]
    if worksite_city != "All":
        filtered = filtered[filtered['worksite_city'] == worksite_city]
    if worksite_state != "All":
        filtered = filtered[filtered['worksite_state'] == worksite_state]
        
    return filtered

# Function to generate insights and visualizations
def generate_visuals(filtered_data):
    # Gender Distribution Bar Chart
    plt.figure(figsize=(12, 8))
    sns.countplot(data=filtered_data, x='gender', order=filtered_data['gender'].value_counts().index)
    plt.title("Gender Distribution")
    plt.xlabel("Gender")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    gender_chart = gr.Plot(plt.gcf())

    # Country of Birth Distribution
    plt.figure(figsize=(12, 8))
    filtered_data['country_of_birth'].value_counts().head(10).plot(kind='bar', color='skyblue')
    plt.title("Top 10 Countries of Birth")
    plt.xlabel("Country")
    plt.ylabel("Count")
    plt.tight_layout()
    country_chart = gr.Plot(plt.gcf())
    
    # Salary by Gender Histogram
    plt.figure(figsize=(12, 8))
    sns.histplot(data=filtered_data, x='wage_amt', hue='gender', multiple="stack", kde=True)
    plt.title("Salary Distribution by Gender")
    plt.xlabel("Salary (USD)")
    plt.ylabel("Frequency")
    plt.tight_layout()
    salary_gender_hist = gr.Plot(plt.gcf())

    return gender_chart, country_chart, salary_gender_hist

# Gradio Interface
def app(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state):
    filtered = filter_data(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state)
    gender_chart, country_chart, salary_gender_hist = generate_visuals(filtered)
    return filtered, gender_chart, country_chart, salary_gender_hist

# Dropdown options
fiscal_years = ["All"] + sorted(data['fiscal_year'].dropna().unique().astype(str).tolist())
employers = ["All"] + sorted(data['employer_name'].dropna().unique().tolist())
job_titles = ["All"] + sorted(data['job_title'].dropna().unique().tolist())
countries_of_birth = ["All"] + sorted(data['country_of_birth'].dropna().unique().tolist())
countries_of_nationality = ["All"] + sorted(data['country_of_nationality'].dropna().unique().tolist())
worksite_cities = ["All"] + sorted(data['worksite_city'].dropna().unique().tolist())
worksite_states = ["All"] + sorted(data['worksite_state'].dropna().unique().tolist())

# Gradio components
with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("### Data Exploration Dashboard")

    with gr.Row():
        fiscal_year = gr.Dropdown(label="Fiscal Year", choices=fiscal_years)
        employer = gr.Dropdown(label="Employer", choices=employers)
        job_title = gr.Dropdown(label="Job Title", choices=job_titles)

    with gr.Row():
        country_of_birth = gr.Dropdown(label="Country of Birth", choices=countries_of_birth)
        country_of_nationality = gr.Dropdown(label="Country of Nationality", choices=countries_of_nationality)

    with gr.Row():
        min_salary = gr.Textbox(label="Min Salary (USD)")
        max_salary = gr.Textbox(label="Max Salary (USD)")

    with gr.Row():
        worksite_city = gr.Dropdown(label="Worksite City", choices=worksite_cities)
        worksite_state = gr.Dropdown(label="Worksite State", choices=worksite_states)

    with gr.Row():
        apply_filters = gr.Button("Apply Filters")
    
    # Output components
    output_table = gr.Dataframe(label="Filtered Data")
    gender_chart = gr.Plot()
    country_chart = gr.Plot()
    salary_gender_hist = gr.Plot()

    apply_filters.click(
        app,
        inputs=[fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state],
        outputs=[output_table, gender_chart, country_chart, salary_gender_hist]
    )

demo.launch()