Spaces:

LeonceNsh
/

h1b_visa_trends

Sleeping

File size: 5,616 Bytes

import pandas as pd
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns

# Load and clean data
def load_and_clean_data(file_path):
    data = pd.read_csv(file_path)
    
    # Handle missing values explicitly
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    datetime_cols = data.select_dtypes(include=['datetime64[ns]']).columns
    
    data[numeric_cols] = data[numeric_cols].fillna(-1)  # Placeholder for numeric columns
    data[datetime_cols] = data[datetime_cols].fillna(pd.Timestamp("1970-01-01"))  # Placeholder for datetime
    data.fillna("Unknown", inplace=True)  # Remaining columns
    
    return data

# Load dataset
DATA_PATH = "TRK_13139_FY2021_2023.csv"
data = load_and_clean_data(DATA_PATH)

# Function to filter the dataset
def filter_data(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state):
    filtered = data.copy()
    
    if fiscal_year != "All":
        filtered = filtered[filtered['fiscal_year'] == int(fiscal_year)]
    if employer != "All":
        filtered = filtered[filtered['employer_name'] == employer]
    if job_title != "All":
        filtered = filtered[filtered['job_title'] == job_title]
    if country_of_birth != "All":
        filtered = filtered[filtered['country_of_birth'] == country_of_birth]
    if country_of_nationality != "All":
        filtered = filtered[filtered['country_of_nationality'] == country_of_nationality]
    if min_salary:
        filtered = filtered[filtered['wage_amt'] >= float(min_salary)]
    if max_salary:
        filtered = filtered[filtered['wage_amt'] <= float(max_salary)]
    if worksite_city != "All":
        filtered = filtered[filtered['worksite_city'] == worksite_city]
    if worksite_state != "All":
        filtered = filtered[filtered['worksite_state'] == worksite_state]
        
    return filtered

# Function to generate insights and visualizations
def generate_visuals(filtered_data):
    # Gender Distribution Bar Chart
    plt.figure(figsize=(12, 8))
    sns.countplot(data=filtered_data, x='gender', order=filtered_data['gender'].value_counts().index)
    plt.title("Gender Distribution")
    plt.xlabel("Gender")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    gender_chart = gr.Plot(plt.gcf())

    # Country of Birth Distribution
    plt.figure(figsize=(12, 8))
    filtered_data['country_of_birth'].value_counts().head(10).plot(kind='bar', color='skyblue')
    plt.title("Top 10 Countries of Birth")
    plt.xlabel("Country")
    plt.ylabel("Count")
    plt.tight_layout()
    country_chart = gr.Plot(plt.gcf())
    
    # Salary by Gender Histogram
    plt.figure(figsize=(12, 8))
    sns.histplot(data=filtered_data, x='wage_amt', hue='gender', multiple="stack", kde=True)
    plt.title("Salary Distribution by Gender")
    plt.xlabel("Salary (USD)")
    plt.ylabel("Frequency")
    plt.tight_layout()
    salary_gender_hist = gr.Plot(plt.gcf())

    return gender_chart, country_chart, salary_gender_hist

# Gradio Interface
def app(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state):
    filtered = filter_data(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state)
    gender_chart, country_chart, salary_gender_hist = generate_visuals(filtered)
    return filtered, gender_chart, country_chart, salary_gender_hist

# Dropdown options
fiscal_years = ["All"] + sorted(data['fiscal_year'].dropna().unique().astype(str).tolist())
employers = ["All"] + sorted(data['employer_name'].dropna().unique().tolist())
job_titles = ["All"] + sorted(data['job_title'].dropna().unique().tolist())
countries_of_birth = ["All"] + sorted(data['country_of_birth'].dropna().unique().tolist())
countries_of_nationality = ["All"] + sorted(data['country_of_nationality'].dropna().unique().tolist())
worksite_cities = ["All"] + sorted(data['worksite_city'].dropna().unique().tolist())
worksite_states = ["All"] + sorted(data['worksite_state'].dropna().unique().tolist())

# Gradio components
with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("### Data Exploration Dashboard")

    with gr.Row():
        fiscal_year = gr.Dropdown(label="Fiscal Year", choices=fiscal_years)
        employer = gr.Dropdown(label="Employer", choices=employers)
        job_title = gr.Dropdown(label="Job Title", choices=job_titles)

    with gr.Row():
        country_of_birth = gr.Dropdown(label="Country of Birth", choices=countries_of_birth)
        country_of_nationality = gr.Dropdown(label="Country of Nationality", choices=countries_of_nationality)

    with gr.Row():
        min_salary = gr.Textbox(label="Min Salary (USD)")
        max_salary = gr.Textbox(label="Max Salary (USD)")

    with gr.Row():
        worksite_city = gr.Dropdown(label="Worksite City", choices=worksite_cities)
        worksite_state = gr.Dropdown(label="Worksite State", choices=worksite_states)

    with gr.Row():
        apply_filters = gr.Button("Apply Filters")
    
    # Output components
    output_table = gr.Dataframe(label="Filtered Data")
    gender_chart = gr.Plot()
    country_chart = gr.Plot()
    salary_gender_hist = gr.Plot()

    apply_filters.click(
        app,
        inputs=[fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state],
        outputs=[output_table, gender_chart, country_chart, salary_gender_hist]
    )

demo.launch()