import pandas as pd import gradio as gr import matplotlib.pyplot as plt import seaborn as sns # Load and clean data def load_and_clean_data(file_path): data = pd.read_csv(file_path) # Handle missing values explicitly numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns datetime_cols = data.select_dtypes(include=['datetime64[ns]']).columns data[numeric_cols] = data[numeric_cols].fillna(-1) # Placeholder for numeric columns data[datetime_cols] = data[datetime_cols].fillna(pd.Timestamp("1970-01-01")) # Placeholder for datetime data.fillna("Unknown", inplace=True) # Remaining columns return data # Load dataset DATA_PATH = "TRK_13139_FY2021_2023.csv" data = load_and_clean_data(DATA_PATH) # Function to filter the dataset def filter_data(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state): filtered = data.copy() if fiscal_year != "All": filtered = filtered[filtered['fiscal_year'] == int(fiscal_year)] if employer != "All": filtered = filtered[filtered['employer_name'] == employer] if job_title != "All": filtered = filtered[filtered['job_title'] == job_title] if country_of_birth != "All": filtered = filtered[filtered['country_of_birth'] == country_of_birth] if country_of_nationality != "All": filtered = filtered[filtered['country_of_nationality'] == country_of_nationality] if min_salary: filtered = filtered[filtered['wage_amt'] >= float(min_salary)] if max_salary: filtered = filtered[filtered['wage_amt'] <= float(max_salary)] if worksite_city != "All": filtered = filtered[filtered['worksite_city'] == worksite_city] if worksite_state != "All": filtered = filtered[filtered['worksite_state'] == worksite_state] return filtered # Function to generate insights and visualizations def generate_visuals(filtered_data): # Gender Distribution Bar Chart plt.figure(figsize=(12, 8)) sns.countplot(data=filtered_data, x='gender', order=filtered_data['gender'].value_counts().index) plt.title("Gender Distribution") plt.xlabel("Gender") plt.ylabel("Count") plt.xticks(rotation=45) plt.tight_layout() gender_chart = gr.Plot(plt.gcf()) # Country of Birth Distribution plt.figure(figsize=(12, 8)) filtered_data['country_of_birth'].value_counts().head(10).plot(kind='bar', color='skyblue') plt.title("Top 10 Countries of Birth") plt.xlabel("Country") plt.ylabel("Count") plt.tight_layout() country_chart = gr.Plot(plt.gcf()) # Salary by Gender Histogram plt.figure(figsize=(12, 8)) sns.histplot(data=filtered_data, x='wage_amt', hue='gender', multiple="stack", kde=True) plt.title("Salary Distribution by Gender") plt.xlabel("Salary (USD)") plt.ylabel("Frequency") plt.tight_layout() salary_gender_hist = gr.Plot(plt.gcf()) return gender_chart, country_chart, salary_gender_hist # Gradio Interface def app(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state): filtered = filter_data(fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state) gender_chart, country_chart, salary_gender_hist = generate_visuals(filtered) return filtered, gender_chart, country_chart, salary_gender_hist # Dropdown options fiscal_years = ["All"] + sorted(data['fiscal_year'].dropna().unique().astype(str).tolist()) employers = ["All"] + sorted(data['employer_name'].dropna().unique().tolist()) job_titles = ["All"] + sorted(data['job_title'].dropna().unique().tolist()) countries_of_birth = ["All"] + sorted(data['country_of_birth'].dropna().unique().tolist()) countries_of_nationality = ["All"] + sorted(data['country_of_nationality'].dropna().unique().tolist()) worksite_cities = ["All"] + sorted(data['worksite_city'].dropna().unique().tolist()) worksite_states = ["All"] + sorted(data['worksite_state'].dropna().unique().tolist()) # Gradio components with gr.Blocks() as demo: with gr.Row(): gr.Markdown("### Data Exploration Dashboard") with gr.Row(): fiscal_year = gr.Dropdown(label="Fiscal Year", choices=fiscal_years) employer = gr.Dropdown(label="Employer", choices=employers) job_title = gr.Dropdown(label="Job Title", choices=job_titles) with gr.Row(): country_of_birth = gr.Dropdown(label="Country of Birth", choices=countries_of_birth) country_of_nationality = gr.Dropdown(label="Country of Nationality", choices=countries_of_nationality) with gr.Row(): min_salary = gr.Textbox(label="Min Salary (USD)") max_salary = gr.Textbox(label="Max Salary (USD)") with gr.Row(): worksite_city = gr.Dropdown(label="Worksite City", choices=worksite_cities) worksite_state = gr.Dropdown(label="Worksite State", choices=worksite_states) with gr.Row(): apply_filters = gr.Button("Apply Filters") # Output components output_table = gr.Dataframe(label="Filtered Data") gender_chart = gr.Plot() country_chart = gr.Plot() salary_gender_hist = gr.Plot() apply_filters.click( app, inputs=[fiscal_year, employer, job_title, country_of_birth, country_of_nationality, min_salary, max_salary, worksite_city, worksite_state], outputs=[output_table, gender_chart, country_chart, salary_gender_hist] ) demo.launch()