Spaces:

LeonceNsh
/

healthcare-networks-gpus

Sleeping

App Files Files Community

LeonceNsh commited on Nov 26, 2024

Commit

823fa29

verified ·

1 Parent(s): e102d88

Create app.py

Browse files

Files changed (1) hide show

app.py +195 -0

app.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.impute import SimpleImputer
+from sklearn.decomposition import PCA
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.ensemble import RandomForestRegressor
+from xgboost import XGBRegressor
+from sklearn.linear_model import Ridge
+from catboost import CatBoostRegressor
+import warnings
+warnings.filterwarnings('ignore')
+def load_embeddings(embeddings_file_path):
+    county_embeddings = pd.read_csv(embeddings_file_path).set_index('place')
+    numeric_cols = county_embeddings.select_dtypes(include=['number']).columns
+    county_embeddings_numeric = county_embeddings[numeric_cols]
+    imputer = SimpleImputer(strategy='mean')
+    county_embeddings_imputed = imputer.fit_transform(county_embeddings_numeric)
+    pca = PCA(n_components=330)
+    pca.fit(county_embeddings_imputed)
+    county_embeddings_pca = pca.transform(county_embeddings_imputed)
+    return county_embeddings, county_embeddings_pca, pca, imputer
+def load_unemployment_data(unemployment_file_path):
+    unemployment_data = pd.read_csv(unemployment_file_path).set_index('place')
+    unemployment_long = unemployment_data.reset_index().melt(id_vars='place', var_name='date', value_name='unemployment_rate')
+    return unemployment_long
+def preprocess_data(county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer):
+    # Prepare data for modeling
+    X = unemployment_long.drop('unemployment_rate', axis=1)
+    y = unemployment_long['unemployment_rate']
+    # Split the data
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    # Merge embeddings
+    county_embeddings.index = county_embeddings.index.astype(str)
+    X_train['place'] = X_train['place'].astype(str)
+    X_test['place'] = X_test['place'].astype(str)
+    X_train = X_train.merge(county_embeddings, left_on='place', right_index=True, how='left')
+    X_test = X_test.merge(county_embeddings, left_on='place', right_index=True, how='left')
+    # Remove non-numeric columns
+    numeric_cols_train = X_train.select_dtypes(include=['number']).columns
+    X_train_numeric = X_train[numeric_cols_train]
+    numeric_cols_test = X_test.select_dtypes(include=['number']).columns
+    X_test_numeric = X_test[numeric_cols_test]
+    # Impute missing values
+    X_train_imputed = imputer.transform(X_train_numeric)
+    X_test_imputed = imputer.transform(X_test_numeric)
+    # Apply PCA
+    X_train_pca = pca.transform(X_train_imputed)
+    X_test_pca = pca.transform(X_test_imputed)
+    return X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train
+def train_and_evaluate_models(X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train):
+    # Define models
+    models = {
+        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
+        "XGBoost": XGBRegressor(n_estimators=100, random_state=42, tree_method='gpu_hist'),
+        "Ridge Regression": Ridge(alpha=1.0),
+        "CatBoost": CatBoostRegressor(iterations=100, random_seed=42, task_type="GPU")
+    }
+    results = {}
+    feature_importances = {}
+    for name, model in models.items():
+        model.fit(X_train_pca, y_train)
+        y_pred = model.predict(X_test_pca)
+        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+        r2 = r2_score(y_test, y_pred)
+        results[name] = {'RMSE': rmse, 'R-squared': r2}
+        # Feature importances
+        if hasattr(model, 'feature_importances_'):
+            importances = model.feature_importances_
+            feature_importances[name] = importances
+    return results, feature_importances, numeric_cols_train
+def plot_feature_importance(importances, feature_names, model_name):
+    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
+    feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False).head(20)
+    plt.figure(figsize=(10, 8))
+    sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
+    plt.title(f'{model_name} Feature Importance')
+    plt.tight_layout()
+    plt.close()
+    return plt.gcf()
+def plot_metrics(results):
+    metrics_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})
+    plt.figure(figsize=(8, 6))
+    sns.barplot(x='Model', y='RMSE', data=metrics_df)
+    plt.title('RMSE for Each Model')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    plt.close()
+    rmse_plot = plt.gcf()
+    plt.figure(figsize=(8, 6))
+    sns.barplot(x='Model', y='R-squared', data=metrics_df)
+    plt.title('R-squared for Each Model')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    plt.close()
+    r2_plot = plt.gcf()
+    return rmse_plot, r2_plot
+def main(embeddings_file_path, unemployment_file_path):
+    # Load data
+    county_embeddings, county_embeddings_pca, pca, imputer = load_embeddings(embeddings_file_path)
+    unemployment_long = load_unemployment_data(unemployment_file_path)
+    # Preprocess data
+    X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train = preprocess_data(
+        county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer
+    )
+    # Train and evaluate models
+    results, feature_importances, feature_names = train_and_evaluate_models(
+        X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train
+    )
+    # Plot metrics
+    rmse_plot, r2_plot = plot_metrics(results)
+    # Plot feature importance for models that have it
+    feature_importance_plots = {}
+    for model_name, importances in feature_importances.items():
+        fig = plot_feature_importance(importances, [f'PC{i+1}' for i in range(len(importances))], model_name)
+        feature_importance_plots[model_name] = fig
+    return results, rmse_plot, r2_plot, feature_importance_plots
+def gradio_app():
+    with gr.Blocks() as demo:
+        gr.Markdown("# County-Level Unemployment Rate Forecasting")
+        gr.Markdown("Upload county embeddings and unemployment data to train models and visualize results.")
+        with gr.Row():
+            embeddings_file = gr.File(label="Upload County Embeddings CSV")
+            unemployment_file = gr.File(label="Upload Unemployment Data CSV")
+        run_button = gr.Button("Run Analysis")
+        output_results = gr.JSON(label="Model Performance Metrics")
+        output_rmse_plot = gr.Plot(label="RMSE Comparison")
+        output_r2_plot = gr.Plot(label="R-squared Comparison")
+        output_feature_importance = gr.Plot(label="Feature Importances")
+        def run_analysis(embeddings_file, unemployment_file):
+            if embeddings_file is None or unemployment_file is None:
+                return gr.update(value="Please upload both embeddings and unemployment data files."), None, None, None
+            # Read files
+            embeddings_file_path = embeddings_file.name
+            unemployment_file_path = unemployment_file.name
+            # Run main analysis
+            results, rmse_plot, r2_plot, feature_importance_plots = main(embeddings_file_path, unemployment_file_path)
+            # For simplicity, display feature importance of Random Forest (if available)
+            fi_plot = None
+            if 'Random Forest' in feature_importance_plots:
+                fi_plot = feature_importance_plots['Random Forest']
+            return results, rmse_plot, r2_plot, fi_plot
+        run_button.click(
+            run_analysis,
+            inputs=[embeddings_file, unemployment_file],
+            outputs=[output_results, output_rmse_plot, output_r2_plot, output_feature_importance]
+        )
+    demo.launch()
+if __name__ == "__main__":
+    gradio_app()