Spaces:

LeonceNsh
/

healthcare-networks-gpus

Sleeping

App Files Files Community

LeonceNsh commited on Dec 1, 2024

Commit

fae0679

verified ·

1 Parent(s): 80e725c

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -186

app.py CHANGED Viewed

@@ -1,191 +1,83 @@
 import gradio as gr
-import cudf
-import cupy as cp
-import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-from cuml.preprocessing.imputation import SimpleImputer
-from cuml.decomposition import PCA
-from cuml.model_selection import train_test_split
-from cuml.metrics import mean_squared_error, r2_score
-from cuml.ensemble import RandomForestRegressor
-from cuml.linear_model import Ridge
-import xgboost as xgb
-from catboost import CatBoostRegressor
-import warnings
-warnings.filterwarnings('ignore')
-# Load datasets (Assuming the datasets are in the same directory)
-def load_embeddings():
-    embeddings_file_path = 'county_embeddings.csv'
-    county_embeddings = cudf.read_csv(embeddings_file_path).set_index('place')
-    county_embeddings = county_embeddings.head(1000)  # Optional: limit data size for testing
-    numeric_cols = county_embeddings.select_dtypes(include=['float64', 'int64']).columns
-    county_embeddings_numeric = county_embeddings[numeric_cols]
-    imputer = SimpleImputer(strategy='mean')
-    county_embeddings_imputed = imputer.fit_transform(county_embeddings_numeric)
-    pca = PCA(n_components=330)
-    pca.fit(county_embeddings_imputed)
-    county_embeddings_pca = pca.transform(county_embeddings_imputed)
-    return county_embeddings, county_embeddings_pca, pca, imputer
-def load_unemployment_data():
-    unemployment_file_path = 'county_unemployment.csv'
-    unemployment_data = cudf.read_csv(unemployment_file_path).set_index('place')
-    unemployment_data = unemployment_data.head(1000)  # Optional: limit data size for testing
-    unemployment_long = unemployment_data.reset_index().melt(id_vars='place', var_name='date', value_name='unemployment_rate')
-    return unemployment_long
-def preprocess_data(county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer):
-    # Prepare data for modeling
-    X = unemployment_long.drop('unemployment_rate', axis=1)
-    y = unemployment_long['unemployment_rate']
-    # Split the data
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    # Merge embeddings
-    county_embeddings.index = county_embeddings.index.astype('str')
-    X_train['place'] = X_train['place'].astype('str')
-    X_test['place'] = X_test['place'].astype('str')
-    X_train = X_train.merge(county_embeddings, left_on='place', right_index=True, how='left')
-    X_test = X_test.merge(county_embeddings, left_on='place', right_index=True, how='left')
-    # Remove non-numeric columns
-    numeric_cols_train = X_train.select_dtypes(include=['float64', 'int64']).columns
-    X_train_numeric = X_train[numeric_cols_train]
-    numeric_cols_test = X_test.select_dtypes(include=['float64', 'int64']).columns
-    X_test_numeric = X_test[numeric_cols_test]
-    # Impute missing values
-    X_train_imputed = imputer.transform(X_train_numeric)
-    X_test_imputed = imputer.transform(X_test_numeric)
-    # Apply PCA
-    X_train_pca = pca.transform(X_train_imputed)
-    X_test_pca = pca.transform(X_test_imputed)
-    # Convert labels to GPU arrays
-    y_train = y_train.to_cupy()
-    y_test = y_test.to_cupy()
-    return X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train
-def train_and_evaluate_models(X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train, selected_models):
-    # Define models
-    all_models = {
-        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
-        "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=42, tree_method='gpu_hist', gpu_id=0),
-        "Ridge Regression": Ridge(alpha=1.0),
-        "CatBoost": CatBoostRegressor(iterations=100, random_seed=42, task_type="GPU", devices='0')
-    }
-    # Filter selected models
-    models = {name: model for name, model in all_models.items() if name in selected_models}
-    results = {}
-    feature_importances = {}
-    for name, model in models.items():
-        if name == "XGBoost":
-            model.fit(cp.asnumpy(X_train_pca), cp.asnumpy(y_train))
-            y_pred = model.predict(cp.asnumpy(X_test_pca))
-            y_pred = cp.asarray(y_pred)
-        elif name == "CatBoost":
-            model.fit(cp.asnumpy(X_train_pca), cp.asnumpy(y_train), verbose=False)
-            y_pred = model.predict(cp.asnumpy(X_test_pca))
-            y_pred = cp.asarray(y_pred)
-        else:
-            model.fit(X_train_pca, y_train)
-            y_pred = model.predict(X_test_pca)
-        # Compute metrics
-        rmse = cp.sqrt(mean_squared_error(y_test, y_pred)).get()
-        r2 = r2_score(y_test, y_pred).get()
-        results[name] = {'RMSE': rmse, 'R-squared': r2}
-        # Feature importances
-        if hasattr(model, 'feature_importances_'):
-            importances = model.feature_importances_
-            if isinstance(importances, cp.ndarray):
-                importances = cp.asnumpy(importances)
-            feature_importances[name] = importances
-    return results, feature_importances, numeric_cols_train
-def plot_feature_importance(importances, feature_names, model_name):
-    feature_importance_df = pd.DataFrame({'Feature': feature_names[:len(importances)], 'Importance': importances})
-    feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False).head(20)
-    plt.figure(figsize=(10, 8))
-    sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
-    plt.title(f'{model_name} Feature Importance')
-    plt.tight_layout()
-    plt.close()
-    return plt.gcf()
-def plot_metrics(results):
-    metrics_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})
-    plt.figure(figsize=(8, 6))
-    sns.barplot(x='Model', y='RMSE', data=metrics_df)
-    plt.title('RMSE for Each Model')
-    plt.xticks(rotation=45)
-    plt.tight_layout()
-    plt.close()
-    rmse_plot = plt.gcf()
-    plt.figure(figsize=(8, 6))
-    sns.barplot(x='Model', y='R-squared', data=metrics_df)
-    plt.title('R-squared for Each Model')
-    plt.xticks(rotation=45)
-    plt.tight_layout()
-    plt.close()
-    r2_plot = plt.gcf()
-    return rmse_plot, r2_plot
-def main(selected_models):
-    # Load data
-    county_embeddings, county_embeddings_pca, pca, imputer = load_embeddings()
-    unemployment_long = load_unemployment_data()
-    # Preprocess data
-    X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train = preprocess_data(
-        county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer
-    )
-    # Train and evaluate models
-    results, feature_importances, feature_names = train_and_evaluate_models(
-        X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train, selected_models
-    )
-    # Plot metrics
-    rmse_plot, r2_plot = plot_metrics(results)
-    # Plot feature importance for models that have it
-    feature_importance_plots = {}
-    for model_name, importances in feature_importances.items():
-        fig = plot_feature_importance(importances, [f'PC{i+1}' for i in range(len(importances))], model_name)
-        feature_importance_plots[model_name] = fig
-    return results, rmse_plot, r2_plot, feature_importance_plots
-def gradio_app():
     with gr.Blocks() as demo:
-        gr.Markdown("<h1 style='text-align: center'>County-Level Unemployment Rate Forecasting</h1>")
-        gr.Markdown("This app forecasts county-level unemployment rates using various machine learning models with GPU acceleration.")
-        with gr.Row():
-            with gr.Column(scale=1):
-                gr.Markdown("### Select Models to Train")
-                model_choices = ["Random Forest", "XGBoost", "Ridge Regression", "CatBoost"]
-                selected_models = gr.CheckboxGroup(choices=model_choices, value=model_choices, label="Models")
-                run_button = gr.Button("Run Analysis")

 import gradio as gr
+import pandas as pd
+import geopandas as gpd
 import matplotlib.pyplot as plt
 import seaborn as sns
+from timesfm import TimesFm, TimesFmHparams, TimesFmCheckpoint
+from sklearn.ensemble import GradientBoostingRegressor
+import numpy as np
+# GPU-optimized TimesFM setup
+timesfm_backend = "gpu"
+timesfm_model_config = TimesFmHparams(
+    context_len=512,
+    horizon_len=128,
+    per_core_batch_size=128,
+    backend=timesfm_backend,
+)
+timesfm_model = TimesFm(
+    hparams=timesfm_model_config,
+    checkpoint=TimesFmCheckpoint(huggingface_repo_id="google/timesfm-1.0-200m-pytorch")
+)
+# Function to load embeddings and calculate HHI
+def calculate_hhi(file, market_col, id_col, weight_col):
+    df = pd.read_csv(file.name)
+    df['denominator'] = df.groupby(market_col)[weight_col].transform('sum')
+    df['numerator'] = df.groupby([market_col, id_col])[weight_col].transform('sum')
+    df['market_share'] = 100 * (df['numerator'] / df['denominator'])
+    df['market_share_sq'] = df['market_share'] ** 2
+    hhi = df.groupby(market_col).apply(lambda x: x['market_share_sq'].sum())
+    return hhi.reset_index(name='hhi')
+# Function to visualize HHI map
+def plot_hhi_map(hhi_csv, shapefile):
+    hhi_df = pd.read_csv(hhi_csv.name)
+    gdf = gpd.read_file(shapefile.name)
+    gdf = gdf.merge(hhi_df, left_on='fips_code', right_on='market_col', how='left')
+    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
+    gdf.plot(column='hhi', cmap='RdBu', legend=True, ax=ax, missing_kwds={"color": "lightgrey"})
+    ax.set_title("HHI by County")
+    return fig
+# Function to forecast using TimesFM
+def forecast(file, history_steps, forecast_steps):
+    df = pd.read_csv(file.name).set_index('place')
+    history = df[history_steps]
+    forecast = timesfm_model.forecast(inputs=history.values)
+    return pd.DataFrame(forecast, index=history.index)
+# Gradio app interface
+def gradio_interface():
     with gr.Blocks() as demo:
+        gr.Markdown("### Healthcare Network Analysis and Forecasting")
+        with gr.Tab("Upload Embeddings"):
+            file_upload = gr.File(label="Upload Embeddings (CSV)")
+            hhi_results = gr.DataFrame(label="HHI Results")
+            calculate_button = gr.Button("Calculate HHI")
+            calculate_button.click(
+                calculate_hhi,
+                inputs=[file_upload, "market_col", "id_col", "weight_col"],
+                outputs=hhi_results
+            )
+        with gr.Tab("Visualize Map"):
+            hhi_csv = gr.File(label="Upload HHI CSV")
+            shapefile = gr.File(label="Upload Shapefile")
+            map_plot = gr.Plot(label="HHI Map")
+            plot_button = gr.Button("Generate Map")
+            plot_button.click(plot_hhi_map, inputs=[hhi_csv, shapefile], outputs=map_plot)
+        with gr.Tab("Forecasting"):
+            forecast_file = gr.File(label="Upload Historical Data (CSV)")
+            forecast_steps = gr.Slider(minimum=1, maximum=24, step=1, label="Forecast Steps")
+            forecast_results = gr.DataFrame(label="Forecasted Data")
+            forecast_button = gr.Button("Forecast")
+            forecast_button.click(forecast, inputs=[forecast_file, forecast_steps], outputs=forecast_results)
+    return demo
+# Run app
+if __name__ == "__main__":
+    gradio_interface().launch()