BP-GWAS-Prioritise

Sleeping

App Files Files Community

hlnicholls commited on Apr 7, 2024

Commit

8a6cf88

1 Parent(s): 15625ec

feat: updated interace

Browse files

Files changed (8) hide show

__pycache__/dynamic_shap_plot.cpython-38.pyc +0 -0
__pycache__/dynamic_shap_plots.cpython-38.pyc +0 -0
__pycache__/shap_plots.cpython-38.pyc +0 -0
app.py +132 -101
dynamic_shap_plot.py +0 -118
dynamic_shap_plots.py +346 -0
requirements.txt +1 -0
shap_plots.py +729 -729

__pycache__/dynamic_shap_plot.cpython-38.pyc ADDED Viewed

Binary file (3.08 kB). View file

__pycache__/dynamic_shap_plots.cpython-38.pyc ADDED Viewed

Binary file (8.14 kB). View file

__pycache__/shap_plots.cpython-38.pyc ADDED Viewed

Binary file (16.9 kB). View file

app.py CHANGED Viewed

@@ -7,11 +7,12 @@ import sklearn
 import catboost
 import shap
 from shap_plots import shap_summary_plot
-from dynamic_shap_plot import matplotlib_to_plotly, summary_plot_plotly_fig
 import plotly.tools as tls
-import dash_core_components as dcc
-import matplotlib
 import plotly.graph_objs as go
 try:
     import matplotlib.pyplot as pl
     from matplotlib.colors import LinearSegmentedColormap
@@ -21,133 +22,163 @@ except ImportError:
 st.set_option('deprecation.showPyplotGlobalUse', False)
-seed=42
 annotations = pd.read_csv("all_genes_merged_ml_data.csv")
-# TODO remove this placeholder when imputation is finished:
 annotations.fillna(0, inplace=True)
 annotations = annotations.set_index("Gene")
-# Read in best_model_fitted.pkl as catboost_model
-model_path = "best_model_fitted.pkl"  # Update this path if your model is stored elsewhere
 with open(model_path, 'rb') as file:
     catboost_model = pickle.load(file)
-# For a multi-class classification model, obtaining probabilities per class
 probabilities = catboost_model.predict_proba(annotations)
-# Creating a DataFrame for these probabilities
-# Assuming classes are ordered as 'most likely', 'probable', and 'least likely' in the model
-prob_df = pd.DataFrame(probabilities,
-                       index=annotations.index,
-                       columns=['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely'])
-# Dynamically including all original features from annotations plus the new probability columns
 df_total = pd.concat([prob_df, annotations], axis=1)
 st.title('Blood Pressure Gene Prioritisation Post-GWAS')
-st.markdown("""
-A machine learning pipeline for predicting disease-causing genes post-genome-wide association study in blood pressure.
-""")
-collect_genes = lambda x : [str(i) for i in re.split(",|,\s+|\s+", x) if i != ""]
 input_gene_list = st.text_input("Input a list of multiple HGNC genes (enter comma separated):")
 gene_list = collect_genes(input_gene_list)
 explainer = shap.TreeExplainer(catboost_model)
 @st.cache_data
 def convert_df(df):
-   return df.to_csv(index=False).encode('utf-8')
 probability_columns = ['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely']
 features_list = [column for column in df_total.columns if column not in probability_columns]
 features = df_total[features_list]
-if len(gene_list) > 1:
-    df = df_total[df_total.index.isin(gene_list)]
-    df['Gene'] = df.index  # Ensure 'Gene' is a column if it's not already
-    df.reset_index(drop=True, inplace=True)
-    # Including Gene, probability columns, and all other features
-    required_columns = ['Gene'] + probability_columns + [col for col in df.columns if col not in probability_columns and col != 'Gene']
-    df = df[required_columns]
-    st.dataframe(df)
-    # Assuming you want to download the genes with their probabilities
-    output = df[['Gene'] + probability_columns]
-    csv = convert_df(output)
-    st.download_button(
-       "Download Gene Prioritisation",
-       csv,
-       "bp_gene_prioritisation.csv",
-       "text/csv",
-       key='download-csv'
-    )
-    # For SHAP values, assuming explainer is already fitted to your model
-    df_shap = df.drop(columns=probability_columns + ['Gene'])  # Exclude non-feature columns
-    shap_values = explainer.shap_values(df_shap)
-    # Handle multiclass scenario: SHAP values will be a list of matrices, one per class
-    # Plotting the summary plot for the first class as an example
-    # You may loop through each class or handle it differently based on your needs
-    class_index = 0  # Example: plotting for the first class
-    shap.summary_plot(shap_values[class_index], df_shap, show=False)
-    st.pyplot(bbox_inches='tight')
-    st.caption("SHAP Summary Plot of All Input Genes")
-else:
-    pass
-input_gene = st.text_input("Input an individual HGNC gene:")
-df2 = df_total[df_total.index == input_gene]
-df2['Gene'] = df2.index
-df2.reset_index(drop=True, inplace=True)
-# Ensure the DataFrame includes the CatBoost model's probability columns
-# And assuming all features are desired in the output
-probability_columns = ['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely']
-required_columns = ['Gene'] + probability_columns + [col for col in df2.columns if col not in probability_columns and col != 'Gene']
-df2 = df2[required_columns]
-st.dataframe(df2)
-if input_gene:
-    if ' ' in input_gene or ',' in input_gene:
-        st.write('Input Error: Please input only a single HGNC gene name with no white spaces or commas.')
     else:
-        df2_shap = df_total.loc[[input_gene], [col for col in df_total.columns if col not in probability_columns + ['Gene']]]
-        if df2_shap.shape[0] > 0:  # Check if the gene exists in the DataFrame
-            shap_values = explainer.shap_values(df2_shap)
-            # Adjust for multiclass: Select SHAP values for the predicted class (or a specific class)
-            predicted_class_index = catboost_model.predict(df2_shap).item()  # Assuming predict returns the class index
-            class_shap_values = shap_values[predicted_class_index]
-            class_expected_value = explainer.expected_value[predicted_class_index]
-            # Since force_plot doesn't directly support multiclass, consider using waterfall_plot or decision_plot
-            # Here's an example using waterfall_plot for the first feature set's prediction
-            shap.plots.waterfall(shap_values=class_shap_values[0], max_display=10, show=False)
-            st.pyplot(bbox_inches='tight')
-else:
-    pass
-st.markdown("""
-### Total Gene Prioritisation Results:
-""")
-df_total_output = df_total
-df_total_output['Gene'] = df_total_output.index
-df_total_output.reset_index(drop=True, inplace=True)
-#df_total_output = df_total_output[['Gene','XGB_Score', 'mousescore_Exomiser',
-# 'SDI', 'Liver_GTExTPM',  'pLI_ExAC',
-# 'HIPred',
-# 'Cells - EBV-transformed lymphocytes_GTExTPM',
-# 'Pituitary_GTExTPM',
-# 'IPA_BP_annotation']]
-st.dataframe(df_total_output)

 import catboost
 import shap
 from shap_plots import shap_summary_plot
+from dynamic_shap_plots import matplotlib_to_plotly, summary_plot_plotly_fig
 import plotly.tools as tls
+from dash import dcc
+import matplotlib.pyplot as plt
 import plotly.graph_objs as go
 try:
     import matplotlib.pyplot as pl
     from matplotlib.colors import LinearSegmentedColormap
 st.set_option('deprecation.showPyplotGlobalUse', False)
+seed = 0
 annotations = pd.read_csv("all_genes_merged_ml_data.csv")
 annotations.fillna(0, inplace=True)
 annotations = annotations.set_index("Gene")
+model_path = "best_model_fitted.pkl"
 with open(model_path, 'rb') as file:
     catboost_model = pickle.load(file)
 probabilities = catboost_model.predict_proba(annotations)
+prob_df = pd.DataFrame(probabilities, index=annotations.index, columns=['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely'])
 df_total = pd.concat([prob_df, annotations], axis=1)
+# Create tabs for navigation
+with st.sidebar:
+    st.sidebar.title("Navigation")
+    tab = st.sidebar.radio("Go to", ("Gene Prioritisation", "Interactive SHAP Plot", "Supervised SHAP Clustering"))
 st.title('Blood Pressure Gene Prioritisation Post-GWAS')
+st.markdown("""A machine learning pipeline for predicting disease-causing genes post-genome-wide association study in blood pressure.""")
+# Define a function to collect genes from input
+collect_genes = lambda x: [str(i) for i in re.split(",|,\s+|\s+", x) if i != ""]
 input_gene_list = st.text_input("Input a list of multiple HGNC genes (enter comma separated):")
 gene_list = collect_genes(input_gene_list)
 explainer = shap.TreeExplainer(catboost_model)
 @st.cache_data
 def convert_df(df):
+    return df.to_csv(index=False).encode('utf-8')
 probability_columns = ['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely']
 features_list = [column for column in df_total.columns if column not in probability_columns]
 features = df_total[features_list]
+# Page 1: Gene Prioritisation
+if tab == "Gene Prioritisation":
+    if len(gene_list) > 1:
+        df = df_total[df_total.index.isin(gene_list)]
+        df['Gene'] = df.index
+        df.reset_index(drop=True, inplace=True)
+        required_columns = ['Gene'] + probability_columns + [column for column in df.columns if column not in probability_columns and column != 'Gene']
+        df = df[required_columns]
+        st.dataframe(df)
+        output = df[['Gene'] + probability_columns]
+        csv = convert_df(output)
+        st.download_button("Download Gene Prioritisation", csv, "bp_gene_prioritisation.csv", "text/csv", key='download-csv')
+        df_shap = df.drop(columns=probability_columns + ['Gene'])
+        shap_values = explainer.shap_values(df_shap)
+        col1, col2 = st.columns(2)
+        class_names = ["Most likely", "Probable", "Least likely"]
+        with col1:
+            st.subheader("Global SHAP Summary Plot")
+            shap.summary_plot(shap_values, df_shap, plot_type="bar", class_names=class_names)
+            st.pyplot(bbox_inches='tight', clear_figure=True)
+        with col2:
+            st.subheader(f"{class_names[0]} Gene Prediction")
+            shap.summary_plot(shap_values[0], df_shap)
+            st.pyplot(bbox_inches='tight', clear_figure=True)
+        col3, col4 = st.columns(2)
+        with col3:
+            st.subheader(f"{class_names[1]} Gene Prediction")
+            shap.summary_plot(shap_values[1], df_shap)
+            st.pyplot(bbox_inches='tight', clear_figure=True)
+        with col4:
+            st.subheader(f"{class_names[2]} Gene Prediction")
+            shap.summary_plot(shap_values[2], df_shap)
+            st.pyplot(bbox_inches='tight', clear_figure=True)
     else:
+        pass
+    input_gene = st.text_input("Input an individual HGNC gene:")
+    if input_gene:
+        df2 = df_total[df_total.index == input_gene]
+        class_names = ["Most likely", "Probable", "Least likely"]
+        if not df2.empty:
+            df2['Gene'] = df2.index
+            df2.reset_index(drop=True, inplace=True)
+            required_columns = ['Gene'] + probability_columns + [col for col in df2.columns if col not in probability_columns and col != 'Gene']
+            df2 = df2[required_columns]
+            st.dataframe(df2)
+            if ' ' in input_gene or ',' in input_gene:
+                st.write('Input Error: Please input only a single HGNC gene name with no white spaces or commas.')
+            else:
+                df2_shap = df_total.loc[[input_gene], [col for col in df_total.columns if col not in probability_columns + ['Gene']]]
+                print(df2_shap.columns)
+                shap_values = explainer.shap_values(df2_shap)
+                shap.getjs()
+                for i in range(3):
+                    st.subheader(f"Force Plot for {class_names[i]} Prediction")
+                    force_plot = shap.force_plot(
+                        explainer.expected_value[i],
+                        shap_values[i],
+                        df2_shap,
+                        matplotlib=True,
+                        show=False
+                    )
+                    st.pyplot(fig=force_plot)
+        else:
+            st.write("Gene not found in the dataset.")
+    else:
+        pass
+    st.markdown("""
+    ### Total Gene Prioritisation Results for All Genes:
+    """)
+    df_total_output = df_total
+    df_total_output['Gene'] = df_total_output.index
+    #df_total_output.reset_index(drop=True, inplace=True)
+    st.dataframe(df_total_output)
+    csv = convert_df(df_total_output)
+    st.download_button("Download Gene Prioritisation", csv, "all_genes_bp_prioritisation.csv", "text/csv", key='download-all-csv')
+# Page 2: Interactive SHAP Plot
+elif tab == "Interactive SHAP Plot":
+    st.title("Interactive SHAP Plot")
+    if len(gene_list) > 1:
+        df = df_total[df_total.index.isin(gene_list)]
+        df['Gene'] = df.index
+        df.reset_index(drop=True, inplace=True)
+        required_columns = ['Gene'] + probability_columns + [column for column in df.columns if column not in probability_columns and column != 'Gene']
+        df = df[required_columns]
+        st.dataframe(df)
+        output = df[['Gene'] + probability_columns]
+        csv = convert_df(output)
+        st.download_button("Download Gene Prioritisation", csv, "bp_gene_prioritisation.csv", "text/csv", key='download-csv')
+        df_shap = df.drop(columns=probability_columns + ['Gene'])
+        shap_values = explainer.shap_values(df_shap)
+        # Use shap's summary_plot function for interactivity
+       # summary_plot = shap.summary_plot(shap_values[0], df_shap, plot_type='interactive', max_display=10)
+        summary_plot = summary_plot_plotly_fig(df_shap, shap_values[0], max_display=10)
+        st.pyplot(summary_plot)
+        st.caption("SHAP Summary Plot of All Input Genes")
+# Page 3: Supervised SHAP Clustering
+elif tab == "Supervised SHAP Clustering":
+    st.title("Supervised SHAP Clustering")
+    # Add your code here to implement supervised SHAP clustering

dynamic_shap_plot.py DELETED Viewed

@@ -1,118 +0,0 @@
-from shap_plots import shap_summary_plot, shap_dependence_plot
-import plotly.tools as tls
-import dash_core_components as dcc
-import pandas as pd
-import numpy as np
-import xgboost
-import shap
-import matplotlib
-import plotly.graph_objs as go
-try:
-    import matplotlib.pyplot as pl
-    from matplotlib.colors import LinearSegmentedColormap
-    from matplotlib.ticker import MaxNLocator
-except ImportError:
-    pass
-from sklearn import preprocessing
-cdict1 = {
-    'red': ((0.0, 0.11764705882352941, 0.11764705882352941),
-            (1.0, 0.9607843137254902, 0.9607843137254902)),
-    'green': ((0.0, 0.5333333333333333, 0.5333333333333333),
-              (1.0, 0.15294117647058825, 0.15294117647058825)),
-    'blue': ((0.0, 0.8980392156862745, 0.8980392156862745),
-             (1.0, 0.3411764705882353, 0.3411764705882353)),
-    'alpha': ((0.0, 1, 1),
-              (0.5, 1, 1),
-              (1.0, 1, 1))
-}  # #1E88E5 -> #ff0052
-red_blue = LinearSegmentedColormap('RedBlue', cdict1)
-def matplotlib_to_plotly(cmap, pl_entries):
-    h = 1.0/(pl_entries-1)
-    pl_colorscale = []
-    for k in range(pl_entries):
-        C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
-        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
-    return pl_colorscale
-red_blue = matplotlib_to_plotly(red_blue, 255)
-def summary_plot_plotly_fig(shap_values, df_shap, feature_names, max_display = 8):
-    #data = pd.read_csv(dataset, encoding="ISO-8859-1")
-    #X = data.drop(['target column'], axis=1)
-    #y = data[target]
-    #y = y/max(y)
-    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
-    #X_train.fillna((-999), inplace=True)
-    #X_test.fillna((-999), inplace=True)
-    #_, shap_values, feature_names = train_model_and_return_shap_values(X, y, target)
-    mpl_fig = shap_summary_plot(shap_values, df_shap, feature_names=feature_names, max_display=20)
-    plotly_fig = tls.mpl_to_plotly(mpl_fig)
-    plotly_fig['layout'] = {'xaxis': {'title': 'SHAP value (impact on model output)'}}
-    feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)[:-1])
-    feature_order = feature_order[-min(max_display, len(feature_order)):]
-    text = [df_shap.index[i] for i in df_shap.index]
-    text = iter(text)
-    for i in range(1, len(plotly_fig['data']), 2):
-        t = text.__next__()
-        plotly_fig['data'][i]['name'] = ''
-        plotly_fig['data'][i]['text'] = t
-        plotly_fig['data'][i]['hoverinfo'] = 'text'
-        #plotly_fig['data'][i]['text'] = df_shap.index
-        plotly_fig['data'][i]['y'] = feature_names[feature_order]
-    colorbar_trace  = go.Scatter(x=[None],
-                                 y=[None],
-                                 mode='markers',
-                                 marker=dict(
-                                     colorscale=red_blue,
-                                     showscale=True,
-                                     cmin=-5,
-                                     cmax=5,
-                                     colorbar=dict(thickness=5, tickvals=[-5, 5], ticktext=['Low', 'High'], outlinewidth=0)
-                                 ),
-                                 hoverinfo='none'
-                                )
-    plotly_fig['layout']['showlegend'] = False
-    plotly_fig['layout']['hovermode'] = 'closest'
-    plotly_fig['layout']['height']=600
-    plotly_fig['layout']['width']=500
-    plotly_fig['layout']['xaxis'].update(zeroline=True, showline=True, ticklen=4, showgrid=False)
-    plotly_fig['layout']['yaxis'].update(dict(visible=True))
-    plotly_fig.add_trace(colorbar_trace)
-    plotly_fig.layout.update(
-                             annotations=[dict(
-                                  x=1.18,
-                                  align="right",
-                                  valign="top",
-                                  text='Gene',
-                                  showarrow=False,
-                                  xref="paper",
-                                  yref="paper",
-                                  xanchor="right",
-                                  yanchor="middle",
-                                  textangle=-90,
-                                  font=dict(family='Calibri', size=14)
-                                )
-                             ],
-                             margin=dict(t=20)
-                            )
-    return plotly_fig

dynamic_shap_plots.py ADDED Viewed

	@@ -0,0 +1,346 @@

+from shap_plots import shap_summary_plot, shap_dependence_plot
+import plotly.tools as tls
+import dash_core_components as dcc
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import numpy as np
+import xgboost
+import shap
+import matplotlib
+import plotly.graph_objs as go
+try:
+    import matplotlib.pyplot as pl
+    from matplotlib.colors import LinearSegmentedColormap
+    from matplotlib.ticker import MaxNLocator
+except ImportError:
+    pass
+from sklearn import preprocessing
+cdict1 = {
+    'red': ((0.0, 0.11764705882352941, 0.11764705882352941),
+            (1.0, 0.9607843137254902, 0.9607843137254902)),
+    'green': ((0.0, 0.5333333333333333, 0.5333333333333333),
+              (1.0, 0.15294117647058825, 0.15294117647058825)),
+    'blue': ((0.0, 0.8980392156862745, 0.8980392156862745),
+             (1.0, 0.3411764705882353, 0.3411764705882353)),
+    'alpha': ((0.0, 1, 1),
+              (0.5, 1, 1),
+              (1.0, 1, 1))
+}  # #1E88E5 -> #ff0052
+red_blue = LinearSegmentedColormap('RedBlue', cdict1)
+def matplotlib_to_plotly(cmap, pl_entries):
+    h = 1.0/(pl_entries-1)
+    pl_colorscale = []
+    for k in range(pl_entries):
+        C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
+        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
+    return pl_colorscale
+red_blue = matplotlib_to_plotly(red_blue, 255)
+def summary_plot_plotly_fig(dataset, shap_values, target='target column', max_display = 20):
+    feature_names=dataset.columns
+    mpl_fig = shap_summary_plot(shap_values, dataset, feature_names=feature_names, max_display=20)
+    plotly_fig = tls.mpl_to_plotly(mpl_fig)
+    plotly_fig['layout'] = {'xaxis': {'title': 'SHAP value (impact on model output)'}}
+    feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)[:-1])
+    feature_order = feature_order[-min(max_display, len(feature_order)):]
+    text = [feature_names[i] for i in feature_order]
+    text = iter(text)
+    for i in range(1, len(plotly_fig['data']), 2):
+        t = text.__next__()
+        plotly_fig['data'][i]['name'] = ''
+        plotly_fig['data'][i]['text'] = t
+        plotly_fig['data'][i]['hoverinfo'] = 'text'
+    colorbar_trace  = go.Scatter(x=[None],
+                                 y=[None],
+                                 mode='markers',
+                                 marker=dict(
+                                     colorscale=red_blue,
+                                     showscale=True,
+                                     cmin=-5,
+                                     cmax=5,
+                                     colorbar=dict(thickness=5, tickvals=[-5, 5], ticktext=['Low', 'High'], outlinewidth=0)
+                                 ),
+                                 hoverinfo='none'
+                                )
+    plotly_fig['layout']['showlegend'] = False
+    plotly_fig['layout']['hovermode'] = 'closest'
+    plotly_fig['layout']['height']=600
+    plotly_fig['layout']['width']=500
+    plotly_fig['layout']['xaxis'].update(zeroline=True, showline=True, ticklen=4, showgrid=False)
+    plotly_fig['layout']['yaxis'].update(dict(visible=False))
+    plotly_fig.add_trace(colorbar_trace)
+    plotly_fig.layout.update(
+                             annotations=[dict(
+                                  x=1.18,
+                                  align="right",
+                                  valign="top",
+                                  text='Feature value',
+                                  showarrow=False,
+                                  xref="paper",
+                                  yref="paper",
+                                  xanchor="right",
+                                  yanchor="middle",
+                                  textangle=-90,
+                                  font=dict(family='Calibri', size=14)
+                                )
+                             ],
+                             margin=dict(t=20)
+                            )
+    return plotly_fig
+def train_model_and_return_shap_values(X, y, target):
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
+    X_train.fillna((-999), inplace=True)
+    X_test.fillna((-999), inplace=True)
+    # Some of values are float or integer and some object. This is why we need to cast them:
+    for f in X_train.columns:
+        if X_train[f].dtype=='object':
+            lbl = preprocessing.LabelEncoder()
+            lbl.fit(list(X_train[f].values))
+            X_train[f] = lbl.transform(list(X_train[f].values))
+    for f in X_test.columns:
+        if X_test[f].dtype=='object':
+            lbl = preprocessing.LabelEncoder()
+            lbl.fit(list(X_test[f].values))
+            X_test[f] = lbl.transform(list(X_test[f].values))
+    X_train=np.array(X_train)
+    X_test=np.array(X_test)
+    X_train = X_train.astype(float)
+    X_test = X_test.astype(float)
+    d_train = xgboost.DMatrix(X_train, label=y_train, feature_names=list(X))
+    d_test = xgboost.DMatrix(X_test, label=y_test, feature_names=list(X))
+    # train the model
+    params = {
+        "eta": 0.01,
+        "subsample": 0.5,
+        "base_score": np.mean(y_train),
+        "silent": 1
+    }
+    model = xgboost.train(params, d_train, 5000, evals = [(d_test, "test")], verbose_eval=None, early_stopping_rounds=50)
+    feature_names = model.feature_names
+    shap_values = shap.TreeExplainer(model).shap_values(pd.DataFrame(X_train, columns=X.columns))
+    return model, shap_values, feature_names
+def dependence_plot_to_plotly_fig(dataset, target='target column', max_display=10):
+    data = pd.read_csv(dataset, encoding="ISO-8859-1")
+    X = data.drop(['target column'], axis=1)
+    y = data[target]
+    y = y/max(y)
+    xgb_full = xgboost.DMatrix(X, label=y)
+    # create a train/test split
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
+    xgb_train = xgboost.DMatrix(X_train, label=y_train)
+    xgb_test = xgboost.DMatrix(X_test, label=y_test)
+    # use validation set to choose # of trees
+    params = {
+        # "eta": 0.002,
+        # "max_depth": 3,
+        # "subsample": 0.5,
+        "silent": 1
+    }
+    model_train = xgboost.train(params, xgb_train, 3000, evals = [(xgb_test, "test")], verbose_eval=None)
+    # train final model on the full data set
+    params = {
+        # "eta": 0.002,
+        # "max_depth": 3,
+        # "subsample": 0.5,
+        "silent": 1
+    }
+    model = xgboost.train(params, xgb_full, 1500, evals = [(xgb_full, "test")], verbose_eval=None)
+    features = model.feature_names
+    shap_values = shap.TreeExplainer(model).shap_values(X)
+    feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)[:-1])
+    feature_order = feature_order[-min(max_display, len(feature_order)):]
+    features = [features[i] for i in feature_order[::-1]]
+    lis = []
+    for i in features:
+        mpl_fig, interaction_index = shap_dependence_plot(i, shap_values, X)
+        plotly_fig = tls.mpl_to_plotly(mpl_fig)
+        # The x-tick labels start by default from 0, which is not necessarily the min value of the feature.
+        # So, we need to increment the x-tick labels by 1. But while doing so, the y-axis gets shifted.
+        # To prevent that, we need to manually control the x-axis range from r_min to r_max
+        new_x = []
+        for j in plotly_fig['data'][0]['x']:
+            new_x.append(j)
+        r_min = min(plotly_fig['data'][0]['x'])
+        r_max = max(plotly_fig['data'][0]['x'])
+        plotly_fig['layout']['xaxis'].update(range=[r_min-1, r_max+1])
+        plotly_fig['data'][0]['x'] = tuple(new_x)
+        # Define the colorbar
+        colorbar_trace  = go.Scatter(x=[None],
+                                     y=[None],
+                                     mode='markers',
+                                     marker=dict(
+                                         colorscale=red_blue,
+                                         showscale=True,
+                                         colorbar=dict(thickness=5, outlinewidth=0),
+                                         color=[min(X[X.columns[interaction_index]]), max(X[X.columns[interaction_index]])],
+                                     ),
+                                     hoverinfo='none'
+                                    )
+        plotly_fig['layout']['showlegend'] = False
+        plotly_fig['layout']['hovermode'] = 'closest'
+        plotly_fig['layout']['height']=380
+        plotly_fig['layout']['width']=450
+        plotly_fig['layout']['xaxis'].update(zeroline=True,
+                                             showline=True,
+                                             ticklen=4,
+                                             showgrid=False,
+                                             tickmode='linear')
+        title = plotly_fig['layout']['yaxis']['title']
+        plotly_fig['layout']['yaxis'].update(title=title.split(' -')[0])
+        plotly_fig.add_trace(colorbar_trace)
+        plotly_fig.layout.update(
+                                 annotations=[dict(
+                                      x=1.23,
+                                      align="right",
+                                      valign="top",
+                                      text=X.columns[interaction_index],
+                                      showarrow=False,
+                                      xref="paper",
+                                      yref="paper",
+                                      xanchor="right",
+                                      yanchor="middle",
+                                      textangle=-90,
+                                      font=dict(family='Calibri', size=14)
+                                    )
+                                 ],
+                                 margin=dict(t=50, b=50, l=50, r=80)
+                                )
+        lis.append(plotly_fig)
+    return lis, features
+def interaction_plot_to_plotly_fig(dataset, target_col='target column', max_display=10):
+    data = pd.read_csv(dataset, encoding="ISO-8859-1")
+    X = data.drop(['target column'], axis=1)
+    y = data[target_col]
+    y = y/max(y)
+    xgb_full = xgboost.DMatrix(X, label=y)
+    # create a train/test split
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
+    xgb_train = xgboost.DMatrix(X_train, label=y_train)
+    xgb_test = xgboost.DMatrix(X_test, label=y_test)
+    # use validation set to choose # of trees
+    params = {
+        # "eta": 0.002,
+        # "max_depth": 3,
+        # "subsample": 0.5,
+        "silent": 1
+    }
+    model_train = xgboost.train(params, xgb_train, 3000, evals = [(xgb_test, "test")], verbose_eval=None)
+    # train final model on the full data set
+    params = {
+        # "eta": 0.002,
+        # "max_depth": 3,
+        # "subsample": 0.5,
+        "silent": 1
+    }
+    model = xgboost.train(params, xgb_full, 1500, evals = [(xgb_full, "test")], verbose_eval=None)
+    features = model.feature_names
+    shap_values = shap.TreeExplainer(model).shap_values(X)
+    feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)[:-1])
+    feature_order = feature_order[-min(max_display, len(feature_order)):]
+    features = [features[i] for i in feature_order[::-1]]
+    shap_interaction_values = shap.TreeExplainer(model).shap_interaction_values(X)
+    lis = []
+    for i in features:
+        for j in features:
+            mpl_fig = pl.figure()
+            ax = mpl_fig.add_subplot(111)
+            _, interaction_index = shap_dependence_plot ( (i, j), shap_interaction_values, X.iloc[:2000,:] )
+            plotly_fig = tls.mpl_to_plotly(mpl_fig)
+            r_min = min(plotly_fig['data'][0]['x'])
+            r_max = max(plotly_fig['data'][0]['x'])
+            plotly_fig['layout']['xaxis'].update(range=[r_min-1, r_max+1])
+            plotly_fig['layout']['showlegend'] = False
+            plotly_fig['layout']['hovermode'] = 'closest'
+            plotly_fig['layout']['height']=380
+            plotly_fig['layout']['width']=450
+            plotly_fig['layout']['xaxis'].update(zeroline=True,
+                                                 showline=True,
+                                                 ticklen=4,
+                                                 showgrid=False,
+                                                 tickmode='linear')
+            plotly_fig['layout']['yaxis'].update(showline=True)
+            if i!=j:
+                # plotly_fig['layout']['height']=380
+                plotly_fig['layout']['width']=480
+                plotly_fig['layout']['yaxis']['title'] = "SHAP interaction value for {} and {}".format(i.split('-')[0], j.split('-')[0])
+                # Define the colorbar
+                colorbar_trace = go.Scatter(x=[None],
+                                            y=[None],
+                                            mode='markers',
+                                            marker=dict(
+                                                colorscale=red_blue,
+                                                showscale=True,
+                                                colorbar=dict(thickness=5, outlinewidth=0),
+                                                color=[min(X[X.columns[interaction_index]]), max(X[X.columns[interaction_index]])],
+                                            ),
+                                            hoverinfo='none'
+                                           )
+                plotly_fig.add_trace(colorbar_trace)
+                plotly_fig.layout.update(
+                                         annotations=[dict(
+                                              x=1.23,
+                                              align="right",
+                                              valign="top",
+                                              text=X.columns[interaction_index],
+                                              showarrow=False,
+                                              xref="paper",
+                                              yref="paper",
+                                              xanchor="right",
+                                              yanchor="middle",
+                                              textangle=-90,
+                                              font=dict(family='Calibri', size=14)
+                                            )
+                                         ],
+                                         margin=dict(t=30, b=30, l=60, r=80)
+                                        )
+            else:
+                plotly_fig['layout']['yaxis']['title'] = "SHAP main effect value for {}".format(i.split('-')[0])
+            lis.append(plotly_fig)
+    return lis, features

requirements.txt CHANGED Viewed

@@ -3,6 +3,7 @@ numpy==1.23.4
 altair==5.1.2
 scikit-learn==1.1.3
 pandas
 xgboost==1.3.3
 shap==0.41.0
 plotly

 altair==5.1.2
 scikit-learn==1.1.3
 pandas
+catboost
 xgboost==1.3.3
 shap==0.41.0
 plotly

shap_plots.py CHANGED Viewed

@@ -1,730 +1,730 @@
-import warnings
-import iml
-import numpy as np
-from iml import Instance, Model
-from iml.datatypes import DenseData
-from iml.explanations import AdditiveExplanation
-from iml.links import IdentityLink
-from scipy.stats import gaussian_kde
-import matplotlib
-try:
-    import matplotlib.pyplot as pl
-    from matplotlib.colors import LinearSegmentedColormap
-    from matplotlib.ticker import MaxNLocator
-    cdict1 = {
-        'red': ((0.0, 0.11764705882352941, 0.11764705882352941),
-                (1.0, 0.9607843137254902, 0.9607843137254902)),
-        'green': ((0.0, 0.5333333333333333, 0.5333333333333333),
-                  (1.0, 0.15294117647058825, 0.15294117647058825)),
-        'blue': ((0.0, 0.8980392156862745, 0.8980392156862745),
-                 (1.0, 0.3411764705882353, 0.3411764705882353)),
-        'alpha': ((0.0, 1, 1),
-                  (0.5, 0.3, 0.3),
-                  (1.0, 1, 1))
-    }  # #1E88E5 -> #ff0052
-    red_blue = LinearSegmentedColormap('RedBlue', cdict1)
-    cdict1 = {
-        'red': ((0.0, 0.11764705882352941, 0.11764705882352941),
-                (1.0, 0.9607843137254902, 0.9607843137254902)),
-        'green': ((0.0, 0.5333333333333333, 0.5333333333333333),
-                  (1.0, 0.15294117647058825, 0.15294117647058825)),
-        'blue': ((0.0, 0.8980392156862745, 0.8980392156862745),
-                 (1.0, 0.3411764705882353, 0.3411764705882353)),
-        'alpha': ((0.0, 1, 1),
-                  (0.5, 1, 1),
-                  (1.0, 1, 1))
-    }  # #1E88E5 -> #ff0052
-    red_blue_solid = LinearSegmentedColormap('RedBlue', cdict1)
-except ImportError:
-    pass
-labels = {
-    'MAIN_EFFECT': "SHAP main effect value for\n%s",
-    'INTERACTION_VALUE': "SHAP interaction value",
-    'INTERACTION_EFFECT': "SHAP interaction value for\n%s and %s",
-    'VALUE': "SHAP value (impact on model output)",
-    'VALUE_FOR': "SHAP value for\n%s",
-    'PLOT_FOR': "SHAP plot for %s",
-    'FEATURE': "Feature %s",
-    'FEATURE_VALUE': "Feature value",
-    'FEATURE_VALUE_LOW': "Low",
-    'FEATURE_VALUE_HIGH': "High",
-    'JOINT_VALUE': "Joint SHAP value"
-}
-def shap_summary_plot(shap_values, features=None, feature_names=None, max_display=None, plot_type="dot",
-                 color=None, axis_color="#333333", title=None, alpha=1, show=True, sort=True,
-                 color_bar=True, auto_size_plot=True, layered_violin_max_num_bins=20):
-    """Create a SHAP summary plot, colored by feature values when they are provided.
-    Parameters
-    ----------
-    shap_values : numpy.array
-        Matrix of SHAP values (# samples x # features)
-    features : numpy.array or pandas.DataFrame or list
-        Matrix of feature values (# samples x # features) or a feature_names list as shorthand
-    feature_names : list
-        Names of the features (length # features)
-    max_display : int
-        How many top features to include in the plot (default is 20, or 7 for interaction plots)
-    plot_type : "dot" (default) or "violin"
-        What type of summary plot to produce
-    """
-    assert len(shap_values.shape) != 1, "Summary plots need a matrix of shap_values, not a vector."
-    # default color:
-    if color is None:
-        color = "coolwarm" if plot_type == 'layered_violin' else "#ff0052"
-    # convert from a DataFrame or other types
-    if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
-        if feature_names is None:
-            feature_names = features.columns
-        features = features.values
-    elif str(type(features)) == "<class 'list'>":
-        if feature_names is None:
-            feature_names = features
-        features = None
-    elif (features is not None) and len(features.shape) == 1 and feature_names is None:
-        feature_names = features
-        features = None
-    if feature_names is None:
-        feature_names = [labels['FEATURE'] % str(i) for i in range(shap_values.shape[1] - 1)]
-    mpl_fig = pl.figure(figsize=(1.5 * max_display + 1, 1 * max_display + 1))
-    # plotting SHAP interaction values
-    if len(shap_values.shape) == 3:
-        if max_display is None:
-            max_display = 7
-        else:
-            max_display = min(len(feature_names), max_display)
-        sort_inds = np.argsort(-np.abs(shap_values[:, :-1, :-1].sum(1)).sum(0))
-        # get plotting limits
-        delta = 1.0 / (shap_values.shape[1] ** 2)
-        slow = np.nanpercentile(shap_values, delta)
-        shigh = np.nanpercentile(shap_values, 100 - delta)
-        v = max(abs(slow), abs(shigh))
-        slow = -0.2
-        shigh = 0.2
-        # mpl_fig = pl.figure(figsize=(1.5 * max_display + 1, 1 * max_display + 1))
-        ax = mpl_fig.subplot(1, max_display, 1)
-        proj_shap_values = shap_values[:, sort_inds[0], np.hstack((sort_inds, len(sort_inds)))]
-        proj_shap_values[:, 1:] *= 2  # because off diag effects are split in half
-        shap_summary_plot(
-            proj_shap_values, features[:, sort_inds],
-            feature_names=feature_names[sort_inds],
-            sort=False, show=False, color_bar=False,
-            auto_size_plot=False,
-            max_display=max_display
-        )
-        pl.xlim((slow, shigh))
-        pl.xlabel("")
-        title_length_limit = 11
-        pl.title(shorten_text(feature_names[sort_inds[0]], title_length_limit))
-        for i in range(1, max_display):
-            ind = sort_inds[i]
-            pl.subplot(1, max_display, i + 1)
-            proj_shap_values = shap_values[:, ind, np.hstack((sort_inds, len(sort_inds)))]
-            proj_shap_values *= 2
-            proj_shap_values[:, i] /= 2  # because only off diag effects are split in half
-            shap_summary_plot(
-                proj_shap_values, features[:, sort_inds],
-                sort=False,
-                feature_names=df_shap.columns, #["" for i in range(features.shape[1])],
-                show=False,
-                color_bar=False,
-                auto_size_plot=False,
-                max_display=max_display
-            )
-            pl.xlim((slow, shigh))
-            pl.xlabel("")
-            if i == max_display // 2:
-                pl.xlabel(labels['INTERACTION_VALUE'])
-            pl.title(shorten_text(feature_names[ind], title_length_limit))
-        pl.tight_layout(pad=0, w_pad=0, h_pad=0.0)
-        pl.subplots_adjust(hspace=0, wspace=0.1)
-        # if show:
-        # #     pl.show()
-        return mpl_fig
-    if max_display is None:
-        max_display = 20
-    if sort:
-        # order features by the sum of their effect magnitudes
-        feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)[:-1])
-        feature_order = feature_order[-min(max_display, len(feature_order)):]
-    else:
-        feature_order = np.flip(np.arange(min(max_display, shap_values.shape[1] - 1)), 0)
-    row_height = 0.4
-    if auto_size_plot:
-        pl.gcf().set_size_inches(8, len(feature_order) * row_height + 1.5)
-    pl.axvline(x=0, color="#999999", zorder=-1)
-    if plot_type == "dot":
-        for pos, i in enumerate(feature_order):
-            pl.axhline(y=pos, color="#cccccc", lw=0.5, dashes=(1, 5), zorder=-1)
-            shaps = shap_values[:, i]
-            values = None if features is None else features[:, i]
-            inds = np.arange(len(shaps))
-            np.random.shuffle(inds)
-            if values is not None:
-                values = values[inds]
-            shaps = shaps[inds]
-            colored_feature = True
-            try:
-                values = np.array(values, dtype=np.float64)  # make sure this can be numeric
-            except:
-                colored_feature = False
-            N = len(shaps)
-            # hspacing = (np.max(shaps) - np.min(shaps)) / 200
-            # curr_bin = []
-            nbins = 100
-            quant = np.round(nbins * (shaps - np.min(shaps)) / (np.max(shaps) - np.min(shaps) + 1e-8))
-            inds = np.argsort(quant + np.random.randn(N) * 1e-6)
-            layer = 0
-            last_bin = -1
-            ys = np.zeros(N)
-            for ind in inds:
-                if quant[ind] != last_bin:
-                    layer = 0
-                ys[ind] = np.ceil(layer / 2) * ((layer % 2) * 2 - 1)
-                layer += 1
-                last_bin = quant[ind]
-            ys *= 0.9 * (row_height / np.max(ys + 1))
-            if features is not None and colored_feature:
-                # trim the color range, but prevent the color range from collapsing
-                vmin = np.nanpercentile(values, 5)
-                vmax = np.nanpercentile(values, 95)
-                if vmin == vmax:
-                    vmin = np.nanpercentile(values, 1)
-                    vmax = np.nanpercentile(values, 99)
-                    if vmin == vmax:
-                        vmin = np.min(values)
-                        vmax = np.max(values)
-                assert features.shape[0] == len(shaps), "Feature and SHAP matrices must have the same number of rows!"
-                nan_mask = np.isnan(values)
-                pl.scatter(shaps[nan_mask], pos + ys[nan_mask], color="#777777", vmin=vmin,
-                           vmax=vmax, s=16, alpha=alpha, linewidth=0,
-                           zorder=3, rasterized=len(shaps) > 500)
-                pl.scatter(shaps[np.invert(nan_mask)], pos + ys[np.invert(nan_mask)],
-                           cmap=red_blue, vmin=vmin, vmax=vmax, s=16,
-                           c=values[np.invert(nan_mask)], alpha=alpha, linewidth=0,
-                           zorder=3, rasterized=len(shaps) > 500)
-            else:
-                pl.scatter(shaps, pos + ys, s=16, alpha=alpha, linewidth=0, zorder=3,
-                           color=color if colored_feature else "#777777", rasterized=len(shaps) > 500)
-    elif plot_type == "violin":
-        for pos, i in enumerate(feature_order):
-            pl.axhline(y=pos, color="#cccccc", lw=0.5, dashes=(1, 5), zorder=-1)
-        if features is not None:
-            global_low = np.nanpercentile(shap_values[:, :len(feature_names)].flatten(), 1)
-            global_high = np.nanpercentile(shap_values[:, :len(feature_names)].flatten(), 99)
-            for pos, i in enumerate(feature_order):
-                shaps = shap_values[:, i]
-                shap_min, shap_max = np.min(shaps), np.max(shaps)
-                rng = shap_max - shap_min
-                xs = np.linspace(np.min(shaps) - rng * 0.2, np.max(shaps) + rng * 0.2, 100)
-                if np.std(shaps) < (global_high - global_low) / 100:
-                    ds = gaussian_kde(shaps + np.random.randn(len(shaps)) * (global_high - global_low) / 100)(xs)
-                else:
-                    ds = gaussian_kde(shaps)(xs)
-                ds /= np.max(ds) * 3
-                values = features[:, i]
-                window_size = max(10, len(values) // 20)
-                smooth_values = np.zeros(len(xs) - 1)
-                sort_inds = np.argsort(shaps)
-                trailing_pos = 0
-                leading_pos = 0
-                running_sum = 0
-                back_fill = 0
-                for j in range(len(xs) - 1):
-                    while leading_pos < len(shaps) and xs[j] >= shaps[sort_inds[leading_pos]]:
-                        running_sum += values[sort_inds[leading_pos]]
-                        leading_pos += 1
-                        if leading_pos - trailing_pos > 20:
-                            running_sum -= values[sort_inds[trailing_pos]]
-                            trailing_pos += 1
-                    if leading_pos - trailing_pos > 0:
-                        smooth_values[j] = running_sum / (leading_pos - trailing_pos)
-                        for k in range(back_fill):
-                            smooth_values[j - k - 1] = smooth_values[j]
-                    else:
-                        back_fill += 1
-                vmin = np.nanpercentile(values, 5)
-                vmax = np.nanpercentile(values, 95)
-                if vmin == vmax:
-                    vmin = np.nanpercentile(values, 1)
-                    vmax = np.nanpercentile(values, 99)
-                    if vmin == vmax:
-                        vmin = np.min(values)
-                        vmax = np.max(values)
-                pl.scatter(shaps, np.ones(shap_values.shape[0]) * pos, s=9, cmap=red_blue_solid, vmin=vmin, vmax=vmax,
-                           c=values, alpha=alpha, linewidth=0, zorder=1)
-                # smooth_values -= nxp.nanpercentile(smooth_values, 5)
-                # smooth_values /= np.nanpercentile(smooth_values, 95)
-                smooth_values -= vmin
-                if vmax - vmin > 0:
-                    smooth_values /= vmax - vmin
-                for i in range(len(xs) - 1):
-                    if ds[i] > 0.05 or ds[i + 1] > 0.05:
-                        pl.fill_between([xs[i], xs[i + 1]], [pos + ds[i], pos + ds[i + 1]],
-                                        [pos - ds[i], pos - ds[i + 1]], color=red_blue_solid(smooth_values[i]),
-                                        zorder=2)
-        else:
-            parts = pl.violinplot(shap_values[:, feature_order], range(len(feature_order)), points=200, vert=False,
-                                  widths=0.7,
-                                  showmeans=False, showextrema=False, showmedians=False)
-            for pc in parts['bodies']:
-                pc.set_facecolor(color)
-                pc.set_edgecolor('none')
-                pc.set_alpha(alpha)
-    elif plot_type == "layered_violin":  # courtesy of @kodonnell
-        num_x_points = 200
-        bins = np.linspace(0, features.shape[0], layered_violin_max_num_bins + 1).round(0).astype(
-            'int')  # the indices of the feature data corresponding to each bin
-        shap_min, shap_max = np.min(shap_values[:, :-1]), np.max(shap_values[:, :-1])
-        x_points = np.linspace(shap_min, shap_max, num_x_points)
-        # loop through each feature and plot:
-        for pos, ind in enumerate(feature_order):
-            # decide how to handle: if #unique < layered_violin_max_num_bins then split by unique value, otherwise use bins/percentiles.
-            # to keep simpler code, in the case of uniques, we just adjust the bins to align with the unique counts.
-            feature = features[:, ind]
-            unique, counts = np.unique(feature, return_counts=True)
-            if unique.shape[0] <= layered_violin_max_num_bins:
-                order = np.argsort(unique)
-                thesebins = np.cumsum(counts[order])
-                thesebins = np.insert(thesebins, 0, 0)
-            else:
-                thesebins = bins
-            nbins = thesebins.shape[0] - 1
-            # order the feature data so we can apply percentiling
-            order = np.argsort(feature)
-            # x axis is located at y0 = pos, with pos being there for offset
-            y0 = np.ones(num_x_points) * pos
-            # calculate kdes:
-            ys = np.zeros((nbins, num_x_points))
-            for i in range(nbins):
-                # get shap values in this bin:
-                shaps = shap_values[order[thesebins[i]:thesebins[i + 1]], ind]
-                # if there's only one element, then we can't
-                if shaps.shape[0] == 1:
-                    warnings.warn(
-                        "not enough data in bin #%d for feature %s, so it'll be ignored. Try increasing the number of records to plot."
-                        % (i, feature_names[ind]))
-                    # to ignore it, just set it to the previous y-values (so the area between them will be zero). Not ys is already 0, so there's
-                    # nothing to do if i == 0
-                    if i > 0:
-                        ys[i, :] = ys[i - 1, :]
-                    continue
-                # save kde of them: note that we add a tiny bit of gaussian noise to avoid singular matrix errors
-                ys[i, :] = gaussian_kde(shaps + np.random.normal(loc=0, scale=0.001, size=shaps.shape[0]))(x_points)
-                # scale it up so that the 'size' of each y represents the size of the bin. For continuous data this will
-                # do nothing, but when we've gone with the unqique option, this will matter - e.g. if 99% are male and 1%
-                # female, we want the 1% to appear a lot smaller.
-                size = thesebins[i + 1] - thesebins[i]
-                bin_size_if_even = features.shape[0] / nbins
-                relative_bin_size = size / bin_size_if_even
-                ys[i, :] *= relative_bin_size
-            # now plot 'em. We don't plot the individual strips, as this can leave whitespace between them.
-            # instead, we plot the full kde, then remove outer strip and plot over it, etc., to ensure no
-            # whitespace
-            ys = np.cumsum(ys, axis=0)
-            width = 0.8
-            scale = ys.max() * 2 / width  # 2 is here as we plot both sides of x axis
-            for i in range(nbins - 1, -1, -1):
-                y = ys[i, :] / scale
-                c = pl.get_cmap(color)(i / (
-                        nbins - 1)) if color in pl.cm.datad else color  # if color is a cmap, use it, otherwise use a color
-                pl.fill_between(x_points, pos - y, pos + y, facecolor=c)
-        pl.xlim(shap_min, shap_max)
-    # draw the color bar
-    if color_bar and features is not None and (plot_type != "layered_violin" or color in pl.cm.datad):
-            import matplotlib.cm as cm
-            m = cm.ScalarMappable(cmap=red_blue_solid if plot_type != "layered_violin" else pl.get_cmap(color))
-            m.set_array([0, 1])
-            cb = pl.colorbar(m, ticks=[0, 1], aspect=1000)
-            cb.set_ticklabels([labels['FEATURE_VALUE_LOW'], labels['FEATURE_VALUE_HIGH']])
-            cb.set_label(labels['FEATURE_VALUE'], size=12, labelpad=0)
-            cb.ax.tick_params(labelsize=11, length=0)
-            cb.set_alpha(1)
-            cb.outline.set_visible(False)
-            bbox = cb.ax.get_window_extent().transformed(pl.gcf().dpi_scale_trans.inverted())
-            cb.ax.set_aspect((bbox.height - 0.9) * 20)
-        # cb.draw_all()
-    pl.gca().xaxis.set_ticks_position('bottom')
-    pl.gca().yaxis.set_ticks_position('none')
-    pl.gca().spines['right'].set_visible(False)
-    pl.gca().spines['top'].set_visible(False)
-    pl.gca().spines['left'].set_visible(False)
-    pl.gca().tick_params(color=axis_color, labelcolor=axis_color)
-    pl.yticks(range(len(feature_order)), [feature_names[i] for i in feature_order], fontsize=13)
-    pl.gca().tick_params('y', length=20, width=0.5, which='major')
-    pl.gca().tick_params('x', labelsize=11)
-    pl.ylim(-1, len(feature_order))
-    pl.xlabel(labels['VALUE'], fontsize=13)
-    pl.tight_layout()
-    # if show:
-    #     pl.show()
-    return mpl_fig
-def approx_interactions(index, shap_values, X):
-    """ Order other features by how much interaction they seem to have with the feature at the given index.
-    This just bins the SHAP values for a feature along that feature's value. For true Shapley interaction
-    index values for SHAP see the interaction_contribs option implemented in XGBoost.
-    """
-    if X.shape[0] > 10000:
-        a = np.arange(X.shape[0])
-        np.random.shuffle(a)
-        inds = a[:10000]
-    else:
-        inds = np.arange(X.shape[0])
-    x = X[inds, index]
-    srt = np.argsort(x)
-    shap_ref = shap_values[inds, index]
-    shap_ref = shap_ref[srt]
-    inc = max(min(int(len(x) / 10.0), 50), 1)
-    interactions = []
-    for i in range(X.shape[1]):
-        val_other = X[inds, i][srt].astype(np.float)
-        v = 0.0
-        if not (i == index or np.sum(np.abs(val_other)) < 1e-8):
-            for j in range(0, len(x), inc):
-                if np.std(val_other[j:j + inc]) > 0 and np.std(shap_ref[j:j + inc]) > 0:
-                    v += abs(np.corrcoef(shap_ref[j:j + inc], val_other[j:j + inc])[0, 1])
-        interactions.append(v)
-    return np.argsort(-np.abs(interactions))
-def shap_dependence_plot(ind, shap_values, features, feature_names=None, display_features=None,
-                    interaction_index="auto", color="#1E88E5", axis_color="#333333",
-                    dot_size=16, alpha=1, title=None, show=True):
-    """
-    Create a SHAP dependence plot, colored by an interaction feature.
-    Parameters
-    ----------
-    ind : int
-        Index of the feature to plot.
-    shap_values : numpy.array
-        Matrix of SHAP values (# samples x # features)
-    features : numpy.array or pandas.DataFrame
-        Matrix of feature values (# samples x # features)
-    feature_names : list
-        Names of the features (length # features)
-    display_features : numpy.array or pandas.DataFrame
-        Matrix of feature values for visual display (such as strings instead of coded values)
-    interaction_index : "auto", None, or int
-        The index of the feature used to color the plot.
-    """
-    # convert from DataFrames if we got any
-    if str(type(features)).endswith("'pandas.core.frame.DataFrame'>"):
-        if feature_names is None:
-            feature_names = features.columns
-        features = features.values
-    if str(type(display_features)).endswith("'pandas.core.frame.DataFrame'>"):
-        if feature_names is None:
-            feature_names = display_features.columns
-        display_features = display_features.values
-    elif display_features is None:
-        display_features = features
-    if feature_names is None:
-        feature_names = [labels['FEATURE'] % str(i) for i in range(shap_values.shape[1] - 1)]
-    # allow vectors to be passed
-    if len(shap_values.shape) == 1:
-        shap_values = np.reshape(shap_values, len(shap_values), 1)
-    if len(features.shape) == 1:
-        features = np.reshape(features, len(features), 1)
-    def convert_name(ind):
-        if type(ind) == str:
-            nzinds = np.where(feature_names == ind)[0]
-            if len(nzinds) == 0:
-                print("Could not find feature named: " + ind)
-                return None
-            else:
-                return nzinds[0]
-        else:
-            return ind
-    ind = convert_name(ind)
-    mpl_fig = pl.gcf()
-    ax = mpl_fig.gca()
-    # plotting SHAP interaction values
-    if len(shap_values.shape) == 3 and len(ind) == 2:
-        ind1 = convert_name(ind[0])
-        ind2 = convert_name(ind[1])
-        if ind1 == ind2:
-            proj_shap_values = shap_values[:, ind2, :]
-        else:
-            proj_shap_values = shap_values[:, ind2, :] * 2  # off-diag values are split in half
-        # TODO: remove recursion; generally the functions should be shorter for more maintainable code
-        return shap_dependence_plot(
-            ind1, proj_shap_values, features, feature_names=feature_names,
-            interaction_index=ind2, display_features=display_features, show=False
-        )
-        assert shap_values.shape[0] == features.shape[0], \
-            "'shap_values' and 'features' values must have the same number of rows!"
-        assert shap_values.shape[1] == features.shape[1], \
-            "'shap_values' must have the same number of columns as 'features'!"
-        # get both the raw and display feature values
-        xv = features[:, ind]
-        xd = display_features[:, ind]
-        s = shap_values[:, ind]
-        if type(xd[0]) == str:
-            name_map = {}
-            for i in range(len(xv)):
-                name_map[xd[i]] = xv[i]
-            xnames = list(name_map.keys())
-        # allow a single feature name to be passed alone
-        if type(feature_names) == str:
-            feature_names = [feature_names]
-        name = feature_names[ind]
-        # guess what other feature as the stongest interaction with the plotted feature
-        if interaction_index == "auto":
-            interaction_index = approx_interactions(ind, shap_values, features)[0]
-        interaction_index = convert_name(interaction_index)
-        categorical_interaction = False
-        # get both the raw and display color values
-        if interaction_index is not None:
-            cv = features[:, interaction_index]
-            cd = display_features[:, interaction_index]
-            clow = np.nanpercentile(features[:, interaction_index].astype(np.float), 5)
-            chigh = np.nanpercentile(features[:, interaction_index].astype(np.float), 95)
-            if type(cd[0]) == str:
-                cname_map = {}
-                for i in range(len(cv)):
-                    cname_map[cd[i]] = cv[i]
-                cnames = list(cname_map.keys())
-                categorical_interaction = True
-            elif clow % 1 == 0 and chigh % 1 == 0 and len(set(features[:, interaction_index])) < 50:
-                categorical_interaction = True
-        # discritize colors for categorical features
-        color_norm = None
-        if categorical_interaction and clow != chigh:
-            bounds = np.linspace(clow, chigh, chigh - clow + 2)
-            color_norm = matplotlib.colors.BoundaryNorm(bounds, red_blue.N)
-        # the actual scatter plot, TODO: adapt the dot_size to the number of data points?
-        if interaction_index is not None:
-            pl.scatter(xv, s, s=dot_size, linewidth=0, c=features[:, interaction_index], cmap=red_blue,
-                       alpha=alpha, vmin=clow, vmax=chigh, norm=color_norm, rasterized=len(xv) > 500)
-        else:
-            pl.scatter(xv, s, s=dot_size, linewidth=0, color="#1E88E5",
-                       alpha=alpha, rasterized=len(xv) > 500)
-        if interaction_index != ind and interaction_index is not None:
-            # draw the color bar
-            if type(cd[0]) == str:
-                tick_positions = [cname_map[n] for n in cnames]
-                if len(tick_positions) == 2:
-                    tick_positions[0] -= 0.25
-                    tick_positions[1] += 0.25
-                cb = pl.colorbar(ticks=tick_positions)
-                cb.set_ticklabels(cnames)
-            else:
-                cb = pl.colorbar()
-            cb.set_label(feature_names[interaction_index], size=13)
-            cb.ax.tick_params(labelsize=11)
-            if categorical_interaction:
-                cb.ax.tick_params(length=0)
-            cb.set_alpha(1)
-            cb.outline.set_visible(False)
-            bbox = cb.ax.get_window_extent().transformed(pl.gcf().dpi_scale_trans.inverted())
-            cb.ax.set_aspect((bbox.height - 0.7) * 20)
-        # make the plot more readable
-        if interaction_index != ind:
-            pl.gcf().set_size_inches(7.5, 5)
-        else:
-            pl.gcf().set_size_inches(6, 5)
-        # pl.xlabel(name, color=axis_color, fontsize=13)
-        # pl.ylabel(labels['VALUE_FOR'] % name, color=axis_color, fontsize=13)
-        if title is not None:
-            pl.title(title, color=axis_color, fontsize=13)
-        pl.gca().xaxis.set_ticks_position('bottom')
-        pl.gca().yaxis.set_ticks_position('left')
-        pl.gca().spines['right'].set_visible(False)
-        pl.gca().spines['top'].set_visible(False)
-        pl.gca().tick_params(color=axis_color, labelcolor=axis_color, labelsize=11)
-        for spine in pl.gca().spines.values():
-            spine.set_edgecolor(axis_color)
-        if type(xd[0]) == str:
-            pl.xticks([name_map[n] for n in xnames], xnames, rotation='vertical', fontsize=11)
-        # if show:
-            # pl.show()
-        if ind1 == ind2:
-            pl.ylabel(labels['MAIN_EFFECT'] % feature_names[ind1])
-        else:
-            pl.ylabel(labels['INTERACTION_EFFECT'] % (feature_names[ind1], feature_names[ind2]))
-        return mpl_fig, interaction_index
-        # # if show:
-        # #     pl.show()
-        # return
-        # return mpl_fig
-    # assert shap_values.shape[0] == features.shape[0], "'shap_values' and 'features' values must have the same number of rows!"
-    # assert shap_values.shape[1] == features.shape[1] + 1, "'shap_values' must have one more column than 'features'!"
-    # get both the raw and display feature values
-    xv = features[:, ind]
-    xd = display_features[:, ind]
-    s = shap_values[:, ind]
-    if type(xd[0]) == str:
-        name_map = {}
-        for i in range(len(xv)):
-            name_map[xd[i]] = xv[i]
-        xnames = list(name_map.keys())
-    # allow a single feature name to be passed alone
-    if type(feature_names) == str:
-        feature_names = [feature_names]
-    name = feature_names[ind]
-    # guess what other feature as the stongest interaction with the plotted feature
-    if interaction_index == "auto":
-        interaction_index = approx_interactions(ind, shap_values, features)[0]
-    interaction_index = convert_name(interaction_index)
-    categorical_interaction = False
-    # get both the raw and display color values
-    if interaction_index is not None:
-        cv = features[:, interaction_index]
-        cd = display_features[:, interaction_index]
-        clow = np.nanpercentile(features[:, interaction_index].astype(np.float), 5)
-        chigh = np.nanpercentile(features[:, interaction_index].astype(np.float), 95)
-        if type(cd[0]) == str:
-            cname_map = {}
-            for i in range(len(cv)):
-                cname_map[cd[i]] = cv[i]
-            cnames = list(cname_map.keys())
-            categorical_interaction = True
-        elif clow % 1 == 0 and chigh % 1 == 0 and len(set(features[:, interaction_index])) < 50:
-            categorical_interaction = True
-    # discritize colors for categorical features
-    color_norm = None
-    if categorical_interaction and clow != chigh:
-        bounds = np.linspace(clow, chigh, chigh - clow + 2)
-        color_norm = matplotlib.colors.BoundaryNorm(bounds, red_blue.N)
-    # the actual scatter plot, TODO: adapt the dot_size to the number of data points?
-    if interaction_index is not None:
-        pl.scatter(xv, s, s=dot_size, linewidth=0, c=features[:, interaction_index], cmap=red_blue,
-                   alpha=alpha, vmin=clow, vmax=chigh, norm=color_norm, rasterized=len(xv) > 500)
-    else:
-        pl.scatter(xv, s, s=dot_size, linewidth=0, color="#1E88E5",
-                   alpha=alpha, rasterized=len(xv) > 500)
-    if interaction_index != ind and interaction_index is not None:
-        # draw the color bar
-        if type(cd[0]) == str:
-            tick_positions = [cname_map[n] for n in cnames]
-            if len(tick_positions) == 2:
-                tick_positions[0] -= 0.25
-                tick_positions[1] += 0.25
-            cb = pl.colorbar(ticks=tick_positions)
-            cb.set_ticklabels(cnames)
-        else:
-            cb = pl.colorbar()
-        cb.set_label(feature_names[interaction_index], size=13)
-        cb.ax.tick_params(labelsize=11)
-        if categorical_interaction:
-            cb.ax.tick_params(length=0)
-        cb.set_alpha(1)
-        cb.outline.set_visible(False)
-        bbox = cb.ax.get_window_extent().transformed(pl.gcf().dpi_scale_trans.inverted())
-        cb.ax.set_aspect((bbox.height - 0.7) * 20)
-    # make the plot more readable
-    if interaction_index != ind:
-        pl.gcf().set_size_inches(7.5, 5)
-    else:
-        pl.gcf().set_size_inches(6, 5)
-    pl.xlabel(name, color=axis_color, fontsize=13)
-    pl.ylabel(labels['VALUE_FOR'] % name, color=axis_color, fontsize=13)
-    if title is not None:
-        pl.title(title, color=axis_color, fontsize=13)
-    pl.gca().xaxis.set_ticks_position('bottom')
-    pl.gca().yaxis.set_ticks_position('left')
-    pl.gca().spines['right'].set_visible(False)
-    pl.gca().spines['top'].set_visible(False)
-    pl.gca().tick_params(color=axis_color, labelcolor=axis_color, labelsize=11)
-    for spine in pl.gca().spines.values():
-        spine.set_edgecolor(axis_color)
-    if type(xd[0]) == str:
-        pl.xticks([name_map[n] for n in xnames], xnames, rotation='vertical', fontsize=11)
-    # if show:
-        # pl.show()
     return mpl_fig, interaction_index

+import warnings
+import iml
+import numpy as np
+from iml import Instance, Model
+from iml.datatypes import DenseData
+from iml.explanations import AdditiveExplanation
+from iml.links import IdentityLink
+from scipy.stats import gaussian_kde
+import matplotlib
+try:
+    import matplotlib.pyplot as pl
+    from matplotlib.colors import LinearSegmentedColormap
+    from matplotlib.ticker import MaxNLocator
+    cdict1 = {
+        'red': ((0.0, 0.11764705882352941, 0.11764705882352941),
+                (1.0, 0.9607843137254902, 0.9607843137254902)),
+        'green': ((0.0, 0.5333333333333333, 0.5333333333333333),
+                  (1.0, 0.15294117647058825, 0.15294117647058825)),
+        'blue': ((0.0, 0.8980392156862745, 0.8980392156862745),
+                 (1.0, 0.3411764705882353, 0.3411764705882353)),
+        'alpha': ((0.0, 1, 1),
+                  (0.5, 0.3, 0.3),
+                  (1.0, 1, 1))
+    }  # #1E88E5 -> #ff0052
+    red_blue = LinearSegmentedColormap('RedBlue', cdict1)
+    cdict1 = {
+        'red': ((0.0, 0.11764705882352941, 0.11764705882352941),
+                (1.0, 0.9607843137254902, 0.9607843137254902)),
+        'green': ((0.0, 0.5333333333333333, 0.5333333333333333),
+                  (1.0, 0.15294117647058825, 0.15294117647058825)),
+        'blue': ((0.0, 0.8980392156862745, 0.8980392156862745),
+                 (1.0, 0.3411764705882353, 0.3411764705882353)),
+        'alpha': ((0.0, 1, 1),
+                  (0.5, 1, 1),
+                  (1.0, 1, 1))
+    }  # #1E88E5 -> #ff0052
+    red_blue_solid = LinearSegmentedColormap('RedBlue', cdict1)
+except ImportError:
+    pass
+labels = {
+    'MAIN_EFFECT': "SHAP main effect value for\n%s",
+    'INTERACTION_VALUE': "SHAP interaction value",
+    'INTERACTION_EFFECT': "SHAP interaction value for\n%s and %s",
+    'VALUE': "SHAP value (impact on model output)",
+    'VALUE_FOR': "SHAP value for\n%s",
+    'PLOT_FOR': "SHAP plot for %s",
+    'FEATURE': "Feature %s",
+    'FEATURE_VALUE': "Feature value",
+    'FEATURE_VALUE_LOW': "Low",
+    'FEATURE_VALUE_HIGH': "High",
+    'JOINT_VALUE': "Joint SHAP value"
+}
+def shap_summary_plot(shap_values, features=None, feature_names=None, max_display=None, plot_type="dot",
+                 color=None, axis_color="#333333", title=None, alpha=1, show=True, sort=True,
+                 color_bar=True, auto_size_plot=True, layered_violin_max_num_bins=20):
+    """Create a SHAP summary plot, colored by feature values when they are provided.
+    Parameters
+    ----------
+    shap_values : numpy.array
+        Matrix of SHAP values (# samples x # features)
+    features : numpy.array or pandas.DataFrame or list
+        Matrix of feature values (# samples x # features) or a feature_names list as shorthand
+    feature_names : list
+        Names of the features (length # features)
+    max_display : int
+        How many top features to include in the plot (default is 20, or 7 for interaction plots)
+    plot_type : "dot" (default) or "violin"
+        What type of summary plot to produce
+    """
+    assert len(shap_values.shape) != 1, "Summary plots need a matrix of shap_values, not a vector."
+    # default color:
+    if color is None:
+        color = "coolwarm" if plot_type == 'layered_violin' else "#ff0052"
+    # convert from a DataFrame or other types
+    if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
+        if feature_names is None:
+            feature_names = features.columns
+        features = features.values
+    elif str(type(features)) == "<class 'list'>":
+        if feature_names is None:
+            feature_names = features
+        features = None
+    elif (features is not None) and len(features.shape) == 1 and feature_names is None:
+        feature_names = features
+        features = None
+    if feature_names is None:
+        feature_names = [labels['FEATURE'] % str(i) for i in range(shap_values.shape[1] - 1)]
+    mpl_fig = pl.figure(figsize=(1.5 * max_display + 1, 1 * max_display + 1))
+    # plotting SHAP interaction values
+    if len(shap_values.shape) == 3:
+        if max_display is None:
+            max_display = 7
+        else:
+            max_display = min(len(feature_names), max_display)
+        sort_inds = np.argsort(-np.abs(shap_values[:, :-1, :-1].sum(1)).sum(0))
+        # get plotting limits
+        delta = 1.0 / (shap_values.shape[1] ** 2)
+        slow = np.nanpercentile(shap_values, delta)
+        shigh = np.nanpercentile(shap_values, 100 - delta)
+        v = max(abs(slow), abs(shigh))
+        slow = -0.2
+        shigh = 0.2
+        # mpl_fig = pl.figure(figsize=(1.5 * max_display + 1, 1 * max_display + 1))
+        ax = mpl_fig.subplot(1, max_display, 1)
+        proj_shap_values = shap_values[:, sort_inds[0], np.hstack((sort_inds, len(sort_inds)))]
+        proj_shap_values[:, 1:] *= 2  # because off diag effects are split in half
+        shap_summary_plot(
+            proj_shap_values, features[:, sort_inds],
+            feature_names=feature_names[sort_inds],
+            sort=False, show=False, color_bar=False,
+            auto_size_plot=False,
+            max_display=max_display
+        )
+        pl.xlim((slow, shigh))
+        pl.xlabel("")
+        title_length_limit = 11
+        pl.title(shorten_text(feature_names[sort_inds[0]], title_length_limit))
+        for i in range(1, max_display):
+            ind = sort_inds[i]
+            pl.subplot(1, max_display, i + 1)
+            proj_shap_values = shap_values[:, ind, np.hstack((sort_inds, len(sort_inds)))]
+            proj_shap_values *= 2
+            proj_shap_values[:, i] /= 2  # because only off diag effects are split in half
+            shap_summary_plot(
+                proj_shap_values, features[:, sort_inds],
+                sort=False,
+                feature_names=["" for i in range(features.shape[1])],
+                show=False,
+                color_bar=False,
+                auto_size_plot=False,
+                max_display=max_display
+            )
+            pl.xlim((slow, shigh))
+            pl.xlabel("")
+            if i == max_display // 2:
+                pl.xlabel(labels['INTERACTION_VALUE'])
+            pl.title(shorten_text(feature_names[ind], title_length_limit))
+        pl.tight_layout(pad=0, w_pad=0, h_pad=0.0)
+        pl.subplots_adjust(hspace=0, wspace=0.1)
+        # if show:
+        # #     pl.show()
+        return mpl_fig
+    if max_display is None:
+        max_display = 20
+    if sort:
+        # order features by the sum of their effect magnitudes
+        feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)[:-1])
+        feature_order = feature_order[-min(max_display, len(feature_order)):]
+    else:
+        feature_order = np.flip(np.arange(min(max_display, shap_values.shape[1] - 1)), 0)
+    row_height = 0.4
+    if auto_size_plot:
+        pl.gcf().set_size_inches(8, len(feature_order) * row_height + 1.5)
+    pl.axvline(x=0, color="#999999", zorder=-1)
+    if plot_type == "dot":
+        for pos, i in enumerate(feature_order):
+            pl.axhline(y=pos, color="#cccccc", lw=0.5, dashes=(1, 5), zorder=-1)
+            shaps = shap_values[:, i]
+            values = None if features is None else features[:, i]
+            inds = np.arange(len(shaps))
+            np.random.shuffle(inds)
+            if values is not None:
+                values = values[inds]
+            shaps = shaps[inds]
+            colored_feature = True
+            try:
+                values = np.array(values, dtype=np.float64)  # make sure this can be numeric
+            except:
+                colored_feature = False
+            N = len(shaps)
+            # hspacing = (np.max(shaps) - np.min(shaps)) / 200
+            # curr_bin = []
+            nbins = 100
+            quant = np.round(nbins * (shaps - np.min(shaps)) / (np.max(shaps) - np.min(shaps) + 1e-8))
+            inds = np.argsort(quant + np.random.randn(N) * 1e-6)
+            layer = 0
+            last_bin = -1
+            ys = np.zeros(N)
+            for ind in inds:
+                if quant[ind] != last_bin:
+                    layer = 0
+                ys[ind] = np.ceil(layer / 2) * ((layer % 2) * 2 - 1)
+                layer += 1
+                last_bin = quant[ind]
+            ys *= 0.9 * (row_height / np.max(ys + 1))
+            if features is not None and colored_feature:
+                # trim the color range, but prevent the color range from collapsing
+                vmin = np.nanpercentile(values, 5)
+                vmax = np.nanpercentile(values, 95)
+                if vmin == vmax:
+                    vmin = np.nanpercentile(values, 1)
+                    vmax = np.nanpercentile(values, 99)
+                    if vmin == vmax:
+                        vmin = np.min(values)
+                        vmax = np.max(values)
+                assert features.shape[0] == len(shaps), "Feature and SHAP matrices must have the same number of rows!"
+                nan_mask = np.isnan(values)
+                pl.scatter(shaps[nan_mask], pos + ys[nan_mask], color="#777777", vmin=vmin,
+                           vmax=vmax, s=16, alpha=alpha, linewidth=0,
+                           zorder=3, rasterized=len(shaps) > 500)
+                pl.scatter(shaps[np.invert(nan_mask)], pos + ys[np.invert(nan_mask)],
+                           cmap=red_blue, vmin=vmin, vmax=vmax, s=16,
+                           c=values[np.invert(nan_mask)], alpha=alpha, linewidth=0,
+                           zorder=3, rasterized=len(shaps) > 500)
+            else:
+                pl.scatter(shaps, pos + ys, s=16, alpha=alpha, linewidth=0, zorder=3,
+                           color=color if colored_feature else "#777777", rasterized=len(shaps) > 500)
+    elif plot_type == "violin":
+        for pos, i in enumerate(feature_order):
+            pl.axhline(y=pos, color="#cccccc", lw=0.5, dashes=(1, 5), zorder=-1)
+        if features is not None:
+            global_low = np.nanpercentile(shap_values[:, :len(feature_names)].flatten(), 1)
+            global_high = np.nanpercentile(shap_values[:, :len(feature_names)].flatten(), 99)
+            for pos, i in enumerate(feature_order):
+                shaps = shap_values[:, i]
+                shap_min, shap_max = np.min(shaps), np.max(shaps)
+                rng = shap_max - shap_min
+                xs = np.linspace(np.min(shaps) - rng * 0.2, np.max(shaps) + rng * 0.2, 100)
+                if np.std(shaps) < (global_high - global_low) / 100:
+                    ds = gaussian_kde(shaps + np.random.randn(len(shaps)) * (global_high - global_low) / 100)(xs)
+                else:
+                    ds = gaussian_kde(shaps)(xs)
+                ds /= np.max(ds) * 3
+                values = features[:, i]
+                window_size = max(10, len(values) // 20)
+                smooth_values = np.zeros(len(xs) - 1)
+                sort_inds = np.argsort(shaps)
+                trailing_pos = 0
+                leading_pos = 0
+                running_sum = 0
+                back_fill = 0
+                for j in range(len(xs) - 1):
+                    while leading_pos < len(shaps) and xs[j] >= shaps[sort_inds[leading_pos]]:
+                        running_sum += values[sort_inds[leading_pos]]
+                        leading_pos += 1
+                        if leading_pos - trailing_pos > 20:
+                            running_sum -= values[sort_inds[trailing_pos]]
+                            trailing_pos += 1
+                    if leading_pos - trailing_pos > 0:
+                        smooth_values[j] = running_sum / (leading_pos - trailing_pos)
+                        for k in range(back_fill):
+                            smooth_values[j - k - 1] = smooth_values[j]
+                    else:
+                        back_fill += 1
+                vmin = np.nanpercentile(values, 5)
+                vmax = np.nanpercentile(values, 95)
+                if vmin == vmax:
+                    vmin = np.nanpercentile(values, 1)
+                    vmax = np.nanpercentile(values, 99)
+                    if vmin == vmax:
+                        vmin = np.min(values)
+                        vmax = np.max(values)
+                pl.scatter(shaps, np.ones(shap_values.shape[0]) * pos, s=9, cmap=red_blue_solid, vmin=vmin, vmax=vmax,
+                           c=values, alpha=alpha, linewidth=0, zorder=1)
+                # smooth_values -= nxp.nanpercentile(smooth_values, 5)
+                # smooth_values /= np.nanpercentile(smooth_values, 95)
+                smooth_values -= vmin
+                if vmax - vmin > 0:
+                    smooth_values /= vmax - vmin
+                for i in range(len(xs) - 1):
+                    if ds[i] > 0.05 or ds[i + 1] > 0.05:
+                        pl.fill_between([xs[i], xs[i + 1]], [pos + ds[i], pos + ds[i + 1]],
+                                        [pos - ds[i], pos - ds[i + 1]], color=red_blue_solid(smooth_values[i]),
+                                        zorder=2)
+        else:
+            parts = pl.violinplot(shap_values[:, feature_order], range(len(feature_order)), points=200, vert=False,
+                                  widths=0.7,
+                                  showmeans=False, showextrema=False, showmedians=False)
+            for pc in parts['bodies']:
+                pc.set_facecolor(color)
+                pc.set_edgecolor('none')
+                pc.set_alpha(alpha)
+    elif plot_type == "layered_violin":  # courtesy of @kodonnell
+        num_x_points = 200
+        bins = np.linspace(0, features.shape[0], layered_violin_max_num_bins + 1).round(0).astype(
+            'int')  # the indices of the feature data corresponding to each bin
+        shap_min, shap_max = np.min(shap_values[:, :-1]), np.max(shap_values[:, :-1])
+        x_points = np.linspace(shap_min, shap_max, num_x_points)
+        # loop through each feature and plot:
+        for pos, ind in enumerate(feature_order):
+            # decide how to handle: if #unique < layered_violin_max_num_bins then split by unique value, otherwise use bins/percentiles.
+            # to keep simpler code, in the case of uniques, we just adjust the bins to align with the unique counts.
+            feature = features[:, ind]
+            unique, counts = np.unique(feature, return_counts=True)
+            if unique.shape[0] <= layered_violin_max_num_bins:
+                order = np.argsort(unique)
+                thesebins = np.cumsum(counts[order])
+                thesebins = np.insert(thesebins, 0, 0)
+            else:
+                thesebins = bins
+            nbins = thesebins.shape[0] - 1
+            # order the feature data so we can apply percentiling
+            order = np.argsort(feature)
+            # x axis is located at y0 = pos, with pos being there for offset
+            y0 = np.ones(num_x_points) * pos
+            # calculate kdes:
+            ys = np.zeros((nbins, num_x_points))
+            for i in range(nbins):
+                # get shap values in this bin:
+                shaps = shap_values[order[thesebins[i]:thesebins[i + 1]], ind]
+                # if there's only one element, then we can't
+                if shaps.shape[0] == 1:
+                    warnings.warn(
+                        "not enough data in bin #%d for feature %s, so it'll be ignored. Try increasing the number of records to plot."
+                        % (i, feature_names[ind]))
+                    # to ignore it, just set it to the previous y-values (so the area between them will be zero). Not ys is already 0, so there's
+                    # nothing to do if i == 0
+                    if i > 0:
+                        ys[i, :] = ys[i - 1, :]
+                    continue
+                # save kde of them: note that we add a tiny bit of gaussian noise to avoid singular matrix errors
+                ys[i, :] = gaussian_kde(shaps + np.random.normal(loc=0, scale=0.001, size=shaps.shape[0]))(x_points)
+                # scale it up so that the 'size' of each y represents the size of the bin. For continuous data this will
+                # do nothing, but when we've gone with the unqique option, this will matter - e.g. if 99% are male and 1%
+                # female, we want the 1% to appear a lot smaller.
+                size = thesebins[i + 1] - thesebins[i]
+                bin_size_if_even = features.shape[0] / nbins
+                relative_bin_size = size / bin_size_if_even
+                ys[i, :] *= relative_bin_size
+            # now plot 'em. We don't plot the individual strips, as this can leave whitespace between them.
+            # instead, we plot the full kde, then remove outer strip and plot over it, etc., to ensure no
+            # whitespace
+            ys = np.cumsum(ys, axis=0)
+            width = 0.8
+            scale = ys.max() * 2 / width  # 2 is here as we plot both sides of x axis
+            for i in range(nbins - 1, -1, -1):
+                y = ys[i, :] / scale
+                c = pl.get_cmap(color)(i / (
+                        nbins - 1)) if color in pl.cm.datad else color  # if color is a cmap, use it, otherwise use a color
+                pl.fill_between(x_points, pos - y, pos + y, facecolor=c)
+        pl.xlim(shap_min, shap_max)
+    # draw the color bar
+    if color_bar and features is not None and (plot_type != "layered_violin" or color in pl.cm.datad):
+            import matplotlib.cm as cm
+            m = cm.ScalarMappable(cmap=red_blue_solid if plot_type != "layered_violin" else pl.get_cmap(color))
+            m.set_array([0, 1])
+            cb = pl.colorbar(m, ticks=[0, 1], aspect=1000)
+            cb.set_ticklabels([labels['FEATURE_VALUE_LOW'], labels['FEATURE_VALUE_HIGH']])
+            cb.set_label(labels['FEATURE_VALUE'], size=12, labelpad=0)
+            cb.ax.tick_params(labelsize=11, length=0)
+            cb.set_alpha(1)
+            cb.outline.set_visible(False)
+            bbox = cb.ax.get_window_extent().transformed(pl.gcf().dpi_scale_trans.inverted())
+            cb.ax.set_aspect((bbox.height - 0.9) * 20)
+        # cb.draw_all()
+    pl.gca().xaxis.set_ticks_position('bottom')
+    pl.gca().yaxis.set_ticks_position('none')
+    pl.gca().spines['right'].set_visible(False)
+    pl.gca().spines['top'].set_visible(False)
+    pl.gca().spines['left'].set_visible(False)
+    pl.gca().tick_params(color=axis_color, labelcolor=axis_color)
+    pl.yticks(range(len(feature_order)), [feature_names[i] for i in feature_order], fontsize=13)
+    pl.gca().tick_params('y', length=20, width=0.5, which='major')
+    pl.gca().tick_params('x', labelsize=11)
+    pl.ylim(-1, len(feature_order))
+    pl.xlabel(labels['VALUE'], fontsize=13)
+    pl.tight_layout()
+    # if show:
+    #     pl.show()
+    return mpl_fig
+def approx_interactions(index, shap_values, X):
+    """ Order other features by how much interaction they seem to have with the feature at the given index.
+    This just bins the SHAP values for a feature along that feature's value. For true Shapley interaction
+    index values for SHAP see the interaction_contribs option implemented in XGBoost.
+    """
+    if X.shape[0] > 10000:
+        a = np.arange(X.shape[0])
+        np.random.shuffle(a)
+        inds = a[:10000]
+    else:
+        inds = np.arange(X.shape[0])
+    x = X[inds, index]
+    srt = np.argsort(x)
+    shap_ref = shap_values[inds, index]
+    shap_ref = shap_ref[srt]
+    inc = max(min(int(len(x) / 10.0), 50), 1)
+    interactions = []
+    for i in range(X.shape[1]):
+        val_other = X[inds, i][srt].astype(np.float)
+        v = 0.0
+        if not (i == index or np.sum(np.abs(val_other)) < 1e-8):
+            for j in range(0, len(x), inc):
+                if np.std(val_other[j:j + inc]) > 0 and np.std(shap_ref[j:j + inc]) > 0:
+                    v += abs(np.corrcoef(shap_ref[j:j + inc], val_other[j:j + inc])[0, 1])
+        interactions.append(v)
+    return np.argsort(-np.abs(interactions))
+def shap_dependence_plot(ind, shap_values, features, feature_names=None, display_features=None,
+                    interaction_index="auto", color="#1E88E5", axis_color="#333333",
+                    dot_size=16, alpha=1, title=None, show=True):
+    """
+    Create a SHAP dependence plot, colored by an interaction feature.
+    Parameters
+    ----------
+    ind : int
+        Index of the feature to plot.
+    shap_values : numpy.array
+        Matrix of SHAP values (# samples x # features)
+    features : numpy.array or pandas.DataFrame
+        Matrix of feature values (# samples x # features)
+    feature_names : list
+        Names of the features (length # features)
+    display_features : numpy.array or pandas.DataFrame
+        Matrix of feature values for visual display (such as strings instead of coded values)
+    interaction_index : "auto", None, or int
+        The index of the feature used to color the plot.
+    """
+    # convert from DataFrames if we got any
+    if str(type(features)).endswith("'pandas.core.frame.DataFrame'>"):
+        if feature_names is None:
+            feature_names = features.columns
+        features = features.values
+    if str(type(display_features)).endswith("'pandas.core.frame.DataFrame'>"):
+        if feature_names is None:
+            feature_names = display_features.columns
+        display_features = display_features.values
+    elif display_features is None:
+        display_features = features
+    if feature_names is None:
+        feature_names = [labels['FEATURE'] % str(i) for i in range(shap_values.shape[1] - 1)]
+    # allow vectors to be passed
+    if len(shap_values.shape) == 1:
+        shap_values = np.reshape(shap_values, len(shap_values), 1)
+    if len(features.shape) == 1:
+        features = np.reshape(features, len(features), 1)
+    def convert_name(ind):
+        if type(ind) == str:
+            nzinds = np.where(feature_names == ind)[0]
+            if len(nzinds) == 0:
+                print("Could not find feature named: " + ind)
+                return None
+            else:
+                return nzinds[0]
+        else:
+            return ind
+    ind = convert_name(ind)
+    mpl_fig = pl.gcf()
+    ax = mpl_fig.gca()
+    # plotting SHAP interaction values
+    if len(shap_values.shape) == 3 and len(ind) == 2:
+        ind1 = convert_name(ind[0])
+        ind2 = convert_name(ind[1])
+        if ind1 == ind2:
+            proj_shap_values = shap_values[:, ind2, :]
+        else:
+            proj_shap_values = shap_values[:, ind2, :] * 2  # off-diag values are split in half
+        # TODO: remove recursion; generally the functions should be shorter for more maintainable code
+        return shap_dependence_plot(
+            ind1, proj_shap_values, features, feature_names=feature_names,
+            interaction_index=ind2, display_features=display_features, show=False
+        )
+        assert shap_values.shape[0] == features.shape[0], \
+            "'shap_values' and 'features' values must have the same number of rows!"
+        assert shap_values.shape[1] == features.shape[1], \
+            "'shap_values' must have the same number of columns as 'features'!"
+        # get both the raw and display feature values
+        xv = features[:, ind]
+        xd = display_features[:, ind]
+        s = shap_values[:, ind]
+        if type(xd[0]) == str:
+            name_map = {}
+            for i in range(len(xv)):
+                name_map[xd[i]] = xv[i]
+            xnames = list(name_map.keys())
+        # allow a single feature name to be passed alone
+        if type(feature_names) == str:
+            feature_names = [feature_names]
+        name = feature_names[ind]
+        # guess what other feature as the stongest interaction with the plotted feature
+        if interaction_index == "auto":
+            interaction_index = approx_interactions(ind, shap_values, features)[0]
+        interaction_index = convert_name(interaction_index)
+        categorical_interaction = False
+        # get both the raw and display color values
+        if interaction_index is not None:
+            cv = features[:, interaction_index]
+            cd = display_features[:, interaction_index]
+            clow = np.nanpercentile(features[:, interaction_index].astype(np.float), 5)
+            chigh = np.nanpercentile(features[:, interaction_index].astype(np.float), 95)
+            if type(cd[0]) == str:
+                cname_map = {}
+                for i in range(len(cv)):
+                    cname_map[cd[i]] = cv[i]
+                cnames = list(cname_map.keys())
+                categorical_interaction = True
+            elif clow % 1 == 0 and chigh % 1 == 0 and len(set(features[:, interaction_index])) < 50:
+                categorical_interaction = True
+        # discritize colors for categorical features
+        color_norm = None
+        if categorical_interaction and clow != chigh:
+            bounds = np.linspace(clow, chigh, chigh - clow + 2)
+            color_norm = matplotlib.colors.BoundaryNorm(bounds, red_blue.N)
+        # the actual scatter plot, TODO: adapt the dot_size to the number of data points?
+        if interaction_index is not None:
+            pl.scatter(xv, s, s=dot_size, linewidth=0, c=features[:, interaction_index], cmap=red_blue,
+                       alpha=alpha, vmin=clow, vmax=chigh, norm=color_norm, rasterized=len(xv) > 500)
+        else:
+            pl.scatter(xv, s, s=dot_size, linewidth=0, color="#1E88E5",
+                       alpha=alpha, rasterized=len(xv) > 500)
+        if interaction_index != ind and interaction_index is not None:
+            # draw the color bar
+            if type(cd[0]) == str:
+                tick_positions = [cname_map[n] for n in cnames]
+                if len(tick_positions) == 2:
+                    tick_positions[0] -= 0.25
+                    tick_positions[1] += 0.25
+                cb = pl.colorbar(ticks=tick_positions)
+                cb.set_ticklabels(cnames)
+            else:
+                cb = pl.colorbar()
+            cb.set_label(feature_names[interaction_index], size=13)
+            cb.ax.tick_params(labelsize=11)
+            if categorical_interaction:
+                cb.ax.tick_params(length=0)
+            cb.set_alpha(1)
+            cb.outline.set_visible(False)
+            bbox = cb.ax.get_window_extent().transformed(pl.gcf().dpi_scale_trans.inverted())
+            cb.ax.set_aspect((bbox.height - 0.7) * 20)
+        # make the plot more readable
+        if interaction_index != ind:
+            pl.gcf().set_size_inches(7.5, 5)
+        else:
+            pl.gcf().set_size_inches(6, 5)
+        # pl.xlabel(name, color=axis_color, fontsize=13)
+        # pl.ylabel(labels['VALUE_FOR'] % name, color=axis_color, fontsize=13)
+        if title is not None:
+            pl.title(title, color=axis_color, fontsize=13)
+        pl.gca().xaxis.set_ticks_position('bottom')
+        pl.gca().yaxis.set_ticks_position('left')
+        pl.gca().spines['right'].set_visible(False)
+        pl.gca().spines['top'].set_visible(False)
+        pl.gca().tick_params(color=axis_color, labelcolor=axis_color, labelsize=11)
+        for spine in pl.gca().spines.values():
+            spine.set_edgecolor(axis_color)
+        if type(xd[0]) == str:
+            pl.xticks([name_map[n] for n in xnames], xnames, rotation='vertical', fontsize=11)
+        # if show:
+            # pl.show()
+        if ind1 == ind2:
+            pl.ylabel(labels['MAIN_EFFECT'] % feature_names[ind1])
+        else:
+            pl.ylabel(labels['INTERACTION_EFFECT'] % (feature_names[ind1], feature_names[ind2]))
+        return mpl_fig, interaction_index
+        # # if show:
+        # #     pl.show()
+        # return
+        # return mpl_fig
+    # assert shap_values.shape[0] == features.shape[0], "'shap_values' and 'features' values must have the same number of rows!"
+    # assert shap_values.shape[1] == features.shape[1] + 1, "'shap_values' must have one more column than 'features'!"
+    # get both the raw and display feature values
+    xv = features[:, ind]
+    xd = display_features[:, ind]
+    s = shap_values[:, ind]
+    if type(xd[0]) == str:
+        name_map = {}
+        for i in range(len(xv)):
+            name_map[xd[i]] = xv[i]
+        xnames = list(name_map.keys())
+    # allow a single feature name to be passed alone
+    if type(feature_names) == str:
+        feature_names = [feature_names]
+    name = feature_names[ind]
+    # guess what other feature as the stongest interaction with the plotted feature
+    if interaction_index == "auto":
+        interaction_index = approx_interactions(ind, shap_values, features)[0]
+    interaction_index = convert_name(interaction_index)
+    categorical_interaction = False
+    # get both the raw and display color values
+    if interaction_index is not None:
+        cv = features[:, interaction_index]
+        cd = display_features[:, interaction_index]
+        clow = np.nanpercentile(features[:, interaction_index].astype(np.float), 5)
+        chigh = np.nanpercentile(features[:, interaction_index].astype(np.float), 95)
+        if type(cd[0]) == str:
+            cname_map = {}
+            for i in range(len(cv)):
+                cname_map[cd[i]] = cv[i]
+            cnames = list(cname_map.keys())
+            categorical_interaction = True
+        elif clow % 1 == 0 and chigh % 1 == 0 and len(set(features[:, interaction_index])) < 50:
+            categorical_interaction = True
+    # discritize colors for categorical features
+    color_norm = None
+    if categorical_interaction and clow != chigh:
+        bounds = np.linspace(clow, chigh, chigh - clow + 2)
+        color_norm = matplotlib.colors.BoundaryNorm(bounds, red_blue.N)
+    # the actual scatter plot, TODO: adapt the dot_size to the number of data points?
+    if interaction_index is not None:
+        pl.scatter(xv, s, s=dot_size, linewidth=0, c=features[:, interaction_index], cmap=red_blue,
+                   alpha=alpha, vmin=clow, vmax=chigh, norm=color_norm, rasterized=len(xv) > 500)
+    else:
+        pl.scatter(xv, s, s=dot_size, linewidth=0, color="#1E88E5",
+                   alpha=alpha, rasterized=len(xv) > 500)
+    if interaction_index != ind and interaction_index is not None:
+        # draw the color bar
+        if type(cd[0]) == str:
+            tick_positions = [cname_map[n] for n in cnames]
+            if len(tick_positions) == 2:
+                tick_positions[0] -= 0.25
+                tick_positions[1] += 0.25
+            cb = pl.colorbar(ticks=tick_positions)
+            cb.set_ticklabels(cnames)
+        else:
+            cb = pl.colorbar()
+        cb.set_label(feature_names[interaction_index], size=13)
+        cb.ax.tick_params(labelsize=11)
+        if categorical_interaction:
+            cb.ax.tick_params(length=0)
+        cb.set_alpha(1)
+        cb.outline.set_visible(False)
+        bbox = cb.ax.get_window_extent().transformed(pl.gcf().dpi_scale_trans.inverted())
+        cb.ax.set_aspect((bbox.height - 0.7) * 20)
+    # make the plot more readable
+    if interaction_index != ind:
+        pl.gcf().set_size_inches(7.5, 5)
+    else:
+        pl.gcf().set_size_inches(6, 5)
+    pl.xlabel(name, color=axis_color, fontsize=13)
+    pl.ylabel(labels['VALUE_FOR'] % name, color=axis_color, fontsize=13)
+    if title is not None:
+        pl.title(title, color=axis_color, fontsize=13)
+    pl.gca().xaxis.set_ticks_position('bottom')
+    pl.gca().yaxis.set_ticks_position('left')
+    pl.gca().spines['right'].set_visible(False)
+    pl.gca().spines['top'].set_visible(False)
+    pl.gca().tick_params(color=axis_color, labelcolor=axis_color, labelsize=11)
+    for spine in pl.gca().spines.values():
+        spine.set_edgecolor(axis_color)
+    if type(xd[0]) == str:
+        pl.xticks([name_map[n] for n in xnames], xnames, rotation='vertical', fontsize=11)
+    # if show:
+        # pl.show()
     return mpl_fig, interaction_index