File size: 6,134 Bytes
f60ce93
7d04c1c
0059ef7
 
df058d7
bcf8eca
df058d7
a2e3d4b
ef3a227
1ededbc
a828d88
 
 
 
 
 
 
 
 
 
 
9ccc625
33c190e
7944a63
0059ef7
df058d7
 
 
66ad10a
0059ef7
df058d7
 
 
 
0059ef7
df058d7
 
4b60c06
df058d7
 
 
 
 
4b60c06
df058d7
 
45b475d
 
659d788
f60ce93
 
cbf0145
f60ce93
df058d7
a9b361d
c73e4be
a044018
cbf0145
a2e3d4b
df058d7
d53ce83
e502fb9
3592cb3
 
 
df058d7
 
 
a828d88
ff8cf9b
330195f
df058d7
5ef2581
df058d7
 
 
 
77897c3
df058d7
 
 
1b01b6f
3592cb3
5e24005
3592cb3
1b01b6f
3592cb3
 
 
df058d7
 
 
5e24005
df058d7
 
 
 
 
 
 
5464374
df058d7
a2e3d4b
 
 
 
cbf0145
a2e3d4b
33c190e
8fb59d9
df058d7
 
 
 
 
 
a2e3d4b
9ccc625
 
9786ddd
89c1ddc
9786ddd
df058d7
 
 
 
 
 
 
 
 
 
 
 
 
 
330195f
 
a044018
 
89c1ddc
a044018
 
2ab2dc2
 
8fb59d9
df058d7
 
 
 
 
 
8fb59d9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import streamlit as st
import re
import numpy as np
import pandas as pd
import pickle
import sklearn
import catboost
import shap
from shap_plots import shap_summary_plot
from dynamic_shap_plot import matplotlib_to_plotly, summary_plot_plotly_fig
import plotly.tools as tls
import dash_core_components as dcc
import matplotlib
import plotly.graph_objs as go
try:
    import matplotlib.pyplot as pl
    from matplotlib.colors import LinearSegmentedColormap
    from matplotlib.ticker import MaxNLocator
except ImportError:
    pass

st.set_option('deprecation.showPyplotGlobalUse', False)

seed=42

annotations = pd.read_csv("all_genes_merged_ml_data.csv")
# TODO remove this placeholder when imputation is finished:
annotations.fillna(0, inplace=True)
annotations = annotations.set_index("Gene")

# Read in best_model_fitted.pkl as catboost_model
model_path = "best_model_fitted.pkl"  # Update this path if your model is stored elsewhere
with open(model_path, 'rb') as file:
    catboost_model = pickle.load(file)

# For a multi-class classification model, obtaining probabilities per class
probabilities = catboost_model.predict_proba(annotations)

# Creating a DataFrame for these probabilities
# Assuming classes are ordered as 'most likely', 'probable', and 'least likely' in the model
prob_df = pd.DataFrame(probabilities, 
                       index=annotations.index, 
                       columns=['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely'])

# Dynamically including all original features from annotations plus the new probability columns
df_total = pd.concat([prob_df, annotations], axis=1)


st.title('Blood Pressure Gene Prioritisation Post-GWAS')
st.markdown("""
A machine learning pipeline for predicting disease-causing genes post-genome-wide association study in blood pressure.


""")

collect_genes = lambda x : [str(i) for i in re.split(",|,\s+|\s+", x) if i != ""]

input_gene_list = st.text_input("Input a list of multiple HGNC genes (enter comma separated):")
gene_list = collect_genes(input_gene_list)
explainer = shap.TreeExplainer(catboost_model)

@st.cache_data
def convert_df(df):
   return df.to_csv(index=False).encode('utf-8')

probability_columns = ['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely']
features_list = [column for column in df_total.columns if column not in probability_columns]
features = df_total[features_list]

if len(gene_list) > 1:
    df = df_total[df_total.index.isin(gene_list)]
    df['Gene'] = df.index  # Ensure 'Gene' is a column if it's not already
    df.reset_index(drop=True, inplace=True)
    
    # Including Gene, probability columns, and all other features
    required_columns = ['Gene'] + probability_columns + [col for col in df.columns if col not in probability_columns and col != 'Gene']
    df = df[required_columns]
    st.dataframe(df)
    
    # Assuming you want to download the genes with their probabilities
    output = df[['Gene'] + probability_columns]
    csv = convert_df(output)
    st.download_button(
       "Download Gene Prioritisation",
       csv,
       "bp_gene_prioritisation.csv",
       "text/csv",
       key='download-csv'
    )
    
    # For SHAP values, assuming explainer is already fitted to your model
    df_shap = df.drop(columns=probability_columns + ['Gene'])  # Exclude non-feature columns
    shap_values = explainer.shap_values(df_shap)
    
    # Handle multiclass scenario: SHAP values will be a list of matrices, one per class
    # Plotting the summary plot for the first class as an example
    # You may loop through each class or handle it differently based on your needs
    class_index = 0  # Example: plotting for the first class
    shap.summary_plot(shap_values[class_index], df_shap, show=False)
    st.pyplot(bbox_inches='tight')
    st.caption("SHAP Summary Plot of All Input Genes")
    
else:
    pass


input_gene = st.text_input("Input an individual HGNC gene:")
df2 = df_total[df_total.index == input_gene]
df2['Gene'] = df2.index
df2.reset_index(drop=True, inplace=True)

# Ensure the DataFrame includes the CatBoost model's probability columns
# And assuming all features are desired in the output
probability_columns = ['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely']
required_columns = ['Gene'] + probability_columns + [col for col in df2.columns if col not in probability_columns and col != 'Gene']
df2 = df2[required_columns]
st.dataframe(df2)

if input_gene:
    if ' ' in input_gene or ',' in input_gene:
        st.write('Input Error: Please input only a single HGNC gene name with no white spaces or commas.')
    else:
        df2_shap = df_total.loc[[input_gene], [col for col in df_total.columns if col not in probability_columns + ['Gene']]]
        
        if df2_shap.shape[0] > 0:  # Check if the gene exists in the DataFrame
            shap_values = explainer.shap_values(df2_shap)
            
            # Adjust for multiclass: Select SHAP values for the predicted class (or a specific class)
            predicted_class_index = catboost_model.predict(df2_shap).item()  # Assuming predict returns the class index
            class_shap_values = shap_values[predicted_class_index]
            class_expected_value = explainer.expected_value[predicted_class_index]
            
            # Since force_plot doesn't directly support multiclass, consider using waterfall_plot or decision_plot
            # Here's an example using waterfall_plot for the first feature set's prediction
            shap.plots.waterfall(shap_values=class_shap_values[0], max_display=10, show=False)
            st.pyplot(bbox_inches='tight')
else:
    pass

st.markdown("""
### Total Gene Prioritisation Results:
""")

df_total_output = df_total
df_total_output['Gene'] = df_total_output.index
df_total_output.reset_index(drop=True, inplace=True)
#df_total_output = df_total_output[['Gene','XGB_Score', 'mousescore_Exomiser',
# 'SDI', 'Liver_GTExTPM',  'pLI_ExAC',
# 'HIPred',
# 'Cells - EBV-transformed lymphocytes_GTExTPM',
# 'Pituitary_GTExTPM',
# 'IPA_BP_annotation']]
st.dataframe(df_total_output)