Spaces:
Sleeping
Sleeping
File size: 6,134 Bytes
f60ce93 7d04c1c 0059ef7 df058d7 bcf8eca df058d7 a2e3d4b ef3a227 1ededbc a828d88 9ccc625 33c190e 7944a63 0059ef7 df058d7 66ad10a 0059ef7 df058d7 0059ef7 df058d7 4b60c06 df058d7 4b60c06 df058d7 45b475d 659d788 f60ce93 cbf0145 f60ce93 df058d7 a9b361d c73e4be a044018 cbf0145 a2e3d4b df058d7 d53ce83 e502fb9 3592cb3 df058d7 a828d88 ff8cf9b 330195f df058d7 5ef2581 df058d7 77897c3 df058d7 1b01b6f 3592cb3 5e24005 3592cb3 1b01b6f 3592cb3 df058d7 5e24005 df058d7 5464374 df058d7 a2e3d4b cbf0145 a2e3d4b 33c190e 8fb59d9 df058d7 a2e3d4b 9ccc625 9786ddd 89c1ddc 9786ddd df058d7 330195f a044018 89c1ddc a044018 2ab2dc2 8fb59d9 df058d7 8fb59d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import streamlit as st
import re
import numpy as np
import pandas as pd
import pickle
import sklearn
import catboost
import shap
from shap_plots import shap_summary_plot
from dynamic_shap_plot import matplotlib_to_plotly, summary_plot_plotly_fig
import plotly.tools as tls
import dash_core_components as dcc
import matplotlib
import plotly.graph_objs as go
try:
import matplotlib.pyplot as pl
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.ticker import MaxNLocator
except ImportError:
pass
st.set_option('deprecation.showPyplotGlobalUse', False)
seed=42
annotations = pd.read_csv("all_genes_merged_ml_data.csv")
# TODO remove this placeholder when imputation is finished:
annotations.fillna(0, inplace=True)
annotations = annotations.set_index("Gene")
# Read in best_model_fitted.pkl as catboost_model
model_path = "best_model_fitted.pkl" # Update this path if your model is stored elsewhere
with open(model_path, 'rb') as file:
catboost_model = pickle.load(file)
# For a multi-class classification model, obtaining probabilities per class
probabilities = catboost_model.predict_proba(annotations)
# Creating a DataFrame for these probabilities
# Assuming classes are ordered as 'most likely', 'probable', and 'least likely' in the model
prob_df = pd.DataFrame(probabilities,
index=annotations.index,
columns=['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely'])
# Dynamically including all original features from annotations plus the new probability columns
df_total = pd.concat([prob_df, annotations], axis=1)
st.title('Blood Pressure Gene Prioritisation Post-GWAS')
st.markdown("""
A machine learning pipeline for predicting disease-causing genes post-genome-wide association study in blood pressure.
""")
collect_genes = lambda x : [str(i) for i in re.split(",|,\s+|\s+", x) if i != ""]
input_gene_list = st.text_input("Input a list of multiple HGNC genes (enter comma separated):")
gene_list = collect_genes(input_gene_list)
explainer = shap.TreeExplainer(catboost_model)
@st.cache_data
def convert_df(df):
return df.to_csv(index=False).encode('utf-8')
probability_columns = ['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely']
features_list = [column for column in df_total.columns if column not in probability_columns]
features = df_total[features_list]
if len(gene_list) > 1:
df = df_total[df_total.index.isin(gene_list)]
df['Gene'] = df.index # Ensure 'Gene' is a column if it's not already
df.reset_index(drop=True, inplace=True)
# Including Gene, probability columns, and all other features
required_columns = ['Gene'] + probability_columns + [col for col in df.columns if col not in probability_columns and col != 'Gene']
df = df[required_columns]
st.dataframe(df)
# Assuming you want to download the genes with their probabilities
output = df[['Gene'] + probability_columns]
csv = convert_df(output)
st.download_button(
"Download Gene Prioritisation",
csv,
"bp_gene_prioritisation.csv",
"text/csv",
key='download-csv'
)
# For SHAP values, assuming explainer is already fitted to your model
df_shap = df.drop(columns=probability_columns + ['Gene']) # Exclude non-feature columns
shap_values = explainer.shap_values(df_shap)
# Handle multiclass scenario: SHAP values will be a list of matrices, one per class
# Plotting the summary plot for the first class as an example
# You may loop through each class or handle it differently based on your needs
class_index = 0 # Example: plotting for the first class
shap.summary_plot(shap_values[class_index], df_shap, show=False)
st.pyplot(bbox_inches='tight')
st.caption("SHAP Summary Plot of All Input Genes")
else:
pass
input_gene = st.text_input("Input an individual HGNC gene:")
df2 = df_total[df_total.index == input_gene]
df2['Gene'] = df2.index
df2.reset_index(drop=True, inplace=True)
# Ensure the DataFrame includes the CatBoost model's probability columns
# And assuming all features are desired in the output
probability_columns = ['Probability_Most_Likely', 'Probability_Probable', 'Probability_Least_Likely']
required_columns = ['Gene'] + probability_columns + [col for col in df2.columns if col not in probability_columns and col != 'Gene']
df2 = df2[required_columns]
st.dataframe(df2)
if input_gene:
if ' ' in input_gene or ',' in input_gene:
st.write('Input Error: Please input only a single HGNC gene name with no white spaces or commas.')
else:
df2_shap = df_total.loc[[input_gene], [col for col in df_total.columns if col not in probability_columns + ['Gene']]]
if df2_shap.shape[0] > 0: # Check if the gene exists in the DataFrame
shap_values = explainer.shap_values(df2_shap)
# Adjust for multiclass: Select SHAP values for the predicted class (or a specific class)
predicted_class_index = catboost_model.predict(df2_shap).item() # Assuming predict returns the class index
class_shap_values = shap_values[predicted_class_index]
class_expected_value = explainer.expected_value[predicted_class_index]
# Since force_plot doesn't directly support multiclass, consider using waterfall_plot or decision_plot
# Here's an example using waterfall_plot for the first feature set's prediction
shap.plots.waterfall(shap_values=class_shap_values[0], max_display=10, show=False)
st.pyplot(bbox_inches='tight')
else:
pass
st.markdown("""
### Total Gene Prioritisation Results:
""")
df_total_output = df_total
df_total_output['Gene'] = df_total_output.index
df_total_output.reset_index(drop=True, inplace=True)
#df_total_output = df_total_output[['Gene','XGB_Score', 'mousescore_Exomiser',
# 'SDI', 'Liver_GTExTPM', 'pLI_ExAC',
# 'HIPred',
# 'Cells - EBV-transformed lymphocytes_GTExTPM',
# 'Pituitary_GTExTPM',
# 'IPA_BP_annotation']]
st.dataframe(df_total_output)
|