Spaces:
Sleeping
Sleeping
File size: 2,745 Bytes
f60ce93 7d04c1c 0059ef7 bcf8eca bf91270 a2e3d4b 7944a63 0059ef7 9b79169 0059ef7 4d92e12 4b60c06 0059ef7 4b60c06 5c158f1 4b60c06 21cbf62 0059ef7 9b7ebe5 f60ce93 21cbf62 f60ce93 060dcc2 45b475d 659d788 f60ce93 a9b361d bd5c918 a044018 a2e3d4b d53ce83 ff8cf9b 330195f a2e3d4b 330195f a044018 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import streamlit as st
import re
import numpy as np
import pandas as pd
import sklearn
import xgboost
import shap
seed=42
data = pd.read_csv("annotations_dataset.csv")
data = data.set_index("Gene")
training_data = pd.read_csv("./selected_features_training_data.csv", header=0)
training_data.columns = [
regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
for col in training_data.columns.values
]
training_data["BPlabel_encoded"] = training_data["BPlabel"].map(
{"most likely": 1, "probable": 0.75, "least likely": 0.1}
)
Y = training_data["BPlabel_encoded"]
X = training_data.drop(columns=["BPlabel_encoded","BPlabel"])
xgb = xgboost.XGBRegressor(
n_estimators=40,
learning_rate=0.2,
max_depth=4,
reg_alpha=1,
reg_lambda=1,
random_state=seed,
objective="reg:squarederror",
)
xgb.fit(X, Y)
predictions = list(xgb.predict(data))
predictions = [round(item, 2) for item in predictions]
output = pd.Series(data=predictions, index=data.index, name="XGB_Score")
df_total = pd.concat([data, output], axis=1)
df_total.rename_axis('Gene').reset_index()
df_total = df_total[['XGB_Score', 'mousescore_Exomiser',
'SDI', 'Liver_GTExTPM', 'pLI_ExAC',
'HIPred',
'Cells - EBV-transformed lymphocytes_GTExTPM',
'Pituitary_GTExTPM',
'IPA_BP_annotation']]
st.title('Blood Pressure Gene Prioritisation Post-GWAS')
st.markdown("""
A machine learning pipeline for predicting disease-causing genes post-genome-wide association study in blood pressure.
""")
#gene_input = st.text_input('Input Single HGNC Gene:')
#df = df_total[df_total.index == gene_input]
#st.dataframe(df)
collect_genes = lambda x : [str(i) for i in re.split(",|, ", x) if i != ""]
input_gene_list = st.text_input("Input list of HGNC genes (enter comma separated):")
gene_list = collect_genes(input_gene_list)
explainer = shap.TreeExplainer(xgb)
if len(gene_list) > 1:
df = df_total[df_total.index.isin(gene_list)]
st.dataframe(df)
shap_values = explainer.shap_values(df)
summary_plot = shap.summary_plot(shap_values, df)
st.caption("SHAP Summary Plot of All Input Genes")
components.html(summary_plot, height = output_height, width = output_width, scrolling = True)
else:
pass
input_gene = st.text_input("Input individual HGNC gene:")
df2 = df_total[df_total.index == input_gene]
st.dataframe(df2)
if len(input_gene) == 1:
shap_values = explainer.shap_values(df2)
shap.initjs()
force_plot = shap.force_plot(
explainer.expected_value,
shap_values.values,
df2)
components.html(force_plot, height = output_height, width = output_width, scrolling = True)
else:
pass
st.markdown("""
Total Gene Prioritisation Results:
""")
st.dataframe(df_total)
|