Spaces:
Sleeping
Sleeping
Commit
·
16c6962
1
Parent(s):
e37c044
Upload 5 files
Browse files- app.py +378 -0
- env.yml +111 -0
- functions/__pycache__/pathway_analyses.cpython-39.pyc +0 -0
- functions/pathway_analyses.py +1015 -0
- pathway_databases/GO_Biological_Process_2021.txt +0 -0
app.py
ADDED
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
import scanpy as sc
|
7 |
+
|
8 |
+
#import mpld3
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
import seaborn as sns
|
11 |
+
|
12 |
+
|
13 |
+
import streamlit.components.v1 as components
|
14 |
+
|
15 |
+
from IPython.display import Markdown as md
|
16 |
+
|
17 |
+
from functions import pathway_analyses
|
18 |
+
|
19 |
+
# SMALL_SIZE = 2
|
20 |
+
# MEDIUM_SIZE = 2
|
21 |
+
# BIGGER_SIZE = 2
|
22 |
+
|
23 |
+
# plt.rc('font', size=SMALL_SIZE) # controls default text sizes
|
24 |
+
# plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
|
25 |
+
# plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
|
26 |
+
# plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
|
27 |
+
# plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
|
28 |
+
# plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
|
29 |
+
# plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
|
30 |
+
|
31 |
+
sc.settings.set_figure_params(dpi=80, facecolor='white')
|
32 |
+
|
33 |
+
#disable st.pyplot warning
|
34 |
+
st.set_page_config(layout="wide")
|
35 |
+
st.markdown(
|
36 |
+
"""
|
37 |
+
<style>
|
38 |
+
.streamlit-expanderHeader {
|
39 |
+
font-size: x-large;
|
40 |
+
}
|
41 |
+
</style>
|
42 |
+
""",
|
43 |
+
unsafe_allow_html=True,
|
44 |
+
)
|
45 |
+
m=st.markdown("""
|
46 |
+
|
47 |
+
<style>
|
48 |
+
|
49 |
+
div.stTitle {
|
50 |
+
|
51 |
+
font-size:40px;
|
52 |
+
|
53 |
+
}
|
54 |
+
|
55 |
+
</style>"""
|
56 |
+
,unsafe_allow_html=True)
|
57 |
+
|
58 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
59 |
+
|
60 |
+
#load Data
|
61 |
+
cwd=os.getcwd()+'/'#+'data/'
|
62 |
+
@st.cache_data
|
63 |
+
def get_data():
|
64 |
+
if 'adata_annot' not in st.session_state:
|
65 |
+
adata_annot = sc.read_h5ad(cwd+'multiregion_brainaging_annotated.h5ad')
|
66 |
+
st.session_state['adata_annot'] = adata_annot
|
67 |
+
if 'genes_list' not in st.session_state:
|
68 |
+
genes=adata_annot.var.index
|
69 |
+
#genes_list=sorted(genes.unique())
|
70 |
+
st.session_state['genes_list'] = sorted(genes.unique())
|
71 |
+
if 'cell_type' not in st.session_state:
|
72 |
+
#cell_type=diff_fdr[diff_fdr.type=='cell_type']['tissue']
|
73 |
+
#cell_type=sorted(cell_type.unique())
|
74 |
+
anno=adata_annot.obs.new_anno
|
75 |
+
#cell_type=sorted(anno.unique())
|
76 |
+
st.session_state['cell_type'] = sorted(anno.unique())
|
77 |
+
if 'broad_type' not in st.session_state:
|
78 |
+
broad_celltype=adata_annot.obs.broad_celltype
|
79 |
+
#broad_type=sorted(broad_type.unique())
|
80 |
+
st.session_state['broad_type'] = sorted(broad_celltype.unique())
|
81 |
+
|
82 |
+
#Also load Go Terms
|
83 |
+
if 'go_table' not in st.session_state:
|
84 |
+
bp = pathway_analyses.read_pathways('pathway_databases/GO_Biological_Process_2021.txt')
|
85 |
+
|
86 |
+
# cy = pathway_analyses.read_pathways('pathway_databases/HumanCyc_2016.txt')
|
87 |
+
# ke = pathway_analyses.read_pathways('pathway_databases/KEGG_2019_Human.txt')
|
88 |
+
# re = pathway_analyses.read_pathways('pathway_databases/Reactome_2016.txt')
|
89 |
+
|
90 |
+
# all_paths = pd.concat([bp, cy, ke, re], join='outer', axis=0, ignore_index=True)
|
91 |
+
# all_paths.set_index(0, inplace=True)
|
92 |
+
# all_paths.fillna("", inplace=True)
|
93 |
+
# all_paths_dict = all_paths.to_dict(orient='index')
|
94 |
+
|
95 |
+
|
96 |
+
go_bp_paths = bp.set_index(0)
|
97 |
+
go_bp_paths.fillna("", inplace=True)
|
98 |
+
go_bp_paths_dict = go_bp_paths.to_dict(orient='index')
|
99 |
+
|
100 |
+
|
101 |
+
gene_set_by_path = {key: [val for val in value.values() if val != ""] for key, value in go_bp_paths_dict.items()}
|
102 |
+
gene_set_by_path = pd.DataFrame.from_dict(gene_set_by_path, orient='index').transpose()
|
103 |
+
st.session_state['path_ways']=gene_set_by_path.columns
|
104 |
+
st.session_state['go_table']=gene_set_by_path
|
105 |
+
#done load Data
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
#st.title('Single nuclei atlas of human aging in brain regions')
|
110 |
+
st.title('Brain Age Browser')
|
111 |
+
|
112 |
+
#genes_list,adata_annot=get_data()
|
113 |
+
|
114 |
+
get_data()
|
115 |
+
|
116 |
+
tab1, tab2,readme = st.tabs(["Gene Expression by CellType", "Age associations for multiple genes", "README"])
|
117 |
+
data = np.random.randn(10, 1)
|
118 |
+
with tab1:
|
119 |
+
with st.form(key='columns_in_form'):
|
120 |
+
#c1, c2, c3 = st.columns([4,4,2])
|
121 |
+
c1, c2 = st.columns(2)
|
122 |
+
with c1:
|
123 |
+
selected_gene = st.selectbox(
|
124 |
+
'Please select a gene',
|
125 |
+
st.session_state['genes_list'])
|
126 |
+
with c2:
|
127 |
+
selected_celltype = st.selectbox(
|
128 |
+
'Please select CellType',
|
129 |
+
st.session_state['cell_type']
|
130 |
+
)
|
131 |
+
# with c3:
|
132 |
+
# plot_choice = st.checkbox(
|
133 |
+
# "Which Plots",
|
134 |
+
# ('Gene','Old/Young'))
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
Updated=st.form_submit_button(label = 'Go')
|
139 |
+
if not isinstance(selected_gene, type(None)) and not isinstance(selected_celltype, type(None)) and Updated:
|
140 |
+
|
141 |
+
|
142 |
+
# fig11, axx1 = plt.subplots()
|
143 |
+
# sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='4', frameon=False,show=False, ax=axx1)
|
144 |
+
# st.pyplot(plt.gcf().set_size_inches(4, 4))
|
145 |
+
|
146 |
+
col1,col2= st.columns([1,1])
|
147 |
+
with col1:
|
148 |
+
fig11, axx11 = plt.subplots(figsize=(5,5))
|
149 |
+
sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='8', frameon=False,show=False, ax=axx11)
|
150 |
+
st.pyplot(fig11)
|
151 |
+
|
152 |
+
with col2:
|
153 |
+
fig12, axx12 = plt.subplots(figsize=(5,5))
|
154 |
+
#sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=axx2)
|
155 |
+
sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False,legend_fontsize='xx-small', ax=axx12)#,vmax='p99')
|
156 |
+
#plt.xticks(rotation = 45)
|
157 |
+
st.pyplot(fig12)
|
158 |
+
|
159 |
+
#Subset Younv and Old
|
160 |
+
adata_Young = st.session_state['adata_annot'][st.session_state['adata_annot'].obs['Age_group']=='young']
|
161 |
+
adata_Old = st.session_state['adata_annot'][st.session_state['adata_annot'].obs['Age_group']=='old']
|
162 |
+
|
163 |
+
#Young/Old but for cell_type
|
164 |
+
adata_YoungAst = adata_Young[adata_Young.obs['new_anno']==selected_celltype]
|
165 |
+
adata_OldAst = adata_Old[adata_Old.obs['new_anno']==selected_celltype]
|
166 |
+
|
167 |
+
# # #Young/Old but for cell_type
|
168 |
+
# # adata_YoungAst = adata_Young[adata_Young.obs['broad_celltype']==selected_celltype]
|
169 |
+
# # adata_OldAst = adata_Old[adata_Old.obs['broad_celltype']==selected_celltype]
|
170 |
+
|
171 |
+
#Young
|
172 |
+
dot_size=.05
|
173 |
+
col1,col2= st.columns([1,1])
|
174 |
+
|
175 |
+
with col1:
|
176 |
+
|
177 |
+
#st.markdown('<div style="text-align: center;">**Young**</div>', unsafe_allow_html=True)
|
178 |
+
str_title='Young: '+selected_gene
|
179 |
+
#st.markdown("<h3 style='text-align: center; color: red;'>str_title</h3>", unsafe_allow_html=True)
|
180 |
+
st.markdown("# {} ".format(str_title))#,align_text='center')
|
181 |
+
#md("# {} ".format(str_title))
|
182 |
+
|
183 |
+
fig21, axx21 = plt.subplots(figsize=(1,1))
|
184 |
+
#sc.pl.umap(adata_Young, color=selected_gene, title="Young: "+selected_gene, legend_loc='right margin', color_map='viridis',frameon=False,show=False,size=dot_size, legend_fontsize='4',colorbar_loc=None,ax=axx21)
|
185 |
+
sc.pl.umap(adata_Young, color=selected_gene, title="", legend_loc='right margin', color_map='viridis',frameon=False,show=False,size=dot_size, legend_fontsize='xx-small',colorbar_loc=None,ax=axx21)
|
186 |
+
#st.pyplot(fig21)
|
187 |
+
st.pyplot(plt.gcf())
|
188 |
+
with col2:
|
189 |
+
str_title='Young: '+selected_gene+" ("+selected_celltype+")"
|
190 |
+
st.markdown("# {} ".format(str_title))#,align_text='center')
|
191 |
+
fig22, axx22 = plt.subplots(figsize=(1,1))
|
192 |
+
#sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=axx2)
|
193 |
+
#sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False, ax=axx2)#,vmax='p99')
|
194 |
+
sc.pl.umap(adata_YoungAst, color=selected_gene, title="", legend_loc='right margin', color_map='viridis', frameon=False,show=False,size=dot_size,legend_fontsize='xx-small',colorbar_loc=None, ax=axx22)
|
195 |
+
#sc.pl.umap(adata_Old, color=selected_gene, title="Old: "+selected_gene, legend_loc='right margin', color_map='viridis', frameon=False,show=False, ax=axx22)
|
196 |
+
#plt.xticks(rotation = 45)
|
197 |
+
#st.pyplot(fig22)
|
198 |
+
st.pyplot(plt.gcf())
|
199 |
+
#Old
|
200 |
+
col1,col2= st.columns([1,1])
|
201 |
+
with col1:
|
202 |
+
str_title='Old: '+selected_gene+" ("+selected_celltype+")"
|
203 |
+
st.markdown("# {} ".format(str_title))#,align_text='center')
|
204 |
+
|
205 |
+
fig31, axx31 = plt.subplots(figsize=(1,1))
|
206 |
+
#sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='8', frameon=False,show=False, ax=axx1)
|
207 |
+
sc.pl.umap(adata_Old, color=selected_gene, title="", legend_loc='right margin', color_map='viridis', frameon=False,show=False,size=dot_size,legend_fontsize='xx-small', colorbar_loc="bottom",ax=axx31)
|
208 |
+
st.pyplot(fig31)
|
209 |
+
with col2:
|
210 |
+
str_title='Old: '+selected_gene+" ("+selected_celltype+")"
|
211 |
+
st.markdown("# {} ".format(str_title))#,align_text='center')
|
212 |
+
|
213 |
+
fig32, axx32 = plt.subplots(figsize=(1,1))
|
214 |
+
#sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=axx2)
|
215 |
+
#sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False, ax=axx2)#,vmax='p99')
|
216 |
+
sc.pl.umap(adata_OldAst, color=selected_gene, title="", legend_loc='right margin', color_map='viridis', frameon=False,show=False,size=dot_size,legend_fontsize='xx-small', colorbar_loc="bottom",ax=axx32)
|
217 |
+
#plt.xticks(rotation = 45)
|
218 |
+
st.pyplot(fig32)
|
219 |
+
|
220 |
+
# fig, ax = plt.subplots(3,2)
|
221 |
+
# sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=ax[0,0])
|
222 |
+
# sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False, ax=ax[0,1],vmax='p99')
|
223 |
+
|
224 |
+
# #Subset Younv and Old
|
225 |
+
# adata_Young = st.session_state['adata_annot'][st.session_state['adata_annot'].obs['Age_group']=='young']
|
226 |
+
# adata_Old = st.session_state['adata_annot'][st.session_state['adata_annot'].obs['Age_group']=='old']
|
227 |
+
# sc.pl.umap(adata_Young, color=selected_gene, title="Young: "+selected_gene, legend_loc='right margin', color_map='viridis',frameon=False,show=False, ax=ax[1,0])
|
228 |
+
# sc.pl.umap(adata_Old, color=selected_gene, title="Old: "+selected_gene, legend_loc='right margin', color_map='viridis', frameon=False,show=False, ax=ax[2,0])
|
229 |
+
|
230 |
+
# # #Young/Old but for cell_type
|
231 |
+
# # adata_YoungAst = adata_Young[adata_Young.obs['broad_celltype']==selected_celltype]
|
232 |
+
# # adata_OldAst = adata_Old[adata_Old.obs['broad_celltype']==selected_celltype]
|
233 |
+
|
234 |
+
# #Young/Old but for cell_type
|
235 |
+
# adata_YoungAst = adata_Young[adata_Young.obs['new_anno']==selected_celltype]
|
236 |
+
# adata_OldAst = adata_Old[adata_Old.obs['new_anno']==selected_celltype]
|
237 |
+
|
238 |
+
|
239 |
+
# sc.pl.umap(adata_YoungAst, color=selected_gene, title=selected_celltype, legend_loc='right margin', color_map='viridis', frameon=False,show=False, ax=ax[1,1])
|
240 |
+
# sc.pl.umap(adata_OldAst, color=selected_gene, title=selected_celltype, legend_loc='right margin', color_map='viridis', frameon=False,show=False, ax=ax[2,1])
|
241 |
+
|
242 |
+
# #sc.pl.umap(st.session_state['adata_annot'], color='Brain_region', title='Brain Region', legend_loc='right margin', frameon=False,show=False, ax=ax[1,1])
|
243 |
+
# #sc.pl.umap(st.session_state['adata_annot'], color='Age_group', title='Age Group', legend_loc='right margin', frameon=False,show=False, ax=ax[2,0])
|
244 |
+
# #sc.pl.umap(st.session_state['adata_annot'], color=selected_celltype, title=selected_celltype, legend_loc='on data', frameon=False,show=False, ax=ax[2,1])
|
245 |
+
|
246 |
+
|
247 |
+
# st.pyplot(plt.gcf().set_size_inches(15, 30))
|
248 |
+
|
249 |
+
|
250 |
+
|
251 |
+
with tab2:
|
252 |
+
with st.form(key='multiselect_form'):
|
253 |
+
c1, c2, c3 = st.columns([4,4,2])
|
254 |
+
with c1:
|
255 |
+
multi_genes = st.multiselect(
|
256 |
+
'Select Genes List',
|
257 |
+
st.session_state['genes_list'])
|
258 |
+
with c2:
|
259 |
+
go_term = st.selectbox(
|
260 |
+
'Select GO Term',
|
261 |
+
st.session_state['path_ways'])
|
262 |
+
with c3:
|
263 |
+
Choice = st.radio(
|
264 |
+
"",
|
265 |
+
('Gene Set','GO Term'))
|
266 |
+
|
267 |
+
Updated_tab2=st.form_submit_button(label = 'Show Results')
|
268 |
+
if not isinstance(multi_genes, type(None)) and Updated_tab2:
|
269 |
+
if Choice=='Gene Set':
|
270 |
+
multi_genes = np.sort(multi_genes)
|
271 |
+
else:
|
272 |
+
multi_genes=st.session_state['go_table'].loc[:,go_term]
|
273 |
+
multi_genes=multi_genes.dropna().values
|
274 |
+
#multi_genes=['WNT3', 'VPS13C', 'VAMP4', 'UBTF', 'UBAP2', 'TMEM175', 'TMEM163', 'SYT17', 'STK39', 'SPPL2B', 'SIPA1L2', 'SH3GL2', 'SCARB2', 'SCAF11', 'RPS6KL1', 'RPS12', 'RIT2', 'RIMS1', 'RETREG3', 'PMVK', 'PAM', 'NOD2', 'MIPOL1', 'MEX3C', 'MED12L', 'MCCC1', 'MBNL2', 'MAPT', 'LRRK2', 'KRTCAP2', 'KCNS3', 'KCNIP3', 'ITGA8', 'IP6K2', 'GPNMB', 'GCH1', 'GBA', 'FYN', 'FCGR2A', 'FBRSL1', 'FAM49B', 'FAM171A2', 'ELOVL7', 'DYRK1A', 'DNAH17', 'DLG2', 'CTSB', 'CRLS1', 'CRHR1', 'CLCN3', 'CHRNB1', 'CAMK2D', 'CAB39L', 'BRIP1', 'BIN3', 'ASXL3', 'SNCA']
|
275 |
+
#########
|
276 |
+
|
277 |
+
#sns.clustermap(st.session_state['adata_annot'], figsize=(14,12),
|
278 |
+
# pivot_kws={'index': 'country',
|
279 |
+
# 'columns': 'year',
|
280 |
+
# 'values': 'lifeExp'})
|
281 |
+
# col1,col2= st.columns([1,1])
|
282 |
+
# #fig_szx=2*len(st.session_state['cell_type'])
|
283 |
+
# #fig_szy=100*len(multi_genes)
|
284 |
+
# with col1:
|
285 |
+
# figa, axxaa = plt.subplots(figsize=(5, 5))
|
286 |
+
# #sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='8', frameon=False,show=False, ax=axx11)
|
287 |
+
# axxaa=sc.pl.clustermap(st.session_state['adata_annot'], obs_keys=multi_genes) #,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='BuPu',swap_axes=True,show=False,vmax=5)
|
288 |
+
# #st.pyplot(fig11)
|
289 |
+
# #st.pyplot(plt.gcf().set_size_inches(fig_szx, fig_szy))
|
290 |
+
# st.pyplot(plt.gcf())
|
291 |
+
|
292 |
+
|
293 |
+
|
294 |
+
col1,col2= st.columns([1,1])
|
295 |
+
#fig_szx=2*len(st.session_state['cell_type'])
|
296 |
+
#fig_szy=100*len(multi_genes)
|
297 |
+
with col1:
|
298 |
+
fig11, axx11 = plt.subplots(figsize=(5, 5))
|
299 |
+
#sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='8', frameon=False,show=False, ax=axx11)
|
300 |
+
axx11=sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='BuPu',swap_axes=True,show=False,vmax=5)
|
301 |
+
#st.pyplot(fig11)
|
302 |
+
#st.pyplot(plt.gcf().set_size_inches(fig_szx, fig_szy))
|
303 |
+
st.pyplot(plt.gcf())
|
304 |
+
with col2:
|
305 |
+
fig12, axx12 = plt.subplots(figsize=(5, 5))
|
306 |
+
#sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=axx2)
|
307 |
+
#sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False,legend_fontsize='xx-small', ax=axx12)#,vmax='p99')
|
308 |
+
axx12=sc.pl.heatmap(st.session_state['adata_annot'], multi_genes, groupby='new_anno', vmin=-1, vmax=1, cmap='BuPu', dendrogram=True, swap_axes=True)#,ax=ax2)
|
309 |
+
#plt.xticks(rotation = 45)
|
310 |
+
#st.pyplot(fig12)
|
311 |
+
#st.pyplot(plt.gcf().set_size_inches(fig_szx, fig_szy))
|
312 |
+
st.pyplot(plt.gcf())
|
313 |
+
|
314 |
+
|
315 |
+
#######
|
316 |
+
|
317 |
+
#multi_genes=['WNT3', 'VPS13C', 'VAMP4', 'UBTF', 'UBAP2', 'TMEM175', 'TMEM163', 'SYT17', 'STK39', 'SPPL2B', 'SIPA1L2', 'SH3GL2', 'SCARB2', 'SCAF11', 'RPS6KL1', 'RPS12', 'RIT2', 'RIMS1', 'RETREG3', 'PMVK', 'PAM', 'NOD2', 'MIPOL1', 'MEX3C', 'MED12L', 'MCCC1', 'MBNL2', 'MAPT', 'LRRK2', 'KRTCAP2', 'KCNS3', 'KCNIP3', 'ITGA8', 'IP6K2', 'GPNMB', 'GCH1', 'GBA', 'FYN', 'FCGR2A', 'FBRSL1', 'FAM49B', 'FAM171A2', 'ELOVL7', 'DYRK1A', 'DNAH17', 'DLG2', 'CTSB', 'CRLS1', 'CRHR1', 'CLCN3', 'CHRNB1', 'CAMK2D', 'CAB39L', 'BRIP1', 'BIN3', 'ASXL3', 'SNCA']
|
318 |
+
#multi_genes=np.sort(multi_genes)
|
319 |
+
# fig, ax1 = plt.subplots(1,2)
|
320 |
+
# sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='RdBu_r',show=False, ax=ax1[0])
|
321 |
+
# st.pyplot(plt.gcf().set_size_inches(10, 10))
|
322 |
+
# fig, ax2 = plt.subplots(1,2)
|
323 |
+
# ax2=sc.pl.heatmap(st.session_state['adata_annot'], multi_genes, 'new_anno', vmin=-1, vmax=1, cmap='RdBu_r', dendrogram=True, swap_axes=True)
|
324 |
+
# st.pyplot(plt.gcf().set_size_inches(10, 10))
|
325 |
+
#ax[0]=sc.pl.dotplot(st.session_state['adata_annot'],multi_genes,'new_anno',show=False)
|
326 |
+
#fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20,4), gridspec_kw={'wspace':0.9})
|
327 |
+
|
328 |
+
|
329 |
+
|
330 |
+
#commented these-working ones
|
331 |
+
|
332 |
+
# fig, (ax1) = plt.subplots(1, 1, figsize=(20,4), gridspec_kw={'wspace':0.9})
|
333 |
+
# #ax = plt.subplot()
|
334 |
+
# ax1_dict=sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='BuPu',swap_axes=True,show=False, ax=ax1,vmax=5)
|
335 |
+
# #ax_dict=sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='RdBu_r',swap_axes=True,show=False, ax=ax)
|
336 |
+
# st.pyplot(plt.gcf().set_size_inches(10, 15))
|
337 |
+
# #ax2_dict=sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'Sex',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='RdBu_r',swap_axes=True,show=False, ax=ax2)
|
338 |
+
# fig, (ax2) = plt.subplots(1, 1, figsize=(20,4), gridspec_kw={'wspace':0.9})
|
339 |
+
# #ax2_dict=sc.pl.matrixplot(st.session_state['adata_annot'], multi_genes, 'new_anno', vmin=-1, vmax=1, show=False, cmap='BuPu',dendrogram=True, swap_axes=True, ax=ax2)
|
340 |
+
|
341 |
+
# #sc.pl.heatmap(adata_annot, genes_lst, groupby='new_anno', vmin=-1, vmax=1, cmap='RdBu_r', dendrogram=True, swap_axes=True, figsize=(11,4))
|
342 |
+
# ax2_dict=sc.pl.heatmap(st.session_state['adata_annot'], multi_genes, groupby='new_anno', vmin=-1, vmax=1, cmap='BuPu', dendrogram=True, swap_axes=True)#,ax=ax2)
|
343 |
+
|
344 |
+
# st.pyplot(plt.gcf().set_size_inches(10, 15))
|
345 |
+
|
346 |
+
|
347 |
+
with readme:
|
348 |
+
expander = st.expander("How to use this app")
|
349 |
+
#st.header('How to use this app')
|
350 |
+
expander.markdown('Please select **Results Menue** checkbox from the sidebar')
|
351 |
+
expander.markdown('Select a Gene from the dropdown list')
|
352 |
+
expander.markdown('A table showing all reference gudies from three LISTS will appear in the main panel')
|
353 |
+
expander.markdown('To see results for each of the selected reference guide from ListA, ListB and ListC, Please select respective checkbox')
|
354 |
+
expander.markdown('Results are shown as two tables, **MATCHED** and **MUTATED** guides tables and **NOT FOUND** table if guides are not found in GRCh38 and LR reference fasta files')
|
355 |
+
expander.markdown('**MATCHED** guides table shows the genomic postion in GRCh38 and LR Fasta file along other fields. **If a guide is found in GRCh38 but not in LR fasta, then corresponding columns will be NA**')
|
356 |
+
expander.markdown('**MUTATED** guides table shows the genomic postion in GRCh38 and LR Fasta file along other fields. **If a guide is found in GRCh38 but not in LR fasta, then corresponding columns will be NA**')
|
357 |
+
|
358 |
+
expander1 = st.expander('Introduction')
|
359 |
+
|
360 |
+
expander1.markdown(
|
361 |
+
""" This app helps navigate all probable genomic **miss-matched/Mutations (upto 2 bp)** for a given sgRNA (from 3 lists of CRISPRi dual sgRNA libraries) in GRCh38 reference fasta and a Reference fasta generated from BAM generated against KOLF2.1J longread data.
|
362 |
+
"""
|
363 |
+
)
|
364 |
+
expander1.markdown('Merged bam file was converted to fasta file using following steps:')
|
365 |
+
expander1.markdown('- samtools mpileup to generate bcf file')
|
366 |
+
expander1.markdown('- bcftools to generate vcf file')
|
367 |
+
expander1.markdown('- bcftools consensus to generate fasta file')
|
368 |
+
expander1.markdown('A GPU based [Cas-OFFinder](http://www.rgenome.net/cas-offinder/) tool was used to find off-target sequences (upto 2 miss-matched) for each geiven reference guide against GRCh38 and LR fasta references.')
|
369 |
+
|
370 |
+
css = '''
|
371 |
+
<style>
|
372 |
+
.stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
|
373 |
+
font-size:1.5rem;
|
374 |
+
}
|
375 |
+
</style>
|
376 |
+
'''
|
377 |
+
|
378 |
+
st.markdown(css, unsafe_allow_html=True)
|
env.yml
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: scBrainApp
|
2 |
+
channels:
|
3 |
+
- conda-forge
|
4 |
+
- bioconda
|
5 |
+
- defaults
|
6 |
+
dependencies:
|
7 |
+
- ca-certificates=2023.5.7=h8857fd0_0
|
8 |
+
- libcxx=16.0.5=hd57cbcb_0
|
9 |
+
- libffi=3.3=h046ec9c_2
|
10 |
+
- libsqlite=3.42.0=h58db7d2_0
|
11 |
+
- libzlib=1.2.13=h8a1eda9_5
|
12 |
+
- ncurses=6.4=hf0c8a7f_0
|
13 |
+
- openssl=1.1.1u=h8a1eda9_0
|
14 |
+
- pip=23.1.2=pyhd8ed1ab_0
|
15 |
+
- python=3.9.0=h4f09611_5_cpython
|
16 |
+
- readline=8.2=h9e318b2_1
|
17 |
+
- setuptools=67.7.2=pyhd8ed1ab_0
|
18 |
+
- sqlite=3.42.0=h2b0dec6_0
|
19 |
+
- tk=8.6.12=h5dbffcc_0
|
20 |
+
- wheel=0.40.0=pyhd8ed1ab_0
|
21 |
+
- xz=5.2.6=h775f41a_0
|
22 |
+
- zlib=1.2.13=h8a1eda9_5
|
23 |
+
- pip:
|
24 |
+
- altair==5.0.1
|
25 |
+
- anndata==0.9.1
|
26 |
+
- appnope==0.1.3
|
27 |
+
- asttokens==2.2.1
|
28 |
+
- attrs==23.1.0
|
29 |
+
- backcall==0.2.0
|
30 |
+
- blinker==1.6.2
|
31 |
+
- cachetools==5.3.1
|
32 |
+
- certifi==2023.5.7
|
33 |
+
- charset-normalizer==3.1.0
|
34 |
+
- click==8.1.3
|
35 |
+
- contourpy==1.0.7
|
36 |
+
- cycler==0.11.0
|
37 |
+
- decorator==5.1.1
|
38 |
+
- decoupler==1.4.0
|
39 |
+
- executing==1.2.0
|
40 |
+
- fonttools==4.40.0
|
41 |
+
- gitdb==4.0.10
|
42 |
+
- gitpython==3.1.31
|
43 |
+
- h5py==3.8.0
|
44 |
+
- idna==3.4
|
45 |
+
- importlib-metadata==6.6.0
|
46 |
+
- importlib-resources==5.12.0
|
47 |
+
- ipython==8.14.0
|
48 |
+
- jedi==0.18.2
|
49 |
+
- jinja2==3.1.2
|
50 |
+
- joblib==1.2.0
|
51 |
+
- jsonschema==4.17.3
|
52 |
+
- kiwisolver==1.4.4
|
53 |
+
- llvmlite==0.40.1rc1
|
54 |
+
- markdown-it-py==3.0.0
|
55 |
+
- markupsafe==2.1.3
|
56 |
+
- matplotlib==3.7.1
|
57 |
+
- matplotlib-inline==0.1.6
|
58 |
+
- mdurl==0.1.2
|
59 |
+
- natsort==8.3.1
|
60 |
+
- networkx==3.1
|
61 |
+
- numba==0.57.0
|
62 |
+
- numpy==1.24.3
|
63 |
+
- packaging==23.1
|
64 |
+
- pandas==2.0.2
|
65 |
+
- parso==0.8.3
|
66 |
+
- patsy==0.5.3
|
67 |
+
- pexpect==4.8.0
|
68 |
+
- pickleshare==0.7.5
|
69 |
+
- pillow==9.5.0
|
70 |
+
- prompt-toolkit==3.0.38
|
71 |
+
- protobuf==4.23.2
|
72 |
+
- ptyprocess==0.7.0
|
73 |
+
- pure-eval==0.2.2
|
74 |
+
- pyarrow==12.0.1
|
75 |
+
- pydeck==0.8.1b0
|
76 |
+
- pygments==2.15.1
|
77 |
+
- pympler==1.0.1
|
78 |
+
- pynndescent==0.5.10
|
79 |
+
- pyparsing==3.0.9
|
80 |
+
- pyrsistent==0.19.3
|
81 |
+
- python-dateutil==2.8.2
|
82 |
+
- pytz==2023.3
|
83 |
+
- pytz-deprecation-shim==0.1.0.post0
|
84 |
+
- requests==2.31.0
|
85 |
+
- rich==13.4.2
|
86 |
+
- scanpy==1.9.3
|
87 |
+
- scikit-learn==1.2.2
|
88 |
+
- scipy==1.10.1
|
89 |
+
- seaborn==0.12.2
|
90 |
+
- session-info==1.0.0
|
91 |
+
- six==1.16.0
|
92 |
+
- smmap==5.0.0
|
93 |
+
- stack-data==0.6.2
|
94 |
+
- statsmodels==0.14.0
|
95 |
+
- stdlib-list==0.8.0
|
96 |
+
- streamlit==1.23.1
|
97 |
+
- tenacity==8.2.2
|
98 |
+
- threadpoolctl==3.1.0
|
99 |
+
- toml==0.10.2
|
100 |
+
- toolz==0.12.0
|
101 |
+
- tornado==6.3.2
|
102 |
+
- tqdm==4.65.0
|
103 |
+
- traitlets==5.9.0
|
104 |
+
- typing-extensions==4.6.3
|
105 |
+
- tzdata==2023.3
|
106 |
+
- tzlocal==4.3
|
107 |
+
- umap-learn==0.5.3
|
108 |
+
- urllib3==2.0.3
|
109 |
+
- validators==0.20.0
|
110 |
+
- wcwidth==0.2.6
|
111 |
+
- zipp==3.15.0
|
functions/__pycache__/pathway_analyses.cpython-39.pyc
ADDED
Binary file (26.4 kB). View file
|
|
functions/pathway_analyses.py
ADDED
@@ -0,0 +1,1015 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import scipy
|
3 |
+
import warnings
|
4 |
+
#import anndata2ri
|
5 |
+
import pandas as pd
|
6 |
+
import scanpy as sc
|
7 |
+
import numpy as np
|
8 |
+
import seaborn as sb
|
9 |
+
import decoupler as dc
|
10 |
+
from scipy import sparse
|
11 |
+
from anndata import AnnData
|
12 |
+
from tabnanny import verbose
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
#from gsva_prep import prep_gsva
|
15 |
+
from typing import Optional, Union
|
16 |
+
from matplotlib.pyplot import rcParams
|
17 |
+
#from statsmodels.stats.multitest import multipletests
|
18 |
+
#from sklearn.model_selection import train_test_split
|
19 |
+
#from rpy2.robjects.conversion import localconverter
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
def rescale_matrix(S, log_scale=False):
|
25 |
+
"""
|
26 |
+
Sums cell-level counts by factors in label vector
|
27 |
+
|
28 |
+
Parameters
|
29 |
+
----------
|
30 |
+
S : np.ndarray, scipy.sparse.csr_matrix or pandas.DataFrame
|
31 |
+
Matrix with read counts (gene x cell)
|
32 |
+
log_scale : bool, optional (default: False)
|
33 |
+
Whether to log-transform the rescaled matrix
|
34 |
+
|
35 |
+
Returns
|
36 |
+
-------
|
37 |
+
B : np.ndarray or scipy.sparse.csr_matrix
|
38 |
+
Scaled and log-transformed matrix
|
39 |
+
"""
|
40 |
+
if isinstance(S, pd.DataFrame):
|
41 |
+
S = S.values
|
42 |
+
elif isinstance(S, np.ndarray):
|
43 |
+
pass
|
44 |
+
elif isinstance(S, scipy.sparse.csr_matrix):
|
45 |
+
S = S.toarray()
|
46 |
+
else:
|
47 |
+
raise ValueError('Input S must be a pandas.DataFrame, numpy.ndarray or scipy.sparse.csr_matrix')
|
48 |
+
|
49 |
+
cs = np.sum(S, axis=0)
|
50 |
+
cs[cs == 0] = 1
|
51 |
+
B = np.median(cs) * (S / cs)
|
52 |
+
if log_scale:
|
53 |
+
B = np.log1p(B)
|
54 |
+
return B
|
55 |
+
|
56 |
+
def normalize_default(adata, log_scale=True):
|
57 |
+
"""
|
58 |
+
Normalizes gene expression matrix by total count and scales by median
|
59 |
+
|
60 |
+
Parameters
|
61 |
+
----------
|
62 |
+
adata : AnnData
|
63 |
+
Annotated data matrix.
|
64 |
+
log_scale : bool, optional (default: True)
|
65 |
+
Whether to log-transform the rescaled matrix.
|
66 |
+
|
67 |
+
Returns
|
68 |
+
-------
|
69 |
+
adata : AnnData
|
70 |
+
Annotated data matrix with normalized and scaled expression values.
|
71 |
+
"""
|
72 |
+
if 'counts' in adata.layers.keys():
|
73 |
+
print('normalizaing data using count data in .layers["counts] ')
|
74 |
+
S = adata.layers['counts']
|
75 |
+
else:
|
76 |
+
print('normaling data using count data in .X')
|
77 |
+
S = adata.X
|
78 |
+
B = rescale_matrix(S, log_scale=log_scale)
|
79 |
+
adata.X = B
|
80 |
+
return adata
|
81 |
+
|
82 |
+
|
83 |
+
def normalize_matrix(
|
84 |
+
X: Union[np.ndarray, sparse.spmatrix],
|
85 |
+
top_features_frac: float = 1.0,
|
86 |
+
scale_factor: Union[str, float, int, np.ndarray, None] = "median",
|
87 |
+
transformation: Union[str, None] = "log",
|
88 |
+
anchor_features: Union[np.ndarray, None] = None,
|
89 |
+
) -> Union[np.ndarray, sparse.spmatrix]:
|
90 |
+
|
91 |
+
X = X.astype(dtype=np.float64)
|
92 |
+
|
93 |
+
# Which features (i.e. genes) should we use to compute library sizes?
|
94 |
+
if anchor_features is not None:
|
95 |
+
lib_sizes = np.array(np.mean(X[:, anchor_features], axis=1))
|
96 |
+
else:
|
97 |
+
if top_features_frac < 1.0:
|
98 |
+
universality = np.array(np.mean(X > 0, axis=0))
|
99 |
+
selected_features = np.flatnonzero(universality > (1 - top_features_frac))
|
100 |
+
lib_sizes = np.array(np.mean(X[:, selected_features], axis=1))
|
101 |
+
else:
|
102 |
+
lib_sizes = np.array(np.mean(X, axis=1))
|
103 |
+
|
104 |
+
# Note: mean as opposed to sum
|
105 |
+
|
106 |
+
# Normalize library sizes
|
107 |
+
if isinstance(X, sparse.spmatrix):
|
108 |
+
X_scaled = X.multiply(1 / lib_sizes)
|
109 |
+
else:
|
110 |
+
try:
|
111 |
+
X_scaled = X / lib_sizes
|
112 |
+
except ValueError:
|
113 |
+
lib_sizes = np.reshape(lib_sizes, (-1, 1))
|
114 |
+
X_scaled = X / lib_sizes
|
115 |
+
|
116 |
+
# scale normalized columns
|
117 |
+
if scale_factor == "median":
|
118 |
+
kappa = np.median(np.array(np.sum(X, axis=1) / np.sum(X_scaled, axis=1)))
|
119 |
+
X_scaled_norm = X_scaled * kappa
|
120 |
+
elif isinstance(scale_factor, (int, float)):
|
121 |
+
X_scaled_norm = X_scaled * scale_factor
|
122 |
+
elif isinstance(scale_factor, np.ndarray):
|
123 |
+
if sparse.issparse(X_scaled):
|
124 |
+
X_scaled_norm = X_scaled.multiply(scale_factor)
|
125 |
+
else:
|
126 |
+
X_scaled_norm = X_scaled / scale_factor
|
127 |
+
|
128 |
+
# For compatibility with C
|
129 |
+
if sparse.issparse(X_scaled_norm):
|
130 |
+
X_scaled_norm = sparse.csc_matrix(X_scaled_norm)
|
131 |
+
|
132 |
+
# Post-transformation
|
133 |
+
if transformation == "log":
|
134 |
+
X_scaled_norm_trans = np.log1p(X_scaled_norm)
|
135 |
+
elif transformation == "tukey":
|
136 |
+
if sparse.issparse(X_scaled_norm):
|
137 |
+
nnz_idx = X_scaled_norm.nonzero()
|
138 |
+
ii = nnz_idx[0]
|
139 |
+
jj = nnz_idx[1]
|
140 |
+
vv = X_scaled_norm[ii, jj]
|
141 |
+
vv_transformed = np.sqrt(vv) + np.sqrt(1 + vv)
|
142 |
+
X_scaled_norm[ii, jj] = vv_transformed
|
143 |
+
else:
|
144 |
+
X_scaled_norm[X_scaled_norm < 0] = 0
|
145 |
+
vv = X_scaled_norm[X_scaled_norm != 0]
|
146 |
+
vv_transformed = np.sqrt(vv) + np.sqrt(1 + vv)
|
147 |
+
X_scaled_norm[X_scaled_norm != 0] = vv_transformed
|
148 |
+
|
149 |
+
# elif transformation == "lsi":
|
150 |
+
# if sparse.issparse(X_scaled_norm):
|
151 |
+
# X_scaled_norm_trans = _an.LSI(X_scaled_norm)
|
152 |
+
# else:
|
153 |
+
# X_scaled_norm_sp = sparse.csc_matrix(X_scaled_norm)
|
154 |
+
# X_scaled_norm_trans = _an.LSI(X_scaled_norm_sp).toarray()
|
155 |
+
else:
|
156 |
+
X_scaled_norm_trans = X_scaled_norm
|
157 |
+
|
158 |
+
return X_scaled_norm_trans
|
159 |
+
|
160 |
+
|
161 |
+
def normalize_actionet(
|
162 |
+
adata: AnnData,
|
163 |
+
layer_key: Optional[str] = None,
|
164 |
+
layer_key_out: Optional[str] = None,
|
165 |
+
top_features_frac: float = 1.0,
|
166 |
+
scale_factor: Union[str, float, int, np.ndarray, None] = "median",
|
167 |
+
transformation: Union[str, None] = "log",
|
168 |
+
anchor_features: Union[np.ndarray, None] = None,
|
169 |
+
copy: Optional[bool] = False,
|
170 |
+
) -> Optional[AnnData]:
|
171 |
+
adata = adata.copy() if copy else adata
|
172 |
+
|
173 |
+
if "metadta" in adata.uns.keys():
|
174 |
+
if "norm_method" in adata.uns["metadata"].keys(): # Already normalized? leave it alone!
|
175 |
+
# return adata if copy else None
|
176 |
+
warnings.warn("AnnData object is prenormalized. Please make sure to use the right assay.")
|
177 |
+
|
178 |
+
if layer_key is None and "input_assay" in adata.uns["metadata"].keys():
|
179 |
+
layer_key = adata.uns["metadata"]["input_assay"]
|
180 |
+
|
181 |
+
if layer_key is not None:
|
182 |
+
if layer_key not in adata.layers.keys():
|
183 |
+
raise ValueError("Did not find adata.layers['" + layer_key + "']. ")
|
184 |
+
S = adata.layers[layer_key]
|
185 |
+
else:
|
186 |
+
S = adata.X
|
187 |
+
|
188 |
+
if sparse.issparse(S):
|
189 |
+
UE = set(S.data)
|
190 |
+
else:
|
191 |
+
UE = set(S.flatten())
|
192 |
+
|
193 |
+
nonint_count = len(UE.difference(set(np.arange(0, max(UE) + 1))))
|
194 |
+
if 0 < nonint_count:
|
195 |
+
warnings.warn("Input [count] assay has non-integer values, which looks like a normalized matrix. Please make sure to use the right assay.")
|
196 |
+
|
197 |
+
S = normalize_matrix(
|
198 |
+
S,
|
199 |
+
anchor_features=anchor_features,
|
200 |
+
top_features_frac=top_features_frac,
|
201 |
+
scale_factor=scale_factor,
|
202 |
+
transformation=transformation,
|
203 |
+
)
|
204 |
+
|
205 |
+
adata.uns["metadata"] = {}
|
206 |
+
adata.uns["metadata"]["norm_method"] = "default_top%.2f_%s" % (
|
207 |
+
top_features_frac,
|
208 |
+
transformation,
|
209 |
+
)
|
210 |
+
|
211 |
+
if layer_key_out is not None:
|
212 |
+
adata.uns["metadata"]["default_assay"] = layer_key_out
|
213 |
+
adata.layers[layer_key_out] = S
|
214 |
+
else:
|
215 |
+
adata.uns["metadata"]["default_assay"] = None
|
216 |
+
adata.X = S
|
217 |
+
|
218 |
+
return adata if copy else None
|
219 |
+
|
220 |
+
def read_pathways(filename):
|
221 |
+
with open(filename, 'r') as temp_f:
|
222 |
+
col_count = [ len(l.split("\t")) for l in temp_f.readlines() ]
|
223 |
+
column_names = [i for i in range(0, max(col_count))]
|
224 |
+
### Read csv
|
225 |
+
return pd.read_csv(filename, header=None, delimiter="\t", names=column_names)
|
226 |
+
|
227 |
+
|
228 |
+
|
229 |
+
def filter_expressed_genes_by_celltype(adata: AnnData,
|
230 |
+
threshold: float=0.05,
|
231 |
+
filter_genes_from: str='singlecell',
|
232 |
+
subject_id: str='Subject'):
|
233 |
+
"""
|
234 |
+
|
235 |
+
Function to filter expressed genes by cell type based on a threshold
|
236 |
+
|
237 |
+
Parameters:
|
238 |
+
-----------
|
239 |
+
adata : AnnData object
|
240 |
+
Annotated Data matrix with rows representing genes and columns representing cells.
|
241 |
+
threshold : float, optional (default=0.05)
|
242 |
+
The threshold to use for filtering expressed genes based on the minimum number of cells they are detected in.
|
243 |
+
filter_genes_from: str, optional (default=`singlecell`)
|
244 |
+
Whether to filter genes that meet threshold in pseudobulk data or singlecell data.
|
245 |
+
subject_id (str): a string indicating the column containing individual identifiers.
|
246 |
+
|
247 |
+
|
248 |
+
Returns:
|
249 |
+
--------
|
250 |
+
expressed_genes_per_celltype : pandas DataFrame
|
251 |
+
A dataframe where the rows are the gene names and columns are the cell types,
|
252 |
+
containing only the genes that are expressed in at least the specified percentage of cells for each cell type.
|
253 |
+
|
254 |
+
|
255 |
+
"""
|
256 |
+
|
257 |
+
# Initialize empty dictionaries to store the expressed genes and gene sets per cell type
|
258 |
+
expressed_genes_per_celltype = {}
|
259 |
+
gene_set_per_celltype = {}
|
260 |
+
|
261 |
+
if filter_genes_from=='pseudobulk':
|
262 |
+
# Get pseudo-bulk profile
|
263 |
+
adata = dc.get_pseudobulk(adata,
|
264 |
+
sample_col=subject_id,
|
265 |
+
groups_col='cell_type',
|
266 |
+
layer='counts',
|
267 |
+
mode='sum',
|
268 |
+
min_cells=0,
|
269 |
+
min_counts=0
|
270 |
+
)
|
271 |
+
# Loop through each unique cell type in the input AnnData object
|
272 |
+
|
273 |
+
for cell_type in adata.obs.cell_type.unique():
|
274 |
+
|
275 |
+
expressed_genes_per_celltype[cell_type] = dc.filter_by_prop(adata[adata.obs['cell_type']==cell_type],
|
276 |
+
min_prop=threshold)
|
277 |
+
|
278 |
+
elif filter_genes_from=='singlecell':
|
279 |
+
# Loop through each unique cell type in the input AnnData object
|
280 |
+
|
281 |
+
for cell_type in adata.obs.cell_type.unique():
|
282 |
+
|
283 |
+
# Calculate the number of cells based on the specified threshold
|
284 |
+
percent = threshold
|
285 |
+
num_cells = round(percent*len(adata[adata.obs['cell_type']==cell_type]))
|
286 |
+
|
287 |
+
# Filter genes based on minimum number of cells and store the resulting gene names
|
288 |
+
expressed_genes_per_celltype[cell_type], _ = sc.pp.filter_genes(adata[adata.obs.cell_type==cell_type].layers['counts'],
|
289 |
+
min_cells=num_cells, inplace=False)
|
290 |
+
expressed_genes_per_celltype[cell_type] = list(adata.var_names[expressed_genes_per_celltype[cell_type]])
|
291 |
+
|
292 |
+
# Convert the dictionary of expressed genes per cell type to a Pandas DataFrame
|
293 |
+
expressed_genes_per_celltype = pd.DataFrame.from_dict(expressed_genes_per_celltype, orient='index').transpose()
|
294 |
+
|
295 |
+
return expressed_genes_per_celltype
|
296 |
+
|
297 |
+
|
298 |
+
def filter_lowly_exp_genes(expressed: pd.DataFrame,
|
299 |
+
all_paths: pd.DataFrame,
|
300 |
+
threshold: float = 0.33):
|
301 |
+
|
302 |
+
"""
|
303 |
+
Filters lowly expressed gene sets based on a threshold and pathway membership.
|
304 |
+
|
305 |
+
Parameters:
|
306 |
+
-----------
|
307 |
+
expressed: pandas.DataFrame
|
308 |
+
A DataFrame of expressed genes with cell types as columns and gene IDs as rows.
|
309 |
+
all_paths: pandas.DataFrame
|
310 |
+
A DataFrame of gene sets with pathways as columns and gene IDs as rows.
|
311 |
+
threshold: float, optional (default=0.33)
|
312 |
+
A proportion threshold used to filter gene sets based on their expression in each cell type.
|
313 |
+
|
314 |
+
Returns:
|
315 |
+
--------
|
316 |
+
gene_set_per_celltype: dict of pandas.DataFrame
|
317 |
+
A dictionary of gene sets per cell type, with cell type names as keys and gene set dataframes as values.
|
318 |
+
Each gene set dataframe has three columns: 'description', 'member', and 'name'.
|
319 |
+
"""
|
320 |
+
|
321 |
+
# Initialize empty dictionaries to store the gene sets and gene sets per cell type
|
322 |
+
gene_set = {}
|
323 |
+
gene_set_per_celltype = {}
|
324 |
+
|
325 |
+
# Loop through each cell type in the input Pandas DataFrame of expressed genes
|
326 |
+
for cell_type in expressed.columns:
|
327 |
+
# Determine which pathways have a proportion of genes above the specified threshold
|
328 |
+
index = [sum(all_paths[x].isin(expressed[cell_type]))/len(all_paths[x]) > threshold for x in all_paths.columns]
|
329 |
+
# Filter pathways based on threshold and store the resulting gene sets
|
330 |
+
p = all_paths.loc[:, index]
|
331 |
+
x = {y: pd.Series(list(set(expressed[cell_type]).intersection(set(p[y])))) for y in p.columns}
|
332 |
+
x = {k: v for k, v in x.items() if not v.empty}
|
333 |
+
gene_set[cell_type] = x
|
334 |
+
|
335 |
+
# Convert the gene sets to Pandas DataFrames and store them in a dictionary by cell type
|
336 |
+
gene_set_per_celltype[cell_type] = pd.DataFrame(columns=['description', 'member', 'name'])
|
337 |
+
for pathway, gene_list in gene_set[cell_type].items():
|
338 |
+
|
339 |
+
df = pd.DataFrame(columns=['description', 'member', 'name'])
|
340 |
+
df['member'] = gene_list
|
341 |
+
df['name'] = pathway
|
342 |
+
df['description'] = pathway.split(" ")[-1]
|
343 |
+
gene_set_per_celltype[cell_type] = pd.concat([gene_set_per_celltype[cell_type], df], join='outer', ignore_index=True)
|
344 |
+
|
345 |
+
# Sort the resulting gene sets by description and member
|
346 |
+
gene_set_per_celltype[cell_type].sort_index(axis=1, inplace=True)
|
347 |
+
gene_set_per_celltype[cell_type].sort_index(axis=0, inplace=True)
|
348 |
+
|
349 |
+
|
350 |
+
return gene_set_per_celltype
|
351 |
+
|
352 |
+
|
353 |
+
def get_ind_level_ave(adata: AnnData, subject_id: str = 'Subject', method: str = "agg_x_num",
|
354 |
+
expressed_genes_per_celltype: dict = {}, filter_genes_at_threshold: bool = True):
|
355 |
+
"""
|
356 |
+
Get averaged expression data for each cell type and individual in an AnnData object.
|
357 |
+
|
358 |
+
|
359 |
+
Args:
|
360 |
+
|
361 |
+
adata (AnnData): An AnnData object with read counts (gene x cell).
|
362 |
+
subject_id (str): a string indicating the column containing individual identifiers.
|
363 |
+
method (str): a string indicating the method to be used. The default is "agg_x_num".
|
364 |
+
filter_genes_at_threshold (bool): A boolean indicating whether to filter genes based on threshold. The default is True.
|
365 |
+
expressed_genes_per_celltype (float): A dictionary of the genes to be filtered for each celltype.
|
366 |
+
|
367 |
+
Returns:
|
368 |
+
|
369 |
+
Dictionary: A dictionary of data frames with averaged expression data for each cell type and individual.
|
370 |
+
|
371 |
+
"""
|
372 |
+
|
373 |
+
if method == "agg_x_norm":
|
374 |
+
|
375 |
+
avs_logcounts_cellxind = {}
|
376 |
+
# loop over each unique cell type in the annotation metadata
|
377 |
+
for cell_type in adata.obs.cell_type.unique():
|
378 |
+
|
379 |
+
# filter genes based on threshold
|
380 |
+
if filter_genes_at_threshold:
|
381 |
+
adata_temp = adata[adata.obs.cell_type==cell_type].copy()
|
382 |
+
# sc.pp.filter_genes(adata_temp, min_cells=gene_celltype_threshold*adata_temp.n_obs)
|
383 |
+
adata_temp = adata_temp[:, adata_temp.var_names.isin(expressed_genes_per_celltype[cell_type].tolist())]
|
384 |
+
else:
|
385 |
+
adata_temp = adata.copy()
|
386 |
+
|
387 |
+
# Get pseudo-bulk profile
|
388 |
+
pdata = dc.get_pseudobulk(adata_temp, sample_col=subject_id, groups_col='cell_type', layer='counts', mode='sum',
|
389 |
+
min_cells=0, min_counts=0)
|
390 |
+
|
391 |
+
# genes = dc.filter_by_prop(pdata, min_prop=0.05, min_smpls=1)
|
392 |
+
# pdata = pdata[:, genes].copy()
|
393 |
+
|
394 |
+
# Normalize and log transform
|
395 |
+
|
396 |
+
# sc.pp.normalize_total(pdata, 1e06)
|
397 |
+
# sc.pp.log1p(pdata)
|
398 |
+
|
399 |
+
pdata.layers['counts'] = pdata.X
|
400 |
+
pdata = normalize_actionet(pdata, layer_key = 'counts', layer_key_out = None,
|
401 |
+
top_features_frac = 1.0, scale_factor = "median",
|
402 |
+
transformation = "log", anchor_features = None, copy = True)
|
403 |
+
|
404 |
+
# Store the log-normalized, averaged expression data for each individual and cell type
|
405 |
+
avs_logcounts_cellxind[cell_type] = pd.DataFrame(pdata.X.T, columns=pdata.obs[subject_id], index=pdata.var_names)
|
406 |
+
|
407 |
+
del adata_temp, pdata
|
408 |
+
|
409 |
+
elif method == 'norm_x_agg':
|
410 |
+
|
411 |
+
def sum_counts(counts, label, cell_labels, gene_labels):
|
412 |
+
|
413 |
+
"""
|
414 |
+
Sums cell-level counts by factors in label vector.
|
415 |
+
|
416 |
+
Args:
|
417 |
+
counts (AnnData): An AnnData object with read counts (gene x cell).
|
418 |
+
label (pd.DataFrame): Variable of interest by which to sum counts.
|
419 |
+
cell_labels (pd.Index): Vector of cell labels.
|
420 |
+
gene_labels (pd.Index): Vector of gene labels.
|
421 |
+
|
422 |
+
Returns:
|
423 |
+
Dictionary: A dictionary with the following keys:
|
424 |
+
- 'summed_counts': A data frame with summed counts.
|
425 |
+
- 'ncells': A data frame with the number of cells used per summation.
|
426 |
+
"""
|
427 |
+
# Create a data frame with the label vector and add a column of 1s for counting.
|
428 |
+
label_df = pd.DataFrame(label)
|
429 |
+
label_df.columns = ['ID']
|
430 |
+
label_df['index'] = 1
|
431 |
+
|
432 |
+
# Add a column for cell type and pivot the data frame to create a matrix of counts.
|
433 |
+
label_df['celltype'] = cell_labels
|
434 |
+
label_df = label_df.pivot_table(index='celltype', columns='ID', values='index', aggfunc=np.sum, fill_value=0)
|
435 |
+
label_df = label_df.astype(float)
|
436 |
+
|
437 |
+
# Multiply the counts matrix by the gene expression matrix to get summed counts.
|
438 |
+
summed_counts = pd.DataFrame(counts.X.T @ label_df.values, index = gene_labels, columns= label_df.columns)
|
439 |
+
|
440 |
+
# Sum the number of cells used for each summation.
|
441 |
+
ncells = label_df.sum()
|
442 |
+
|
443 |
+
# Return the summed counts and number of cells as a dictionary.
|
444 |
+
return {'summed_counts': summed_counts, 'ncells': ncells}
|
445 |
+
|
446 |
+
|
447 |
+
# Get metadata from the AnnData object.
|
448 |
+
meta = adata.obs # Get metadata
|
449 |
+
|
450 |
+
|
451 |
+
# Create a data frame of labels by combining cell type and individual metadata fields.
|
452 |
+
# Sum counts by individual
|
453 |
+
labels = pd.DataFrame(meta['cell_type'].astype(str) + '_' + meta[subject_id].astype(str), columns=['individual'])
|
454 |
+
|
455 |
+
# Sum counts by individual and store the results in a dictionary.
|
456 |
+
summed_logcounts_cellxind = sum_counts(adata, labels, adata.obs_names, adata.var_names)
|
457 |
+
|
458 |
+
# Calculate averages for each cell type and individual and store the results in a dictionary.
|
459 |
+
# Get averages corresponding to both count matrices
|
460 |
+
avs_logcounts = np.array(summed_logcounts_cellxind['summed_counts'].values) / np.array(summed_logcounts_cellxind['ncells'].values)
|
461 |
+
# avs_logcounts = np.array(summed_logcounts_cellxind['summed_counts'].values)
|
462 |
+
avs_logcounts = pd.DataFrame(avs_logcounts, index = summed_logcounts_cellxind['summed_counts'].index,
|
463 |
+
columns=summed_logcounts_cellxind['summed_counts'].columns)
|
464 |
+
|
465 |
+
|
466 |
+
# Split the averages by cell type and individual and store the results in a dictionary.
|
467 |
+
# Split column names into two parts: cell type and individual
|
468 |
+
x = [col.split('_') for col in avs_logcounts.columns]
|
469 |
+
celltype = [col[0] for col in x]
|
470 |
+
individual = [col[1] for col in x]
|
471 |
+
|
472 |
+
# Get unique cell types in the dataset
|
473 |
+
celltype_unique = np.unique(celltype)
|
474 |
+
|
475 |
+
# Create an empty dictionary to store the average counts for each cell type and individual
|
476 |
+
avs_by_ind_out = {}
|
477 |
+
|
478 |
+
# Loop over the unique cell types and subset the average counts for each cell type and individual
|
479 |
+
for i in celltype_unique:
|
480 |
+
index = np.array(celltype)==i
|
481 |
+
df = avs_logcounts.loc[:, index]
|
482 |
+
df.columns = np.array(individual)[index]
|
483 |
+
avs_by_ind_out[i] = df
|
484 |
+
|
485 |
+
if filter_genes_at_threshold:
|
486 |
+
# num_cells = round(gene_celltype_threshold*len(adata[adata.obs['cell_type']==cell_type]))
|
487 |
+
# # Filter genes based on minimum number of cells and store the resulting gene names
|
488 |
+
# gene_mask, _ = sc.pp.filter_genes(adata[adata.obs.cell_type==cell_type].layers['counts'],
|
489 |
+
# min_cells=num_cells,
|
490 |
+
# inplace=False)
|
491 |
+
# genes = list(adata.var_names[gene_mask])
|
492 |
+
avs_by_ind_out[i] = avs_by_ind_out[i].loc[expressed_genes_per_celltype[i], :]
|
493 |
+
else:
|
494 |
+
adata = adata.copy()
|
495 |
+
# Store the dictionary of average counts for each cell type and individual
|
496 |
+
avs_logcounts_cellxind = avs_by_ind_out
|
497 |
+
|
498 |
+
# Return the dictionary of average counts for each cell type and individual
|
499 |
+
|
500 |
+
return avs_logcounts_cellxind
|
501 |
+
|
502 |
+
|
503 |
+
def plot_and_select_top_deps(all_pathways: pd.DataFrame(),
|
504 |
+
list_of_paths_to_annotate: list = [],
|
505 |
+
save_name='cell_type_specific',
|
506 |
+
save_prefix: str = 'mathys_pfc',
|
507 |
+
filter: bool=False,
|
508 |
+
cell_type_specific: bool = True,
|
509 |
+
test_name: str = ''):
|
510 |
+
|
511 |
+
if cell_type_specific:
|
512 |
+
# Plot certain cell_type specific pathways
|
513 |
+
collated_df = pd.DataFrame(all_pathways.groupby(all_pathways.index).agg({'score_adj': list, 'celltype': list,
|
514 |
+
'logFC': list, 'P.Value': list, 'shortened': list, 'highlight': list}))
|
515 |
+
# filter pathways only expressed in one cell type
|
516 |
+
mask = collated_df["celltype"].apply(len) == 1
|
517 |
+
df = collated_df[mask]
|
518 |
+
|
519 |
+
# create pathway by cell type pivot table
|
520 |
+
scores_table = pd.pivot_table(all_pathways, values='score_adj', index='pathway', columns='celltype')
|
521 |
+
scores_table = scores_table.loc[df.index]
|
522 |
+
scores_table['shortened'] = df.shortened.apply(lambda x: x[0])
|
523 |
+
scores_table['highlight'] = df.highlight.apply(lambda x: x[0])
|
524 |
+
scores_table.sort_values(by=[cell_type for cell_type in all_pathways.celltype.unique()], inplace=True)
|
525 |
+
|
526 |
+
# drop pathways with same shortened names ??
|
527 |
+
scores_table = scores_table.drop_duplicates(subset='shortened', keep='first')
|
528 |
+
|
529 |
+
###### Plot Cell type specific data
|
530 |
+
|
531 |
+
if filter:
|
532 |
+
xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
|
533 |
+
|
534 |
+
# select only pathways that should be visualized
|
535 |
+
shortened_names = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]['shortened']
|
536 |
+
scores_table = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]
|
537 |
+
|
538 |
+
n_rows = len(scores_table)
|
539 |
+
|
540 |
+
fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
|
541 |
+
fig.tight_layout()
|
542 |
+
|
543 |
+
# order table by cell type name
|
544 |
+
# scores_table = scores_table.reindex(columns=['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte',
|
545 |
+
# 'OPC', 'Microglia'])
|
546 |
+
scores_table = scores_table[xticks]
|
547 |
+
|
548 |
+
g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
549 |
+
linewidths=0.15, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.2},
|
550 |
+
cbar_ax=None, square=False,ax=ax1, xticklabels=xticks, yticklabels=shortened_names, mask=None,)
|
551 |
+
|
552 |
+
|
553 |
+
cax = g1.figure.axes[-1]
|
554 |
+
|
555 |
+
g1.set_title(f'Select Cell-type-specific Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology',
|
556 |
+
fontsize=3)
|
557 |
+
g1.set_ylabel('')
|
558 |
+
g1.set_xlabel('')
|
559 |
+
|
560 |
+
ax1.tick_params(axis='both', which='major', labelsize=4, length=1.5, width=0.5)
|
561 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.5, which="major")
|
562 |
+
|
563 |
+
plt.tight_layout()
|
564 |
+
plt.savefig(f'results/{test_name}/{save_prefix}_filtered_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
565 |
+
plt.show(block=False)
|
566 |
+
|
567 |
+
else:
|
568 |
+
xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
|
569 |
+
|
570 |
+
|
571 |
+
scores_table = scores_table[scores_table.shortened!='None']
|
572 |
+
yticklabels = scores_table['shortened']
|
573 |
+
# order table by cell type name
|
574 |
+
|
575 |
+
scores_table = scores_table[xticks]
|
576 |
+
|
577 |
+
n_rows = len(scores_table)
|
578 |
+
|
579 |
+
fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
|
580 |
+
fig.tight_layout()
|
581 |
+
|
582 |
+
g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
583 |
+
linewidths=0.07, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.1},
|
584 |
+
cbar_ax=None, square=False, ax=ax1, xticklabels=xticks, yticklabels=yticklabels, mask=None,)
|
585 |
+
|
586 |
+
|
587 |
+
cax = g1.figure.axes[-1]
|
588 |
+
|
589 |
+
g1.set_title(f'All Cell-type-specific Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology',
|
590 |
+
fontsize=3)
|
591 |
+
g1.set_ylabel('')
|
592 |
+
g1.set_xlabel('')
|
593 |
+
|
594 |
+
ax1.tick_params(axis='both', which='major', labelsize=2, length=1.5, width=0.25)
|
595 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.25, which="major")
|
596 |
+
|
597 |
+
plt.tight_layout()
|
598 |
+
#plt.savefig(f'../results/{test_name}/{save_prefix}_all_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
599 |
+
plt.savefig(f'results/{test_name}/{save_prefix}_all_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
600 |
+
plt.show(block=False)
|
601 |
+
|
602 |
+
|
603 |
+
else:
|
604 |
+
# Plot certain cell_type specific pathways
|
605 |
+
collated_df = pd.DataFrame(all_pathways.groupby(all_pathways.index).agg({'score_adj': list, 'celltype': list,
|
606 |
+
'logFC': list, 'P.Value': list, 'shortened': list, 'highlight': list}))
|
607 |
+
# filte pathways only expressed in one cell type
|
608 |
+
mask = collated_df["celltype"].apply(len) > 1
|
609 |
+
df = collated_df[mask]
|
610 |
+
|
611 |
+
# create pathway by cell type pivot table
|
612 |
+
scores_table = pd.pivot_table(all_pathways, values='score_adj', index='pathway', columns='celltype')
|
613 |
+
scores_table = scores_table.loc[df.index]
|
614 |
+
scores_table['shortened'] = df.shortened.apply(lambda x: x[0])
|
615 |
+
scores_table['highlight'] = df.highlight.apply(lambda x: x[0])
|
616 |
+
scores_table.sort_values(by=[cell_type for cell_type in all_pathways.celltype.unique()], inplace=True)
|
617 |
+
|
618 |
+
# drop pathways with same shortened names ??
|
619 |
+
scores_table = scores_table.drop_duplicates(subset='shortened', keep='first')
|
620 |
+
|
621 |
+
###### Plot Cell type specific data
|
622 |
+
|
623 |
+
if filter:
|
624 |
+
xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
|
625 |
+
|
626 |
+
# select only pathways that should be visualized
|
627 |
+
shortened_names = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]['shortened']
|
628 |
+
scores_table = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]
|
629 |
+
|
630 |
+
# order table by cell type name
|
631 |
+
scores_table = scores_table[xticks]
|
632 |
+
|
633 |
+
n_rows = len(scores_table)
|
634 |
+
|
635 |
+
fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
|
636 |
+
fig.tight_layout()
|
637 |
+
|
638 |
+
g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
639 |
+
linewidths=0.15, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.2},
|
640 |
+
cbar_ax=None, square=False,ax=ax1, xticklabels=xticks, yticklabels=shortened_names, mask=None,)
|
641 |
+
|
642 |
+
cax = g1.figure.axes[-1]
|
643 |
+
|
644 |
+
g1.set_title(f'Select Shared Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology', fontsize=3)
|
645 |
+
g1.set_ylabel('')
|
646 |
+
g1.set_xlabel('')
|
647 |
+
|
648 |
+
ax1.tick_params(axis='both', which='major', labelsize=4, length=1.5, width=0.5)
|
649 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.5, which="major")
|
650 |
+
|
651 |
+
plt.tight_layout()
|
652 |
+
plt.savefig(f'results/{test_name}/{save_prefix}_filtered_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
653 |
+
plt.show(block=False)
|
654 |
+
|
655 |
+
else:
|
656 |
+
xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
|
657 |
+
|
658 |
+
scores_table = scores_table[scores_table.shortened!='None']
|
659 |
+
yticklabels = scores_table['shortened']
|
660 |
+
# order table by cell type name
|
661 |
+
|
662 |
+
scores_table = scores_table[xticks]
|
663 |
+
|
664 |
+
n_rows = len(scores_table)
|
665 |
+
|
666 |
+
fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
|
667 |
+
fig.tight_layout()
|
668 |
+
|
669 |
+
g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
670 |
+
linewidths=0.07, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.1},
|
671 |
+
cbar_ax=None, square=False, ax=ax1, xticklabels=xticks, yticklabels=yticklabels, mask=None,)
|
672 |
+
|
673 |
+
cax = g1.figure.axes[-1]
|
674 |
+
|
675 |
+
g1.set_title(f'All Broad Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology', fontsize=3)
|
676 |
+
g1.set_ylabel('')
|
677 |
+
g1.set_xlabel('')
|
678 |
+
|
679 |
+
ax1.tick_params(axis='both', which='major', labelsize=2, length=1.5, width=0.25)
|
680 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.25, which="major")
|
681 |
+
|
682 |
+
plt.tight_layout()
|
683 |
+
plt.savefig(f'results/{test_name}/{save_prefix}_all_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
|
684 |
+
plt.show(block=False)
|
685 |
+
|
686 |
+
return
|
687 |
+
|
688 |
+
|
689 |
+
def multi_study_pathway_overlap(pathway_scores: dict = {},
|
690 |
+
filtered_pathways: list = [],
|
691 |
+
cell_types: list = ["Excitatory", "Inhibitory", "Astrocyte",
|
692 |
+
"Microglia", "Oligodendrocyte", "OPC", "Endothelial"],
|
693 |
+
test_name: str = 'ad_vs_no',
|
694 |
+
top_n: int = 10,
|
695 |
+
pathways: list = [],
|
696 |
+
filter: bool = False,
|
697 |
+
save_suffix: str = 'ad_vs_no',
|
698 |
+
method: str = 'cell_type_overlap'):
|
699 |
+
|
700 |
+
"""
|
701 |
+
This function generates a heatmap of the overlapping pathways across multiple studies. The heatmap displays the adjusted
|
702 |
+
pathway scores across different cell types for each pathway in each study. The function also returns a dictionary of
|
703 |
+
filtered scores that contain only the overlapping pathways across the studies.
|
704 |
+
|
705 |
+
Parameters:
|
706 |
+
-----------
|
707 |
+
pathway_scores : dict
|
708 |
+
A dictionary of pathway scores for different studies.
|
709 |
+
filtered_pathways : list, optional
|
710 |
+
A list of pathways to be used as a filter.
|
711 |
+
cell_types : list, optional
|
712 |
+
A list of cell types to be included in the heatmap. Default is ["Excitatory", "Inhibitory", "Astrocyte",
|
713 |
+
"Microglia", "Oligodendrocyte", "OPC", "Endothelial"].
|
714 |
+
test_name : str, optional
|
715 |
+
The name of the test being compared. Default is 'ad_vs_no'.
|
716 |
+
top_n : int, optional
|
717 |
+
The number of top pathways to be included in the heatmap. Default is 10.
|
718 |
+
pathways : list, optional
|
719 |
+
A list of pathways to be included in the heatmap. If not empty, only these pathways will be included in the
|
720 |
+
heatmap. Default is [].
|
721 |
+
filter : bool, optional
|
722 |
+
If True, the function will filter out pathways that are not present in the filtered_pathways list. Default is
|
723 |
+
False.
|
724 |
+
save_suffix : str, optional
|
725 |
+
A suffix to be added to the output file name. Default is 'ad_vs_no'.
|
726 |
+
method : str, optional
|
727 |
+
The method used to generate the overlap. 'cell_type_overlap' will generate the overlap based on cell type.
|
728 |
+
'global_overlap' will generate the overlap based on all pathways in the studies. Default is 'cell_type_overlap'.
|
729 |
+
|
730 |
+
Returns:
|
731 |
+
--------
|
732 |
+
filtered_scores : dict
|
733 |
+
A dictionary of pathway scores for the overlapping pathways across the studies.
|
734 |
+
|
735 |
+
Examples:
|
736 |
+
---------
|
737 |
+
>>> multi_study_pathway_overlap(pathway_scores, filtered_pathways=['pathway1', 'pathway2'],
|
738 |
+
cell_types=['Excitatory', 'Astrocyte'], test_name='ad_vs_no', filter=True)
|
739 |
+
"""
|
740 |
+
|
741 |
+
|
742 |
+
for i, study in enumerate(pathway_scores.keys()):
|
743 |
+
pathway_scores[study][test_name] = pathway_scores[study][test_name][pathway_scores[study][test_name].celltype.isin(cell_types)]
|
744 |
+
|
745 |
+
if method == "cell_type_overlap":
|
746 |
+
overlap = []
|
747 |
+
for cell_type in cell_types:
|
748 |
+
eval_string = []
|
749 |
+
for i, study in enumerate(pathway_scores.keys()):
|
750 |
+
eval_string.append(f'set(pathway_scores["{study}"]["{test_name}"][pathway_scores["{study}"]["{test_name}"].celltype=="{cell_type}"].pathway)')
|
751 |
+
|
752 |
+
eval_string = '&'.join(eval_string)
|
753 |
+
overlap.extend(list(eval(eval_string)))
|
754 |
+
|
755 |
+
elif method == "global_overlap":
|
756 |
+
overlap = []
|
757 |
+
eval_string = []
|
758 |
+
for i, study in enumerate(pathway_scores.keys()):
|
759 |
+
eval_string.append(f'set(pathway_scores["{study}"]["{test_name}"].pathway)')
|
760 |
+
|
761 |
+
eval_string = '&'.join(eval_string)
|
762 |
+
overlap.extend(list(eval(eval_string)))
|
763 |
+
|
764 |
+
|
765 |
+
if filter:
|
766 |
+
n_rows = len(set(filtered_pathways) & set(overlap))
|
767 |
+
else:
|
768 |
+
n_rows = len(overlap)
|
769 |
+
|
770 |
+
fig, axs = plt.subplots(1, 3, figsize=(3.5, n_rows*0.095), gridspec_kw={'width_ratios':[0.85, 0.85, 1]}, sharex=False,
|
771 |
+
sharey=True, layout='constrained')
|
772 |
+
fig.tight_layout()
|
773 |
+
|
774 |
+
filtered_scores = {}
|
775 |
+
shortened_names = {}
|
776 |
+
|
777 |
+
for i, study in enumerate(pathway_scores.keys()):
|
778 |
+
filtered_scores[study] = pathway_scores[study][test_name][pathway_scores[study][test_name].pathway.isin(overlap)]
|
779 |
+
filtered_scores[study] = pd.pivot_table(filtered_scores[study], values='score_adj', index='pathway', columns='celltype')
|
780 |
+
filtered_scores[study] = filtered_scores[study][cell_types]
|
781 |
+
|
782 |
+
if filter:
|
783 |
+
filtered_scores[study] = filtered_scores[study].loc[filtered_scores[study].index.isin(filtered_pathways)]
|
784 |
+
|
785 |
+
shortened_names[study] = [' '.join(name.split(" ")[:-1]) for name in filtered_scores[study].index]
|
786 |
+
# shortened_names[study] = filtered_scores[study].index
|
787 |
+
|
788 |
+
cbar=True if study==list(pathway_scores.keys())[-1] else False
|
789 |
+
g1 = sb.heatmap(filtered_scores[study], cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
|
790 |
+
linewidths=0.015, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.2}, cbar=cbar,
|
791 |
+
cbar_ax=None, square=False, ax=axs[i], xticklabels=cell_types, yticklabels=shortened_names[study], mask=None,)
|
792 |
+
|
793 |
+
axs[i].tick_params(axis='both', which='major', labelsize=2.5, length=1.5, width=0.5)
|
794 |
+
|
795 |
+
g1.set_title(study.split('_')[-1].upper(), fontsize=3)
|
796 |
+
g1.set_ylabel('', fontsize=4)
|
797 |
+
g1.set_xlabel('')
|
798 |
+
|
799 |
+
cax = g1.figure.axes[-1]
|
800 |
+
cax.tick_params(labelsize=4, length=1.5, width=0.5, which="major")
|
801 |
+
|
802 |
+
# plt.tight_layout()
|
803 |
+
# if filter:
|
804 |
+
# plt.savefig(f'../results/pathway_meta_analysis/filtered_overlap_pathway_diff_exp_patterns_{save_suffix}.pdf', bbox_inches='tight')
|
805 |
+
# else:
|
806 |
+
|
807 |
+
plt.suptitle(f"{test_name.split('_')[0].capitalize()}- vs {test_name.split('_')[-1]}-pathology", fontsize=4)
|
808 |
+
|
809 |
+
if filter:
|
810 |
+
plt.savefig(f'results/{test_name}/multi_study_pathway_overlap_filtered.pdf', bbox_inches='tight')
|
811 |
+
else:
|
812 |
+
plt.savefig(f'results/{test_name}/multi_study_pathway_overlap_all.pdf', bbox_inches='tight')
|
813 |
+
plt.show(block=False)
|
814 |
+
|
815 |
+
return filtered_scores
|
816 |
+
|
817 |
+
|
818 |
+
def save_plot(fig, ax, save):
|
819 |
+
if save is not None:
|
820 |
+
if ax is not None:
|
821 |
+
if fig is not None:
|
822 |
+
fig.savefig(save, bbox_inches='tight')
|
823 |
+
else:
|
824 |
+
raise ValueError("fig is None, cannot save figure.")
|
825 |
+
else:
|
826 |
+
raise ValueError("ax is None, cannot save figure.")
|
827 |
+
|
828 |
+
|
829 |
+
def check_if_matplotlib(return_mpl=False):
|
830 |
+
if not return_mpl:
|
831 |
+
try:
|
832 |
+
import matplotlib.pyplot as plt
|
833 |
+
except Exception:
|
834 |
+
raise ImportError('matplotlib is not installed. Please install it with: pip install matplotlib')
|
835 |
+
return plt
|
836 |
+
else:
|
837 |
+
try:
|
838 |
+
import matplotlib as mpl
|
839 |
+
except Exception:
|
840 |
+
raise ImportError('matplotlib is not installed. Please install it with: pip install matplotlib')
|
841 |
+
return mpl
|
842 |
+
|
843 |
+
|
844 |
+
def check_if_seaborn():
|
845 |
+
try:
|
846 |
+
import seaborn as sns
|
847 |
+
except Exception:
|
848 |
+
raise ImportError('seaborn is not installed. Please install it with: pip install seaborn')
|
849 |
+
return sns
|
850 |
+
|
851 |
+
|
852 |
+
def check_if_adjustText():
|
853 |
+
try:
|
854 |
+
import adjustText as at
|
855 |
+
except Exception:
|
856 |
+
raise ImportError('adjustText is not installed. Please install it with: pip install adjustText')
|
857 |
+
return at
|
858 |
+
|
859 |
+
|
860 |
+
def filter_limits(df, sign_limit=None, lFCs_limit=None):
|
861 |
+
|
862 |
+
"""
|
863 |
+
Filters a DataFrame by limits of the absolute value of the columns pvals and logFCs.
|
864 |
+
|
865 |
+
Parameters
|
866 |
+
----------
|
867 |
+
df : pd.DataFrame
|
868 |
+
The input DataFrame to be filtered.
|
869 |
+
sign_limit : float, None
|
870 |
+
The absolute value limit for the p-values. If None, defaults to infinity.
|
871 |
+
lFCs_limit : float, None
|
872 |
+
The absolute value limit for the logFCs. If None, defaults to infinity.
|
873 |
+
|
874 |
+
Returns
|
875 |
+
-------
|
876 |
+
pd.DataFrame
|
877 |
+
The filtered DataFrame.
|
878 |
+
"""
|
879 |
+
|
880 |
+
# Define limits if not defined
|
881 |
+
if sign_limit is None:
|
882 |
+
sign_limit = np.inf
|
883 |
+
if lFCs_limit is None:
|
884 |
+
lFCs_limit = np.inf
|
885 |
+
|
886 |
+
# Filter by absolute value limits
|
887 |
+
msk_sign = df['pvals'] < np.abs(sign_limit)
|
888 |
+
msk_lFCs = np.abs(df['logFCs']) < np.abs(lFCs_limit)
|
889 |
+
df = df.loc[msk_sign & msk_lFCs]
|
890 |
+
|
891 |
+
return df
|
892 |
+
|
893 |
+
|
894 |
+
def plot_volcano(data, x, y, x_label, y_label='-log10(pvals)', annotate=True,
|
895 |
+
annot_by='top', names=[],
|
896 |
+
top=5, sign_thr=0.05, lFCs_thr=0.5, sign_limit=None, lFCs_limit=None,
|
897 |
+
figsize=(7, 5), dpi=100, ax=None, return_fig=False, save=None,
|
898 |
+
fontsizes={"on_plot": 4}):
|
899 |
+
"""
|
900 |
+
Plot logFC and p-values from a long formated data-frame.
|
901 |
+
|
902 |
+
Parameters
|
903 |
+
----------
|
904 |
+
data : pd.DataFrame
|
905 |
+
Results of DEA in long format.
|
906 |
+
x : str
|
907 |
+
Column name of data storing the logFCs.
|
908 |
+
y : str
|
909 |
+
Columns name of data storing the p-values.
|
910 |
+
x_label: str
|
911 |
+
Aternate name for LogFC to be included in plot. If None, defaults to x
|
912 |
+
y_label: str
|
913 |
+
Aternate name for p-values to be included in plot. If None, defaults to y
|
914 |
+
annotate: bool
|
915 |
+
Whether to annotate labels.
|
916 |
+
annot_by: str
|
917 |
+
Determines how to annotate the plot for top features. It can be either 'top' or 'name'.
|
918 |
+
If set to 'top', the top top differentially expressed features will be annotated. If set to 'name',
|
919 |
+
only the features specified in names will be annotated.
|
920 |
+
names: list[]:
|
921 |
+
A list of feature names to be annotated in the plot. Only used if annot_by is set to 'name'.
|
922 |
+
top : int
|
923 |
+
Number of top differentially expressed features to show.
|
924 |
+
sign_thr : float
|
925 |
+
Significance threshold for p-values.
|
926 |
+
lFCs_thr : float
|
927 |
+
Significance threshold for logFCs.
|
928 |
+
sign_limit : float
|
929 |
+
Limit of p-values to plot in -log10.
|
930 |
+
lFCs_limit : float
|
931 |
+
Limit of logFCs to plot in absolute value.
|
932 |
+
figsize : tuple
|
933 |
+
Figure size.
|
934 |
+
dpi : int
|
935 |
+
DPI resolution of figure.
|
936 |
+
ax : Axes, None
|
937 |
+
A matplotlib axes object. If None returns new figure.
|
938 |
+
return_fig : bool
|
939 |
+
Whether to return a Figure object or not.
|
940 |
+
save : str, None
|
941 |
+
Path to where to save the plot. Infer the filetype if ending on {`.pdf`, `.png`, `.svg`}.
|
942 |
+
|
943 |
+
Returns
|
944 |
+
-------
|
945 |
+
fig : Figure, None
|
946 |
+
If return_fig, returns Figure object.
|
947 |
+
"""
|
948 |
+
|
949 |
+
|
950 |
+
if x_label is None:
|
951 |
+
x_label = x
|
952 |
+
|
953 |
+
if y_label is None:
|
954 |
+
y_label = y
|
955 |
+
|
956 |
+
# Load plotting packages
|
957 |
+
plt = check_if_matplotlib()
|
958 |
+
at = check_if_adjustText()
|
959 |
+
|
960 |
+
# Transform sign_thr
|
961 |
+
sign_thr = -np.log10(sign_thr)
|
962 |
+
|
963 |
+
# Extract df
|
964 |
+
df = data.copy()
|
965 |
+
df['logFCs'] = df[x]
|
966 |
+
df['pvals'] = -np.log10(df[y])
|
967 |
+
|
968 |
+
# Filter by limits
|
969 |
+
df = filter_limits(df, sign_limit=sign_limit, lFCs_limit=lFCs_limit)
|
970 |
+
|
971 |
+
# Define color by up or down regulation and significance
|
972 |
+
df['weight'] = 'gray'
|
973 |
+
up_msk = (df['logFCs'] >= lFCs_thr) & (df['pvals'] >= sign_thr)
|
974 |
+
dw_msk = (df['logFCs'] <= -lFCs_thr) & (df['pvals'] >= sign_thr)
|
975 |
+
df.loc[up_msk, 'weight'] = '#D62728'
|
976 |
+
df.loc[dw_msk, 'weight'] = '#1F77B4'
|
977 |
+
|
978 |
+
# Plot
|
979 |
+
fig = None
|
980 |
+
if ax is None:
|
981 |
+
fig, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
|
982 |
+
|
983 |
+
n = df.shape[0]
|
984 |
+
size = 120000 / (100*n)
|
985 |
+
|
986 |
+
df.plot.scatter(x='logFCs', y='pvals', c='weight', sharex=False, ax=ax, s=size)
|
987 |
+
|
988 |
+
# Draw sign lines
|
989 |
+
ax.axhline(y=sign_thr, linestyle='--', color="black")
|
990 |
+
ax.axvline(x=lFCs_thr, linestyle='--', color="black")
|
991 |
+
ax.axvline(x=-lFCs_thr, linestyle='--', color="black")
|
992 |
+
|
993 |
+
# Plot top sign features
|
994 |
+
signs = df[up_msk | dw_msk].sort_values('pvals', ascending=False)
|
995 |
+
|
996 |
+
# Add labels
|
997 |
+
ax.set_ylabel(y_label)
|
998 |
+
ax.set_xlabel(x_label)
|
999 |
+
|
1000 |
+
if annotate:
|
1001 |
+
if annot_by == 'top':
|
1002 |
+
signs = signs.iloc[:top]
|
1003 |
+
elif annot_by == 'name':
|
1004 |
+
signs = signs.loc[signs.index.isin(names)]
|
1005 |
+
|
1006 |
+
texts = []
|
1007 |
+
for x, y, s in zip(signs['logFCs'], signs['pvals'], signs.index):
|
1008 |
+
texts.append(ax.text(x, y, s, fontsize=fontsizes['on_plot']))
|
1009 |
+
if len(texts) > 0:
|
1010 |
+
at.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'), ax=ax)
|
1011 |
+
|
1012 |
+
save_plot(fig, ax, save)
|
1013 |
+
|
1014 |
+
if return_fig:
|
1015 |
+
return fig
|
pathway_databases/GO_Biological_Process_2021.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|