syedislamuddin commited on
Commit
16c6962
·
1 Parent(s): e37c044

Upload 5 files

Browse files
app.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ import scanpy as sc
7
+
8
+ #import mpld3
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+
12
+
13
+ import streamlit.components.v1 as components
14
+
15
+ from IPython.display import Markdown as md
16
+
17
+ from functions import pathway_analyses
18
+
19
+ # SMALL_SIZE = 2
20
+ # MEDIUM_SIZE = 2
21
+ # BIGGER_SIZE = 2
22
+
23
+ # plt.rc('font', size=SMALL_SIZE) # controls default text sizes
24
+ # plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
25
+ # plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
26
+ # plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
27
+ # plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
28
+ # plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
29
+ # plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
30
+
31
+ sc.settings.set_figure_params(dpi=80, facecolor='white')
32
+
33
+ #disable st.pyplot warning
34
+ st.set_page_config(layout="wide")
35
+ st.markdown(
36
+ """
37
+ <style>
38
+ .streamlit-expanderHeader {
39
+ font-size: x-large;
40
+ }
41
+ </style>
42
+ """,
43
+ unsafe_allow_html=True,
44
+ )
45
+ m=st.markdown("""
46
+
47
+ <style>
48
+
49
+ div.stTitle {
50
+
51
+ font-size:40px;
52
+
53
+ }
54
+
55
+ </style>"""
56
+ ,unsafe_allow_html=True)
57
+
58
+ st.set_option('deprecation.showPyplotGlobalUse', False)
59
+
60
+ #load Data
61
+ cwd=os.getcwd()+'/'#+'data/'
62
+ @st.cache_data
63
+ def get_data():
64
+ if 'adata_annot' not in st.session_state:
65
+ adata_annot = sc.read_h5ad(cwd+'multiregion_brainaging_annotated.h5ad')
66
+ st.session_state['adata_annot'] = adata_annot
67
+ if 'genes_list' not in st.session_state:
68
+ genes=adata_annot.var.index
69
+ #genes_list=sorted(genes.unique())
70
+ st.session_state['genes_list'] = sorted(genes.unique())
71
+ if 'cell_type' not in st.session_state:
72
+ #cell_type=diff_fdr[diff_fdr.type=='cell_type']['tissue']
73
+ #cell_type=sorted(cell_type.unique())
74
+ anno=adata_annot.obs.new_anno
75
+ #cell_type=sorted(anno.unique())
76
+ st.session_state['cell_type'] = sorted(anno.unique())
77
+ if 'broad_type' not in st.session_state:
78
+ broad_celltype=adata_annot.obs.broad_celltype
79
+ #broad_type=sorted(broad_type.unique())
80
+ st.session_state['broad_type'] = sorted(broad_celltype.unique())
81
+
82
+ #Also load Go Terms
83
+ if 'go_table' not in st.session_state:
84
+ bp = pathway_analyses.read_pathways('pathway_databases/GO_Biological_Process_2021.txt')
85
+
86
+ # cy = pathway_analyses.read_pathways('pathway_databases/HumanCyc_2016.txt')
87
+ # ke = pathway_analyses.read_pathways('pathway_databases/KEGG_2019_Human.txt')
88
+ # re = pathway_analyses.read_pathways('pathway_databases/Reactome_2016.txt')
89
+
90
+ # all_paths = pd.concat([bp, cy, ke, re], join='outer', axis=0, ignore_index=True)
91
+ # all_paths.set_index(0, inplace=True)
92
+ # all_paths.fillna("", inplace=True)
93
+ # all_paths_dict = all_paths.to_dict(orient='index')
94
+
95
+
96
+ go_bp_paths = bp.set_index(0)
97
+ go_bp_paths.fillna("", inplace=True)
98
+ go_bp_paths_dict = go_bp_paths.to_dict(orient='index')
99
+
100
+
101
+ gene_set_by_path = {key: [val for val in value.values() if val != ""] for key, value in go_bp_paths_dict.items()}
102
+ gene_set_by_path = pd.DataFrame.from_dict(gene_set_by_path, orient='index').transpose()
103
+ st.session_state['path_ways']=gene_set_by_path.columns
104
+ st.session_state['go_table']=gene_set_by_path
105
+ #done load Data
106
+
107
+
108
+
109
+ #st.title('Single nuclei atlas of human aging in brain regions')
110
+ st.title('Brain Age Browser')
111
+
112
+ #genes_list,adata_annot=get_data()
113
+
114
+ get_data()
115
+
116
+ tab1, tab2,readme = st.tabs(["Gene Expression by CellType", "Age associations for multiple genes", "README"])
117
+ data = np.random.randn(10, 1)
118
+ with tab1:
119
+ with st.form(key='columns_in_form'):
120
+ #c1, c2, c3 = st.columns([4,4,2])
121
+ c1, c2 = st.columns(2)
122
+ with c1:
123
+ selected_gene = st.selectbox(
124
+ 'Please select a gene',
125
+ st.session_state['genes_list'])
126
+ with c2:
127
+ selected_celltype = st.selectbox(
128
+ 'Please select CellType',
129
+ st.session_state['cell_type']
130
+ )
131
+ # with c3:
132
+ # plot_choice = st.checkbox(
133
+ # "Which Plots",
134
+ # ('Gene','Old/Young'))
135
+
136
+
137
+
138
+ Updated=st.form_submit_button(label = 'Go')
139
+ if not isinstance(selected_gene, type(None)) and not isinstance(selected_celltype, type(None)) and Updated:
140
+
141
+
142
+ # fig11, axx1 = plt.subplots()
143
+ # sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='4', frameon=False,show=False, ax=axx1)
144
+ # st.pyplot(plt.gcf().set_size_inches(4, 4))
145
+
146
+ col1,col2= st.columns([1,1])
147
+ with col1:
148
+ fig11, axx11 = plt.subplots(figsize=(5,5))
149
+ sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='8', frameon=False,show=False, ax=axx11)
150
+ st.pyplot(fig11)
151
+
152
+ with col2:
153
+ fig12, axx12 = plt.subplots(figsize=(5,5))
154
+ #sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=axx2)
155
+ sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False,legend_fontsize='xx-small', ax=axx12)#,vmax='p99')
156
+ #plt.xticks(rotation = 45)
157
+ st.pyplot(fig12)
158
+
159
+ #Subset Younv and Old
160
+ adata_Young = st.session_state['adata_annot'][st.session_state['adata_annot'].obs['Age_group']=='young']
161
+ adata_Old = st.session_state['adata_annot'][st.session_state['adata_annot'].obs['Age_group']=='old']
162
+
163
+ #Young/Old but for cell_type
164
+ adata_YoungAst = adata_Young[adata_Young.obs['new_anno']==selected_celltype]
165
+ adata_OldAst = adata_Old[adata_Old.obs['new_anno']==selected_celltype]
166
+
167
+ # # #Young/Old but for cell_type
168
+ # # adata_YoungAst = adata_Young[adata_Young.obs['broad_celltype']==selected_celltype]
169
+ # # adata_OldAst = adata_Old[adata_Old.obs['broad_celltype']==selected_celltype]
170
+
171
+ #Young
172
+ dot_size=.05
173
+ col1,col2= st.columns([1,1])
174
+
175
+ with col1:
176
+
177
+ #st.markdown('<div style="text-align: center;">**Young**</div>', unsafe_allow_html=True)
178
+ str_title='Young: '+selected_gene
179
+ #st.markdown("<h3 style='text-align: center; color: red;'>str_title</h3>", unsafe_allow_html=True)
180
+ st.markdown("# {} ".format(str_title))#,align_text='center')
181
+ #md("# {} ".format(str_title))
182
+
183
+ fig21, axx21 = plt.subplots(figsize=(1,1))
184
+ #sc.pl.umap(adata_Young, color=selected_gene, title="Young: "+selected_gene, legend_loc='right margin', color_map='viridis',frameon=False,show=False,size=dot_size, legend_fontsize='4',colorbar_loc=None,ax=axx21)
185
+ sc.pl.umap(adata_Young, color=selected_gene, title="", legend_loc='right margin', color_map='viridis',frameon=False,show=False,size=dot_size, legend_fontsize='xx-small',colorbar_loc=None,ax=axx21)
186
+ #st.pyplot(fig21)
187
+ st.pyplot(plt.gcf())
188
+ with col2:
189
+ str_title='Young: '+selected_gene+" ("+selected_celltype+")"
190
+ st.markdown("# {} ".format(str_title))#,align_text='center')
191
+ fig22, axx22 = plt.subplots(figsize=(1,1))
192
+ #sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=axx2)
193
+ #sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False, ax=axx2)#,vmax='p99')
194
+ sc.pl.umap(adata_YoungAst, color=selected_gene, title="", legend_loc='right margin', color_map='viridis', frameon=False,show=False,size=dot_size,legend_fontsize='xx-small',colorbar_loc=None, ax=axx22)
195
+ #sc.pl.umap(adata_Old, color=selected_gene, title="Old: "+selected_gene, legend_loc='right margin', color_map='viridis', frameon=False,show=False, ax=axx22)
196
+ #plt.xticks(rotation = 45)
197
+ #st.pyplot(fig22)
198
+ st.pyplot(plt.gcf())
199
+ #Old
200
+ col1,col2= st.columns([1,1])
201
+ with col1:
202
+ str_title='Old: '+selected_gene+" ("+selected_celltype+")"
203
+ st.markdown("# {} ".format(str_title))#,align_text='center')
204
+
205
+ fig31, axx31 = plt.subplots(figsize=(1,1))
206
+ #sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='8', frameon=False,show=False, ax=axx1)
207
+ sc.pl.umap(adata_Old, color=selected_gene, title="", legend_loc='right margin', color_map='viridis', frameon=False,show=False,size=dot_size,legend_fontsize='xx-small', colorbar_loc="bottom",ax=axx31)
208
+ st.pyplot(fig31)
209
+ with col2:
210
+ str_title='Old: '+selected_gene+" ("+selected_celltype+")"
211
+ st.markdown("# {} ".format(str_title))#,align_text='center')
212
+
213
+ fig32, axx32 = plt.subplots(figsize=(1,1))
214
+ #sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=axx2)
215
+ #sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False, ax=axx2)#,vmax='p99')
216
+ sc.pl.umap(adata_OldAst, color=selected_gene, title="", legend_loc='right margin', color_map='viridis', frameon=False,show=False,size=dot_size,legend_fontsize='xx-small', colorbar_loc="bottom",ax=axx32)
217
+ #plt.xticks(rotation = 45)
218
+ st.pyplot(fig32)
219
+
220
+ # fig, ax = plt.subplots(3,2)
221
+ # sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=ax[0,0])
222
+ # sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False, ax=ax[0,1],vmax='p99')
223
+
224
+ # #Subset Younv and Old
225
+ # adata_Young = st.session_state['adata_annot'][st.session_state['adata_annot'].obs['Age_group']=='young']
226
+ # adata_Old = st.session_state['adata_annot'][st.session_state['adata_annot'].obs['Age_group']=='old']
227
+ # sc.pl.umap(adata_Young, color=selected_gene, title="Young: "+selected_gene, legend_loc='right margin', color_map='viridis',frameon=False,show=False, ax=ax[1,0])
228
+ # sc.pl.umap(adata_Old, color=selected_gene, title="Old: "+selected_gene, legend_loc='right margin', color_map='viridis', frameon=False,show=False, ax=ax[2,0])
229
+
230
+ # # #Young/Old but for cell_type
231
+ # # adata_YoungAst = adata_Young[adata_Young.obs['broad_celltype']==selected_celltype]
232
+ # # adata_OldAst = adata_Old[adata_Old.obs['broad_celltype']==selected_celltype]
233
+
234
+ # #Young/Old but for cell_type
235
+ # adata_YoungAst = adata_Young[adata_Young.obs['new_anno']==selected_celltype]
236
+ # adata_OldAst = adata_Old[adata_Old.obs['new_anno']==selected_celltype]
237
+
238
+
239
+ # sc.pl.umap(adata_YoungAst, color=selected_gene, title=selected_celltype, legend_loc='right margin', color_map='viridis', frameon=False,show=False, ax=ax[1,1])
240
+ # sc.pl.umap(adata_OldAst, color=selected_gene, title=selected_celltype, legend_loc='right margin', color_map='viridis', frameon=False,show=False, ax=ax[2,1])
241
+
242
+ # #sc.pl.umap(st.session_state['adata_annot'], color='Brain_region', title='Brain Region', legend_loc='right margin', frameon=False,show=False, ax=ax[1,1])
243
+ # #sc.pl.umap(st.session_state['adata_annot'], color='Age_group', title='Age Group', legend_loc='right margin', frameon=False,show=False, ax=ax[2,0])
244
+ # #sc.pl.umap(st.session_state['adata_annot'], color=selected_celltype, title=selected_celltype, legend_loc='on data', frameon=False,show=False, ax=ax[2,1])
245
+
246
+
247
+ # st.pyplot(plt.gcf().set_size_inches(15, 30))
248
+
249
+
250
+
251
+ with tab2:
252
+ with st.form(key='multiselect_form'):
253
+ c1, c2, c3 = st.columns([4,4,2])
254
+ with c1:
255
+ multi_genes = st.multiselect(
256
+ 'Select Genes List',
257
+ st.session_state['genes_list'])
258
+ with c2:
259
+ go_term = st.selectbox(
260
+ 'Select GO Term',
261
+ st.session_state['path_ways'])
262
+ with c3:
263
+ Choice = st.radio(
264
+ "",
265
+ ('Gene Set','GO Term'))
266
+
267
+ Updated_tab2=st.form_submit_button(label = 'Show Results')
268
+ if not isinstance(multi_genes, type(None)) and Updated_tab2:
269
+ if Choice=='Gene Set':
270
+ multi_genes = np.sort(multi_genes)
271
+ else:
272
+ multi_genes=st.session_state['go_table'].loc[:,go_term]
273
+ multi_genes=multi_genes.dropna().values
274
+ #multi_genes=['WNT3', 'VPS13C', 'VAMP4', 'UBTF', 'UBAP2', 'TMEM175', 'TMEM163', 'SYT17', 'STK39', 'SPPL2B', 'SIPA1L2', 'SH3GL2', 'SCARB2', 'SCAF11', 'RPS6KL1', 'RPS12', 'RIT2', 'RIMS1', 'RETREG3', 'PMVK', 'PAM', 'NOD2', 'MIPOL1', 'MEX3C', 'MED12L', 'MCCC1', 'MBNL2', 'MAPT', 'LRRK2', 'KRTCAP2', 'KCNS3', 'KCNIP3', 'ITGA8', 'IP6K2', 'GPNMB', 'GCH1', 'GBA', 'FYN', 'FCGR2A', 'FBRSL1', 'FAM49B', 'FAM171A2', 'ELOVL7', 'DYRK1A', 'DNAH17', 'DLG2', 'CTSB', 'CRLS1', 'CRHR1', 'CLCN3', 'CHRNB1', 'CAMK2D', 'CAB39L', 'BRIP1', 'BIN3', 'ASXL3', 'SNCA']
275
+ #########
276
+
277
+ #sns.clustermap(st.session_state['adata_annot'], figsize=(14,12),
278
+ # pivot_kws={'index': 'country',
279
+ # 'columns': 'year',
280
+ # 'values': 'lifeExp'})
281
+ # col1,col2= st.columns([1,1])
282
+ # #fig_szx=2*len(st.session_state['cell_type'])
283
+ # #fig_szy=100*len(multi_genes)
284
+ # with col1:
285
+ # figa, axxaa = plt.subplots(figsize=(5, 5))
286
+ # #sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='8', frameon=False,show=False, ax=axx11)
287
+ # axxaa=sc.pl.clustermap(st.session_state['adata_annot'], obs_keys=multi_genes) #,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='BuPu',swap_axes=True,show=False,vmax=5)
288
+ # #st.pyplot(fig11)
289
+ # #st.pyplot(plt.gcf().set_size_inches(fig_szx, fig_szy))
290
+ # st.pyplot(plt.gcf())
291
+
292
+
293
+
294
+ col1,col2= st.columns([1,1])
295
+ #fig_szx=2*len(st.session_state['cell_type'])
296
+ #fig_szy=100*len(multi_genes)
297
+ with col1:
298
+ fig11, axx11 = plt.subplots(figsize=(5, 5))
299
+ #sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data',legend_fontsize='8', frameon=False,show=False, ax=axx11)
300
+ axx11=sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='BuPu',swap_axes=True,show=False,vmax=5)
301
+ #st.pyplot(fig11)
302
+ #st.pyplot(plt.gcf().set_size_inches(fig_szx, fig_szy))
303
+ st.pyplot(plt.gcf())
304
+ with col2:
305
+ fig12, axx12 = plt.subplots(figsize=(5, 5))
306
+ #sc.pl.umap(st.session_state['adata_annot'], color='new_anno', title='', legend_loc='on data', frameon=False,show=False, ax=axx2)
307
+ #sc.pl.umap(st.session_state['adata_annot'], color=selected_gene, title=selected_gene, legend_loc='best', frameon=False,show=False,legend_fontsize='xx-small', ax=axx12)#,vmax='p99')
308
+ axx12=sc.pl.heatmap(st.session_state['adata_annot'], multi_genes, groupby='new_anno', vmin=-1, vmax=1, cmap='BuPu', dendrogram=True, swap_axes=True)#,ax=ax2)
309
+ #plt.xticks(rotation = 45)
310
+ #st.pyplot(fig12)
311
+ #st.pyplot(plt.gcf().set_size_inches(fig_szx, fig_szy))
312
+ st.pyplot(plt.gcf())
313
+
314
+
315
+ #######
316
+
317
+ #multi_genes=['WNT3', 'VPS13C', 'VAMP4', 'UBTF', 'UBAP2', 'TMEM175', 'TMEM163', 'SYT17', 'STK39', 'SPPL2B', 'SIPA1L2', 'SH3GL2', 'SCARB2', 'SCAF11', 'RPS6KL1', 'RPS12', 'RIT2', 'RIMS1', 'RETREG3', 'PMVK', 'PAM', 'NOD2', 'MIPOL1', 'MEX3C', 'MED12L', 'MCCC1', 'MBNL2', 'MAPT', 'LRRK2', 'KRTCAP2', 'KCNS3', 'KCNIP3', 'ITGA8', 'IP6K2', 'GPNMB', 'GCH1', 'GBA', 'FYN', 'FCGR2A', 'FBRSL1', 'FAM49B', 'FAM171A2', 'ELOVL7', 'DYRK1A', 'DNAH17', 'DLG2', 'CTSB', 'CRLS1', 'CRHR1', 'CLCN3', 'CHRNB1', 'CAMK2D', 'CAB39L', 'BRIP1', 'BIN3', 'ASXL3', 'SNCA']
318
+ #multi_genes=np.sort(multi_genes)
319
+ # fig, ax1 = plt.subplots(1,2)
320
+ # sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='RdBu_r',show=False, ax=ax1[0])
321
+ # st.pyplot(plt.gcf().set_size_inches(10, 10))
322
+ # fig, ax2 = plt.subplots(1,2)
323
+ # ax2=sc.pl.heatmap(st.session_state['adata_annot'], multi_genes, 'new_anno', vmin=-1, vmax=1, cmap='RdBu_r', dendrogram=True, swap_axes=True)
324
+ # st.pyplot(plt.gcf().set_size_inches(10, 10))
325
+ #ax[0]=sc.pl.dotplot(st.session_state['adata_annot'],multi_genes,'new_anno',show=False)
326
+ #fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20,4), gridspec_kw={'wspace':0.9})
327
+
328
+
329
+
330
+ #commented these-working ones
331
+
332
+ # fig, (ax1) = plt.subplots(1, 1, figsize=(20,4), gridspec_kw={'wspace':0.9})
333
+ # #ax = plt.subplot()
334
+ # ax1_dict=sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='BuPu',swap_axes=True,show=False, ax=ax1,vmax=5)
335
+ # #ax_dict=sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'new_anno',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='RdBu_r',swap_axes=True,show=False, ax=ax)
336
+ # st.pyplot(plt.gcf().set_size_inches(10, 15))
337
+ # #ax2_dict=sc.pl.dotplot(st.session_state['adata_annot'], multi_genes,'Sex',size_title='Fraction of\n Expressing Cells',colorbar_title='Mean\nExpression',cmap='RdBu_r',swap_axes=True,show=False, ax=ax2)
338
+ # fig, (ax2) = plt.subplots(1, 1, figsize=(20,4), gridspec_kw={'wspace':0.9})
339
+ # #ax2_dict=sc.pl.matrixplot(st.session_state['adata_annot'], multi_genes, 'new_anno', vmin=-1, vmax=1, show=False, cmap='BuPu',dendrogram=True, swap_axes=True, ax=ax2)
340
+
341
+ # #sc.pl.heatmap(adata_annot, genes_lst, groupby='new_anno', vmin=-1, vmax=1, cmap='RdBu_r', dendrogram=True, swap_axes=True, figsize=(11,4))
342
+ # ax2_dict=sc.pl.heatmap(st.session_state['adata_annot'], multi_genes, groupby='new_anno', vmin=-1, vmax=1, cmap='BuPu', dendrogram=True, swap_axes=True)#,ax=ax2)
343
+
344
+ # st.pyplot(plt.gcf().set_size_inches(10, 15))
345
+
346
+
347
+ with readme:
348
+ expander = st.expander("How to use this app")
349
+ #st.header('How to use this app')
350
+ expander.markdown('Please select **Results Menue** checkbox from the sidebar')
351
+ expander.markdown('Select a Gene from the dropdown list')
352
+ expander.markdown('A table showing all reference gudies from three LISTS will appear in the main panel')
353
+ expander.markdown('To see results for each of the selected reference guide from ListA, ListB and ListC, Please select respective checkbox')
354
+ expander.markdown('Results are shown as two tables, **MATCHED** and **MUTATED** guides tables and **NOT FOUND** table if guides are not found in GRCh38 and LR reference fasta files')
355
+ expander.markdown('**MATCHED** guides table shows the genomic postion in GRCh38 and LR Fasta file along other fields. **If a guide is found in GRCh38 but not in LR fasta, then corresponding columns will be NA**')
356
+ expander.markdown('**MUTATED** guides table shows the genomic postion in GRCh38 and LR Fasta file along other fields. **If a guide is found in GRCh38 but not in LR fasta, then corresponding columns will be NA**')
357
+
358
+ expander1 = st.expander('Introduction')
359
+
360
+ expander1.markdown(
361
+ """ This app helps navigate all probable genomic **miss-matched/Mutations (upto 2 bp)** for a given sgRNA (from 3 lists of CRISPRi dual sgRNA libraries) in GRCh38 reference fasta and a Reference fasta generated from BAM generated against KOLF2.1J longread data.
362
+ """
363
+ )
364
+ expander1.markdown('Merged bam file was converted to fasta file using following steps:')
365
+ expander1.markdown('- samtools mpileup to generate bcf file')
366
+ expander1.markdown('- bcftools to generate vcf file')
367
+ expander1.markdown('- bcftools consensus to generate fasta file')
368
+ expander1.markdown('A GPU based [Cas-OFFinder](http://www.rgenome.net/cas-offinder/) tool was used to find off-target sequences (upto 2 miss-matched) for each geiven reference guide against GRCh38 and LR fasta references.')
369
+
370
+ css = '''
371
+ <style>
372
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
373
+ font-size:1.5rem;
374
+ }
375
+ </style>
376
+ '''
377
+
378
+ st.markdown(css, unsafe_allow_html=True)
env.yml ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: scBrainApp
2
+ channels:
3
+ - conda-forge
4
+ - bioconda
5
+ - defaults
6
+ dependencies:
7
+ - ca-certificates=2023.5.7=h8857fd0_0
8
+ - libcxx=16.0.5=hd57cbcb_0
9
+ - libffi=3.3=h046ec9c_2
10
+ - libsqlite=3.42.0=h58db7d2_0
11
+ - libzlib=1.2.13=h8a1eda9_5
12
+ - ncurses=6.4=hf0c8a7f_0
13
+ - openssl=1.1.1u=h8a1eda9_0
14
+ - pip=23.1.2=pyhd8ed1ab_0
15
+ - python=3.9.0=h4f09611_5_cpython
16
+ - readline=8.2=h9e318b2_1
17
+ - setuptools=67.7.2=pyhd8ed1ab_0
18
+ - sqlite=3.42.0=h2b0dec6_0
19
+ - tk=8.6.12=h5dbffcc_0
20
+ - wheel=0.40.0=pyhd8ed1ab_0
21
+ - xz=5.2.6=h775f41a_0
22
+ - zlib=1.2.13=h8a1eda9_5
23
+ - pip:
24
+ - altair==5.0.1
25
+ - anndata==0.9.1
26
+ - appnope==0.1.3
27
+ - asttokens==2.2.1
28
+ - attrs==23.1.0
29
+ - backcall==0.2.0
30
+ - blinker==1.6.2
31
+ - cachetools==5.3.1
32
+ - certifi==2023.5.7
33
+ - charset-normalizer==3.1.0
34
+ - click==8.1.3
35
+ - contourpy==1.0.7
36
+ - cycler==0.11.0
37
+ - decorator==5.1.1
38
+ - decoupler==1.4.0
39
+ - executing==1.2.0
40
+ - fonttools==4.40.0
41
+ - gitdb==4.0.10
42
+ - gitpython==3.1.31
43
+ - h5py==3.8.0
44
+ - idna==3.4
45
+ - importlib-metadata==6.6.0
46
+ - importlib-resources==5.12.0
47
+ - ipython==8.14.0
48
+ - jedi==0.18.2
49
+ - jinja2==3.1.2
50
+ - joblib==1.2.0
51
+ - jsonschema==4.17.3
52
+ - kiwisolver==1.4.4
53
+ - llvmlite==0.40.1rc1
54
+ - markdown-it-py==3.0.0
55
+ - markupsafe==2.1.3
56
+ - matplotlib==3.7.1
57
+ - matplotlib-inline==0.1.6
58
+ - mdurl==0.1.2
59
+ - natsort==8.3.1
60
+ - networkx==3.1
61
+ - numba==0.57.0
62
+ - numpy==1.24.3
63
+ - packaging==23.1
64
+ - pandas==2.0.2
65
+ - parso==0.8.3
66
+ - patsy==0.5.3
67
+ - pexpect==4.8.0
68
+ - pickleshare==0.7.5
69
+ - pillow==9.5.0
70
+ - prompt-toolkit==3.0.38
71
+ - protobuf==4.23.2
72
+ - ptyprocess==0.7.0
73
+ - pure-eval==0.2.2
74
+ - pyarrow==12.0.1
75
+ - pydeck==0.8.1b0
76
+ - pygments==2.15.1
77
+ - pympler==1.0.1
78
+ - pynndescent==0.5.10
79
+ - pyparsing==3.0.9
80
+ - pyrsistent==0.19.3
81
+ - python-dateutil==2.8.2
82
+ - pytz==2023.3
83
+ - pytz-deprecation-shim==0.1.0.post0
84
+ - requests==2.31.0
85
+ - rich==13.4.2
86
+ - scanpy==1.9.3
87
+ - scikit-learn==1.2.2
88
+ - scipy==1.10.1
89
+ - seaborn==0.12.2
90
+ - session-info==1.0.0
91
+ - six==1.16.0
92
+ - smmap==5.0.0
93
+ - stack-data==0.6.2
94
+ - statsmodels==0.14.0
95
+ - stdlib-list==0.8.0
96
+ - streamlit==1.23.1
97
+ - tenacity==8.2.2
98
+ - threadpoolctl==3.1.0
99
+ - toml==0.10.2
100
+ - toolz==0.12.0
101
+ - tornado==6.3.2
102
+ - tqdm==4.65.0
103
+ - traitlets==5.9.0
104
+ - typing-extensions==4.6.3
105
+ - tzdata==2023.3
106
+ - tzlocal==4.3
107
+ - umap-learn==0.5.3
108
+ - urllib3==2.0.3
109
+ - validators==0.20.0
110
+ - wcwidth==0.2.6
111
+ - zipp==3.15.0
functions/__pycache__/pathway_analyses.cpython-39.pyc ADDED
Binary file (26.4 kB). View file
 
functions/pathway_analyses.py ADDED
@@ -0,0 +1,1015 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import scipy
3
+ import warnings
4
+ #import anndata2ri
5
+ import pandas as pd
6
+ import scanpy as sc
7
+ import numpy as np
8
+ import seaborn as sb
9
+ import decoupler as dc
10
+ from scipy import sparse
11
+ from anndata import AnnData
12
+ from tabnanny import verbose
13
+ import matplotlib.pyplot as plt
14
+ #from gsva_prep import prep_gsva
15
+ from typing import Optional, Union
16
+ from matplotlib.pyplot import rcParams
17
+ #from statsmodels.stats.multitest import multipletests
18
+ #from sklearn.model_selection import train_test_split
19
+ #from rpy2.robjects.conversion import localconverter
20
+
21
+
22
+
23
+
24
+ def rescale_matrix(S, log_scale=False):
25
+ """
26
+ Sums cell-level counts by factors in label vector
27
+
28
+ Parameters
29
+ ----------
30
+ S : np.ndarray, scipy.sparse.csr_matrix or pandas.DataFrame
31
+ Matrix with read counts (gene x cell)
32
+ log_scale : bool, optional (default: False)
33
+ Whether to log-transform the rescaled matrix
34
+
35
+ Returns
36
+ -------
37
+ B : np.ndarray or scipy.sparse.csr_matrix
38
+ Scaled and log-transformed matrix
39
+ """
40
+ if isinstance(S, pd.DataFrame):
41
+ S = S.values
42
+ elif isinstance(S, np.ndarray):
43
+ pass
44
+ elif isinstance(S, scipy.sparse.csr_matrix):
45
+ S = S.toarray()
46
+ else:
47
+ raise ValueError('Input S must be a pandas.DataFrame, numpy.ndarray or scipy.sparse.csr_matrix')
48
+
49
+ cs = np.sum(S, axis=0)
50
+ cs[cs == 0] = 1
51
+ B = np.median(cs) * (S / cs)
52
+ if log_scale:
53
+ B = np.log1p(B)
54
+ return B
55
+
56
+ def normalize_default(adata, log_scale=True):
57
+ """
58
+ Normalizes gene expression matrix by total count and scales by median
59
+
60
+ Parameters
61
+ ----------
62
+ adata : AnnData
63
+ Annotated data matrix.
64
+ log_scale : bool, optional (default: True)
65
+ Whether to log-transform the rescaled matrix.
66
+
67
+ Returns
68
+ -------
69
+ adata : AnnData
70
+ Annotated data matrix with normalized and scaled expression values.
71
+ """
72
+ if 'counts' in adata.layers.keys():
73
+ print('normalizaing data using count data in .layers["counts] ')
74
+ S = adata.layers['counts']
75
+ else:
76
+ print('normaling data using count data in .X')
77
+ S = adata.X
78
+ B = rescale_matrix(S, log_scale=log_scale)
79
+ adata.X = B
80
+ return adata
81
+
82
+
83
+ def normalize_matrix(
84
+ X: Union[np.ndarray, sparse.spmatrix],
85
+ top_features_frac: float = 1.0,
86
+ scale_factor: Union[str, float, int, np.ndarray, None] = "median",
87
+ transformation: Union[str, None] = "log",
88
+ anchor_features: Union[np.ndarray, None] = None,
89
+ ) -> Union[np.ndarray, sparse.spmatrix]:
90
+
91
+ X = X.astype(dtype=np.float64)
92
+
93
+ # Which features (i.e. genes) should we use to compute library sizes?
94
+ if anchor_features is not None:
95
+ lib_sizes = np.array(np.mean(X[:, anchor_features], axis=1))
96
+ else:
97
+ if top_features_frac < 1.0:
98
+ universality = np.array(np.mean(X > 0, axis=0))
99
+ selected_features = np.flatnonzero(universality > (1 - top_features_frac))
100
+ lib_sizes = np.array(np.mean(X[:, selected_features], axis=1))
101
+ else:
102
+ lib_sizes = np.array(np.mean(X, axis=1))
103
+
104
+ # Note: mean as opposed to sum
105
+
106
+ # Normalize library sizes
107
+ if isinstance(X, sparse.spmatrix):
108
+ X_scaled = X.multiply(1 / lib_sizes)
109
+ else:
110
+ try:
111
+ X_scaled = X / lib_sizes
112
+ except ValueError:
113
+ lib_sizes = np.reshape(lib_sizes, (-1, 1))
114
+ X_scaled = X / lib_sizes
115
+
116
+ # scale normalized columns
117
+ if scale_factor == "median":
118
+ kappa = np.median(np.array(np.sum(X, axis=1) / np.sum(X_scaled, axis=1)))
119
+ X_scaled_norm = X_scaled * kappa
120
+ elif isinstance(scale_factor, (int, float)):
121
+ X_scaled_norm = X_scaled * scale_factor
122
+ elif isinstance(scale_factor, np.ndarray):
123
+ if sparse.issparse(X_scaled):
124
+ X_scaled_norm = X_scaled.multiply(scale_factor)
125
+ else:
126
+ X_scaled_norm = X_scaled / scale_factor
127
+
128
+ # For compatibility with C
129
+ if sparse.issparse(X_scaled_norm):
130
+ X_scaled_norm = sparse.csc_matrix(X_scaled_norm)
131
+
132
+ # Post-transformation
133
+ if transformation == "log":
134
+ X_scaled_norm_trans = np.log1p(X_scaled_norm)
135
+ elif transformation == "tukey":
136
+ if sparse.issparse(X_scaled_norm):
137
+ nnz_idx = X_scaled_norm.nonzero()
138
+ ii = nnz_idx[0]
139
+ jj = nnz_idx[1]
140
+ vv = X_scaled_norm[ii, jj]
141
+ vv_transformed = np.sqrt(vv) + np.sqrt(1 + vv)
142
+ X_scaled_norm[ii, jj] = vv_transformed
143
+ else:
144
+ X_scaled_norm[X_scaled_norm < 0] = 0
145
+ vv = X_scaled_norm[X_scaled_norm != 0]
146
+ vv_transformed = np.sqrt(vv) + np.sqrt(1 + vv)
147
+ X_scaled_norm[X_scaled_norm != 0] = vv_transformed
148
+
149
+ # elif transformation == "lsi":
150
+ # if sparse.issparse(X_scaled_norm):
151
+ # X_scaled_norm_trans = _an.LSI(X_scaled_norm)
152
+ # else:
153
+ # X_scaled_norm_sp = sparse.csc_matrix(X_scaled_norm)
154
+ # X_scaled_norm_trans = _an.LSI(X_scaled_norm_sp).toarray()
155
+ else:
156
+ X_scaled_norm_trans = X_scaled_norm
157
+
158
+ return X_scaled_norm_trans
159
+
160
+
161
+ def normalize_actionet(
162
+ adata: AnnData,
163
+ layer_key: Optional[str] = None,
164
+ layer_key_out: Optional[str] = None,
165
+ top_features_frac: float = 1.0,
166
+ scale_factor: Union[str, float, int, np.ndarray, None] = "median",
167
+ transformation: Union[str, None] = "log",
168
+ anchor_features: Union[np.ndarray, None] = None,
169
+ copy: Optional[bool] = False,
170
+ ) -> Optional[AnnData]:
171
+ adata = adata.copy() if copy else adata
172
+
173
+ if "metadta" in adata.uns.keys():
174
+ if "norm_method" in adata.uns["metadata"].keys(): # Already normalized? leave it alone!
175
+ # return adata if copy else None
176
+ warnings.warn("AnnData object is prenormalized. Please make sure to use the right assay.")
177
+
178
+ if layer_key is None and "input_assay" in adata.uns["metadata"].keys():
179
+ layer_key = adata.uns["metadata"]["input_assay"]
180
+
181
+ if layer_key is not None:
182
+ if layer_key not in adata.layers.keys():
183
+ raise ValueError("Did not find adata.layers['" + layer_key + "']. ")
184
+ S = adata.layers[layer_key]
185
+ else:
186
+ S = adata.X
187
+
188
+ if sparse.issparse(S):
189
+ UE = set(S.data)
190
+ else:
191
+ UE = set(S.flatten())
192
+
193
+ nonint_count = len(UE.difference(set(np.arange(0, max(UE) + 1))))
194
+ if 0 < nonint_count:
195
+ warnings.warn("Input [count] assay has non-integer values, which looks like a normalized matrix. Please make sure to use the right assay.")
196
+
197
+ S = normalize_matrix(
198
+ S,
199
+ anchor_features=anchor_features,
200
+ top_features_frac=top_features_frac,
201
+ scale_factor=scale_factor,
202
+ transformation=transformation,
203
+ )
204
+
205
+ adata.uns["metadata"] = {}
206
+ adata.uns["metadata"]["norm_method"] = "default_top%.2f_%s" % (
207
+ top_features_frac,
208
+ transformation,
209
+ )
210
+
211
+ if layer_key_out is not None:
212
+ adata.uns["metadata"]["default_assay"] = layer_key_out
213
+ adata.layers[layer_key_out] = S
214
+ else:
215
+ adata.uns["metadata"]["default_assay"] = None
216
+ adata.X = S
217
+
218
+ return adata if copy else None
219
+
220
+ def read_pathways(filename):
221
+ with open(filename, 'r') as temp_f:
222
+ col_count = [ len(l.split("\t")) for l in temp_f.readlines() ]
223
+ column_names = [i for i in range(0, max(col_count))]
224
+ ### Read csv
225
+ return pd.read_csv(filename, header=None, delimiter="\t", names=column_names)
226
+
227
+
228
+
229
+ def filter_expressed_genes_by_celltype(adata: AnnData,
230
+ threshold: float=0.05,
231
+ filter_genes_from: str='singlecell',
232
+ subject_id: str='Subject'):
233
+ """
234
+
235
+ Function to filter expressed genes by cell type based on a threshold
236
+
237
+ Parameters:
238
+ -----------
239
+ adata : AnnData object
240
+ Annotated Data matrix with rows representing genes and columns representing cells.
241
+ threshold : float, optional (default=0.05)
242
+ The threshold to use for filtering expressed genes based on the minimum number of cells they are detected in.
243
+ filter_genes_from: str, optional (default=`singlecell`)
244
+ Whether to filter genes that meet threshold in pseudobulk data or singlecell data.
245
+ subject_id (str): a string indicating the column containing individual identifiers.
246
+
247
+
248
+ Returns:
249
+ --------
250
+ expressed_genes_per_celltype : pandas DataFrame
251
+ A dataframe where the rows are the gene names and columns are the cell types,
252
+ containing only the genes that are expressed in at least the specified percentage of cells for each cell type.
253
+
254
+
255
+ """
256
+
257
+ # Initialize empty dictionaries to store the expressed genes and gene sets per cell type
258
+ expressed_genes_per_celltype = {}
259
+ gene_set_per_celltype = {}
260
+
261
+ if filter_genes_from=='pseudobulk':
262
+ # Get pseudo-bulk profile
263
+ adata = dc.get_pseudobulk(adata,
264
+ sample_col=subject_id,
265
+ groups_col='cell_type',
266
+ layer='counts',
267
+ mode='sum',
268
+ min_cells=0,
269
+ min_counts=0
270
+ )
271
+ # Loop through each unique cell type in the input AnnData object
272
+
273
+ for cell_type in adata.obs.cell_type.unique():
274
+
275
+ expressed_genes_per_celltype[cell_type] = dc.filter_by_prop(adata[adata.obs['cell_type']==cell_type],
276
+ min_prop=threshold)
277
+
278
+ elif filter_genes_from=='singlecell':
279
+ # Loop through each unique cell type in the input AnnData object
280
+
281
+ for cell_type in adata.obs.cell_type.unique():
282
+
283
+ # Calculate the number of cells based on the specified threshold
284
+ percent = threshold
285
+ num_cells = round(percent*len(adata[adata.obs['cell_type']==cell_type]))
286
+
287
+ # Filter genes based on minimum number of cells and store the resulting gene names
288
+ expressed_genes_per_celltype[cell_type], _ = sc.pp.filter_genes(adata[adata.obs.cell_type==cell_type].layers['counts'],
289
+ min_cells=num_cells, inplace=False)
290
+ expressed_genes_per_celltype[cell_type] = list(adata.var_names[expressed_genes_per_celltype[cell_type]])
291
+
292
+ # Convert the dictionary of expressed genes per cell type to a Pandas DataFrame
293
+ expressed_genes_per_celltype = pd.DataFrame.from_dict(expressed_genes_per_celltype, orient='index').transpose()
294
+
295
+ return expressed_genes_per_celltype
296
+
297
+
298
+ def filter_lowly_exp_genes(expressed: pd.DataFrame,
299
+ all_paths: pd.DataFrame,
300
+ threshold: float = 0.33):
301
+
302
+ """
303
+ Filters lowly expressed gene sets based on a threshold and pathway membership.
304
+
305
+ Parameters:
306
+ -----------
307
+ expressed: pandas.DataFrame
308
+ A DataFrame of expressed genes with cell types as columns and gene IDs as rows.
309
+ all_paths: pandas.DataFrame
310
+ A DataFrame of gene sets with pathways as columns and gene IDs as rows.
311
+ threshold: float, optional (default=0.33)
312
+ A proportion threshold used to filter gene sets based on their expression in each cell type.
313
+
314
+ Returns:
315
+ --------
316
+ gene_set_per_celltype: dict of pandas.DataFrame
317
+ A dictionary of gene sets per cell type, with cell type names as keys and gene set dataframes as values.
318
+ Each gene set dataframe has three columns: 'description', 'member', and 'name'.
319
+ """
320
+
321
+ # Initialize empty dictionaries to store the gene sets and gene sets per cell type
322
+ gene_set = {}
323
+ gene_set_per_celltype = {}
324
+
325
+ # Loop through each cell type in the input Pandas DataFrame of expressed genes
326
+ for cell_type in expressed.columns:
327
+ # Determine which pathways have a proportion of genes above the specified threshold
328
+ index = [sum(all_paths[x].isin(expressed[cell_type]))/len(all_paths[x]) > threshold for x in all_paths.columns]
329
+ # Filter pathways based on threshold and store the resulting gene sets
330
+ p = all_paths.loc[:, index]
331
+ x = {y: pd.Series(list(set(expressed[cell_type]).intersection(set(p[y])))) for y in p.columns}
332
+ x = {k: v for k, v in x.items() if not v.empty}
333
+ gene_set[cell_type] = x
334
+
335
+ # Convert the gene sets to Pandas DataFrames and store them in a dictionary by cell type
336
+ gene_set_per_celltype[cell_type] = pd.DataFrame(columns=['description', 'member', 'name'])
337
+ for pathway, gene_list in gene_set[cell_type].items():
338
+
339
+ df = pd.DataFrame(columns=['description', 'member', 'name'])
340
+ df['member'] = gene_list
341
+ df['name'] = pathway
342
+ df['description'] = pathway.split(" ")[-1]
343
+ gene_set_per_celltype[cell_type] = pd.concat([gene_set_per_celltype[cell_type], df], join='outer', ignore_index=True)
344
+
345
+ # Sort the resulting gene sets by description and member
346
+ gene_set_per_celltype[cell_type].sort_index(axis=1, inplace=True)
347
+ gene_set_per_celltype[cell_type].sort_index(axis=0, inplace=True)
348
+
349
+
350
+ return gene_set_per_celltype
351
+
352
+
353
+ def get_ind_level_ave(adata: AnnData, subject_id: str = 'Subject', method: str = "agg_x_num",
354
+ expressed_genes_per_celltype: dict = {}, filter_genes_at_threshold: bool = True):
355
+ """
356
+ Get averaged expression data for each cell type and individual in an AnnData object.
357
+
358
+
359
+ Args:
360
+
361
+ adata (AnnData): An AnnData object with read counts (gene x cell).
362
+ subject_id (str): a string indicating the column containing individual identifiers.
363
+ method (str): a string indicating the method to be used. The default is "agg_x_num".
364
+ filter_genes_at_threshold (bool): A boolean indicating whether to filter genes based on threshold. The default is True.
365
+ expressed_genes_per_celltype (float): A dictionary of the genes to be filtered for each celltype.
366
+
367
+ Returns:
368
+
369
+ Dictionary: A dictionary of data frames with averaged expression data for each cell type and individual.
370
+
371
+ """
372
+
373
+ if method == "agg_x_norm":
374
+
375
+ avs_logcounts_cellxind = {}
376
+ # loop over each unique cell type in the annotation metadata
377
+ for cell_type in adata.obs.cell_type.unique():
378
+
379
+ # filter genes based on threshold
380
+ if filter_genes_at_threshold:
381
+ adata_temp = adata[adata.obs.cell_type==cell_type].copy()
382
+ # sc.pp.filter_genes(adata_temp, min_cells=gene_celltype_threshold*adata_temp.n_obs)
383
+ adata_temp = adata_temp[:, adata_temp.var_names.isin(expressed_genes_per_celltype[cell_type].tolist())]
384
+ else:
385
+ adata_temp = adata.copy()
386
+
387
+ # Get pseudo-bulk profile
388
+ pdata = dc.get_pseudobulk(adata_temp, sample_col=subject_id, groups_col='cell_type', layer='counts', mode='sum',
389
+ min_cells=0, min_counts=0)
390
+
391
+ # genes = dc.filter_by_prop(pdata, min_prop=0.05, min_smpls=1)
392
+ # pdata = pdata[:, genes].copy()
393
+
394
+ # Normalize and log transform
395
+
396
+ # sc.pp.normalize_total(pdata, 1e06)
397
+ # sc.pp.log1p(pdata)
398
+
399
+ pdata.layers['counts'] = pdata.X
400
+ pdata = normalize_actionet(pdata, layer_key = 'counts', layer_key_out = None,
401
+ top_features_frac = 1.0, scale_factor = "median",
402
+ transformation = "log", anchor_features = None, copy = True)
403
+
404
+ # Store the log-normalized, averaged expression data for each individual and cell type
405
+ avs_logcounts_cellxind[cell_type] = pd.DataFrame(pdata.X.T, columns=pdata.obs[subject_id], index=pdata.var_names)
406
+
407
+ del adata_temp, pdata
408
+
409
+ elif method == 'norm_x_agg':
410
+
411
+ def sum_counts(counts, label, cell_labels, gene_labels):
412
+
413
+ """
414
+ Sums cell-level counts by factors in label vector.
415
+
416
+ Args:
417
+ counts (AnnData): An AnnData object with read counts (gene x cell).
418
+ label (pd.DataFrame): Variable of interest by which to sum counts.
419
+ cell_labels (pd.Index): Vector of cell labels.
420
+ gene_labels (pd.Index): Vector of gene labels.
421
+
422
+ Returns:
423
+ Dictionary: A dictionary with the following keys:
424
+ - 'summed_counts': A data frame with summed counts.
425
+ - 'ncells': A data frame with the number of cells used per summation.
426
+ """
427
+ # Create a data frame with the label vector and add a column of 1s for counting.
428
+ label_df = pd.DataFrame(label)
429
+ label_df.columns = ['ID']
430
+ label_df['index'] = 1
431
+
432
+ # Add a column for cell type and pivot the data frame to create a matrix of counts.
433
+ label_df['celltype'] = cell_labels
434
+ label_df = label_df.pivot_table(index='celltype', columns='ID', values='index', aggfunc=np.sum, fill_value=0)
435
+ label_df = label_df.astype(float)
436
+
437
+ # Multiply the counts matrix by the gene expression matrix to get summed counts.
438
+ summed_counts = pd.DataFrame(counts.X.T @ label_df.values, index = gene_labels, columns= label_df.columns)
439
+
440
+ # Sum the number of cells used for each summation.
441
+ ncells = label_df.sum()
442
+
443
+ # Return the summed counts and number of cells as a dictionary.
444
+ return {'summed_counts': summed_counts, 'ncells': ncells}
445
+
446
+
447
+ # Get metadata from the AnnData object.
448
+ meta = adata.obs # Get metadata
449
+
450
+
451
+ # Create a data frame of labels by combining cell type and individual metadata fields.
452
+ # Sum counts by individual
453
+ labels = pd.DataFrame(meta['cell_type'].astype(str) + '_' + meta[subject_id].astype(str), columns=['individual'])
454
+
455
+ # Sum counts by individual and store the results in a dictionary.
456
+ summed_logcounts_cellxind = sum_counts(adata, labels, adata.obs_names, adata.var_names)
457
+
458
+ # Calculate averages for each cell type and individual and store the results in a dictionary.
459
+ # Get averages corresponding to both count matrices
460
+ avs_logcounts = np.array(summed_logcounts_cellxind['summed_counts'].values) / np.array(summed_logcounts_cellxind['ncells'].values)
461
+ # avs_logcounts = np.array(summed_logcounts_cellxind['summed_counts'].values)
462
+ avs_logcounts = pd.DataFrame(avs_logcounts, index = summed_logcounts_cellxind['summed_counts'].index,
463
+ columns=summed_logcounts_cellxind['summed_counts'].columns)
464
+
465
+
466
+ # Split the averages by cell type and individual and store the results in a dictionary.
467
+ # Split column names into two parts: cell type and individual
468
+ x = [col.split('_') for col in avs_logcounts.columns]
469
+ celltype = [col[0] for col in x]
470
+ individual = [col[1] for col in x]
471
+
472
+ # Get unique cell types in the dataset
473
+ celltype_unique = np.unique(celltype)
474
+
475
+ # Create an empty dictionary to store the average counts for each cell type and individual
476
+ avs_by_ind_out = {}
477
+
478
+ # Loop over the unique cell types and subset the average counts for each cell type and individual
479
+ for i in celltype_unique:
480
+ index = np.array(celltype)==i
481
+ df = avs_logcounts.loc[:, index]
482
+ df.columns = np.array(individual)[index]
483
+ avs_by_ind_out[i] = df
484
+
485
+ if filter_genes_at_threshold:
486
+ # num_cells = round(gene_celltype_threshold*len(adata[adata.obs['cell_type']==cell_type]))
487
+ # # Filter genes based on minimum number of cells and store the resulting gene names
488
+ # gene_mask, _ = sc.pp.filter_genes(adata[adata.obs.cell_type==cell_type].layers['counts'],
489
+ # min_cells=num_cells,
490
+ # inplace=False)
491
+ # genes = list(adata.var_names[gene_mask])
492
+ avs_by_ind_out[i] = avs_by_ind_out[i].loc[expressed_genes_per_celltype[i], :]
493
+ else:
494
+ adata = adata.copy()
495
+ # Store the dictionary of average counts for each cell type and individual
496
+ avs_logcounts_cellxind = avs_by_ind_out
497
+
498
+ # Return the dictionary of average counts for each cell type and individual
499
+
500
+ return avs_logcounts_cellxind
501
+
502
+
503
+ def plot_and_select_top_deps(all_pathways: pd.DataFrame(),
504
+ list_of_paths_to_annotate: list = [],
505
+ save_name='cell_type_specific',
506
+ save_prefix: str = 'mathys_pfc',
507
+ filter: bool=False,
508
+ cell_type_specific: bool = True,
509
+ test_name: str = ''):
510
+
511
+ if cell_type_specific:
512
+ # Plot certain cell_type specific pathways
513
+ collated_df = pd.DataFrame(all_pathways.groupby(all_pathways.index).agg({'score_adj': list, 'celltype': list,
514
+ 'logFC': list, 'P.Value': list, 'shortened': list, 'highlight': list}))
515
+ # filter pathways only expressed in one cell type
516
+ mask = collated_df["celltype"].apply(len) == 1
517
+ df = collated_df[mask]
518
+
519
+ # create pathway by cell type pivot table
520
+ scores_table = pd.pivot_table(all_pathways, values='score_adj', index='pathway', columns='celltype')
521
+ scores_table = scores_table.loc[df.index]
522
+ scores_table['shortened'] = df.shortened.apply(lambda x: x[0])
523
+ scores_table['highlight'] = df.highlight.apply(lambda x: x[0])
524
+ scores_table.sort_values(by=[cell_type for cell_type in all_pathways.celltype.unique()], inplace=True)
525
+
526
+ # drop pathways with same shortened names ??
527
+ scores_table = scores_table.drop_duplicates(subset='shortened', keep='first')
528
+
529
+ ###### Plot Cell type specific data
530
+
531
+ if filter:
532
+ xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
533
+
534
+ # select only pathways that should be visualized
535
+ shortened_names = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]['shortened']
536
+ scores_table = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]
537
+
538
+ n_rows = len(scores_table)
539
+
540
+ fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
541
+ fig.tight_layout()
542
+
543
+ # order table by cell type name
544
+ # scores_table = scores_table.reindex(columns=['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte',
545
+ # 'OPC', 'Microglia'])
546
+ scores_table = scores_table[xticks]
547
+
548
+ g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
549
+ linewidths=0.15, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.2},
550
+ cbar_ax=None, square=False,ax=ax1, xticklabels=xticks, yticklabels=shortened_names, mask=None,)
551
+
552
+
553
+ cax = g1.figure.axes[-1]
554
+
555
+ g1.set_title(f'Select Cell-type-specific Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology',
556
+ fontsize=3)
557
+ g1.set_ylabel('')
558
+ g1.set_xlabel('')
559
+
560
+ ax1.tick_params(axis='both', which='major', labelsize=4, length=1.5, width=0.5)
561
+ cax.tick_params(labelsize=4, length=1.5, width=0.5, which="major")
562
+
563
+ plt.tight_layout()
564
+ plt.savefig(f'results/{test_name}/{save_prefix}_filtered_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
565
+ plt.show(block=False)
566
+
567
+ else:
568
+ xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
569
+
570
+
571
+ scores_table = scores_table[scores_table.shortened!='None']
572
+ yticklabels = scores_table['shortened']
573
+ # order table by cell type name
574
+
575
+ scores_table = scores_table[xticks]
576
+
577
+ n_rows = len(scores_table)
578
+
579
+ fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
580
+ fig.tight_layout()
581
+
582
+ g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
583
+ linewidths=0.07, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.1},
584
+ cbar_ax=None, square=False, ax=ax1, xticklabels=xticks, yticklabels=yticklabels, mask=None,)
585
+
586
+
587
+ cax = g1.figure.axes[-1]
588
+
589
+ g1.set_title(f'All Cell-type-specific Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology',
590
+ fontsize=3)
591
+ g1.set_ylabel('')
592
+ g1.set_xlabel('')
593
+
594
+ ax1.tick_params(axis='both', which='major', labelsize=2, length=1.5, width=0.25)
595
+ cax.tick_params(labelsize=4, length=1.5, width=0.25, which="major")
596
+
597
+ plt.tight_layout()
598
+ #plt.savefig(f'../results/{test_name}/{save_prefix}_all_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
599
+ plt.savefig(f'results/{test_name}/{save_prefix}_all_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
600
+ plt.show(block=False)
601
+
602
+
603
+ else:
604
+ # Plot certain cell_type specific pathways
605
+ collated_df = pd.DataFrame(all_pathways.groupby(all_pathways.index).agg({'score_adj': list, 'celltype': list,
606
+ 'logFC': list, 'P.Value': list, 'shortened': list, 'highlight': list}))
607
+ # filte pathways only expressed in one cell type
608
+ mask = collated_df["celltype"].apply(len) > 1
609
+ df = collated_df[mask]
610
+
611
+ # create pathway by cell type pivot table
612
+ scores_table = pd.pivot_table(all_pathways, values='score_adj', index='pathway', columns='celltype')
613
+ scores_table = scores_table.loc[df.index]
614
+ scores_table['shortened'] = df.shortened.apply(lambda x: x[0])
615
+ scores_table['highlight'] = df.highlight.apply(lambda x: x[0])
616
+ scores_table.sort_values(by=[cell_type for cell_type in all_pathways.celltype.unique()], inplace=True)
617
+
618
+ # drop pathways with same shortened names ??
619
+ scores_table = scores_table.drop_duplicates(subset='shortened', keep='first')
620
+
621
+ ###### Plot Cell type specific data
622
+
623
+ if filter:
624
+ xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
625
+
626
+ # select only pathways that should be visualized
627
+ shortened_names = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]['shortened']
628
+ scores_table = scores_table[scores_table.shortened.isin(list_of_paths_to_annotate)]
629
+
630
+ # order table by cell type name
631
+ scores_table = scores_table[xticks]
632
+
633
+ n_rows = len(scores_table)
634
+
635
+ fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
636
+ fig.tight_layout()
637
+
638
+ g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
639
+ linewidths=0.15, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.2},
640
+ cbar_ax=None, square=False,ax=ax1, xticklabels=xticks, yticklabels=shortened_names, mask=None,)
641
+
642
+ cax = g1.figure.axes[-1]
643
+
644
+ g1.set_title(f'Select Shared Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology', fontsize=3)
645
+ g1.set_ylabel('')
646
+ g1.set_xlabel('')
647
+
648
+ ax1.tick_params(axis='both', which='major', labelsize=4, length=1.5, width=0.5)
649
+ cax.tick_params(labelsize=4, length=1.5, width=0.5, which="major")
650
+
651
+ plt.tight_layout()
652
+ plt.savefig(f'results/{test_name}/{save_prefix}_filtered_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
653
+ plt.show(block=False)
654
+
655
+ else:
656
+ xticks = ['Excitatory', 'Inhibitory', 'Astrocyte', 'Oligodendrocyte', 'OPC', 'Microglia', 'Endothelial']
657
+
658
+ scores_table = scores_table[scores_table.shortened!='None']
659
+ yticklabels = scores_table['shortened']
660
+ # order table by cell type name
661
+
662
+ scores_table = scores_table[xticks]
663
+
664
+ n_rows = len(scores_table)
665
+
666
+ fig, ax1 = plt.subplots(1, 1, figsize=(0.5, n_rows*0.095), sharex=False, layout='constrained')
667
+ fig.tight_layout()
668
+
669
+ g1 = sb.heatmap(scores_table, cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
670
+ linewidths=0.07, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.1},
671
+ cbar_ax=None, square=False, ax=ax1, xticklabels=xticks, yticklabels=yticklabels, mask=None,)
672
+
673
+ cax = g1.figure.axes[-1]
674
+
675
+ g1.set_title(f'All Broad Pathways in {test_name.split("_")[0]}- vs {test_name.split("_")[-1]}-pathology', fontsize=3)
676
+ g1.set_ylabel('')
677
+ g1.set_xlabel('')
678
+
679
+ ax1.tick_params(axis='both', which='major', labelsize=2, length=1.5, width=0.25)
680
+ cax.tick_params(labelsize=4, length=1.5, width=0.25, which="major")
681
+
682
+ plt.tight_layout()
683
+ plt.savefig(f'results/{test_name}/{save_prefix}_all_{save_name}_diff_exp_paths.pdf', bbox_inches='tight')
684
+ plt.show(block=False)
685
+
686
+ return
687
+
688
+
689
+ def multi_study_pathway_overlap(pathway_scores: dict = {},
690
+ filtered_pathways: list = [],
691
+ cell_types: list = ["Excitatory", "Inhibitory", "Astrocyte",
692
+ "Microglia", "Oligodendrocyte", "OPC", "Endothelial"],
693
+ test_name: str = 'ad_vs_no',
694
+ top_n: int = 10,
695
+ pathways: list = [],
696
+ filter: bool = False,
697
+ save_suffix: str = 'ad_vs_no',
698
+ method: str = 'cell_type_overlap'):
699
+
700
+ """
701
+ This function generates a heatmap of the overlapping pathways across multiple studies. The heatmap displays the adjusted
702
+ pathway scores across different cell types for each pathway in each study. The function also returns a dictionary of
703
+ filtered scores that contain only the overlapping pathways across the studies.
704
+
705
+ Parameters:
706
+ -----------
707
+ pathway_scores : dict
708
+ A dictionary of pathway scores for different studies.
709
+ filtered_pathways : list, optional
710
+ A list of pathways to be used as a filter.
711
+ cell_types : list, optional
712
+ A list of cell types to be included in the heatmap. Default is ["Excitatory", "Inhibitory", "Astrocyte",
713
+ "Microglia", "Oligodendrocyte", "OPC", "Endothelial"].
714
+ test_name : str, optional
715
+ The name of the test being compared. Default is 'ad_vs_no'.
716
+ top_n : int, optional
717
+ The number of top pathways to be included in the heatmap. Default is 10.
718
+ pathways : list, optional
719
+ A list of pathways to be included in the heatmap. If not empty, only these pathways will be included in the
720
+ heatmap. Default is [].
721
+ filter : bool, optional
722
+ If True, the function will filter out pathways that are not present in the filtered_pathways list. Default is
723
+ False.
724
+ save_suffix : str, optional
725
+ A suffix to be added to the output file name. Default is 'ad_vs_no'.
726
+ method : str, optional
727
+ The method used to generate the overlap. 'cell_type_overlap' will generate the overlap based on cell type.
728
+ 'global_overlap' will generate the overlap based on all pathways in the studies. Default is 'cell_type_overlap'.
729
+
730
+ Returns:
731
+ --------
732
+ filtered_scores : dict
733
+ A dictionary of pathway scores for the overlapping pathways across the studies.
734
+
735
+ Examples:
736
+ ---------
737
+ >>> multi_study_pathway_overlap(pathway_scores, filtered_pathways=['pathway1', 'pathway2'],
738
+ cell_types=['Excitatory', 'Astrocyte'], test_name='ad_vs_no', filter=True)
739
+ """
740
+
741
+
742
+ for i, study in enumerate(pathway_scores.keys()):
743
+ pathway_scores[study][test_name] = pathway_scores[study][test_name][pathway_scores[study][test_name].celltype.isin(cell_types)]
744
+
745
+ if method == "cell_type_overlap":
746
+ overlap = []
747
+ for cell_type in cell_types:
748
+ eval_string = []
749
+ for i, study in enumerate(pathway_scores.keys()):
750
+ eval_string.append(f'set(pathway_scores["{study}"]["{test_name}"][pathway_scores["{study}"]["{test_name}"].celltype=="{cell_type}"].pathway)')
751
+
752
+ eval_string = '&'.join(eval_string)
753
+ overlap.extend(list(eval(eval_string)))
754
+
755
+ elif method == "global_overlap":
756
+ overlap = []
757
+ eval_string = []
758
+ for i, study in enumerate(pathway_scores.keys()):
759
+ eval_string.append(f'set(pathway_scores["{study}"]["{test_name}"].pathway)')
760
+
761
+ eval_string = '&'.join(eval_string)
762
+ overlap.extend(list(eval(eval_string)))
763
+
764
+
765
+ if filter:
766
+ n_rows = len(set(filtered_pathways) & set(overlap))
767
+ else:
768
+ n_rows = len(overlap)
769
+
770
+ fig, axs = plt.subplots(1, 3, figsize=(3.5, n_rows*0.095), gridspec_kw={'width_ratios':[0.85, 0.85, 1]}, sharex=False,
771
+ sharey=True, layout='constrained')
772
+ fig.tight_layout()
773
+
774
+ filtered_scores = {}
775
+ shortened_names = {}
776
+
777
+ for i, study in enumerate(pathway_scores.keys()):
778
+ filtered_scores[study] = pathway_scores[study][test_name][pathway_scores[study][test_name].pathway.isin(overlap)]
779
+ filtered_scores[study] = pd.pivot_table(filtered_scores[study], values='score_adj', index='pathway', columns='celltype')
780
+ filtered_scores[study] = filtered_scores[study][cell_types]
781
+
782
+ if filter:
783
+ filtered_scores[study] = filtered_scores[study].loc[filtered_scores[study].index.isin(filtered_pathways)]
784
+
785
+ shortened_names[study] = [' '.join(name.split(" ")[:-1]) for name in filtered_scores[study].index]
786
+ # shortened_names[study] = filtered_scores[study].index
787
+
788
+ cbar=True if study==list(pathway_scores.keys())[-1] else False
789
+ g1 = sb.heatmap(filtered_scores[study], cmap='bwr', center=0, vmin=-2.5, vmax=2.5, robust=False, annot=None, fmt='.1g',
790
+ linewidths=0.015, linecolor='black', annot_kws=None, cbar_kws={'shrink': 0.2}, cbar=cbar,
791
+ cbar_ax=None, square=False, ax=axs[i], xticklabels=cell_types, yticklabels=shortened_names[study], mask=None,)
792
+
793
+ axs[i].tick_params(axis='both', which='major', labelsize=2.5, length=1.5, width=0.5)
794
+
795
+ g1.set_title(study.split('_')[-1].upper(), fontsize=3)
796
+ g1.set_ylabel('', fontsize=4)
797
+ g1.set_xlabel('')
798
+
799
+ cax = g1.figure.axes[-1]
800
+ cax.tick_params(labelsize=4, length=1.5, width=0.5, which="major")
801
+
802
+ # plt.tight_layout()
803
+ # if filter:
804
+ # plt.savefig(f'../results/pathway_meta_analysis/filtered_overlap_pathway_diff_exp_patterns_{save_suffix}.pdf', bbox_inches='tight')
805
+ # else:
806
+
807
+ plt.suptitle(f"{test_name.split('_')[0].capitalize()}- vs {test_name.split('_')[-1]}-pathology", fontsize=4)
808
+
809
+ if filter:
810
+ plt.savefig(f'results/{test_name}/multi_study_pathway_overlap_filtered.pdf', bbox_inches='tight')
811
+ else:
812
+ plt.savefig(f'results/{test_name}/multi_study_pathway_overlap_all.pdf', bbox_inches='tight')
813
+ plt.show(block=False)
814
+
815
+ return filtered_scores
816
+
817
+
818
+ def save_plot(fig, ax, save):
819
+ if save is not None:
820
+ if ax is not None:
821
+ if fig is not None:
822
+ fig.savefig(save, bbox_inches='tight')
823
+ else:
824
+ raise ValueError("fig is None, cannot save figure.")
825
+ else:
826
+ raise ValueError("ax is None, cannot save figure.")
827
+
828
+
829
+ def check_if_matplotlib(return_mpl=False):
830
+ if not return_mpl:
831
+ try:
832
+ import matplotlib.pyplot as plt
833
+ except Exception:
834
+ raise ImportError('matplotlib is not installed. Please install it with: pip install matplotlib')
835
+ return plt
836
+ else:
837
+ try:
838
+ import matplotlib as mpl
839
+ except Exception:
840
+ raise ImportError('matplotlib is not installed. Please install it with: pip install matplotlib')
841
+ return mpl
842
+
843
+
844
+ def check_if_seaborn():
845
+ try:
846
+ import seaborn as sns
847
+ except Exception:
848
+ raise ImportError('seaborn is not installed. Please install it with: pip install seaborn')
849
+ return sns
850
+
851
+
852
+ def check_if_adjustText():
853
+ try:
854
+ import adjustText as at
855
+ except Exception:
856
+ raise ImportError('adjustText is not installed. Please install it with: pip install adjustText')
857
+ return at
858
+
859
+
860
+ def filter_limits(df, sign_limit=None, lFCs_limit=None):
861
+
862
+ """
863
+ Filters a DataFrame by limits of the absolute value of the columns pvals and logFCs.
864
+
865
+ Parameters
866
+ ----------
867
+ df : pd.DataFrame
868
+ The input DataFrame to be filtered.
869
+ sign_limit : float, None
870
+ The absolute value limit for the p-values. If None, defaults to infinity.
871
+ lFCs_limit : float, None
872
+ The absolute value limit for the logFCs. If None, defaults to infinity.
873
+
874
+ Returns
875
+ -------
876
+ pd.DataFrame
877
+ The filtered DataFrame.
878
+ """
879
+
880
+ # Define limits if not defined
881
+ if sign_limit is None:
882
+ sign_limit = np.inf
883
+ if lFCs_limit is None:
884
+ lFCs_limit = np.inf
885
+
886
+ # Filter by absolute value limits
887
+ msk_sign = df['pvals'] < np.abs(sign_limit)
888
+ msk_lFCs = np.abs(df['logFCs']) < np.abs(lFCs_limit)
889
+ df = df.loc[msk_sign & msk_lFCs]
890
+
891
+ return df
892
+
893
+
894
+ def plot_volcano(data, x, y, x_label, y_label='-log10(pvals)', annotate=True,
895
+ annot_by='top', names=[],
896
+ top=5, sign_thr=0.05, lFCs_thr=0.5, sign_limit=None, lFCs_limit=None,
897
+ figsize=(7, 5), dpi=100, ax=None, return_fig=False, save=None,
898
+ fontsizes={"on_plot": 4}):
899
+ """
900
+ Plot logFC and p-values from a long formated data-frame.
901
+
902
+ Parameters
903
+ ----------
904
+ data : pd.DataFrame
905
+ Results of DEA in long format.
906
+ x : str
907
+ Column name of data storing the logFCs.
908
+ y : str
909
+ Columns name of data storing the p-values.
910
+ x_label: str
911
+ Aternate name for LogFC to be included in plot. If None, defaults to x
912
+ y_label: str
913
+ Aternate name for p-values to be included in plot. If None, defaults to y
914
+ annotate: bool
915
+ Whether to annotate labels.
916
+ annot_by: str
917
+ Determines how to annotate the plot for top features. It can be either 'top' or 'name'.
918
+ If set to 'top', the top top differentially expressed features will be annotated. If set to 'name',
919
+ only the features specified in names will be annotated.
920
+ names: list[]:
921
+ A list of feature names to be annotated in the plot. Only used if annot_by is set to 'name'.
922
+ top : int
923
+ Number of top differentially expressed features to show.
924
+ sign_thr : float
925
+ Significance threshold for p-values.
926
+ lFCs_thr : float
927
+ Significance threshold for logFCs.
928
+ sign_limit : float
929
+ Limit of p-values to plot in -log10.
930
+ lFCs_limit : float
931
+ Limit of logFCs to plot in absolute value.
932
+ figsize : tuple
933
+ Figure size.
934
+ dpi : int
935
+ DPI resolution of figure.
936
+ ax : Axes, None
937
+ A matplotlib axes object. If None returns new figure.
938
+ return_fig : bool
939
+ Whether to return a Figure object or not.
940
+ save : str, None
941
+ Path to where to save the plot. Infer the filetype if ending on {`.pdf`, `.png`, `.svg`}.
942
+
943
+ Returns
944
+ -------
945
+ fig : Figure, None
946
+ If return_fig, returns Figure object.
947
+ """
948
+
949
+
950
+ if x_label is None:
951
+ x_label = x
952
+
953
+ if y_label is None:
954
+ y_label = y
955
+
956
+ # Load plotting packages
957
+ plt = check_if_matplotlib()
958
+ at = check_if_adjustText()
959
+
960
+ # Transform sign_thr
961
+ sign_thr = -np.log10(sign_thr)
962
+
963
+ # Extract df
964
+ df = data.copy()
965
+ df['logFCs'] = df[x]
966
+ df['pvals'] = -np.log10(df[y])
967
+
968
+ # Filter by limits
969
+ df = filter_limits(df, sign_limit=sign_limit, lFCs_limit=lFCs_limit)
970
+
971
+ # Define color by up or down regulation and significance
972
+ df['weight'] = 'gray'
973
+ up_msk = (df['logFCs'] >= lFCs_thr) & (df['pvals'] >= sign_thr)
974
+ dw_msk = (df['logFCs'] <= -lFCs_thr) & (df['pvals'] >= sign_thr)
975
+ df.loc[up_msk, 'weight'] = '#D62728'
976
+ df.loc[dw_msk, 'weight'] = '#1F77B4'
977
+
978
+ # Plot
979
+ fig = None
980
+ if ax is None:
981
+ fig, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
982
+
983
+ n = df.shape[0]
984
+ size = 120000 / (100*n)
985
+
986
+ df.plot.scatter(x='logFCs', y='pvals', c='weight', sharex=False, ax=ax, s=size)
987
+
988
+ # Draw sign lines
989
+ ax.axhline(y=sign_thr, linestyle='--', color="black")
990
+ ax.axvline(x=lFCs_thr, linestyle='--', color="black")
991
+ ax.axvline(x=-lFCs_thr, linestyle='--', color="black")
992
+
993
+ # Plot top sign features
994
+ signs = df[up_msk | dw_msk].sort_values('pvals', ascending=False)
995
+
996
+ # Add labels
997
+ ax.set_ylabel(y_label)
998
+ ax.set_xlabel(x_label)
999
+
1000
+ if annotate:
1001
+ if annot_by == 'top':
1002
+ signs = signs.iloc[:top]
1003
+ elif annot_by == 'name':
1004
+ signs = signs.loc[signs.index.isin(names)]
1005
+
1006
+ texts = []
1007
+ for x, y, s in zip(signs['logFCs'], signs['pvals'], signs.index):
1008
+ texts.append(ax.text(x, y, s, fontsize=fontsizes['on_plot']))
1009
+ if len(texts) > 0:
1010
+ at.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'), ax=ax)
1011
+
1012
+ save_plot(fig, ax, save)
1013
+
1014
+ if return_fig:
1015
+ return fig
pathway_databases/GO_Biological_Process_2021.txt ADDED
The diff for this file is too large to render. See raw diff