Spaces:
Sleeping
Sleeping
File size: 15,205 Bytes
2999286 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 |
#!/usr/bin/env python # coding: utf-8 # # Cell interaction with CellPhoneDB # # CellPhoneDB is a publicly available repository of curated receptors, ligands and their interactions in HUMAN. CellPhoneDB can be used to search for a particular ligand/receptor, or interrogate your own single-cell transcriptomics data. # # We made three improvements in integrating the CellPhoneDB algorithm in OmicVerse: # # - We have added a tutorial on analysing `anndata` based on any `anndata`. # - We added prettier heatmaps, chord diagrams and network diagrams for visualising relationships between cells. # - We added visualisation of ligand receptor proteins in different groups # # Paper: [Single-cell reconstruction of the early maternal–fetal interface in humans](https://www.nature.com/articles/s41586-018-0698-6) # # Code: https://github.com/ventolab/CellphoneDB # # This notebook will demonstrate how to use CellPhoneDB on scRNA data and visualize it. # In[1]: import scanpy as sc import matplotlib.pyplot as plt import pandas as pd import numpy as np import omicverse as ov import os ov.plot_set() #print(f'cellphonedb version{cellphonedb.__version__}') # ## The EVT Data # # Th EVT data have finished the celltype annotation, it can be download from the tutorial of CellPhoneDB. # # Download: https://github.com/ventolab/CellphoneDB/blob/master/notebooks/data_tutorial.zip # # In[2]: adata=sc.read('data/cpdb/normalised_log_counts.h5ad') adata=adata[adata.obs['cell_labels'].isin(['eEVT','iEVT','EVT_1','EVT_2','DC','dNK1','dNK2','dNK3', 'VCT','VCT_CCC','VCT_fusing','VCT_p','GC','SCT'])] adata # In[3]: ov.pl.embedding(adata, basis='X_umap', color='cell_labels', frameon='small', palette=ov.pl.red_color+ov.pl.blue_color+ov.pl.green_color+ov.pl.orange_color+ov.pl.purple_color) # In[4]: adata.X.max() # We can clearly see that the maximum value of the data is a floating point number less than 10. The fact that the maximum value is not an integer means that it has been normalised, and the fact that it is less than 10 means that it has been logarithmised. Note that our data cannot be `scaled`. # ## Export the anndata object # # As the input to CellPhoneDB only requires the expression matrix and cell type, we extracted only the expression matrix and cell type from adata for the next step of analysis # In[5]: sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) adata1=sc.AnnData(adata.X,obs=pd.DataFrame(index=adata.obs.index), var=pd.DataFrame(index=adata.var.index)) adata1.write_h5ad('data/cpdb/norm_log.h5ad',compression='gzip') adata1 # ## Export the meta info of cells # # we construct a `DataFrame` object to export the meta info of cells. In EVT adata object, the celltypes were stored in the `obs['cell_labels']` # In[6]: #meta导出 df_meta = pd.DataFrame(data={'Cell':list(adata[adata1.obs.index].obs.index), 'cell_type':[ i for i in adata[adata1.obs.index].obs['cell_labels']] }) df_meta.set_index('Cell', inplace=True) df_meta.to_csv('data/cpdb/meta.tsv', sep = '\t') # ## Cell interaction analysis # # Now, we prepare the meta info of cells `meta.tsv` and matrix of scRNA-eq `norm_log.h5ad`, we can use the method of CellPhoneDB to calculate the interaction of each celltype in scRNA-seq data. # # Importantly, to avoid a series of bugs, we set the absolute path for CellPhoneDB analysis. we use `os.getcwd() ` to get the path now analysis. # In[7]: import os os.getcwd() # Another thing to note is that we need to download the `cellphonedb.zip` file from `cellphonedb-data` for further analysis. I have placed it in the `data/CPDB` directory, but you can place it in any path you are interested in # # Downloads: https://github.com/ventolab/cellphonedb-data # In[8]: cpdb_file_path = '/Users/fernandozeng/Desktop/analysis/cellphonedb-data/cellphonedb.zip' meta_file_path = os.getcwd()+'/data/cpdb/meta.tsv' counts_file_path = os.getcwd()+'/data/cpdb/norm_log.h5ad' microenvs_file_path = None active_tf_path = None out_path =os.getcwd()+'/data/cpdb/test_cellphone' # Now we run `cpdb_statistical_analysis_method` to predicted the cell interaction in scRNA-seq # In[9]: from cellphonedb.src.core.methods import cpdb_statistical_analysis_method cpdb_results = cpdb_statistical_analysis_method.call( cpdb_file_path = cpdb_file_path, # mandatory: CellphoneDB database zip file. meta_file_path = meta_file_path, # mandatory: tsv file defining barcodes to cell label. counts_file_path = counts_file_path, # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object counts_data = 'hgnc_symbol', # defines the gene annotation in counts matrix. active_tfs_file_path = active_tf_path, # optional: defines cell types and their active TFs. microenvs_file_path = microenvs_file_path, # optional (default: None): defines cells per microenvironment. score_interactions = True, # optional: whether to score interactions or not. iterations = 1000, # denotes the number of shufflings performed in the analysis. threshold = 0.1, # defines the min % of cells expressing a gene for this to be employed in the analysis. threads = 10, # number of threads to use in the analysis. debug_seed = 42, # debug randome seed. To disable >=0. result_precision = 3, # Sets the rounding for the mean values in significan_means. pvalue = 0.05, # P-value threshold to employ for significance. subsampling = False, # To enable subsampling the data (geometri sketching). subsampling_log = False, # (mandatory) enable subsampling log1p for non log-transformed data inputs. subsampling_num_pc = 100, # Number of componets to subsample via geometric skectching (dafault: 100). subsampling_num_cells = 1000, # Number of cells to subsample (integer) (default: 1/3 of the dataset). separator = '|', # Sets the string to employ to separate cells in the results dataframes "cellA|CellB". debug = False, # Saves all intermediate tables employed during the analysis in pkl format. output_path = out_path, # Path to save results. output_suffix = None # Replaces the timestamp in the output files by a user defined string in the (default: None). ) # In[10]: ov.utils.save(cpdb_results,'data/cpdb/gex_cpdb_test.pkl') # In[5]: cpdb_results=ov.utils.load('data/cpdb/gex_cpdb_test.pkl') # ## Network of celltype analysis # # It is worth noting that we will be using ov for all downstream analysis, starting with cell network analysis, where we provide the `ov.single.cpdb_network_cal` function to extract interactions, and the `ov.single.cpdb_plot_network` function for very elegant visualization # In[6]: interaction=ov.single.cpdb_network_cal(adata = adata, pvals = cpdb_results['pvalues'], celltype_key = "cell_labels",) # In[7]: interaction['interaction_edges'].head() # In[8]: ov.plot_set() # In[9]: fig, ax = plt.subplots(figsize=(4,4)) ov.pl.cpdb_heatmap(adata,interaction['interaction_edges'],celltype_key='cell_labels', fontsize=11, ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',}) # In[10]: fig, ax = plt.subplots(figsize=(2,4)) ov.pl.cpdb_heatmap(adata,interaction['interaction_edges'],celltype_key='cell_labels', source_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'], ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',}) # In[11]: fig=ov.pl.cpdb_chord(adata,interaction['interaction_edges'],celltype_key='cell_labels', count_min=60,fontsize=12,padding=50,radius=100,save=None,) fig.show() # In[12]: fig, ax = plt.subplots(figsize=(4,4)) ov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels', counts_min=60, nodesize_scale=5, ax=ax) # In[13]: fig, ax = plt.subplots(figsize=(4,4)) ov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels', counts_min=60, nodesize_scale=5, source_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'], ax=ax) # In[14]: fig, ax = plt.subplots(figsize=(4,4)) ov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels', counts_min=60, nodesize_scale=5, target_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'], ax=ax) # In[15]: ov.single.cpdb_plot_network(adata=adata, interaction_edges=interaction['interaction_edges'], celltype_key='cell_labels', nodecolor_dict=None,title='EVT Network', edgeswidth_scale=25,nodesize_scale=10, pos_scale=1,pos_size=10,figsize=(6,6), legend_ncol=3,legend_bbox=(0.8,0.2),legend_fontsize=10) # Sometimes, the whole network you don't want to use for analysis, the sub-network is useful for analysis. we can exacted the sub-network from it. # # We need to exacted the sub-interaction first, we assumed that the five celltypes `['EVT_1','EVT_2','dNK1','dNK2','dNK3']` which is interested # In[16]: sub_i=interaction['interaction_edges'] sub_i=sub_i.loc[sub_i['SOURCE'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])] sub_i=sub_i.loc[sub_i['TARGET'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])] # Then, we exacted the sub-anndata object # In[17]: sub_adata=adata[adata.obs['cell_labels'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])] sub_adata # Now we plot the sub-interaction network between the cells in scRNA-seq # In[18]: ov.single.cpdb_plot_network(adata=sub_adata, interaction_edges=sub_i, celltype_key='cell_labels', nodecolor_dict=None,title='Sub-EVT Network', edgeswidth_scale=25,nodesize_scale=1, pos_scale=1,pos_size=10,figsize=(5,5), legend_ncol=3,legend_bbox=(0.8,0.2),legend_fontsize=10) # In[19]: fig=ov.pl.cpdb_chord(sub_adata,sub_i,celltype_key='cell_labels', count_min=10,fontsize=12,padding=60,radius=100,save=None,) fig.show() # In[20]: fig, ax = plt.subplots(figsize=(4,4)) ov.pl.cpdb_network(sub_adata,sub_i,celltype_key='cell_labels', counts_min=10, nodesize_scale=5, ax=ax) # In[21]: fig, ax = plt.subplots(figsize=(3,3)) ov.pl.cpdb_heatmap(sub_adata,sub_i,celltype_key='cell_labels', ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',}) # ## The ligand-receptor exacted # # We can set EVT as ligand or receptor to exacted the ligand-receptor proteins from the result of CellPhoneDB. # # # The most important step is that we need to extract the results of the analysis with eEVT as the ligand, and here we use ov's function `ov.single.cpdb_exact_target`,`ov.single.cpdb_exact_source` to do this # In[22]: sub_means=ov.single.cpdb_exact_target(cpdb_results['means'],['eEVT','iEVT']) sub_means=ov.single.cpdb_exact_source(sub_means,['dNK1','dNK2','dNK3']) sub_means.head() # In[23]: ov.pl.cpdb_interacting_heatmap(adata=adata, celltype_key='cell_labels', means=cpdb_results['means'], pvalues=cpdb_results['pvalues'], source_cells=['dNK1','dNK2','dNK3'], target_cells=['eEVT','iEVT'], plot_secret=True, min_means=3, nodecolor_dict=None, ax=None, figsize=(2,6), fontsize=10,) # Sometimes we care about the expression of ligand in SOURCE and receptor in TARGET, we provide another function for getting the expression situation # In[24]: ov.pl.cpdb_group_heatmap(adata=adata, celltype_key='cell_labels', means=cpdb_results['means'], cmap={'Target':'Blues','Source':'Reds'}, source_cells=['dNK1','dNK2','dNK3'], target_cells=['eEVT','iEVT'], plot_secret=True, min_means=3, nodecolor_dict=None, ax=None, figsize=(2,6), fontsize=10,) # We can also build Ligand, Receptor, SOURCE, and TARGET into a regulatory network, which is interesting. # In[25]: ov.pl.cpdb_interacting_network(adata=adata, celltype_key='cell_labels', means=cpdb_results['means'], source_cells=['dNK1','dNK2','dNK3'], target_cells=['eEVT','iEVT'], means_min=1, means_sum_min=1, nodecolor_dict=None, ax=None, figsize=(6,6), fontsize=10) # Sometimes we want to analyse ligand-receptor pathway enrichment or function, so we need to extract ligand-receptor pairs from the significant ligand-receptors filtered out above, and omicverse provides an easy function `ov.single.cpdb_interaction_filtered` to do this here # In[40]: sub_means=sub_means.loc[~sub_means['gene_a'].isnull()] sub_means=sub_means.loc[~sub_means['gene_b'].isnull()] enrichr_genes=sub_means['gene_a'].tolist()+sub_means['gene_b'].tolist() # A tutorial on enrichment you can find in the Bulk chapter of tutorials: # # https://omicverse.readthedocs.io/en/latest/Tutorials-bulk/t_deg/ or https://starlitnightly.github.io/omicverse/Tutorials-bulk/t_deg/ # In[ ]: pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2023.txt',organism='Human') # In[14]: #deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist() enr=ov.bulk.geneset_enrichment(gene_list=enrichr_genes, pathways_dict=pathway_dict, pvalue_type='auto', organism='human') # In[20]: ov.plot_set() ov.bulk.geneset_plot(enr,figsize=(2,4),fig_title='GO-Bio(EVT)', cax_loc=[2, 0.45, 0.5, 0.02],num=8, bbox_to_anchor_used=(-0.25, -13),custom_ticks=[10,100], cmap='Greens') # In[ ]: |