#!/usr/bin/env python # coding: utf-8 # # Data integration and batch correction with SIMBA # # Here we will use three scRNA-seq human pancreas datasets of different studies as an example to illustrate how SIMBA performs scRNA-seq batch correction for multiple batches # # We follow the corresponding tutorial at [SIMBA](https://simba-bio.readthedocs.io/en/latest/rna_human_pancreas.html). We do not provide much explanation, and instead refer to the original tutorial. # # Paper: [SIMBA: single-cell embedding along with features](https://www.nature.com/articles/s41592-023-01899-8) # # Code: https://github.com/huidongchen/simba # In[1]: import omicverse as ov from omicverse.utils import mde workdir = 'result_human_pancreas' ov.utils.ov_plot_set() # We need to install simba at first # # ``` # conda install -c bioconda simba # ``` # # or # # ``` # pip install git+https://github.com/huidongchen/simba # pip install git+https://github.com/pinellolab/simba_pbg # ``` # ## Read data # # The anndata object was concat from three anndata in simba: `simba.datasets.rna_baron2016()`, `simba.datasets.rna_segerstolpe2016()`, and `simba.datasets.rna_muraro2016()` # # It can be downloaded from figshare: https://figshare.com/ndownloader/files/41418600 # In[2]: adata=ov.utils.read('simba_adata_raw.h5ad') # We need to set workdir to initiate the pySIMBA object # In[3]: simba_object=ov.single.pySIMBA(adata,workdir) # ## Preprocess # # Follow the raw tutorial, we set the paragument as default. # In[4]: simba_object.preprocess(batch_key='batch',min_n_cells=3, method='lib_size',n_top_genes=3000,n_bins=5) # ## Generate a graph for training # # Observations and variables within each Anndata object are both represented as nodes (entities). # # the data store in `simba_object.uns['simba_batch_edge_dict']` # In[5]: simba_object.gen_graph() # ## PBG training # # Before training, let’s take a look at the current parameters: # # - dict_config['workers'] = 12 #The number of CPUs. # In[10]: simba_object.train(num_workers=6) # In[6]: simba_object.load('result_human_pancreas/pbg/graph0') # ## Batch correction # # Here, we use `simba_object.batch_correction()` to perform the batch correction # #
#

Note

#

# If the batch is greater than 10, then the batch correction is less effective #

#
# In[7]: adata=simba_object.batch_correction() adata # ## Visualize # # We also use `mde` instead `umap` to visualize the result # In[8]: adata.obsm["X_mde"] = mde(adata.obsm["X_simba"]) # In[11]: sc.pl.embedding(adata,basis='X_mde',color=['cell_type1','batch']) # Certainly, umap can also be used to visualize # In[10]: import scanpy as sc sc.pp.neighbors(adata, use_rep="X_simba") sc.tl.umap(adata) sc.pl.umap(adata,color=['cell_type1','batch']) # In[ ]: