Martin Proks commited on
Commit
fc741fb
·
unverified ·
1 Parent(s): dddbe1c

chore: port streamlit-portal to huggingface spaces

Browse files
Home.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import streamlit as st
3
+
4
+ st.set_page_config(layout="wide")
5
+
6
+ st.markdown(
7
+ """
8
+ # Deep Learning Based Models for Preimplantation Mouse and Human Embryos Based on Single Cell RNA Sequencing
9
+
10
+ _[Martin Proks](https://orcid.org/0000-0002-8178-3128)\*,
11
+ [Nazmus Salehin](https://orcid.org/0000-0002-8155-4296)\*,
12
+ [Joshua M. Brickman](https://orcid.org/0000-0003-1580-7491)**_
13
+
14
+ _\* There authors contributed equally to the work_
15
+
16
+ _** Corresponding author [[email protected]](mailto:[email protected])_
17
+
18
+ The rapid growth of single-cell transcriptomic technology has produced an increasing number of
19
+ datasets for both embryonic development and _in vitro_ pluripotent stem cell derived models.
20
+ This avalanche of data surrounding pluripotency and the process of lineage specification has
21
+ meant it has become increasingly difficult to define specific cell types or states and compare
22
+ these to _in vitro_ differentiation. Here we utilize a set of deep learning (DL) tools to
23
+ integrate and classify multiple datasets. This allows for the definition of both mouse and
24
+ human embryo cell types, lineages and states, thereby maximising the information one can garner
25
+ from these precious experimental resources. Our approaches are built on recent initiatives for
26
+ large scale human organ atlases, but here we focus on the difficult to obtain and process
27
+ material that spans early mouse and human development. We deploy similar approaches as the
28
+ initiatives building large reference organ atlases, however with a focus on early mammalian
29
+ development. Using publicly available data for these stages, we test different deep learning
30
+ approaches and develop a model to classify cell types in an unbiased fashion at the same time as
31
+ defining the set of genes used by the model to identify lineages, cell types and states. We have
32
+ used our models trained on _in vivo_ development to classify pluripotent stem cell models for
33
+ both mouse and human development, showcasing the importance of this resource as a dynamic
34
+ reference for early embryogenesis.
35
+ """
36
+ )
37
+
38
+ st.image(
39
+ "static/Fig-1.v4.3.png",
40
+ caption="""
41
+ Summary of datasets used to build reference models. a) Schematic overview of mouse
42
+ and human preimplantation development. b) Quantification of cells per publication which
43
+ were collected for building the mouse (grey) and human (black) reference. c) Computational
44
+ schematic of tools used to build and interpret the reference models. d) Gene expression
45
+ of canonical markers for each developmental stage in mouse (top) and human (bottom)
46
+ preimplantation. e) Reduced dimensional representation of preimplantation mouse (left)
47
+ and human (right) datasets. dpf: days post fertilization, E: embryonic day.
48
+ """,
49
+ )
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
- title: Preimplantation
3
- emoji: 🏆
4
- colorFrom: green
5
- colorTo: purple
6
  sdk: streamlit
7
- sdk_version: 1.41.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Preimplantation portal
 
 
 
3
  sdk: streamlit
4
+ sdk_version: 1.38.0
5
+ app_file: Home.py
 
6
  ---
7
 
8
+ # preimplantation portal
9
+
10
+ ## Run locally
11
+
12
+ ```bash
13
+ source venv/bin/activate
14
+ streamlit run Home.py
15
+ ```
__init__.py ADDED
File without changes
constants.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ VERSION = 1.5
2
+
3
+ DEFAULT_DR = "X_draw_graph_fa"
4
+ DEFAULT_META = "stage"
5
+ ZENODO_URL = "https://zenodo.org/records/13749348/files"
6
+
7
+ DATA = {
8
+ "HUMAN": {
9
+ "RAW_DATASET": f"{ZENODO_URL}/32_human_adata.h5ad",
10
+ "DATASET": f"{ZENODO_URL}/portal_human_v{VERSION}.h5ad",
11
+ "DEGS": {
12
+ "CT": f"{ZENODO_URL}/human_degs_ct_v{VERSION}.feather",
13
+ "STAGE": f"{ZENODO_URL}/human_degs_stage_v{VERSION}.feather"
14
+ },
15
+ "SHAP": f"{ZENODO_URL}/human_SHAP_v{VERSION}.feather",
16
+ },
17
+ "MOUSE": {
18
+ "RAW_DATASET": f"{ZENODO_URL}/01_mouse_reprocessed.h5ad",
19
+ "DATASET": f"{ZENODO_URL}/portal_mouse_v{VERSION}.h5ad",
20
+ "DEGS": {
21
+ "CT": f"{ZENODO_URL}/mouse_degs_ct_v{VERSION}.feather",
22
+ "STAGE": f"{ZENODO_URL}/mouse_degs_stage_v{VERSION}.feather",
23
+ },
24
+ "SHAP": f"{ZENODO_URL}/mouse_SHAP_v{VERSION}.feather",
25
+ },
26
+ }
pages/1_Gene_Expression.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import anndata
3
+ import streamlit as st
4
+
5
+ from constants import DATA, DEFAULT_DR, DEFAULT_META
6
+ from utils import fetch_resource, plot_feature, plot_sc_embedding
7
+
8
+ st.set_page_config(layout="wide")
9
+ st.markdown("""
10
+ # Gene expression
11
+
12
+ Levels of gene activity along differentiation.
13
+ """)
14
+
15
+ ds = st.sidebar.selectbox(
16
+ "**Load dataset**",
17
+ DATA.keys(),
18
+ index=None,
19
+ placeholder="Select contact method...",
20
+ )
21
+
22
+ if ds is not None:
23
+
24
+ adata = anndata.read_h5ad(fetch_resource(DATA[ds]['DATASET']))
25
+
26
+ sl_dr = st.sidebar.selectbox(
27
+ "**Dimension reduction**",
28
+ adata.obsm_keys(),
29
+ index=adata.obsm_keys().index(DEFAULT_DR),
30
+ placeholder="Select method ...",
31
+ )
32
+
33
+ sl_metadata = st.sidebar.selectbox(
34
+ "**Metadata**",
35
+ adata.obs.columns,
36
+ index=adata.obs.columns.get_loc(DEFAULT_META),
37
+ placeholder="Select column ...",
38
+ )
39
+
40
+ sl_feature = st.sidebar.selectbox(
41
+ "**Gene**",
42
+ adata.raw.var_names,
43
+ index=0,
44
+ placeholder="Select gene ...",
45
+ )
46
+
47
+ is_imputed = sl_feature in adata.var_names
48
+ sl_denoised = st.sidebar.checkbox(
49
+ "Use denoised expression?",
50
+ help="Denoised expression is sampled from the decoder.",
51
+ disabled=(not is_imputed)
52
+ )
53
+
54
+ col1, col2 = st.columns(2)
55
+ plot_sc_embedding(adata, group_by=sl_metadata, reduction_key=sl_dr, ax=col1)
56
+ plot_sc_embedding(
57
+ adata, feature=sl_feature, reduction_key=sl_dr, layer=sl_denoised, ax=col2
58
+ )
59
+
60
+ st.markdown("## Raw gene expression")
61
+ plot_feature(adata, feature=sl_feature, group_by=sl_metadata, kind="box")
pages/2_Differentially_Expressed_Genes.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import pandas as pd
3
+ import streamlit as st
4
+
5
+ from constants import DATA
6
+ from utils import fetch_resource
7
+
8
+ st.set_page_config(layout="wide")
9
+
10
+ st.markdown("""
11
+ # Differentially expressed genes
12
+
13
+ Genes below have been determined using `sc.tl.rank_genes_groups` where `t-test_overestim_var`
14
+ is the default method.
15
+ """
16
+ )
17
+
18
+ filter_flag = []
19
+ ds = st.sidebar.selectbox(
20
+ "**Select models**",
21
+ DATA.keys(),
22
+ index=None,
23
+ placeholder="Select species",
24
+ )
25
+
26
+ if ds:
27
+ filter_by = st.sidebar.selectbox(
28
+ "**Select by**",
29
+ DATA[ds]["DEGS"].keys(),
30
+ index=None,
31
+ placeholder="Select by",
32
+ )
33
+
34
+ if ds and filter_by:
35
+ markers = pd.read_feather(fetch_resource(DATA[ds]['DEGS'][filter_by]))
36
+
37
+ group = st.sidebar.multiselect(
38
+ "**Cell type**", markers.group.unique(), placeholder="Select group ..."
39
+ )
40
+
41
+ genes = st.sidebar.multiselect(
42
+ "**Gene**", markers.gene_symbol.unique(), placeholder="Select genes ..."
43
+ )
44
+
45
+ foldchange = st.sidebar.number_input(
46
+ "**Log2 fold-change**",
47
+ value=1,
48
+ )
49
+
50
+ pval = st.sidebar.number_input(
51
+ "**Adjusted p-value**",
52
+ value=0.05,
53
+ )
54
+
55
+ if group:
56
+ filter_flag.append("group == @group")
57
+
58
+ if genes:
59
+ filter_flag.append("@genes in gene_symbol")
60
+
61
+ if foldchange:
62
+ filter_flag.append(
63
+ "logfoldchanges > @foldchange"
64
+ if foldchange > 0
65
+ else "logfoldchanges < @foldchange"
66
+ )
67
+
68
+ if pval:
69
+ filter_flag.append("pvals_adj < @pval")
70
+
71
+ subset = markers.query(" & ".join(filter_flag)) if filter_flag else markers
72
+ st.dataframe(subset, use_container_width=True, height=650)
pages/3_SHAP_features.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import pandas as pd
3
+ import streamlit as st
4
+
5
+ from constants import DATA
6
+ from utils import fetch_resource
7
+
8
+ st.set_page_config(layout="wide")
9
+
10
+ st.markdown("""
11
+ # SHAP features
12
+
13
+ Predicted features (genes) used by the scANVI classifier to determine a cell type. The features
14
+ have been determined using [SHAP](https://shap.readthedocs.io/en/latest/).
15
+
16
+ Each metric for a feature is determined from 10 random boostraps with replacement.
17
+
18
+ - weight_mean: $\mu$ of SHAP value
19
+ - weight_std: $\sigma$ of SHAP value
20
+ - weight_ci_upper: $\mu$ + $\sigma$
21
+ - weight_ci_lower: $\mu$ - $\sigma$
22
+ - logfoldchanges: Log2fold change from differentiation expression analysis
23
+ - pvals_adj: Adjusted p-value from differentiation expression analysis
24
+ - scores: Estimated score from differentiation expression analysis
25
+ """
26
+ )
27
+
28
+ ds = st.sidebar.selectbox(
29
+ "**Load dataset**",
30
+ DATA.keys(),
31
+ index=None,
32
+ placeholder="Select dataset ...",
33
+ )
34
+
35
+ if ds:
36
+ data = pd.read_feather(fetch_resource(DATA[ds]["SHAP"]))
37
+
38
+ query = st.sidebar.selectbox(
39
+ "**Subset**",
40
+ data.ct.unique().tolist(),
41
+ index=None,
42
+ placeholder="Select cell type ...",
43
+ )
44
+
45
+ features = st.sidebar.multiselect(
46
+ "**Genes**", data.feature.unique(), placeholder="Select genes ..."
47
+ )
48
+
49
+ filter_condition = []
50
+ if query:
51
+ filter_condition.append("ct == @query")
52
+ if features:
53
+ filter_condition.append("feature in @features")
54
+
55
+ if filter_condition:
56
+ data = data.query(" & ".join(filter_condition))
57
+
58
+ st.dataframe(data, use_container_width=True, height=650)
pages/4_Download.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import streamlit as st
3
+
4
+ from constants import DATA
5
+
6
+ st.set_page_config(layout="wide")
7
+
8
+ st.markdown(
9
+ f"""
10
+ # Download
11
+
12
+ ## 1. Pipelines
13
+
14
+ - Downloading datasets: [nf-core/fetchngs (revision 1.10.0)](https://github.com/nf-core/fetchngs)
15
+ - Aligning datasets: [brickmanlab/scrnaseq (revision: feature/smartseq)](https://github.com/brickmanlab/scrnaseq)
16
+ - **Ensembl Genomes**
17
+ - Mouse: GRCm38 v102
18
+ - Human: GRCh38 v110
19
+
20
+ ## 2. Codebase
21
+
22
+ - Data analysis: [brickmanlab/proks-salehin-et-al](https://github.com/brickmanlab/proks-salehin-et-al)
23
+ - Web portal: [brickmanlab/preimplantation-portal](https://github.com/brickmanlab/preimplantation-portal)
24
+
25
+ ## 3. Raw data
26
+
27
+ - [Mouse]({DATA['MOUSE']['RAW_DATASET']})
28
+ - [Human]({DATA['HUMAN']['RAW_DATASET']})
29
+
30
+ ## 4. AI models
31
+
32
+ Trained models with parameters were uploaded to [Hugging Face](https://huggingface.co/brickmanlab/preimplantation-models).
33
+
34
+ ### 4.1 Models
35
+
36
+ - [scANVI mouse](https://huggingface.co/brickmanlab/mouse-scanvi)
37
+ - [scANVI human](https://huggingface.co/brickmanlab/human-scanvi)
38
+
39
+ """
40
+ )
pages/__init__.py ADDED
File without changes
prepare.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ anndata==0.10.9
2
+ plotly==5.24.0
3
+ pyarrow==17.0.0
static/Fig-1.v4.3.png ADDED
utils.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import socket
2
+ import urllib.request
3
+ from pathlib import Path
4
+ from typing import Literal
5
+
6
+ import anndata
7
+ import pandas as pd
8
+ import plotly.express as px
9
+ import streamlit as st
10
+
11
+
12
+ @st.cache_data
13
+ def fetch_resource(url: str) -> str:
14
+ """Helper function for downloading datasets
15
+
16
+ Parameters
17
+ ----------
18
+ url : str
19
+ Zenodo url link
20
+
21
+ Returns
22
+ -------
23
+ str
24
+ Path where the file was downloaded to, default /tmp
25
+ """
26
+
27
+ filename = f"/tmp/{url.split('/')[-1]}"
28
+ if not Path(filename).exists():
29
+ try:
30
+ urllib.request.urlretrieve(url, filename)
31
+ except (socket.gaierror, urllib.error.URLError) as err:
32
+ raise ConnectionError(f"could not download {url} due to {err}")
33
+
34
+ return filename
35
+
36
+
37
+ def get_embedding(adata: anndata.AnnData, key: str) -> pd.DataFrame:
38
+ """
39
+ Helper function which retrieves embedding coordinates for each cell.
40
+
41
+ Parameters
42
+ ----------
43
+ adata : anndata.AnnData
44
+ scrna-seq dataset
45
+ key : str
46
+ Dimension reduction key, usually starts with X_
47
+
48
+ Returns
49
+ -------
50
+ pd.DataFrame
51
+ Embedding coordinates
52
+
53
+ Raises
54
+ ------
55
+ ValueError
56
+ Fail if reduction key doesn't exist
57
+ """
58
+ if key not in adata.obsm.keys():
59
+ raise ValueError(f"Reduction key: {key} not available")
60
+
61
+ dimension_names = f"{key[2:].upper()}_1", f"{key[2:].upper()}_2"
62
+ return pd.DataFrame(adata.obsm[key][:, :2], columns=dimension_names)
63
+
64
+
65
+ def plot_sc_embedding(
66
+ adata: anndata.AnnData,
67
+ reduction_key: str,
68
+ group_by: str = None,
69
+ feature: str = None,
70
+ layer: str = None,
71
+ ax = None,
72
+ ):
73
+ """
74
+ Plot single-cell dataset
75
+
76
+ Parameters
77
+ ----------
78
+ adata : anndata.AnnData
79
+ scrna-seq dataset
80
+ reduction_key : str
81
+ Reduced space key
82
+ group_by : str
83
+ Key used to color cells
84
+ features: str
85
+ Gene
86
+ ax : _type_
87
+ Axes
88
+ """
89
+ embeddings = get_embedding(adata, reduction_key)
90
+
91
+ if group_by:
92
+ embeddings[group_by] = adata.obs[group_by].values
93
+ embeddings = embeddings.sort_values(by=group_by)
94
+
95
+ # color_uns_key = f"{group_by}_colors"
96
+
97
+ kwargs = {"color": embeddings[group_by].values.tolist()}
98
+ if adata.obs[group_by].dtype == "category":
99
+ ...
100
+ else:
101
+ kwargs["color_continuous_scale"] = px.colors.sequential.Viridis
102
+
103
+ if feature:
104
+ X = (
105
+ adata[:, feature].layers["scVI_normalized"].toarray()
106
+ if layer
107
+ else adata.raw[:, feature].X.toarray()
108
+ )
109
+ embeddings[feature] = X.ravel()
110
+ kwargs = {
111
+ "color": embeddings[feature].values.tolist(),
112
+ # "title": feature,
113
+ "color_continuous_scale": px.colors.sequential.Viridis,
114
+ }
115
+
116
+ ax_ = ax if ax else st
117
+ ax_.plotly_chart(
118
+ px.scatter(
119
+ data_frame=embeddings,
120
+ x=embeddings.columns[0],
121
+ y=embeddings.columns[1],
122
+ **kwargs,
123
+ ),
124
+ use_container_width=True,
125
+ # .update_xaxes(showgrid=False)
126
+ # .update_yaxes(showgrid=False, zeroline=False)
127
+ )
128
+
129
+
130
+ def plot_feature(
131
+ adata: anndata.AnnData,
132
+ feature: str,
133
+ group_by: str,
134
+ kind: Literal["box"] = "box",
135
+ ax = None
136
+ ):
137
+ """Plot feature expression
138
+
139
+ Parameters
140
+ ----------
141
+ adata : anndata.AnnData
142
+ Dataset
143
+ feature : str
144
+ Gene name
145
+ group_by : str
146
+ Metadata column
147
+ kind : str
148
+ Type of plot
149
+ ax : _type_, optional
150
+ Axis, by default None
151
+ """
152
+
153
+ df = pd.DataFrame(adata.raw[:, feature].X.toarray(), columns=[feature])
154
+ df[group_by] = adata.obs[group_by].values
155
+ df = df.sort_values(by=group_by)
156
+
157
+ g = None
158
+ if kind == "box":
159
+ g = px.box(df, x=group_by, y=feature, color=group_by)
160
+ else:
161
+ raise ValueError(f"Provided kind: {kind} not supported")
162
+
163
+ ax_ = ax if ax else st
164
+ ax_.plotly_chart(g, use_container_width=True)